annotate HuffmanSet.cpp @ 42:4c283daa42c7

Optimize diacritics removal.
author Tom Fredrik Blenning Klaussen <bfg@blenning.no>
date Sun, 09 Sep 2012 16:09:52 +0200
parents f711ddb56ae7
children f8d0ea827db3
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
21
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
1 #include "HuffmanString.hpp"
40
f711ddb56ae7 Sort up includes.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 37
diff changeset
2
f711ddb56ae7 Sort up includes.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 37
diff changeset
3 #include "Exception/InvalidDataException.hpp"
28
b2c2c2bf2bbd Refactor Exceptions into a separate directory.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 26
diff changeset
4 #include "Exception/NoSuchValueException.hpp"
21
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
5
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
6 #include <QtCore/QHash>
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
7
26
c0ddc978475a Remove debug info from HuffmanSet.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 21
diff changeset
8 HuffmanSet::HuffmanSet() : cutoff(256), numInserts(0), lut(0)
21
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
9 {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
10 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
11
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
12 void HuffmanSet::setCutoff(uint cutoff)
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
13 {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
14 this->cutoff = cutoff;
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
15 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
16
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
17 QStringList HuffmanSet::chunks(const QString& str)
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
18 {
37
c52a0627337c BUGFIX: Chunking got extra values at beginning and end.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 34
diff changeset
19 return str.split("", QString::SkipEmptyParts);
21
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
20 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
21
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
22 BitDecoder* HuffmanSet::createLut(const QMap<QString, uint>& freqTable)
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
23 {
34
fda70a362ed5 Remove whitespace.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 28
diff changeset
24 QMultiMap<uint, BitDecoder* > freqs;
21
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
25 for(QMap<QString, uint>::const_iterator it = freqTable.begin();
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
26 it != freqTable.end(); ++it) {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
27 freqs.insert(it.value(), new BitDecoder(it.key()));
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
28 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
29
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
30 if (freqs.size() == 1) {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
31 QList<uint> keys = freqs.keys();
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
32 return freqs.take(keys[0]);
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
33 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
34
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
35 QList<uint> keys = freqs.keys();
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
36
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
37 while (freqs.size() >= 2) {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
38 QList<uint> keys = freqs.keys();
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
39
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
40 BitDecoder* v0 = freqs.take(keys[0]);
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
41 BitDecoder* v1 = freqs.take(keys[1]);
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
42
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
43 BitDecoder* n = BitDecoder::merge(v0, v1);
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
44
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
45 freqs.insert(keys[0] + keys[1], n);
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
46 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
47 BitDecoder* retVal = freqs.values()[0];
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
48 return retVal;
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
49 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
50
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
51 QString HuffmanSet::decode(const QBitArray& bits) const
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
52 {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
53 return lut->decode(bits);
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
54 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
55
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
56 QBitArray HuffmanSet::encode(const QString& string, const QMap<QString, QBitArray>& encoder)
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
57 {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
58 QBitArray retVal;
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
59 QStringList c = chunks(string);
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
60 foreach(const QString& fragment, c) {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
61 if (encoder.contains(fragment))
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
62 retVal = BitDecoder::unite(retVal, encoder[fragment]);
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
63 else
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
64 throw InvalidDataException();
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
65 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
66 return retVal;
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
67 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
68
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
69 uint HuffmanSet::totalElements() const
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
70 {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
71 return newStrings.size() + map.size();
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
72 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
73
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
74 void HuffmanSet::rebuild()
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
75 {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
76 QMap<QString, uint> freqTable;
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
77
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
78 foreach(key_t key, map.keys()) {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
79 foreach(const QString& chunk, chunks(decode(map.value(key)))) {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
80 ++freqTable[chunk];
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
81 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
82 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
83 foreach(key_t key, newStrings.keys()) {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
84 foreach(const QString& chunk, chunks(newStrings.value(key))) {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
85 ++freqTable[chunk];
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
86 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
87 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
88
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
89 BitDecoder* newLut = createLut(freqTable);
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
90
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
91 encoder = newLut->createEncoder();
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
92
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
93 foreach(key_t key, map.keys()) {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
94 map.insert(key, encode(decode(map.value(key)), encoder));
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
95 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
96 foreach(key_t key, newStrings.keys()) {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
97 map.insert(key, encode(newStrings.value(key), encoder));
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
98 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
99 numInserts = 0;
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
100 delete lut;
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
101 lut = newLut;
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
102 newStrings.clear();
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
103 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
104
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
105 bool HuffmanSet::contains(key_t key) const
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
106 {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
107 return newStrings.contains(key) || map.contains(key);
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
108 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
109
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
110 HuffmanSet::key_t HuffmanSet::hash(const QString& str)
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
111 {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
112 key_t key = qHash(str);
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
113 while (contains(key) && value(key) != str)
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
114 ++key;
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
115 return key;
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
116 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
117
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
118 HuffmanSet::key_t HuffmanSet::insert(const QString& str)
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
119 {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
120 key_t key = hash(str);
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
121 if (!contains(key)) {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
122 try {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
123 QBitArray bits = encode(str, encoder);
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
124 map.insert(key, bits);
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
125 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
126 catch (InvalidDataException& e) {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
127 newStrings.insert(key, str);
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
128 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
129 if (++numInserts >= cutoff) {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
130 rebuild();
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
131 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
132 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
133 return key;
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
134 }
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
135
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
136 QString HuffmanSet::value(key_t key) const
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
137 {
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
138 if (map.contains(key))
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
139 return decode(map.value(key));
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
140 if (newStrings.contains(key))
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
141 return newStrings.value(key);
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
142 throw NoSuchValueException();
3bcdb8bb6914 Huffman representations.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
143 }