Mercurial > dedupe
diff HuffmanSet.cpp @ 21:3bcdb8bb6914
Huffman representations.
| author | Tom Fredrik Blenning Klaussen <bfg@blenning.no> |
|---|---|
| date | Wed, 05 Sep 2012 21:54:53 +0200 |
| parents | |
| children | c0ddc978475a |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/HuffmanSet.cpp Wed Sep 05 21:54:53 2012 +0200 @@ -0,0 +1,150 @@ +#include "HuffmanString.hpp" + +#include <QtCore/QHash> +#include "NoSuchValueException.hpp" +#include <QtCore/QDebug> + +#include "InvalidDataException.hpp" + + +HuffmanSet::HuffmanSet() : cutoff(128), numInserts(0), lut(0) +{ + qDebug() << __FUNCTION__; +} + +void HuffmanSet::setCutoff(uint cutoff) +{ + this->cutoff = cutoff; +} + +QStringList HuffmanSet::chunks(const QString& str) +{ + return str.split(""); +} + +BitDecoder* HuffmanSet::createLut(const QMap<QString, uint>& freqTable) +{ + QMultiMap<uint, BitDecoder* > freqs; + for(QMap<QString, uint>::const_iterator it = freqTable.begin(); + it != freqTable.end(); ++it) { + freqs.insert(it.value(), new BitDecoder(it.key())); + } + + if (freqs.size() == 1) { + QList<uint> keys = freqs.keys(); + return freqs.take(keys[0]); + } + + QList<uint> keys = freqs.keys(); + + while (freqs.size() >= 2) { + QList<uint> keys = freqs.keys(); + + BitDecoder* v0 = freqs.take(keys[0]); + BitDecoder* v1 = freqs.take(keys[1]); + + BitDecoder* n = BitDecoder::merge(v0, v1); + + freqs.insert(keys[0] + keys[1], n); + } + BitDecoder* retVal = freqs.values()[0]; + return retVal; +} + +QString HuffmanSet::decode(const QBitArray& bits) const +{ + return lut->decode(bits); +} + +QBitArray HuffmanSet::encode(const QString& string, const QMap<QString, QBitArray>& encoder) +{ + QBitArray retVal; + QStringList c = chunks(string); + foreach(const QString& fragment, c) { + if (encoder.contains(fragment)) + retVal = BitDecoder::unite(retVal, encoder[fragment]); + else + throw InvalidDataException(); + } + return retVal; +} + +uint HuffmanSet::totalElements() const +{ + return newStrings.size() + map.size(); +} + +void HuffmanSet::rebuild() +{ + qDebug() << __FUNCTION__ << totalElements() << numInserts << newStrings.size(); + + QMap<key_t, QBitArray> newVals; + + QMap<QString, uint> freqTable; + + foreach(key_t key, map.keys()) { + foreach(const QString& chunk, chunks(decode(map.value(key)))) { + ++freqTable[chunk]; + } + } + foreach(key_t key, newStrings.keys()) { + foreach(const QString& chunk, chunks(newStrings.value(key))) { + ++freqTable[chunk]; + } + } + + BitDecoder* newLut = createLut(freqTable); + + encoder = newLut->createEncoder(); + + foreach(key_t key, map.keys()) { + map.insert(key, encode(decode(map.value(key)), encoder)); + } + foreach(key_t key, newStrings.keys()) { + map.insert(key, encode(newStrings.value(key), encoder)); + } + numInserts = 0; + delete lut; + lut = newLut; + newStrings.clear(); +} + +bool HuffmanSet::contains(key_t key) const +{ + return newStrings.contains(key) || map.contains(key); +} + +HuffmanSet::key_t HuffmanSet::hash(const QString& str) +{ + key_t key = qHash(str); + while (contains(key) && value(key) != str) + ++key; + return key; +} + +HuffmanSet::key_t HuffmanSet::insert(const QString& str) +{ + key_t key = hash(str); + if (!contains(key)) { + try { + QBitArray bits = encode(str, encoder); + map.insert(key, bits); + } + catch (InvalidDataException& e) { + newStrings.insert(key, str); + } + if (++numInserts >= cutoff) { + rebuild(); + } + } + return key; +} + +QString HuffmanSet::value(key_t key) const +{ + if (map.contains(key)) + return decode(map.value(key)); + if (newStrings.contains(key)) + return newStrings.value(key); + throw NoSuchValueException(); +}
