Mercurial > dedupe
changeset 21:3bcdb8bb6914
Huffman representations.
| author | Tom Fredrik Blenning Klaussen <bfg@blenning.no> |
|---|---|
| date | Wed, 05 Sep 2012 21:54:53 +0200 |
| parents | 754e12c927b3 |
| children | d62f708ad88b |
| files | HuffmanSet.cpp HuffmanSet.hpp HuffmanString.cpp HuffmanString.hpp TestHuffmanString.cpp |
| diffstat | 5 files changed, 261 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/HuffmanSet.cpp Wed Sep 05 21:54:53 2012 +0200 @@ -0,0 +1,150 @@ +#include "HuffmanString.hpp" + +#include <QtCore/QHash> +#include "NoSuchValueException.hpp" +#include <QtCore/QDebug> + +#include "InvalidDataException.hpp" + + +HuffmanSet::HuffmanSet() : cutoff(128), numInserts(0), lut(0) +{ + qDebug() << __FUNCTION__; +} + +void HuffmanSet::setCutoff(uint cutoff) +{ + this->cutoff = cutoff; +} + +QStringList HuffmanSet::chunks(const QString& str) +{ + return str.split(""); +} + +BitDecoder* HuffmanSet::createLut(const QMap<QString, uint>& freqTable) +{ + QMultiMap<uint, BitDecoder* > freqs; + for(QMap<QString, uint>::const_iterator it = freqTable.begin(); + it != freqTable.end(); ++it) { + freqs.insert(it.value(), new BitDecoder(it.key())); + } + + if (freqs.size() == 1) { + QList<uint> keys = freqs.keys(); + return freqs.take(keys[0]); + } + + QList<uint> keys = freqs.keys(); + + while (freqs.size() >= 2) { + QList<uint> keys = freqs.keys(); + + BitDecoder* v0 = freqs.take(keys[0]); + BitDecoder* v1 = freqs.take(keys[1]); + + BitDecoder* n = BitDecoder::merge(v0, v1); + + freqs.insert(keys[0] + keys[1], n); + } + BitDecoder* retVal = freqs.values()[0]; + return retVal; +} + +QString HuffmanSet::decode(const QBitArray& bits) const +{ + return lut->decode(bits); +} + +QBitArray HuffmanSet::encode(const QString& string, const QMap<QString, QBitArray>& encoder) +{ + QBitArray retVal; + QStringList c = chunks(string); + foreach(const QString& fragment, c) { + if (encoder.contains(fragment)) + retVal = BitDecoder::unite(retVal, encoder[fragment]); + else + throw InvalidDataException(); + } + return retVal; +} + +uint HuffmanSet::totalElements() const +{ + return newStrings.size() + map.size(); +} + +void HuffmanSet::rebuild() +{ + qDebug() << __FUNCTION__ << totalElements() << numInserts << newStrings.size(); + + QMap<key_t, QBitArray> newVals; + + QMap<QString, uint> freqTable; + + foreach(key_t key, map.keys()) { + foreach(const QString& chunk, chunks(decode(map.value(key)))) { + ++freqTable[chunk]; + } + } + foreach(key_t key, newStrings.keys()) { + foreach(const QString& chunk, chunks(newStrings.value(key))) { + ++freqTable[chunk]; + } + } + + BitDecoder* newLut = createLut(freqTable); + + encoder = newLut->createEncoder(); + + foreach(key_t key, map.keys()) { + map.insert(key, encode(decode(map.value(key)), encoder)); + } + foreach(key_t key, newStrings.keys()) { + map.insert(key, encode(newStrings.value(key), encoder)); + } + numInserts = 0; + delete lut; + lut = newLut; + newStrings.clear(); +} + +bool HuffmanSet::contains(key_t key) const +{ + return newStrings.contains(key) || map.contains(key); +} + +HuffmanSet::key_t HuffmanSet::hash(const QString& str) +{ + key_t key = qHash(str); + while (contains(key) && value(key) != str) + ++key; + return key; +} + +HuffmanSet::key_t HuffmanSet::insert(const QString& str) +{ + key_t key = hash(str); + if (!contains(key)) { + try { + QBitArray bits = encode(str, encoder); + map.insert(key, bits); + } + catch (InvalidDataException& e) { + newStrings.insert(key, str); + } + if (++numInserts >= cutoff) { + rebuild(); + } + } + return key; +} + +QString HuffmanSet::value(key_t key) const +{ + if (map.contains(key)) + return decode(map.value(key)); + if (newStrings.contains(key)) + return newStrings.value(key); + throw NoSuchValueException(); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/HuffmanSet.hpp Wed Sep 05 21:54:53 2012 +0200 @@ -0,0 +1,39 @@ +#ifndef HUFFMANSET_HPP +#define HUFFMANSET_HPP + +#include <QtCore/QMap> +#include <QtCore/QString> +#include <QtCore/QStringList> +#include <QtCore/QBitArray> + +#include "BitDecoder.hpp" + +class HuffmanSet { +public: + typedef uint key_t; + +private: + QMap<key_t, QString> newStrings; + QMap<key_t, QBitArray> map; + QMap<QString, QBitArray> encoder; + uint cutoff; + uint numInserts; + BitDecoder* lut; + + +public: + HuffmanSet(); + void setCutoff(uint cutoff); + static QStringList chunks(const QString& str); + BitDecoder* createLut(const QMap<QString, uint>& freqTable); + QString decode(const QBitArray& bits) const; + static QBitArray encode(const QString& string, const QMap<QString, QBitArray>& encoder); + void rebuild(); + bool contains(key_t key) const; + uint totalElements() const; + key_t hash(const QString& str); + key_t insert(const QString& str); + QString value(key_t key) const; +}; + +#endif //HUFFMANSET_HPP
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/HuffmanString.cpp Wed Sep 05 21:54:53 2012 +0200 @@ -0,0 +1,33 @@ +#include "HuffmanString.hpp" + +HuffmanSet* HuffmanString::set = 0; + +HuffmanString::HuffmanString(const QString& str, HuffmanSet* set) +{ + set = this->set; + if (!set) + set = new HuffmanSet(); + this->set = set; + + key = set->insert(str); +} + +QString HuffmanString::toString() const +{ + return set->value(key); +} + +HuffmanString::operator QString() const +{ + return toString(); +} + +bool HuffmanString::operator<(const HuffmanString& rhs) const +{ + return this->toString() < rhs.toString(); +} + +bool HuffmanString::operator==(const HuffmanString& rhs) const +{ + return this->toString() == rhs.toString(); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/HuffmanString.hpp Wed Sep 05 21:54:53 2012 +0200 @@ -0,0 +1,19 @@ +#ifndef HUFFMANSTRING_HPP +#define HUFFMANSTRING_HPP + +#include <QtCore/QString> +#include "HuffmanSet.hpp" + +class HuffmanString { + static HuffmanSet* set; + HuffmanSet::key_t key; +public: + + HuffmanString(const QString& str = QString(), HuffmanSet* set = NULL); + QString toString() const; + operator QString() const; + bool operator<(const HuffmanString& rhs) const; + bool operator==(const HuffmanString& rhs) const; +}; + +#endif //HUFFMANSTRING_HPP
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TestHuffmanString.cpp Wed Sep 05 21:54:53 2012 +0200 @@ -0,0 +1,20 @@ +#include "HuffmanString.hpp" +#include "TestFramework.hpp" + +#include <QtCore/QDebug> + +BOOST_AUTO_TEST_CASE( TestSimple ) +{ + HuffmanSet set; + set.setCutoff(1); + + QList<HuffmanString> strList; + for (uint n = 0; n < 100; ++n) { + QString str = QString("test%1").arg(n); + strList << HuffmanString(str, &set); + for (uint i = 0; i <= n; ++i) { + QString str = QString("test%1").arg(i); + BOOST_REQUIRE_EQUAL(strList[i], str); + } + } +}
