Mercurial > dedupe
changeset 78:9744ec195be3
Encapsulate EditDistance with caching.
| author | Tom Fredrik Blenning Klaussen <bfg@bfgconsult.no> |
|---|---|
| date | Thu, 10 Oct 2013 01:07:52 +0200 |
| parents | a827f3687c4a |
| children | 114be42a612c |
| files | CMakeLists.txt CachedEditDistance.cpp CachedEditDistance.hpp DataController.cpp EditDistance.cpp EditDistance.hpp FileDBLink.cpp TestDataBase.hpp TestMemoryDBLink.cpp TestSQLGenerator.cpp |
| diffstat | 10 files changed, 94 insertions(+), 58 deletions(-) [+] |
line wrap: on
line diff
--- a/CMakeLists.txt Sat Feb 16 19:00:54 2013 +0100 +++ b/CMakeLists.txt Thu Oct 10 01:07:52 2013 +0200 @@ -69,6 +69,7 @@ SET(CLASS_SOURCES BitArray.cpp BitDecoder.cpp + CachedEditDistance.cpp ConfigurationProcessing.cpp DataController.cpp EditDistance.cpp @@ -89,6 +90,7 @@ SET(CLASS_HEADERS BitArray.hpp BitDecoder.hpp + CachedEditDistance.hpp ConfigurationProcessing.hpp DataController.hpp EditDistance.hpp @@ -220,7 +222,7 @@ coverage_test.info 'moc_*' --output-file coverage.preprocessed2 DEPENDS coverage_test.info - COMMENT "Removing \"moc_\"-files" + COMMENT "Removing \"moc_\"-files" ) @@ -237,7 +239,7 @@ ADD_CUSTOM_TARGET(coverage_presentation genhtml -q ${CMAKE_CURRENT_BINARY_DIR}/coverage.preprocessed --output-directory - ${CMAKE_CURRENT_BINARY_DIR}/coverage_presentation + ${CMAKE_CURRENT_BINARY_DIR}/coverage_presentation DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/coverage.preprocessed )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CachedEditDistance.cpp Thu Oct 10 01:07:52 2013 +0200 @@ -0,0 +1,33 @@ +#include "CachedEditDistance.hpp" + +#include "CompileTimeConstants.h" +#include "ConfigurationProcessing.hpp" + +CachedEditDistance::cacheType* CachedEditDistance::cache = 0; +//CachedEditDistance::cacheType CachedEditDistance::cache; + +int CachedEditDistance::Compute(QString a, QString b, bool remove) { + if (remove) { + removeDiacriticsNoCopy(a); + removeDiacriticsNoCopy(b); + } + + if ( a == b) + return 0; + + OrderedPair<UniqueString> lup(a, b); + + if (cache == 0) { + QString cacheLocation = processSetupVariables(EDITDISTANCE_CACHE_LOCATION); + CachedEditDistance::cache = new cacheType(cacheLocation, "EditLUT"); + } + boost::optional<int> res = cache->value(lup); + if (res) + return *res; + + int retVal = EditDistance::Compute(a, b, false); + + cache->insert(lup, retVal); + + return retVal; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CachedEditDistance.hpp Thu Oct 10 01:07:52 2013 +0200 @@ -0,0 +1,48 @@ +#ifndef CACHEDEDITDISTANCE_HPP +#define CACHEDEDITDISTANCE_HPP + +#include "DBCache.hpp" + +#include "EditDistance.hpp" + +template<typename Value> +struct InsertRegulator<OrderedPair<UniqueString>, Value > +{ + uint n; + void start() + { + n = 0; + HuffmanString::getSet().setAutoRebuild(false); + } + + static void finish() + { + HuffmanString::getSet().rebuild(); + HuffmanString::getSet().setAutoRebuild(true); + } + + void next() + { + if (++n == 2048) + HuffmanString::getSet().rebuild(); + } +}; + +class CachedEditDistance { +protected: + typedef DBCache<OrderedPair<UniqueString>, int, true> cacheType; +public: + static int Compute(QString a, QString b, bool removeDiacritics = false); + static void removeDiacriticsNoCopy(QString& in) + { + EditDistance::removeDiacriticsNoCopy(in); + } + static QString removeDiacritics(const QString& in) + { + return EditDistance::removeDiacritics(in); + } + + static cacheType* cache; +}; + +#endif //CACHEDEDITDISTANCE_HPP
--- a/DataController.cpp Sat Feb 16 19:00:54 2013 +0100 +++ b/DataController.cpp Thu Oct 10 01:07:52 2013 +0200 @@ -2,7 +2,7 @@ #include "CompileTimeConstants.h" #include "ConfigurationProcessing.hpp" -#include "EditDistance.hpp" +#include "CachedEditDistance.hpp" #include "SqliteDBLink.hpp" #include "Exception/PermissionException.hpp" @@ -267,7 +267,7 @@ int absoluteCutoff = line->name().length() * editDistanceCutoff; foreach(QSharedPointer<FileDBLink::DBInfo> dup, elems) { if(dup != line) { - int distance = EditDistance::Compute(line->name(), dup->name()); + int distance = CachedEditDistance::Compute(line->name(), dup->name()); if (distance <= absoluteCutoff) { oList.insert(distance, dup); @@ -486,7 +486,7 @@ if (showGUI) { setupGUI(); - + QTimer* populator = new QTimer(this); populator->setSingleShot(true); populator->setInterval(50);
--- a/EditDistance.cpp Sat Feb 16 19:00:54 2013 +0100 +++ b/EditDistance.cpp Thu Oct 10 01:07:52 2013 +0200 @@ -1,15 +1,9 @@ #include "EditDistance.hpp" -#include "CompileTimeConstants.h" -#include "ConfigurationProcessing.hpp" - #include <boost/numeric/ublas/matrix.hpp> #define CharComparer(A, B) (QChar(A) == QChar(B)) -EditDistance::cacheType* EditDistance::cache = 0; -//EditDistance::cacheType EditDistance::cache; - void EditDistance::removeDiacriticsNoCopy(QString& in) { for(QString::iterator c = in.begin(); @@ -34,19 +28,6 @@ removeDiacriticsNoCopy(b); } - if ( a == b) - return 0; - - OrderedPair<UniqueString> lup(a, b); - - if (cache == 0) { - QString cacheLocation = processSetupVariables(EDITDISTANCE_CACHE_LOCATION); - EditDistance::cache = new cacheType(cacheLocation, "EditLUT"); - } - boost::optional<int> res = cache->value(lup); - if (res) - return *res; - uint s1 = a.size(); uint s2 = b.size(); @@ -76,7 +57,6 @@ // Return final value int retVal = d(s1, s2); - cache->insert(lup, retVal); return retVal; }
--- a/EditDistance.hpp Sat Feb 16 19:00:54 2013 +0100 +++ b/EditDistance.hpp Thu Oct 10 01:07:52 2013 +0200 @@ -1,40 +1,13 @@ #ifndef EDITDISTANCE_HPP #define EDITDISTANCE_HPP -#include "DBCache.hpp" - -template<typename Value> -struct InsertRegulator<OrderedPair<UniqueString>, Value > -{ - uint n; - void start() - { - n = 0; - HuffmanString::getSet().setAutoRebuild(false); - } - - static void finish() - { - HuffmanString::getSet().rebuild(); - HuffmanString::getSet().setAutoRebuild(true); - } - - void next() - { - if (++n == 2048) - HuffmanString::getSet().rebuild(); - } -}; +#include <QtCore/QString> class EditDistance { -protected: - typedef DBCache<OrderedPair<UniqueString>, int, true> cacheType; public: static int Compute(QString a, QString b, bool removeDiacritics = false); static void removeDiacriticsNoCopy(QString& in); static QString removeDiacritics(const QString& in); - - static cacheType* cache; }; #endif //EDITDISTANCE_HPP
--- a/FileDBLink.cpp Sat Feb 16 19:00:54 2013 +0100 +++ b/FileDBLink.cpp Thu Oct 10 01:07:52 2013 +0200 @@ -1,6 +1,6 @@ #include "FileDBLink.hpp" -#include "EditDistance.hpp" +#include "CachedEditDistance.hpp" #include "Exception/PermissionException.hpp" @@ -142,7 +142,7 @@ if (info == *it2) continue; QString p2 = (*it2)->name(); - int dist = EditDistance::Compute(p1, p2, false); + int dist = CachedEditDistance::Compute(p1, p2, false); if (dist < minDist) { minDist = dist; other = (*it2)->path();
--- a/TestDataBase.hpp Sat Feb 16 19:00:54 2013 +0100 +++ b/TestDataBase.hpp Thu Oct 10 01:07:52 2013 +0200 @@ -9,7 +9,7 @@ QSqlDatabase getDatabase(); TestDatabase(); ~TestDatabase(); - + private: const QString connectionName; QSqlDatabase *db;
--- a/TestMemoryDBLink.cpp Sat Feb 16 19:00:54 2013 +0100 +++ b/TestMemoryDBLink.cpp Thu Oct 10 01:07:52 2013 +0200 @@ -61,5 +61,5 @@ prev = info->path(); } } - + }
--- a/TestSQLGenerator.cpp Sat Feb 16 19:00:54 2013 +0100 +++ b/TestSQLGenerator.cpp Thu Oct 10 01:07:52 2013 +0200 @@ -171,7 +171,7 @@ .arg(fieldCreateString2); BOOST_REQUIRE(query.exec(createQuery)); - + QString queryString = QString("INSERT into %1 (%2, %3) VALUES(%4, %5);") .arg(tableName) .arg(SQLGenerator<int>::fieldName("value1"))
