PageRenderTime 39ms CodeModel.GetById 10ms app.highlight 25ms RepoModel.GetById 1ms app.codeStats 0ms

/src/libtomahawk/database/fuzzyindex.cpp

Relevant Search: With Applications for Solr and Elasticsearch

For more in depth reading about search, ranking and generally everything you could ever want to know about how lucene, elasticsearch or solr work under the hood I highly suggest this book. Easily one of the most interesting technical books I have read in a long time. If you are tasked with solving search relevance problems even if not in Solr or Elasticsearch it should be your first reference. Amazon Affiliate Link
http://github.com/tomahawk-player/tomahawk
C++ | 355 lines | 265 code | 70 blank | 20 comment | 17 complexity | 08917fc3370de98328eced8a8acb8f4c MD5 | raw file
  1/* === This file is part of Tomahawk Player - <http://tomahawk-player.org> ===
  2 *
  3 *   Copyright 2010-2011, Christian Muehlhaeuser <muesli@tomahawk-player.org>
  4 *
  5 *   Tomahawk is free software: you can redistribute it and/or modify
  6 *   it under the terms of the GNU General Public License as published by
  7 *   the Free Software Foundation, either version 3 of the License, or
  8 *   (at your option) any later version.
  9 *
 10 *   Tomahawk is distributed in the hope that it will be useful,
 11 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 13 *   GNU General Public License for more details.
 14 *
 15 *   You should have received a copy of the GNU General Public License
 16 *   along with Tomahawk. If not, see <http://www.gnu.org/licenses/>.
 17 */
 18
 19#include "FuzzyIndex.h"
 20
 21#include <QDir>
 22#include <QTime>
 23
 24#include <CLucene.h>
 25#include <CLucene/queryParser/MultiFieldQueryParser.h>
 26
 27#include "DatabaseCommand_UpdateSearchIndex.h"
 28#include "DatabaseImpl.h"
 29#include "Database.h"
 30#include "utils/TomahawkUtils.h"
 31#include "utils/Logger.h"
 32#include "Source.h"
 33
 34using namespace lucene::analysis;
 35using namespace lucene::analysis::standard;
 36using namespace lucene::document;
 37using namespace lucene::store;
 38using namespace lucene::index;
 39using namespace lucene::queryParser;
 40using namespace lucene::search;
 41
 42
 43FuzzyIndex::FuzzyIndex( QObject* parent, bool wipe )
 44    : QObject( parent )
 45    , m_luceneReader( 0 )
 46    , m_luceneSearcher( 0 )
 47{
 48    QString m_lucenePath = TomahawkUtils::appDataDir().absoluteFilePath( "tomahawk.lucene" );
 49    QByteArray path = m_lucenePath.toUtf8();
 50    const char* cPath = path.constData();
 51
 52    bool failed = false;
 53    tDebug() << "Opening Lucene directory:" << path;
 54    try
 55    {
 56        m_analyzer = _CLNEW SimpleAnalyzer();
 57        m_luceneDir = FSDirectory::getDirectory( cPath );
 58    }
 59    catch ( CLuceneError& error )
 60    {
 61        tDebug() << "Caught CLucene error:" << error.what();
 62        failed = true;
 63    }
 64
 65    if ( failed )
 66    {
 67        tDebug() << "Initializing RAM directory instead.";
 68
 69        m_luceneDir = _CLNEW RAMDirectory();
 70        wipe = true;
 71    }
 72
 73    if ( wipe )
 74        wipeIndex();
 75}
 76
 77
 78FuzzyIndex::~FuzzyIndex()
 79{
 80    delete m_luceneSearcher;
 81    delete m_luceneReader;
 82    delete m_analyzer;
 83    delete m_luceneDir;
 84}
 85
 86
 87bool
 88FuzzyIndex::wipeIndex()
 89{
 90    tLog( LOGVERBOSE ) << "Wiping fuzzy index...";
 91    beginIndexing();
 92    endIndexing();
 93
 94    QTimer::singleShot( 0, this, SLOT( updateIndex() ) );
 95
 96    return true; // FIXME
 97}
 98
 99
100void
101FuzzyIndex::updateIndex()
102{
103    DatabaseCommand* cmd = new DatabaseCommand_UpdateSearchIndex();
104    Database::instance()->enqueue( QSharedPointer<DatabaseCommand>( cmd ) );
105}
106
107
108void
109FuzzyIndex::beginIndexing()
110{
111    m_mutex.lock();
112
113    try
114    {
115        qDebug() << Q_FUNC_INFO << "Starting indexing.";
116        if ( m_luceneReader != 0 )
117        {
118            qDebug() << "Deleting old lucene stuff.";
119            m_luceneSearcher->close();
120            m_luceneReader->close();
121            delete m_luceneSearcher;
122            delete m_luceneReader;
123            m_luceneSearcher = 0;
124            m_luceneReader = 0;
125        }
126
127        qDebug() << "Creating new index writer.";
128        IndexWriter luceneWriter( m_luceneDir, m_analyzer, true );
129    }
130    catch( CLuceneError& error )
131    {
132        tDebug() << "Caught CLucene error:" << error.what();
133        Q_ASSERT( false );
134    }
135}
136
137
138void
139FuzzyIndex::endIndexing()
140{
141    m_mutex.unlock();
142    emit indexReady();
143}
144
145
146void
147FuzzyIndex::appendFields( const QMap< unsigned int, QMap< QString, QString > >& trackData )
148{
149    try
150    {
151        tDebug() << "Appending to index:" << trackData.count();
152        bool create = !IndexReader::indexExists( TomahawkUtils::appDataDir().absoluteFilePath( "tomahawk.lucene" ).toStdString().c_str() );
153        IndexWriter luceneWriter( m_luceneDir, m_analyzer, create );
154        Document doc;
155
156        QMapIterator< unsigned int, QMap< QString, QString > > it( trackData );
157        while ( it.hasNext() )
158        {
159            it.next();
160            unsigned int id = it.key();
161            QMap< QString, QString > values = it.value();
162
163            if ( values.contains( "track" ) )
164            {
165                doc.add( *( _CLNEW Field( _T( "fulltext" ), DatabaseImpl::sortname( QString( "%1 %2" ).arg( values.value( "artist" ) ).arg( values.value( "track" ) ) ).toStdWString().c_str(),
166                                          Field::STORE_NO | Field::INDEX_UNTOKENIZED ) ) );
167
168                doc.add( *( _CLNEW Field( _T( "track" ), DatabaseImpl::sortname( values.value( "track" ) ).toStdWString().c_str(),
169                                          Field::STORE_NO | Field::INDEX_UNTOKENIZED ) ) );
170
171                doc.add( *( _CLNEW Field( _T( "artist" ), DatabaseImpl::sortname( values.value( "artist" ) ).toStdWString().c_str(),
172                                          Field::STORE_NO | Field::INDEX_UNTOKENIZED ) ) );
173
174                doc.add( *( _CLNEW Field( _T( "artistid" ), values.value( "artistid" ).toStdWString().c_str(),
175                                          Field::STORE_YES | Field::INDEX_NO ) ) );
176
177                doc.add( *( _CLNEW Field( _T( "trackid" ), QString::number( id ).toStdWString().c_str(),
178                                          Field::STORE_YES | Field::INDEX_NO ) ) );
179            }
180            else if ( values.contains( "album" ) )
181            {
182                doc.add( *( _CLNEW Field( _T( "album" ), DatabaseImpl::sortname( values.value( "album" ) ).toStdWString().c_str(),
183                                          Field::STORE_NO | Field::INDEX_UNTOKENIZED ) ) );
184
185                doc.add( *( _CLNEW Field( _T( "albumid" ), QString::number( id ).toStdWString().c_str(),
186                                          Field::STORE_YES | Field::INDEX_NO ) ) );
187            }
188            else
189                Q_ASSERT( false );
190
191            luceneWriter.addDocument( &doc );
192            doc.clear();
193        }
194
195        luceneWriter.optimize();
196        luceneWriter.close();
197    }
198    catch( CLuceneError& error )
199    {
200        tDebug() << "Caught CLucene error:" << error.what();
201
202        QTimer::singleShot( 0, this, SLOT( wipeIndex() ) );
203    }
204}
205
206
207void
208FuzzyIndex::loadLuceneIndex()
209{
210    emit indexReady();
211}
212
213
214QMap< int, float >
215FuzzyIndex::search( const Tomahawk::query_ptr& query )
216{
217    QMutexLocker lock( &m_mutex );
218
219    QMap< int, float > resultsmap;
220    try
221    {
222        if ( !m_luceneReader )
223        {
224            if ( !IndexReader::indexExists( TomahawkUtils::appDataDir().absoluteFilePath( "tomahawk.lucene" ).toStdString().c_str() ) )
225            {
226                qDebug() << Q_FUNC_INFO << "index didn't exist.";
227                return resultsmap;
228            }
229
230            m_luceneReader = IndexReader::open( m_luceneDir );
231            m_luceneSearcher = _CLNEW IndexSearcher( m_luceneReader );
232        }
233
234        float minScore;
235        const TCHAR** fields = 0;
236        MultiFieldQueryParser parser( fields, m_analyzer );
237        BooleanQuery* qry = _CLNEW BooleanQuery();
238
239        if ( query->isFullTextQuery() )
240        {
241            QString escapedQuery = QString::fromWCharArray( parser.escape( DatabaseImpl::sortname( query->fullTextQuery() ).toStdWString().c_str() ) );
242
243            Term* term = _CLNEW Term( _T( "track" ), escapedQuery.toStdWString().c_str() );
244            Query* fqry = _CLNEW FuzzyQuery( term );
245            qry->add( fqry, true, BooleanClause::SHOULD );
246
247            term = _CLNEW Term( _T( "artist" ), escapedQuery.toStdWString().c_str() );
248            fqry = _CLNEW FuzzyQuery( term );
249            qry->add( fqry, true, BooleanClause::SHOULD );
250
251            term = _CLNEW Term( _T( "fulltext" ), escapedQuery.toStdWString().c_str() );
252            fqry = _CLNEW FuzzyQuery( term );
253            qry->add( fqry, true, BooleanClause::SHOULD );
254
255            minScore = 0.00;
256        }
257        else
258        {
259            QString track = QString::fromWCharArray( parser.escape( DatabaseImpl::sortname( query->track() ).toStdWString().c_str() ) );
260            QString artist = QString::fromWCharArray( parser.escape( DatabaseImpl::sortname( query->artist() ).toStdWString().c_str() ) );
261//            QString album = QString::fromWCharArray( parser.escape( query->album().toStdWString().c_str() ) );
262
263            Term* term = _CLNEW Term( _T( "track" ), track.toStdWString().c_str() );
264            Query* fqry = _CLNEW FuzzyQuery( term );
265            qry->add( fqry, true, BooleanClause::MUST );
266
267            term = _CLNEW Term( _T( "artist" ), artist.toStdWString().c_str() );
268            fqry = _CLNEW FuzzyQuery( term );
269            qry->add( fqry, true, BooleanClause::MUST );
270
271            minScore = 0.00;
272        }
273
274        Hits* hits = m_luceneSearcher->search( qry );
275        for ( uint i = 0; i < hits->length(); i++ )
276        {
277            Document* d = &hits->doc( i );
278
279            float score = hits->score( i );
280            int id = QString::fromWCharArray( d->get( _T( "trackid" ) ) ).toInt();
281
282            if ( score > minScore )
283            {
284                resultsmap.insert( id, score );
285//                tDebug() << "Index hit:" << id << score << QString::fromWCharArray( ((Query*)qry)->toString() );
286            }
287        }
288
289        delete hits;
290        delete qry;
291    }
292    catch( CLuceneError& error )
293    {
294        tDebug() << "Caught CLucene error:" << error.what() << query->toString();
295
296        QTimer::singleShot( 0, this, SLOT( wipeIndex() ) );
297    }
298
299    return resultsmap;
300}
301
302
303QMap< int, float >
304FuzzyIndex::searchAlbum( const Tomahawk::query_ptr& query )
305{
306    Q_ASSERT( query->isFullTextQuery() );
307
308    QMutexLocker lock( &m_mutex );
309
310    QMap< int, float > resultsmap;
311    try
312    {
313        if ( !m_luceneReader )
314        {
315            if ( !IndexReader::indexExists( TomahawkUtils::appDataDir().absoluteFilePath( "tomahawk.lucene" ).toStdString().c_str() ) )
316            {
317                qDebug() << Q_FUNC_INFO << "index didn't exist.";
318                return resultsmap;
319            }
320
321            m_luceneReader = IndexReader::open( m_luceneDir );
322            m_luceneSearcher = _CLNEW IndexSearcher( m_luceneReader );
323        }
324
325        QueryParser parser( _T( "album" ), m_analyzer );
326        QString escapedName = QString::fromWCharArray( parser.escape( DatabaseImpl::sortname( query->fullTextQuery() ).toStdWString().c_str() ) );
327
328        Query* qry = _CLNEW FuzzyQuery( _CLNEW Term( _T( "album" ), escapedName.toStdWString().c_str() ) );
329        Hits* hits = m_luceneSearcher->search( qry );
330        for ( uint i = 0; i < hits->length(); i++ )
331        {
332            Document* d = &hits->doc( i );
333
334            float score = hits->score( i );
335            int id = QString::fromWCharArray( d->get( _T( "albumid" ) ) ).toInt();
336
337            if ( score > 0.30 )
338            {
339                resultsmap.insert( id, score );
340//                tDebug() << "Index hit:" << id << score;
341            }
342        }
343
344        delete hits;
345        delete qry;
346    }
347    catch( CLuceneError& error )
348    {
349        tDebug() << "Caught CLucene error:" << error.what();
350
351        QTimer::singleShot( 0, this, SLOT( wipeIndex() ) );
352    }
353
354    return resultsmap;
355}