build_whoosh_index.py - This script builds a full-text sear…

/scripts/data_libraries/build_whoosh_index.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 90 lines · 71 code · 8 blank · 11 comment · 15 complexity · 8d241b1150c9c156eec014b63c4da6a5 MD5 · raw file


#!/usr/bin/env python
"""
Build index for full-text whoosh search of files in data libraries.

Requires configuration settings in universe_wsgi.ini. See the whoosh settings
in the data library search section for more details.

Run from the ~/scripts/data_libraries directory:
%sh build_whoosh_index.sh
"""
import sys, os, csv, urllib, urllib2, ConfigParser
    
new_path = [ os.path.join( os.getcwd(), "lib" ) ]
new_path.extend( sys.path[1:] ) # remove scripts/ from the path
sys.path = new_path

from galaxy import eggs

# Whoosh is compatible with Python 2.5+ Try to import Whoosh and set flag to indicate whether search is enabled.
try:
    eggs.require( "Whoosh" )

    from whoosh.filedb.filestore import FileStorage
    from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT
    from whoosh.index import Index
    whoosh_search_enabled = True
    schema = Schema( id=STORED, name=TEXT, info=TEXT, dbkey=TEXT, message=TEXT )
    import galaxy.model.mapping
    from galaxy import config, model
    import pkg_resources
    pkg_resources.require( "SQLAlchemy >= 0.4" )
except ImportError, e:
    whoosh_search_enabled = False
    schema = None

def build_index( sa_session, whoosh_index_dir ):
    storage = FileStorage( whoosh_index_dir )
    index = storage.create_index( schema )
    writer = index.writer()
    def to_unicode( a_basestr ):
        if type( a_basestr ) is str:
            return unicode( a_basestr, 'utf-8' )
        else:
            return a_basestr
    lddas_indexed = 0
    for id, name, info, dbkey, message in get_lddas( sa_session ):
        writer.add_document( id=id,
                             name=to_unicode( name ),
                             info=to_unicode( info ),
                             dbkey=to_unicode( dbkey ),
                             message=to_unicode( message ) )
        lddas_indexed += 1
    writer.commit()
    print "Number of active library datasets indexed: ", lddas_indexed

def get_lddas( sa_session ):
    for ldda in sa_session.query( model.LibraryDatasetDatasetAssociation ).filter_by( deleted=False ):
        id = ldda.id
        name = ldda.name
        info = ldda.library_dataset.get_info()
        if info and not info.startswith( 'upload' ):
            info = info.replace( 'no info', '' )
        else:
            info = ''
        dbkey = ldda.metadata.dbkey
        if ldda.message:
            message = ldda.message
        else:
            message = ''
        yield id, name, info, dbkey, message

def get_sa_session_and_needed_config_settings( ini_file ):
    conf_parser = ConfigParser.ConfigParser( { 'here' : os.getcwd() } )
    conf_parser.read( ini_file )
    kwds = dict()
    for key, value in conf_parser.items( "app:main" ):
        kwds[ key ] = value
    config_settings = config.Configuration( **kwds )
    db_con = config_settings.database_connection
    if not db_con:
        db_con = "sqlite:///%s?isolation_level=IMMEDIATE" % config_settings.database
    model = galaxy.model.mapping.init( config_settings.file_path, db_con, engine_options={}, create_tables=False )
    return model.context.current, config_settings

if __name__ == "__main__":
    if whoosh_search_enabled:
        ini_file = sys.argv[1]
        sa_session, config_settings = get_sa_session_and_needed_config_settings( ini_file )
        whoosh_index_dir = config_settings.whoosh_index_dir
        build_index( sa_session, whoosh_index_dir )

Summary ✨

This script builds a full-text search index for library datasets using the Whoosh library. It requires configuration settings in the universe_wsgi.ini file, which are used to connect to the Galaxy database and specify the directory where the search index will be stored. The script iterates over all active library datasets in the database, extracts relevant information such as name, description, and dbkey, and adds them to the search index using Whoosh’s writer class. Once all library datasets have been indexed, the script commits the changes and prints a message indicating the number of library datasets that were indexed.

Tech Fingerprint

Standard Library: OS Interaction

Alerts (5)

'def' Ensure functions have docstrings for documentation
36 40 56 72
'type(' Use isinstance() for type checking instead of type()
41