PageRenderTime 21ms CodeModel.GetById 13ms app.highlight 6ms RepoModel.GetById 1ms app.codeStats 0ms

/scripts/data_libraries/build_whoosh_index.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 90 lines | 73 code | 6 blank | 11 comment | 8 complexity | 8d241b1150c9c156eec014b63c4da6a5 MD5 | raw file
 1#!/usr/bin/env python
 2"""
 3Build index for full-text whoosh search of files in data libraries.
 4
 5Requires configuration settings in universe_wsgi.ini. See the whoosh settings
 6in the data library search section for more details.
 7
 8Run from the ~/scripts/data_libraries directory:
 9%sh build_whoosh_index.sh
10"""
11import sys, os, csv, urllib, urllib2, ConfigParser
12    
13new_path = [ os.path.join( os.getcwd(), "lib" ) ]
14new_path.extend( sys.path[1:] ) # remove scripts/ from the path
15sys.path = new_path
16
17from galaxy import eggs
18
19# Whoosh is compatible with Python 2.5+ Try to import Whoosh and set flag to indicate whether search is enabled.
20try:
21    eggs.require( "Whoosh" )
22
23    from whoosh.filedb.filestore import FileStorage
24    from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT
25    from whoosh.index import Index
26    whoosh_search_enabled = True
27    schema = Schema( id=STORED, name=TEXT, info=TEXT, dbkey=TEXT, message=TEXT )
28    import galaxy.model.mapping
29    from galaxy import config, model
30    import pkg_resources
31    pkg_resources.require( "SQLAlchemy >= 0.4" )
32except ImportError, e:
33    whoosh_search_enabled = False
34    schema = None
35
36def build_index( sa_session, whoosh_index_dir ):
37    storage = FileStorage( whoosh_index_dir )
38    index = storage.create_index( schema )
39    writer = index.writer()
40    def to_unicode( a_basestr ):
41        if type( a_basestr ) is str:
42            return unicode( a_basestr, 'utf-8' )
43        else:
44            return a_basestr
45    lddas_indexed = 0
46    for id, name, info, dbkey, message in get_lddas( sa_session ):
47        writer.add_document( id=id,
48                             name=to_unicode( name ),
49                             info=to_unicode( info ),
50                             dbkey=to_unicode( dbkey ),
51                             message=to_unicode( message ) )
52        lddas_indexed += 1
53    writer.commit()
54    print "Number of active library datasets indexed: ", lddas_indexed
55
56def get_lddas( sa_session ):
57    for ldda in sa_session.query( model.LibraryDatasetDatasetAssociation ).filter_by( deleted=False ):
58        id = ldda.id
59        name = ldda.name
60        info = ldda.library_dataset.get_info()
61        if info and not info.startswith( 'upload' ):
62            info = info.replace( 'no info', '' )
63        else:
64            info = ''
65        dbkey = ldda.metadata.dbkey
66        if ldda.message:
67            message = ldda.message
68        else:
69            message = ''
70        yield id, name, info, dbkey, message
71
72def get_sa_session_and_needed_config_settings( ini_file ):
73    conf_parser = ConfigParser.ConfigParser( { 'here' : os.getcwd() } )
74    conf_parser.read( ini_file )
75    kwds = dict()
76    for key, value in conf_parser.items( "app:main" ):
77        kwds[ key ] = value
78    config_settings = config.Configuration( **kwds )
79    db_con = config_settings.database_connection
80    if not db_con:
81        db_con = "sqlite:///%s?isolation_level=IMMEDIATE" % config_settings.database
82    model = galaxy.model.mapping.init( config_settings.file_path, db_con, engine_options={}, create_tables=False )
83    return model.context.current, config_settings
84
85if __name__ == "__main__":
86    if whoosh_search_enabled:
87        ini_file = sys.argv[1]
88        sa_session, config_settings = get_sa_session_and_needed_config_settings( ini_file )
89        whoosh_index_dir = config_settings.whoosh_index_dir
90        build_index( sa_session, whoosh_index_dir )