/scripts/data_libraries/build_whoosh_index.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 90 lines · 71 code · 8 blank · 11 comment · 15 complexity · 8d241b1150c9c156eec014b63c4da6a5 MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. Build index for full-text whoosh search of files in data libraries.
  4. Requires configuration settings in universe_wsgi.ini. See the whoosh settings
  5. in the data library search section for more details.
  6. Run from the ~/scripts/data_libraries directory:
  7. %sh build_whoosh_index.sh
  8. """
  9. import sys, os, csv, urllib, urllib2, ConfigParser
  10. new_path = [ os.path.join( os.getcwd(), "lib" ) ]
  11. new_path.extend( sys.path[1:] ) # remove scripts/ from the path
  12. sys.path = new_path
  13. from galaxy import eggs
  14. # Whoosh is compatible with Python 2.5+ Try to import Whoosh and set flag to indicate whether search is enabled.
  15. try:
  16. eggs.require( "Whoosh" )
  17. from whoosh.filedb.filestore import FileStorage
  18. from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT
  19. from whoosh.index import Index
  20. whoosh_search_enabled = True
  21. schema = Schema( id=STORED, name=TEXT, info=TEXT, dbkey=TEXT, message=TEXT )
  22. import galaxy.model.mapping
  23. from galaxy import config, model
  24. import pkg_resources
  25. pkg_resources.require( "SQLAlchemy >= 0.4" )
  26. except ImportError, e:
  27. whoosh_search_enabled = False
  28. schema = None
  29. def build_index( sa_session, whoosh_index_dir ):
  30. storage = FileStorage( whoosh_index_dir )
  31. index = storage.create_index( schema )
  32. writer = index.writer()
  33. def to_unicode( a_basestr ):
  34. if type( a_basestr ) is str:
  35. return unicode( a_basestr, 'utf-8' )
  36. else:
  37. return a_basestr
  38. lddas_indexed = 0
  39. for id, name, info, dbkey, message in get_lddas( sa_session ):
  40. writer.add_document( id=id,
  41. name=to_unicode( name ),
  42. info=to_unicode( info ),
  43. dbkey=to_unicode( dbkey ),
  44. message=to_unicode( message ) )
  45. lddas_indexed += 1
  46. writer.commit()
  47. print "Number of active library datasets indexed: ", lddas_indexed
  48. def get_lddas( sa_session ):
  49. for ldda in sa_session.query( model.LibraryDatasetDatasetAssociation ).filter_by( deleted=False ):
  50. id = ldda.id
  51. name = ldda.name
  52. info = ldda.library_dataset.get_info()
  53. if info and not info.startswith( 'upload' ):
  54. info = info.replace( 'no info', '' )
  55. else:
  56. info = ''
  57. dbkey = ldda.metadata.dbkey
  58. if ldda.message:
  59. message = ldda.message
  60. else:
  61. message = ''
  62. yield id, name, info, dbkey, message
  63. def get_sa_session_and_needed_config_settings( ini_file ):
  64. conf_parser = ConfigParser.ConfigParser( { 'here' : os.getcwd() } )
  65. conf_parser.read( ini_file )
  66. kwds = dict()
  67. for key, value in conf_parser.items( "app:main" ):
  68. kwds[ key ] = value
  69. config_settings = config.Configuration( **kwds )
  70. db_con = config_settings.database_connection
  71. if not db_con:
  72. db_con = "sqlite:///%s?isolation_level=IMMEDIATE" % config_settings.database
  73. model = galaxy.model.mapping.init( config_settings.file_path, db_con, engine_options={}, create_tables=False )
  74. return model.context.current, config_settings
  75. if __name__ == "__main__":
  76. if whoosh_search_enabled:
  77. ini_file = sys.argv[1]
  78. sa_session, config_settings = get_sa_session_and_needed_config_settings( ini_file )
  79. whoosh_index_dir = config_settings.whoosh_index_dir
  80. build_index( sa_session, whoosh_index_dir )