PageRenderTime 47ms CodeModel.GetById 34ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/galaxy/tools/data/__init__.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 465 lines | 380 code | 29 blank | 56 comment | 42 complexity | c0a6c0f82a4295f04ddef2fd95589803 MD5 | raw file
  1. """
  2. Manage tool data tables, which store (at the application level) data that is
  3. used by tools, for example in the generation of dynamic options. Tables are
  4. loaded and stored by names which tools use to refer to them. This allows
  5. users to configure data tables for a local Galaxy instance without needing
  6. to modify the tool configurations.
  7. """
  8. import logging
  9. import os
  10. import os.path
  11. import shutil
  12. import tempfile
  13. from galaxy import util
  14. from galaxy.util.odict import odict
  15. log = logging.getLogger( __name__ )
  16. DEFAULT_TABLE_TYPE = 'tabular'
  17. class ToolDataTableManager( object ):
  18. """Manages a collection of tool data tables"""
  19. def __init__( self, tool_data_path, config_filename=None ):
  20. self.tool_data_path = tool_data_path
  21. # This stores all defined data table entries from both the tool_data_table_conf.xml file and the shed_tool_data_table_conf.xml file
  22. # at server startup. If tool shed repositories are installed that contain a valid file named tool_data_table_conf.xml.sample, entries
  23. # from that file are inserted into this dict at the time of installation.
  24. self.data_tables = {}
  25. if config_filename:
  26. self.load_from_config_file( config_filename, self.tool_data_path, from_shed_config=False )
  27. def __getitem__( self, key ):
  28. return self.data_tables.__getitem__( key )
  29. def __contains__( self, key ):
  30. return self.data_tables.__contains__( key )
  31. def get( self, name, default=None ):
  32. try:
  33. return self[ name ]
  34. except KeyError:
  35. return default
  36. def get_tables( self ):
  37. return self.data_tables
  38. def load_from_config_file( self, config_filename, tool_data_path, from_shed_config=False ):
  39. """
  40. This method is called under 3 conditions:
  41. 1. When the ToolDataTableManager is initialized (see __init__ above).
  42. 2. Just after the ToolDataTableManager is initialized and the additional entries defined by shed_tool_data_table_conf.xml
  43. are being loaded into the ToolDataTableManager.data_tables.
  44. 3. When a tool shed repository that includes a tool_data_table_conf.xml.sample file is being installed into a local
  45. Galaxy instance. In this case, we have 2 entry types to handle, files whose root tag is <tables>, for example:
  46. """
  47. tree = util.parse_xml( config_filename )
  48. root = tree.getroot()
  49. table_elems = []
  50. for table_elem in root.findall( 'table' ):
  51. table = ToolDataTable.from_elem( table_elem, tool_data_path, from_shed_config )
  52. table_elems.append( table_elem )
  53. if table.name not in self.data_tables:
  54. self.data_tables[ table.name ] = table
  55. log.debug( "Loaded tool data table '%s'", table.name )
  56. else:
  57. log.debug( "Loading another instance of data table '%s', attempting to merge content.", table.name )
  58. self.data_tables[ table.name ].merge_tool_data_table( table, allow_duplicates=False ) #only merge content, do not persist to disk, do not allow duplicate rows when merging
  59. # FIXME: This does not account for an entry with the same unique build ID, but a different path.
  60. return table_elems
  61. def add_new_entries_from_config_file( self, config_filename, tool_data_path, shed_tool_data_table_config, persist=False ):
  62. """
  63. This method is called when a tool shed repository that includes a tool_data_table_conf.xml.sample file is being
  64. installed into a local galaxy instance. We have 2 cases to handle, files whose root tag is <tables>, for example::
  65. <tables>
  66. <!-- Location of Tmap files -->
  67. <table name="tmap_indexes" comment_char="#">
  68. <columns>value, dbkey, name, path</columns>
  69. <file path="tool-data/tmap_index.loc" />
  70. </table>
  71. </tables>
  72. and files whose root tag is <table>, for example::
  73. <!-- Location of Tmap files -->
  74. <table name="tmap_indexes" comment_char="#">
  75. <columns>value, dbkey, name, path</columns>
  76. <file path="tool-data/tmap_index.loc" />
  77. </table>
  78. """
  79. error_message = ''
  80. try:
  81. table_elems = self.load_from_config_file( config_filename=config_filename,
  82. tool_data_path=tool_data_path,
  83. from_shed_config=True )
  84. except Exception, e:
  85. error_message = 'Error attempting to parse file %s: %s' % ( str( os.path.split( config_filename )[ 1 ] ), str( e ) )
  86. log.debug( error_message )
  87. table_elems = []
  88. if persist:
  89. # Persist Galaxy's version of the changed tool_data_table_conf.xml file.
  90. self.to_xml_file( shed_tool_data_table_config, table_elems )
  91. return table_elems, error_message
  92. def to_xml_file( self, shed_tool_data_table_config, new_elems=None, remove_elems=None ):
  93. """
  94. Write the current in-memory version of the shed_tool_data_table_conf.xml file to disk.
  95. remove_elems are removed before new_elems are added.
  96. """
  97. if not ( new_elems or remove_elems ):
  98. log.debug( 'ToolDataTableManager.to_xml_file called without any elements to add or remove.' )
  99. return #no changes provided, no need to persist any changes
  100. if not new_elems:
  101. new_elems = []
  102. if not remove_elems:
  103. remove_elems = []
  104. full_path = os.path.abspath( shed_tool_data_table_config )
  105. #FIXME: we should lock changing this file by other threads / head nodes
  106. try:
  107. tree = util.parse_xml( full_path )
  108. root = tree.getroot()
  109. out_elems = [ elem for elem in root ]
  110. except Exception, e:
  111. out_elems = []
  112. log.debug( 'Could not parse existing tool data table config, assume no existing elements: %s', e )
  113. for elem in remove_elems:
  114. #handle multiple occurrences of remove elem in existing elems
  115. while elem in out_elems:
  116. remove_elems.remove( elem )
  117. #add new elems
  118. out_elems.extend( new_elems )
  119. with open( full_path, 'wb' ) as out:
  120. out.write( '<?xml version="1.0"?>\n<tables>\n' )
  121. for elem in out_elems:
  122. out.write( util.xml_to_string( elem ) )
  123. out.write( '</tables>\n' )
  124. os.chmod( full_path, 0644 )
  125. class ToolDataTable( object ):
  126. @classmethod
  127. def from_elem( cls, table_elem, tool_data_path, from_shed_config ):
  128. table_type = table_elem.get( 'type', 'tabular' )
  129. assert table_type in tool_data_table_types, "Unknown data table type '%s'" % type
  130. return tool_data_table_types[ table_type ]( table_elem, tool_data_path, from_shed_config=from_shed_config )
  131. def __init__( self, config_element, tool_data_path, from_shed_config = False):
  132. self.name = config_element.get( 'name' )
  133. self.comment_char = config_element.get( 'comment_char' )
  134. self.empty_field_value = config_element.get( 'empty_field_value', '' )
  135. self.empty_field_values = {}
  136. self.filenames = odict()
  137. self.tool_data_path = tool_data_path
  138. self.missing_index_file = None
  139. # increment this variable any time a new entry is added, or when the table is totally reloaded
  140. # This value has no external meaning, and does not represent an abstract version of the underlying data
  141. self._loaded_content_version = 1
  142. def _update_version( self ):
  143. self._loaded_content_version += 1
  144. return self._loaded_content_version
  145. def get_empty_field_by_name( self, name ):
  146. return self.empty_field_values.get( name, self.empty_field_value )
  147. def _add_entry( self, entry, allow_duplicates=True, persist=False, persist_on_error=False, entry_source=None, **kwd ):
  148. raise NotImplementedError( "Abstract method" )
  149. def add_entry( self, entry, allow_duplicates=True, persist=False, persist_on_error=False, entry_source=None, **kwd ):
  150. self._add_entry( entry, allow_duplicates=allow_duplicates, persist=persist, persist_on_error=persist_on_error, entry_source=entry_source, **kwd )
  151. return self._update_version()
  152. def add_entries( self, entries, allow_duplicates=True, persist=False, persist_on_error=False, entry_source=None, **kwd ):
  153. if entries:
  154. for entry in entries:
  155. self.add_entry( entry, allow_duplicates=allow_duplicates, persist=persist, persist_on_error=persist_on_error, entry_source=entry_source, **kwd )
  156. return self._loaded_content_version
  157. def is_current_version( self, other_version ):
  158. return self._loaded_content_version == other_version
  159. def merge_tool_data_table( self, other_table, allow_duplicates=True, persist=False, persist_on_error=False, entry_source=None, **kwd ):
  160. raise NotImplementedError( "Abstract method" )
  161. class TabularToolDataTable( ToolDataTable ):
  162. """
  163. Data stored in a tabular / separated value format on disk, allows multiple
  164. files to be merged but all must have the same column definitions::
  165. <table type="tabular" name="test">
  166. <column name='...' index = '...' />
  167. <file path="..." />
  168. <file path="..." />
  169. </table>
  170. """
  171. type_key = 'tabular'
  172. def __init__( self, config_element, tool_data_path, from_shed_config = False):
  173. super( TabularToolDataTable, self ).__init__( config_element, tool_data_path, from_shed_config)
  174. self.data = []
  175. self.configure_and_load( config_element, tool_data_path, from_shed_config)
  176. def configure_and_load( self, config_element, tool_data_path, from_shed_config = False):
  177. """
  178. Configure and load table from an XML element.
  179. """
  180. self.separator = config_element.get( 'separator', '\t' )
  181. self.comment_char = config_element.get( 'comment_char', '#' )
  182. # Configure columns
  183. self.parse_column_spec( config_element )
  184. #store repo info if available:
  185. repo_elem = config_element.find( 'tool_shed_repository' )
  186. if repo_elem is not None:
  187. repo_info = dict( tool_shed=repo_elem.find( 'tool_shed' ).text, name=repo_elem.find( 'repository_name' ).text,
  188. owner=repo_elem.find( 'repository_owner' ).text, installed_changeset_revision=repo_elem.find( 'installed_changeset_revision' ).text )
  189. else:
  190. repo_info = None
  191. # Read every file
  192. for file_element in config_element.findall( 'file' ):
  193. filename = file_path = file_element.get( 'path', None )
  194. found = False
  195. if file_path is None:
  196. log.debug( "Encountered a file element (%s) that does not contain a path value when loading tool data table '%s'.", util.xml_to_string( file_element ), self.name )
  197. continue
  198. #FIXME: splitting on and merging paths from a configuration file when loading is wonky
  199. # Data should exist on disk in the state needed, i.e. the xml configuration should
  200. # point directly to the desired file to load. Munging of the tool_data_tables_conf.xml.sample
  201. # can be done during installing / testing / metadata resetting with the creation of a proper
  202. # tool_data_tables_conf.xml file, containing correct <file path=> attributes. Allowing a
  203. # path.join with a different root should be allowed, but splitting should not be necessary.
  204. if tool_data_path and from_shed_config:
  205. # Must identify with from_shed_config as well, because the
  206. # regular galaxy app has and uses tool_data_path.
  207. # We're loading a tool in the tool shed, so we cannot use the Galaxy tool-data
  208. # directory which is hard-coded into the tool_data_table_conf.xml entries.
  209. filename = os.path.split( file_path )[ 1 ]
  210. filename = os.path.join( tool_data_path, filename )
  211. if os.path.exists( filename ):
  212. found = True
  213. else:
  214. # Since the path attribute can include a hard-coded path to a specific directory
  215. # (e.g., <file path="tool-data/cg_crr_files.loc" />) which may not be the same value
  216. # as self.tool_data_path, we'll parse the path to get the filename and see if it is
  217. # in self.tool_data_path.
  218. file_path, file_name = os.path.split( filename )
  219. if file_path and file_path != self.tool_data_path:
  220. corrected_filename = os.path.join( self.tool_data_path, file_name )
  221. if os.path.exists( corrected_filename ):
  222. filename = corrected_filename
  223. found = True
  224. if found:
  225. self.data.extend( self.parse_file_fields( open( filename ) ) )
  226. self._update_version()
  227. else:
  228. self.missing_index_file = filename
  229. log.warn( "Cannot find index file '%s' for tool data table '%s'" % ( filename, self.name ) )
  230. if filename not in self.filenames or not self.filenames[ filename ][ 'found' ]:
  231. self.filenames[ filename ] = dict( found=found, filename=filename, from_shed_config=from_shed_config, tool_data_path=tool_data_path,
  232. config_element=config_element, tool_shed_repository=repo_info )
  233. else:
  234. log.debug( "Filename '%s' already exists in filenames (%s), not adding", filename, self.filenames.keys() )
  235. def merge_tool_data_table( self, other_table, allow_duplicates=True, persist=False, persist_on_error=False, entry_source=None, **kwd ):
  236. assert self.columns == other_table.columns, "Merging tabular data tables with non matching columns is not allowed: %s:%s != %s:%s" % ( self.name, self.columns, other_table.name, other_table.columns )
  237. #merge filename info
  238. for filename, info in other_table.filenames.iteritems():
  239. if filename not in self.filenames:
  240. self.filenames[ filename ] = info
  241. #add data entries and return current data table version
  242. return self.add_entries( other_table.data, allow_duplicates=allow_duplicates, persist=persist, persist_on_error=persist_on_error, entry_source=entry_source, **kwd )
  243. def handle_found_index_file( self, filename ):
  244. self.missing_index_file = None
  245. self.data.extend( self.parse_file_fields( open( filename ) ) )
  246. def get_fields( self ):
  247. return self.data
  248. def get_version_fields( self ):
  249. return ( self._loaded_content_version, self.data )
  250. def parse_column_spec( self, config_element ):
  251. """
  252. Parse column definitions, which can either be a set of 'column' elements
  253. with a name and index (as in dynamic options config), or a shorthand
  254. comma separated list of names in order as the text of a 'column_names'
  255. element.
  256. A column named 'value' is required.
  257. """
  258. self.columns = {}
  259. if config_element.find( 'columns' ) is not None:
  260. column_names = util.xml_text( config_element.find( 'columns' ) )
  261. column_names = [ n.strip() for n in column_names.split( ',' ) ]
  262. for index, name in enumerate( column_names ):
  263. self.columns[ name ] = index
  264. self.largest_index = index
  265. else:
  266. for column_elem in config_element.findall( 'column' ):
  267. name = column_elem.get( 'name', None )
  268. assert name is not None, "Required 'name' attribute missing from column def"
  269. index = column_elem.get( 'index', None )
  270. assert index is not None, "Required 'index' attribute missing from column def"
  271. index = int( index )
  272. self.columns[name] = index
  273. if index > self.largest_index:
  274. self.largest_index = index
  275. empty_field_value = column_elem.get( 'empty_field_value', None )
  276. if empty_field_value is not None:
  277. self.empty_field_values[ name ] = empty_field_value
  278. assert 'value' in self.columns, "Required 'value' column missing from column def"
  279. if 'name' not in self.columns:
  280. self.columns['name'] = self.columns['value']
  281. def parse_file_fields( self, reader ):
  282. """
  283. Parse separated lines from file and return a list of tuples.
  284. TODO: Allow named access to fields using the column names.
  285. """
  286. separator_char = (lambda c: '<TAB>' if c == '\t' else c)(self.separator)
  287. rval = []
  288. for i, line in enumerate( reader ):
  289. if line.lstrip().startswith( self.comment_char ):
  290. continue
  291. line = line.rstrip( "\n\r" )
  292. if line:
  293. fields = line.split( self.separator )
  294. if self.largest_index < len( fields ):
  295. rval.append( fields )
  296. else:
  297. log.warn( "Line %i in tool data table '%s' is invalid (HINT: "
  298. "'%s' characters must be used to separate fields):\n%s"
  299. % ( ( i + 1 ), self.name, separator_char, line ) )
  300. return rval
  301. def get_column_name_list( self ):
  302. rval = []
  303. for i in range( self.largest_index + 1 ):
  304. found_column = False
  305. for name, index in self.columns.iteritems():
  306. if index == i:
  307. if not found_column:
  308. rval.append( name )
  309. elif name == 'value':
  310. #the column named 'value' always has priority over other named columns
  311. rval[ -1 ] = name
  312. found_column = True
  313. if not found_column:
  314. rval.append( None )
  315. return rval
  316. def get_entry( self, query_attr, query_val, return_attr, default=None ):
  317. """
  318. Returns table entry associated with a col/val pair.
  319. """
  320. query_col = self.columns.get( query_attr, None )
  321. if query_col is None:
  322. return default
  323. return_col = self.columns.get( return_attr, None )
  324. if return_col is None:
  325. return default
  326. rval = default
  327. # Look for table entry.
  328. for fields in self.data:
  329. if fields[ query_col ] == query_val:
  330. rval = fields[ return_col ]
  331. break
  332. return rval
  333. def _add_entry( self, entry, allow_duplicates=True, persist=False, persist_on_error=False, entry_source=None, **kwd ):
  334. #accepts dict or list of columns
  335. if isinstance( entry, dict ):
  336. fields = []
  337. for column_name in self.get_column_name_list():
  338. if column_name not in entry:
  339. log.debug( "Using default column value for column '%s' when adding data table entry (%s) to table '%s'.", column_name, entry, self.name )
  340. field_value = self.get_empty_field_by_name( column_name )
  341. else:
  342. field_value = entry[ column_name ]
  343. fields.append( field_value )
  344. else:
  345. fields = entry
  346. is_error = False
  347. if self.largest_index < len( fields ):
  348. fields = self._replace_field_separators( fields )
  349. if fields not in self.data or allow_duplicates:
  350. self.data.append( fields )
  351. else:
  352. log.debug( "Attempted to add fields (%s) to data table '%s', but this entry already exists and allow_duplicates is False.", fields, self.name )
  353. is_error = True
  354. else:
  355. log.error( "Attempted to add fields (%s) to data table '%s', but there were not enough fields specified ( %i < %i ).", fields, self.name, len( fields ), self.largest_index + 1 )
  356. is_error = True
  357. filename = None
  358. if persist and ( not is_error or persist_on_error ):
  359. if entry_source:
  360. #if dict, assume is compatible info dict, otherwise call method
  361. if isinstance( entry_source, dict ):
  362. source_repo_info = entry_source
  363. else:
  364. source_repo_info = entry_source.get_tool_shed_repository_info_dict()
  365. else:
  366. source_repo_info = None
  367. for name, value in self.filenames.iteritems():
  368. repo_info = value.get( 'tool_shed_repository', None )
  369. if ( not source_repo_info and not repo_info ) or ( source_repo_info and repo_info and source_repo_info == repo_info ):
  370. filename = name
  371. break
  372. if filename is None:
  373. #should we default to using any filename here instead?
  374. log.error( "Unable to determine filename for persisting data table '%s' values: '%s'.", self.name, fields )
  375. is_error = True
  376. else:
  377. #FIXME: Need to lock these files for editing
  378. log.debug( "Persisting changes to file: %s", filename )
  379. try:
  380. data_table_fh = open( filename, 'r+b' )
  381. except IOError, e:
  382. log.warning( 'Error opening data table file (%s) with r+b, assuming file does not exist and will open as wb: %s', self.filename, e )
  383. data_table_fh = open( filename, 'wb' )
  384. if os.stat( filename )[6] != 0:
  385. # ensure last existing line ends with new line
  386. data_table_fh.seek( -1, 2 ) #last char in file
  387. last_char = data_table_fh.read( 1 )
  388. if last_char not in [ '\n', '\r' ]:
  389. data_table_fh.write( '\n' )
  390. data_table_fh.write( "%s\n" % ( self.separator.join( fields ) ) )
  391. return not is_error
  392. def _replace_field_separators( self, fields, separator=None, replace=None, comment_char=None ):
  393. #make sure none of the fields contain separator
  394. #make sure separator replace is different from comment_char,
  395. #due to possible leading replace
  396. if separator is None:
  397. separator = self.separator
  398. if replace is None:
  399. if separator == " ":
  400. if comment_char == "\t":
  401. replace = "_"
  402. else:
  403. replace = "\t"
  404. else:
  405. if comment_char == " ":
  406. replace = "_"
  407. else:
  408. replace = " "
  409. return map( lambda x: x.replace( separator, replace ), fields )
  410. # Registry of tool data types by type_key
  411. tool_data_table_types = dict( [ ( cls.type_key, cls ) for cls in [ TabularToolDataTable ] ] )