PageRenderTime 65ms CodeModel.GetById 34ms app.highlight 25ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/galaxy/tools/data/__init__.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 465 lines | 380 code | 29 blank | 56 comment | 49 complexity | c0a6c0f82a4295f04ddef2fd95589803 MD5 | raw file
  1"""
  2Manage tool data tables, which store (at the application level) data that is
  3used by tools, for example in the generation of dynamic options. Tables are
  4loaded and stored by names which tools use to refer to them. This allows
  5users to configure data tables for a local Galaxy instance without needing
  6to modify the tool configurations.
  7"""
  8
  9import logging
 10import os
 11import os.path
 12import shutil
 13import tempfile
 14
 15from galaxy import util
 16from galaxy.util.odict import odict
 17
 18log = logging.getLogger( __name__ )
 19
 20DEFAULT_TABLE_TYPE = 'tabular'
 21
 22class ToolDataTableManager( object ):
 23    """Manages a collection of tool data tables"""
 24
 25    def __init__( self, tool_data_path, config_filename=None ):
 26        self.tool_data_path = tool_data_path
 27        # This stores all defined data table entries from both the tool_data_table_conf.xml file and the shed_tool_data_table_conf.xml file
 28        # at server startup. If tool shed repositories are installed that contain a valid file named tool_data_table_conf.xml.sample, entries
 29        # from that file are inserted into this dict at the time of installation.
 30        self.data_tables = {}
 31        if config_filename:
 32            self.load_from_config_file( config_filename, self.tool_data_path, from_shed_config=False )
 33
 34    def __getitem__( self, key ):
 35        return self.data_tables.__getitem__( key )
 36
 37    def __contains__( self, key ):
 38        return self.data_tables.__contains__( key )
 39
 40    def get( self, name, default=None ):
 41        try:
 42            return self[ name ]
 43        except KeyError:
 44            return default
 45
 46    def get_tables( self ):
 47        return self.data_tables
 48
 49    def load_from_config_file( self, config_filename, tool_data_path, from_shed_config=False ):
 50        """
 51        This method is called under 3 conditions:
 52
 53        1. When the ToolDataTableManager is initialized (see __init__ above).
 54        2. Just after the ToolDataTableManager is initialized and the additional entries defined by shed_tool_data_table_conf.xml
 55           are being loaded into the ToolDataTableManager.data_tables.
 56        3. When a tool shed repository that includes a tool_data_table_conf.xml.sample file is being installed into a local
 57           Galaxy instance.  In this case, we have 2 entry types to handle, files whose root tag is <tables>, for example:
 58        """
 59        tree = util.parse_xml( config_filename )
 60        root = tree.getroot()
 61        table_elems = []
 62        for table_elem in root.findall( 'table' ):
 63            table = ToolDataTable.from_elem( table_elem, tool_data_path, from_shed_config )
 64            table_elems.append( table_elem )
 65            if table.name not in self.data_tables:
 66                self.data_tables[ table.name ] = table
 67                log.debug( "Loaded tool data table '%s'", table.name )
 68            else:
 69                log.debug( "Loading another instance of data table '%s', attempting to merge content.", table.name )
 70                self.data_tables[ table.name ].merge_tool_data_table( table, allow_duplicates=False ) #only merge content, do not persist to disk, do not allow duplicate rows when merging
 71                # FIXME: This does not account for an entry with the same unique build ID, but a different path.
 72        return table_elems
 73
 74    def add_new_entries_from_config_file( self, config_filename, tool_data_path, shed_tool_data_table_config, persist=False ):
 75        """
 76        This method is called when a tool shed repository that includes a tool_data_table_conf.xml.sample file is being
 77        installed into a local galaxy instance.  We have 2 cases to handle, files whose root tag is <tables>, for example::
 78
 79            <tables>
 80                <!-- Location of Tmap files -->
 81                <table name="tmap_indexes" comment_char="#">
 82                    <columns>value, dbkey, name, path</columns>
 83                    <file path="tool-data/tmap_index.loc" />
 84                </table>
 85            </tables>
 86
 87        and files whose root tag is <table>, for example::
 88
 89            <!-- Location of Tmap files -->
 90            <table name="tmap_indexes" comment_char="#">
 91                <columns>value, dbkey, name, path</columns>
 92                <file path="tool-data/tmap_index.loc" />
 93            </table>
 94
 95        """
 96        error_message = ''
 97        try:
 98            table_elems = self.load_from_config_file( config_filename=config_filename,
 99                                                      tool_data_path=tool_data_path,
100                                                      from_shed_config=True )
101        except Exception, e:
102            error_message = 'Error attempting to parse file %s: %s' % ( str( os.path.split( config_filename )[ 1 ] ), str( e ) )
103            log.debug( error_message )
104            table_elems = []
105        if persist:
106            # Persist Galaxy's version of the changed tool_data_table_conf.xml file.
107            self.to_xml_file( shed_tool_data_table_config, table_elems )
108        return table_elems, error_message
109
110    def to_xml_file( self, shed_tool_data_table_config, new_elems=None, remove_elems=None ):
111        """
112        Write the current in-memory version of the shed_tool_data_table_conf.xml file to disk.
113        remove_elems are removed before new_elems are added.
114        """
115        if not ( new_elems or remove_elems ):
116            log.debug( 'ToolDataTableManager.to_xml_file called without any elements to add or remove.' )
117            return #no changes provided, no need to persist any changes
118        if not new_elems:
119            new_elems = []
120        if not remove_elems:
121            remove_elems = []
122        full_path = os.path.abspath( shed_tool_data_table_config )
123        #FIXME: we should lock changing this file by other threads / head nodes
124        try:
125            tree = util.parse_xml( full_path )
126            root = tree.getroot()
127            out_elems = [ elem for elem in root ]
128        except Exception, e:
129            out_elems = []
130            log.debug( 'Could not parse existing tool data table config, assume no existing elements: %s', e )
131        for elem in remove_elems:
132            #handle multiple occurrences of remove elem in existing elems
133            while elem in out_elems:
134                remove_elems.remove( elem )
135        #add new elems
136        out_elems.extend( new_elems )
137        with open( full_path, 'wb' ) as out:
138            out.write( '<?xml version="1.0"?>\n<tables>\n' )
139            for elem in out_elems:
140                out.write( util.xml_to_string( elem ) )
141            out.write( '</tables>\n' )
142        os.chmod( full_path, 0644 )
143
144class ToolDataTable( object ):
145
146    @classmethod
147    def from_elem( cls, table_elem, tool_data_path, from_shed_config ):
148        table_type = table_elem.get( 'type', 'tabular' )
149        assert table_type in tool_data_table_types, "Unknown data table type '%s'" % type
150        return tool_data_table_types[ table_type ]( table_elem, tool_data_path, from_shed_config=from_shed_config )
151
152    def __init__( self, config_element, tool_data_path, from_shed_config = False):
153        self.name = config_element.get( 'name' )
154        self.comment_char = config_element.get( 'comment_char' )
155        self.empty_field_value = config_element.get( 'empty_field_value', '' )
156        self.empty_field_values = {}
157        self.filenames = odict()
158        self.tool_data_path = tool_data_path
159        self.missing_index_file = None
160        # increment this variable any time a new entry is added, or when the table is totally reloaded
161        # This value has no external meaning, and does not represent an abstract version of the underlying data
162        self._loaded_content_version = 1
163
164    def _update_version( self ):
165        self._loaded_content_version += 1
166        return self._loaded_content_version
167
168    def get_empty_field_by_name( self, name ):
169        return self.empty_field_values.get( name, self.empty_field_value )
170
171    def _add_entry( self, entry, allow_duplicates=True, persist=False, persist_on_error=False, entry_source=None, **kwd ):
172        raise NotImplementedError( "Abstract method" )
173
174    def add_entry( self, entry, allow_duplicates=True, persist=False, persist_on_error=False, entry_source=None, **kwd ):
175        self._add_entry( entry, allow_duplicates=allow_duplicates, persist=persist, persist_on_error=persist_on_error, entry_source=entry_source, **kwd )
176        return self._update_version()
177
178    def add_entries( self, entries, allow_duplicates=True, persist=False, persist_on_error=False, entry_source=None, **kwd ):
179        if entries:
180            for entry in entries:
181                self.add_entry( entry, allow_duplicates=allow_duplicates, persist=persist, persist_on_error=persist_on_error, entry_source=entry_source, **kwd )
182        return self._loaded_content_version
183
184    def is_current_version( self, other_version ):
185        return self._loaded_content_version == other_version
186
187    def merge_tool_data_table( self, other_table, allow_duplicates=True, persist=False, persist_on_error=False, entry_source=None, **kwd ):
188        raise NotImplementedError( "Abstract method" )
189
190class TabularToolDataTable( ToolDataTable ):
191    """
192    Data stored in a tabular / separated value format on disk, allows multiple
193    files to be merged but all must have the same column definitions::
194
195        <table type="tabular" name="test">
196            <column name='...' index = '...' />
197            <file path="..." />
198            <file path="..." />
199        </table>
200
201    """
202
203    type_key = 'tabular'
204
205    def __init__( self, config_element, tool_data_path, from_shed_config = False):
206        super( TabularToolDataTable, self ).__init__( config_element, tool_data_path, from_shed_config)
207        self.data = []
208        self.configure_and_load( config_element, tool_data_path, from_shed_config)
209
210    def configure_and_load( self, config_element, tool_data_path, from_shed_config = False):
211        """
212        Configure and load table from an XML element.
213        """
214        self.separator = config_element.get( 'separator', '\t' )
215        self.comment_char = config_element.get( 'comment_char', '#' )
216        # Configure columns
217        self.parse_column_spec( config_element )
218
219        #store repo info if available:
220        repo_elem = config_element.find( 'tool_shed_repository' )
221        if repo_elem is not None:
222            repo_info = dict( tool_shed=repo_elem.find( 'tool_shed' ).text, name=repo_elem.find( 'repository_name' ).text,
223                              owner=repo_elem.find( 'repository_owner' ).text, installed_changeset_revision=repo_elem.find( 'installed_changeset_revision' ).text )
224        else:
225            repo_info = None
226        # Read every file
227        for file_element in config_element.findall( 'file' ):
228            filename = file_path = file_element.get( 'path', None )
229            found = False
230            if file_path is None:
231                log.debug( "Encountered a file element (%s) that does not contain a path value when loading tool data table '%s'.", util.xml_to_string( file_element ), self.name )
232                continue
233
234            #FIXME: splitting on and merging paths from a configuration file when loading is wonky
235            # Data should exist on disk in the state needed, i.e. the xml configuration should
236            # point directly to the desired file to load. Munging of the tool_data_tables_conf.xml.sample
237            # can be done during installing / testing / metadata resetting with the creation of a proper
238            # tool_data_tables_conf.xml file, containing correct <file path=> attributes. Allowing a
239            # path.join with a different root should be allowed, but splitting should not be necessary.
240            if tool_data_path and from_shed_config:
241                # Must identify with from_shed_config as well, because the
242                # regular galaxy app has and uses tool_data_path.
243                # We're loading a tool in the tool shed, so we cannot use the Galaxy tool-data
244                # directory which is hard-coded into the tool_data_table_conf.xml entries.
245                filename = os.path.split( file_path )[ 1 ]
246                filename = os.path.join( tool_data_path, filename )
247            if os.path.exists( filename ):
248                found = True
249            else:
250                # Since the path attribute can include a hard-coded path to a specific directory
251                # (e.g., <file path="tool-data/cg_crr_files.loc" />) which may not be the same value
252                # as self.tool_data_path, we'll parse the path to get the filename and see if it is
253                # in self.tool_data_path.
254                file_path, file_name = os.path.split( filename )
255                if file_path and file_path != self.tool_data_path:
256                    corrected_filename = os.path.join( self.tool_data_path, file_name )
257                    if os.path.exists( corrected_filename ):
258                        filename = corrected_filename
259                        found = True
260
261            if found:
262                self.data.extend( self.parse_file_fields( open( filename ) ) )
263                self._update_version()
264            else:
265                self.missing_index_file = filename
266                log.warn( "Cannot find index file '%s' for tool data table '%s'" % ( filename, self.name ) )
267
268            if filename not in self.filenames or not self.filenames[ filename ][ 'found' ]:
269                self.filenames[ filename ] = dict( found=found, filename=filename, from_shed_config=from_shed_config, tool_data_path=tool_data_path,
270                                                   config_element=config_element, tool_shed_repository=repo_info )
271            else:
272                log.debug( "Filename '%s' already exists in filenames (%s), not adding", filename, self.filenames.keys() )
273
274
275    def merge_tool_data_table( self, other_table, allow_duplicates=True, persist=False, persist_on_error=False, entry_source=None, **kwd ):
276        assert self.columns == other_table.columns, "Merging tabular data tables with non matching columns is not allowed: %s:%s != %s:%s" % ( self.name, self.columns, other_table.name, other_table.columns )
277        #merge filename info
278        for filename, info in other_table.filenames.iteritems():
279            if filename not in self.filenames:
280                self.filenames[ filename ] = info
281        #add data entries and return current data table version
282        return self.add_entries( other_table.data, allow_duplicates=allow_duplicates, persist=persist, persist_on_error=persist_on_error, entry_source=entry_source, **kwd )
283
284    def handle_found_index_file( self, filename ):
285        self.missing_index_file = None
286        self.data.extend( self.parse_file_fields( open( filename ) ) )
287
288    def get_fields( self ):
289        return self.data
290
291    def get_version_fields( self ):
292        return ( self._loaded_content_version, self.data )
293
294    def parse_column_spec( self, config_element ):
295        """
296        Parse column definitions, which can either be a set of 'column' elements
297        with a name and index (as in dynamic options config), or a shorthand
298        comma separated list of names in order as the text of a 'column_names'
299        element.
300
301        A column named 'value' is required.
302        """
303        self.columns = {}
304        if config_element.find( 'columns' ) is not None:
305            column_names = util.xml_text( config_element.find( 'columns' ) )
306            column_names = [ n.strip() for n in column_names.split( ',' ) ]
307            for index, name in enumerate( column_names ):
308                self.columns[ name ] = index
309                self.largest_index = index
310        else:
311            for column_elem in config_element.findall( 'column' ):
312                name = column_elem.get( 'name', None )
313                assert name is not None, "Required 'name' attribute missing from column def"
314                index = column_elem.get( 'index', None )
315                assert index is not None, "Required 'index' attribute missing from column def"
316                index = int( index )
317                self.columns[name] = index
318                if index > self.largest_index:
319                    self.largest_index = index
320                empty_field_value = column_elem.get( 'empty_field_value', None )
321                if empty_field_value is not None:
322                    self.empty_field_values[ name ] = empty_field_value
323        assert 'value' in self.columns, "Required 'value' column missing from column def"
324        if 'name' not in self.columns:
325            self.columns['name'] = self.columns['value']
326
327    def parse_file_fields( self, reader ):
328        """
329        Parse separated lines from file and return a list of tuples.
330
331        TODO: Allow named access to fields using the column names.
332        """
333        separator_char = (lambda c: '<TAB>' if c == '\t' else c)(self.separator)
334
335        rval = []
336        for i, line in enumerate( reader ):
337            if line.lstrip().startswith( self.comment_char ):
338                continue
339            line = line.rstrip( "\n\r" )
340            if line:
341                fields = line.split( self.separator )
342                if self.largest_index < len( fields ):
343                    rval.append( fields )
344                else:
345                    log.warn( "Line %i in tool data table '%s' is invalid (HINT: "
346                              "'%s' characters must be used to separate fields):\n%s"
347                              % ( ( i + 1 ), self.name, separator_char, line ) )
348        return rval
349
350    def get_column_name_list( self ):
351        rval = []
352        for i in range( self.largest_index + 1 ):
353            found_column = False
354            for name, index in self.columns.iteritems():
355                if index == i:
356                    if not found_column:
357                        rval.append( name )
358                    elif name == 'value':
359                        #the column named 'value' always has priority over other named columns
360                        rval[ -1 ] = name
361                    found_column = True
362            if not found_column:
363                rval.append( None )
364        return rval
365
366    def get_entry( self, query_attr, query_val, return_attr, default=None ):
367        """
368        Returns table entry associated with a col/val pair.
369        """
370        query_col = self.columns.get( query_attr, None )
371        if query_col is None:
372            return default
373        return_col = self.columns.get( return_attr, None )
374        if return_col is None:
375            return default
376        rval = default
377        # Look for table entry.
378        for fields in self.data:
379            if fields[ query_col ] == query_val:
380                rval = fields[ return_col ]
381                break
382        return rval
383
384    def _add_entry( self, entry, allow_duplicates=True, persist=False, persist_on_error=False, entry_source=None, **kwd ):
385        #accepts dict or list of columns
386        if isinstance( entry, dict ):
387            fields = []
388            for column_name in self.get_column_name_list():
389                if column_name not in entry:
390                    log.debug( "Using default column value for column '%s' when adding data table entry (%s) to table '%s'.", column_name, entry, self.name )
391                    field_value = self.get_empty_field_by_name( column_name )
392                else:
393                    field_value = entry[ column_name ]
394                fields.append( field_value )
395        else:
396            fields = entry
397        is_error = False
398        if self.largest_index < len( fields ):
399            fields = self._replace_field_separators( fields )
400            if fields not in self.data or allow_duplicates:
401                self.data.append( fields )
402            else:
403                log.debug( "Attempted to add fields (%s) to data table '%s', but this entry already exists and allow_duplicates is False.", fields, self.name )
404                is_error = True
405        else:
406            log.error( "Attempted to add fields (%s) to data table '%s', but there were not enough fields specified ( %i < %i ).", fields, self.name, len( fields ), self.largest_index + 1 )
407            is_error = True
408        filename = None
409
410        if persist and ( not is_error or persist_on_error ):
411            if entry_source:
412                #if dict, assume is compatible info dict, otherwise call method
413                if isinstance( entry_source, dict ):
414                    source_repo_info = entry_source
415                else:
416                    source_repo_info = entry_source.get_tool_shed_repository_info_dict()
417            else:
418                source_repo_info = None
419            for name, value in self.filenames.iteritems():
420                repo_info = value.get( 'tool_shed_repository', None )
421                if ( not source_repo_info and not repo_info ) or ( source_repo_info and repo_info and source_repo_info == repo_info ):
422                    filename = name
423                    break
424            if filename is None:
425                #should we default to using any filename here instead?
426                log.error( "Unable to determine filename for persisting data table '%s' values: '%s'.", self.name, fields )
427                is_error = True
428            else:
429                #FIXME: Need to lock these files for editing
430                log.debug( "Persisting changes to file: %s", filename )
431                try:
432                    data_table_fh = open( filename, 'r+b' )
433                except IOError, e:
434                    log.warning( 'Error opening data table file (%s) with r+b, assuming file does not exist and will open as wb: %s', self.filename, e )
435                    data_table_fh = open( filename, 'wb' )
436                if os.stat( filename )[6] != 0:
437                    # ensure last existing line ends with new line
438                    data_table_fh.seek( -1, 2 ) #last char in file
439                    last_char = data_table_fh.read( 1 )
440                    if last_char not in [ '\n', '\r' ]:
441                        data_table_fh.write( '\n' )
442                data_table_fh.write( "%s\n" % ( self.separator.join( fields ) ) )
443        return not is_error
444
445    def _replace_field_separators( self, fields, separator=None, replace=None, comment_char=None ):
446        #make sure none of the fields contain separator
447        #make sure separator replace is different from comment_char,
448        #due to possible leading replace
449        if separator is None:
450            separator = self.separator
451        if replace is None:
452            if separator == " ":
453                if comment_char == "\t":
454                    replace = "_"
455                else:
456                    replace = "\t"
457            else:
458                if comment_char == " ":
459                    replace = "_"
460                else:
461                    replace = " "
462        return map( lambda x: x.replace( separator, replace ), fields )
463
464# Registry of tool data types by type_key
465tool_data_table_types = dict( [ ( cls.type_key, cls ) for cls in [ TabularToolDataTable ] ] )