PageRenderTime 123ms CodeModel.GetById 26ms app.highlight 88ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/galaxy/datatypes/tabular.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 819 lines | 674 code | 50 blank | 95 comment | 56 complexity | ee5ab63e4af7f5f83dc1d5c814edb05e MD5 | raw file
  1"""
  2Tabular datatype
  3
  4"""
  5import pkg_resources
  6pkg_resources.require( "bx-python" )
  7import gzip
  8import logging
  9import os
 10from cgi import escape
 11from galaxy import util
 12from galaxy.datatypes import data
 13from galaxy.datatypes import metadata
 14from galaxy.datatypes.checkers import is_gzip
 15from galaxy.datatypes.metadata import MetadataElement
 16from galaxy.datatypes.sniff import get_headers, get_test_fname
 17from galaxy.util.json import to_json_string
 18import dataproviders
 19
 20log = logging.getLogger(__name__)
 21
 22@dataproviders.decorators.has_dataproviders
 23class Tabular( data.Text ):
 24    """Tab delimited data"""
 25
 26    # All tabular data is chunkable.
 27    CHUNKABLE = True
 28    CHUNK_SIZE = 50000
 29
 30    """Add metadata elements"""
 31    MetadataElement( name="comment_lines", default=0, desc="Number of comment lines", readonly=False, optional=True, no_value=0 )
 32    MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=False, no_value=0 )
 33    MetadataElement( name="column_types", default=[], desc="Column types", param=metadata.ColumnTypesParameter, readonly=True, visible=False, no_value=[] )
 34    MetadataElement( name="column_names", default=[], desc="Column names", readonly=True, visible=False, optional=True, no_value=[] )
 35
 36    def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = 100000, max_guess_type_data_lines = None, **kwd ):
 37        """
 38        Tries to determine the number of columns as well as those columns that
 39        contain numerical values in the dataset.  A skip parameter is used
 40        because various tabular data types reuse this function, and their data
 41        type classes are responsible to determine how many invalid comment
 42        lines should be skipped. Using None for skip will cause skip to be
 43        zero, but the first line will be processed as a header. A
 44        max_data_lines parameter is used because various tabular data types
 45        reuse this function, and their data type classes are responsible to
 46        determine how many data lines should be processed to ensure that the
 47        non-optional metadata parameters are properly set; if used, optional
 48        metadata parameters will be set to None, unless the entire file has
 49        already been read. Using None for max_data_lines will process all data
 50        lines.
 51
 52        Items of interest:
 53
 54        1. We treat 'overwrite' as always True (we always want to set tabular metadata when called).
 55        2. If a tabular file has no data, it will have one column of type 'str'.
 56        3. We used to check only the first 100 lines when setting metadata and this class's
 57           set_peek() method read the entire file to determine the number of lines in the file.
 58           Since metadata can now be processed on cluster nodes, we've merged the line count portion
 59           of the set_peek() processing here, and we now check the entire contents of the file.
 60        """
 61        # Store original skip value to check with later
 62        requested_skip = skip
 63        if skip is None:
 64            skip = 0
 65        column_type_set_order = [ 'int', 'float', 'list', 'str'  ] #Order to set column types in
 66        default_column_type = column_type_set_order[-1] # Default column type is lowest in list
 67        column_type_compare_order = list( column_type_set_order ) #Order to compare column types
 68        column_type_compare_order.reverse()
 69        def type_overrules_type( column_type1, column_type2 ):
 70            if column_type1 is None or column_type1 == column_type2:
 71                return False
 72            if column_type2 is None:
 73                return True
 74            for column_type in column_type_compare_order:
 75                if column_type1 == column_type:
 76                    return True
 77                if column_type2 == column_type:
 78                    return False
 79            #neither column type was found in our ordered list, this cannot happen
 80            raise "Tried to compare unknown column types"
 81        def is_int( column_text ):
 82            try:
 83                int( column_text )
 84                return True
 85            except:
 86                return False
 87        def is_float( column_text ):
 88            try:
 89                float( column_text )
 90                return True
 91            except:
 92                if column_text.strip().lower() == 'na':
 93                    return True #na is special cased to be a float
 94                return False
 95        def is_list( column_text ):
 96            return "," in column_text
 97        def is_str( column_text ):
 98            #anything, except an empty string, is True
 99            if column_text == "":
100                return False
101            return True
102        is_column_type = {} #Dict to store column type string to checking function
103        for column_type in column_type_set_order:
104            is_column_type[column_type] = locals()[ "is_%s" % ( column_type ) ]
105        def guess_column_type( column_text ):
106            for column_type in column_type_set_order:
107                if is_column_type[column_type]( column_text ):
108                    return column_type
109            return None
110        data_lines = 0
111        comment_lines = 0
112        column_types = []
113        first_line_column_types = [default_column_type] # default value is one column of type str
114        if dataset.has_data():
115            #NOTE: if skip > num_check_lines, we won't detect any metadata, and will use default
116            dataset_fh = open( dataset.file_name )
117            i = 0
118            while True:
119                line = dataset_fh.readline()
120                if not line: break
121                line = line.rstrip( '\r\n' )
122                if i < skip or not line or line.startswith( '#' ):
123                    # We'll call blank lines comments
124                    comment_lines += 1
125                else:
126                    data_lines += 1
127                    if max_guess_type_data_lines is None or data_lines <= max_guess_type_data_lines:
128                        fields = line.split( '\t' )
129                        for field_count, field in enumerate( fields ):
130                            if field_count >= len( column_types ): #found a previously unknown column, we append None
131                                column_types.append( None )
132                            column_type = guess_column_type( field )
133                            if type_overrules_type( column_type, column_types[field_count] ):
134                                column_types[field_count] = column_type
135                    if i == 0 and requested_skip is None:
136                        # This is our first line, people seem to like to upload files that have a header line, but do not
137                        # start with '#' (i.e. all column types would then most likely be detected as str).  We will assume
138                        # that the first line is always a header (this was previous behavior - it was always skipped).  When
139                        # the requested skip is None, we only use the data from the first line if we have no other data for
140                        # a column.  This is far from perfect, as
141                        # 1,2,3	1.1	2.2	qwerty
142                        # 0	0		1,2,3
143                        # will be detected as
144                        # "column_types": ["int", "int", "float", "list"]
145                        # instead of
146                        # "column_types": ["list", "float", "float", "str"]  *** would seem to be the 'Truth' by manual
147                        # observation that the first line should be included as data.  The old method would have detected as
148                        # "column_types": ["int", "int", "str", "list"]
149                        first_line_column_types = column_types
150                        column_types = [ None for col in first_line_column_types ]
151                if max_data_lines is not None and data_lines >= max_data_lines:
152                    if dataset_fh.tell() != dataset.get_size():
153                        data_lines = None #Clear optional data_lines metadata value
154                        comment_lines = None #Clear optional comment_lines metadata value; additional comment lines could appear below this point
155                    break
156                i += 1
157            dataset_fh.close()
158
159        #we error on the larger number of columns
160        #first we pad our column_types by using data from first line
161        if len( first_line_column_types ) > len( column_types ):
162            for column_type in first_line_column_types[len( column_types ):]:
163                column_types.append( column_type )
164        #Now we fill any unknown (None) column_types with data from first line
165        for i in range( len( column_types ) ):
166            if column_types[i] is None:
167                if len( first_line_column_types ) <= i or first_line_column_types[i] is None:
168                    column_types[i] = default_column_type
169                else:
170                    column_types[i] = first_line_column_types[i]
171        # Set the discovered metadata values for the dataset
172        dataset.metadata.data_lines = data_lines
173        dataset.metadata.comment_lines = comment_lines
174        dataset.metadata.column_types = column_types
175        dataset.metadata.columns = len( column_types )
176    def make_html_table( self, dataset, **kwargs ):
177        """Create HTML table, used for displaying peek"""
178        out = ['<table cellspacing="0" cellpadding="3">']
179        try:
180            out.append( self.make_html_peek_header( dataset, **kwargs ) )
181            out.append( self.make_html_peek_rows( dataset, **kwargs ) )
182            out.append( '</table>' )
183            out = "".join( out )
184        except Exception, exc:
185            out = "Can't create peek %s" % str( exc )
186        return out
187
188    def make_html_peek_header( self, dataset, skipchars=None, column_names=None, column_number_format='%s', column_parameter_alias=None, **kwargs ):
189        if skipchars is None:
190            skipchars = []
191        if column_names is None:
192            column_names = []
193        if column_parameter_alias is None:
194            column_parameter_alias = {}
195        out = []
196        try:
197            if not column_names and dataset.metadata.column_names:
198                column_names = dataset.metadata.column_names
199
200            columns = dataset.metadata.columns
201            if columns is None:
202                columns = dataset.metadata.spec.columns.no_value
203            column_headers = [None] * columns
204
205            # fill in empty headers with data from column_names
206            for i in range( min( columns, len( column_names ) ) ):
207                if column_headers[i] is None and column_names[i] is not None:
208                    column_headers[i] = column_names[i]
209
210            # fill in empty headers from ColumnParameters set in the metadata
211            for name, spec in dataset.metadata.spec.items():
212                if isinstance( spec.param, metadata.ColumnParameter ):
213                    try:
214                        i = int( getattr( dataset.metadata, name ) ) - 1
215                    except:
216                        i = -1
217                    if 0 <= i < columns and column_headers[i] is None:
218                        column_headers[i] = column_parameter_alias.get(name, name)
219
220            out.append( '<tr>' )
221            for i, header in enumerate( column_headers ):
222                out.append( '<th>' )
223                if header is None:
224                    out.append( column_number_format % str( i + 1 ) )
225                else:
226                    out.append( '%s.%s' % ( str( i + 1 ), escape( header ) ) )
227                out.append( '</th>' )
228            out.append( '</tr>' )
229        except Exception, exc:
230            log.exception( 'make_html_peek_header failed on HDA %s' % dataset.id )
231            raise Exception, "Can't create peek header %s" % str( exc )
232        return "".join( out )
233
234    def make_html_peek_rows( self, dataset, skipchars=None, **kwargs ):
235        if skipchars is None:
236            skipchars = []
237        out = []
238        try:
239            if not dataset.peek:
240                dataset.set_peek()
241            columns = dataset.metadata.columns
242            if columns is None:
243                columns = dataset.metadata.spec.columns.no_value
244            for line in dataset.peek.splitlines():
245                if line.startswith( tuple( skipchars ) ):
246                    out.append( '<tr><td colspan="100%%">%s</td></tr>' % escape( line ) )
247                elif line:
248                    elems = line.split( '\t' )
249                    # we may have an invalid comment line or invalid data
250                    if len( elems ) != columns:
251                        out.append( '<tr><td colspan="100%%">%s</td></tr>' % escape( line ) )
252                    else:
253                        out.append( '<tr>' )
254                        for elem in elems:
255                            out.append( '<td>%s</td>' % escape( elem ) )
256                        out.append( '</tr>' )
257        except Exception, exc:
258            log.exception( 'make_html_peek_rows failed on HDA %s' % dataset.id )
259            raise Exception, "Can't create peek rows %s" % str( exc )
260        return "".join( out )
261
262    def get_chunk(self, trans, dataset, chunk):
263        ck_index = int(chunk)
264        f = open(dataset.file_name)
265        f.seek(ck_index * self.CHUNK_SIZE)
266        # If we aren't at the start of the file, seek to next newline.  Do this better eventually.
267        if f.tell() != 0:
268            cursor = f.read(1)
269            while cursor and cursor != '\n':
270                cursor = f.read(1)
271        ck_data = f.read(self.CHUNK_SIZE)
272        cursor = f.read(1)
273        while cursor and ck_data[-1] != '\n':
274            ck_data += cursor
275            cursor = f.read(1)
276        return to_json_string( { 'ck_data': util.unicodify( ck_data ), 'ck_index': ck_index + 1 } )
277
278    def display_data(self, trans, dataset, preview=False, filename=None, to_ext=None, chunk=None, **kwd):
279        preview = util.string_as_bool( preview )
280        if chunk:
281            return self.get_chunk(trans, dataset, chunk)
282        elif to_ext or not preview:
283            return self._serve_raw(trans, dataset, to_ext)
284        elif dataset.metadata.columns > 50:
285            #Fancy tabular display is only suitable for datasets without an incredibly large number of columns.
286            #We should add a new datatype 'matrix', with it's own draw method, suitable for this kind of data.
287            #For now, default to the old behavior, ugly as it is.  Remove this after adding 'matrix'.
288            max_peek_size = 1000000 # 1 MB
289            if os.stat( dataset.file_name ).st_size < max_peek_size:
290                return open( dataset.file_name )
291            else:
292                trans.response.set_content_type( "text/html" )
293                return trans.stream_template_mako( "/dataset/large_file.mako",
294                                            truncated_data = open( dataset.file_name ).read(max_peek_size),
295                                            data = dataset)
296        else:
297            column_names = 'null'
298            if dataset.metadata.column_names:
299                column_names = dataset.metadata.column_names
300            elif hasattr(dataset.datatype, 'column_names'):
301                column_names = dataset.datatype.column_names
302            column_types = dataset.metadata.column_types
303            if not column_types:
304                column_types = []
305            column_number = dataset.metadata.columns
306            if column_number is None:
307                column_number = 'null'
308            return trans.fill_template( "/dataset/tabular_chunked.mako",
309                        dataset = dataset,
310                        chunk = self.get_chunk(trans, dataset, 0),
311                        column_number = column_number,
312                        column_names = column_names,
313                        column_types = column_types )
314
315    def set_peek( self, dataset, line_count=None, is_multi_byte=False):
316        super(Tabular, self).set_peek( dataset, line_count=line_count, is_multi_byte=is_multi_byte)
317        if dataset.metadata.comment_lines:
318            dataset.blurb = "%s, %s comments" % ( dataset.blurb, util.commaify( str( dataset.metadata.comment_lines ) ) )
319    def display_peek( self, dataset ):
320        """Returns formatted html of peek"""
321        return self.make_html_table( dataset )
322    def displayable( self, dataset ):
323        try:
324            return dataset.has_data() \
325                and dataset.state == dataset.states.OK \
326                and dataset.metadata.columns > 0 \
327                and dataset.metadata.data_lines != 0
328        except:
329            return False
330    def as_gbrowse_display_file( self, dataset, **kwd ):
331        return open( dataset.file_name )
332    def as_ucsc_display_file( self, dataset, **kwd ):
333        return open( dataset.file_name )
334
335    # ------------- Dataproviders
336    @dataproviders.decorators.dataprovider_factory( 'column', dataproviders.column.ColumnarDataProvider.settings )
337    def column_dataprovider( self, dataset, **settings ):
338        """Uses column settings that are passed in"""
339        dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
340        return dataproviders.column.ColumnarDataProvider( dataset_source, **settings )
341
342    @dataproviders.decorators.dataprovider_factory( 'dataset-column',
343                                                    dataproviders.column.ColumnarDataProvider.settings )
344    def dataset_column_dataprovider( self, dataset, **settings ):
345        """Attempts to get column settings from dataset.metadata"""
346        return dataproviders.dataset.DatasetColumnarDataProvider( dataset, **settings )
347
348    @dataproviders.decorators.dataprovider_factory( 'dict', dataproviders.column.DictDataProvider.settings )
349    def dict_dataprovider( self, dataset, **settings ):
350        """Uses column settings that are passed in"""
351        dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
352        return dataproviders.column.DictDataProvider( dataset_source, **settings )
353
354    @dataproviders.decorators.dataprovider_factory( 'dataset-dict', dataproviders.column.DictDataProvider.settings )
355    def dataset_dict_dataprovider( self, dataset, **settings ):
356        """Attempts to get column settings from dataset.metadata"""
357        return dataproviders.dataset.DatasetDictDataProvider( dataset, **settings )
358
359
360class Taxonomy( Tabular ):
361    def __init__(self, **kwd):
362        """Initialize taxonomy datatype"""
363        Tabular.__init__( self, **kwd )
364        self.column_names = ['Name', 'TaxId', 'Root', 'Superkingdom', 'Kingdom', 'Subkingdom',
365                             'Superphylum', 'Phylum', 'Subphylum', 'Superclass', 'Class', 'Subclass',
366                             'Superorder', 'Order', 'Suborder', 'Superfamily', 'Family', 'Subfamily',
367                             'Tribe', 'Subtribe', 'Genus', 'Subgenus', 'Species', 'Subspecies'
368                             ]
369    def display_peek( self, dataset ):
370        """Returns formated html of peek"""
371        return Tabular.make_html_table( self, dataset, column_names=self.column_names )
372
373
374@dataproviders.decorators.has_dataproviders
375class Sam( Tabular ):
376    file_ext = 'sam'
377    track_type = "ReadTrack"
378    data_sources = { "data": "bam", "index": "bigwig" }
379
380    def __init__(self, **kwd):
381        """Initialize taxonomy datatype"""
382        Tabular.__init__( self, **kwd )
383        self.column_names = ['QNAME', 'FLAG', 'RNAME', 'POS', 'MAPQ', 'CIGAR',
384                             'MRNM', 'MPOS', 'ISIZE', 'SEQ', 'QUAL', 'OPT'
385                             ]
386    def display_peek( self, dataset ):
387        """Returns formated html of peek"""
388        return Tabular.make_html_table( self, dataset, column_names=self.column_names )
389
390    def sniff( self, filename ):
391        """
392        Determines whether the file is in SAM format
393
394        A file in SAM format consists of lines of tab-separated data.
395        The following header line may be the first line::
396
397          @QNAME  FLAG    RNAME   POS     MAPQ    CIGAR   MRNM    MPOS    ISIZE   SEQ     QUAL
398          or
399          @QNAME  FLAG    RNAME   POS     MAPQ    CIGAR   MRNM    MPOS    ISIZE   SEQ     QUAL    OPT
400
401        Data in the OPT column is optional and can consist of tab-separated data
402
403        For complete details see http://samtools.sourceforge.net/SAM1.pdf
404
405        Rules for sniffing as True::
406
407            There must be 11 or more columns of data on each line
408            Columns 2 (FLAG), 4(POS), 5 (MAPQ), 8 (MPOS), and 9 (ISIZE) must be numbers (9 can be negative)
409            We will only check that up to the first 5 alignments are correctly formatted.
410
411        >>> fname = get_test_fname( 'sequence.maf' )
412        >>> Sam().sniff( fname )
413        False
414        >>> fname = get_test_fname( '1.sam' )
415        >>> Sam().sniff( fname )
416        True
417        """
418        try:
419            fh = open( filename )
420            count = 0
421            while True:
422                line = fh.readline()
423                line = line.strip()
424                if not line:
425                    break #EOF
426                if line:
427                    if line[0] != '@':
428                        linePieces = line.split('\t')
429                        if len(linePieces) < 11:
430                            return False
431                        try:
432                            check = int(linePieces[1])
433                            check = int(linePieces[3])
434                            check = int(linePieces[4])
435                            check = int(linePieces[7])
436                            check = int(linePieces[8])
437                        except ValueError:
438                            return False
439                        count += 1
440                        if count == 5:
441                            return True
442            fh.close()
443            if count < 5 and count > 0:
444                return True
445        except:
446            pass
447        return False
448
449    def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = 5, **kwd ):
450        if dataset.has_data():
451            dataset_fh = open( dataset.file_name )
452            comment_lines = 0
453            if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
454                # If the dataset is larger than optional_metadata, just count comment lines.
455                for i, l in enumerate(dataset_fh):
456                    if l.startswith('@'):
457                        comment_lines += 1
458                    else:
459                        # No more comments, and the file is too big to look at the whole thing.  Give up.
460                        dataset.metadata.data_lines = None
461                        break
462            else:
463                # Otherwise, read the whole thing and set num data lines.
464                for i, l in enumerate(dataset_fh):
465                    if l.startswith('@'):
466                        comment_lines += 1
467                dataset.metadata.data_lines = i + 1 - comment_lines
468            dataset_fh.close()
469            dataset.metadata.comment_lines = comment_lines
470            dataset.metadata.columns = 12
471            dataset.metadata.column_types = ['str', 'int', 'str', 'int', 'int', 'str', 'str', 'int', 'int', 'str', 'str', 'str']
472
473    def merge( split_files, output_file):
474        """
475        Multiple SAM files may each have headers. Since the headers should all be the same, remove
476        the headers from files 1-n, keeping them in the first file only
477        """
478        cmd = 'mv %s %s' % ( split_files[0], output_file )
479        result = os.system(cmd)
480        if result != 0:
481            raise Exception('Result %s from %s' % (result, cmd))
482        if len(split_files) > 1:
483            cmd = 'egrep -v "^@" %s >> %s' % ( ' '.join(split_files[1:]), output_file )
484        result = os.system(cmd)
485        if result != 0:
486            raise Exception('Result %s from %s' % (result, cmd))
487    merge = staticmethod(merge)
488
489    # ------------- Dataproviders
490    # sam does not use '#' to indicate comments/headers - we need to strip out those headers from the std. providers
491    #TODO:?? seems like there should be an easier way to do this - metadata.comment_char?
492    @dataproviders.decorators.dataprovider_factory( 'line', dataproviders.line.FilteredLineDataProvider.settings )
493    def line_dataprovider( self, dataset, **settings ):
494        settings[ 'comment_char' ] = '@'
495        return super( Sam, self ).line_dataprovider( dataset, **settings )
496
497    @dataproviders.decorators.dataprovider_factory( 'regex-line', dataproviders.line.RegexLineDataProvider.settings )
498    def regex_line_dataprovider( self, dataset, **settings ):
499        settings[ 'comment_char' ] = '@'
500        return super( Sam, self ).regex_line_dataprovider( dataset, **settings )
501
502    @dataproviders.decorators.dataprovider_factory( 'column', dataproviders.column.ColumnarDataProvider.settings )
503    def column_dataprovider( self, dataset, **settings ):
504        settings[ 'comment_char' ] = '@'
505        return super( Sam, self ).column_dataprovider( dataset, **settings )
506
507    @dataproviders.decorators.dataprovider_factory( 'dataset-column',
508                                                    dataproviders.column.ColumnarDataProvider.settings )
509    def dataset_column_dataprovider( self, dataset, **settings ):
510        settings[ 'comment_char' ] = '@'
511        return super( Sam, self ).dataset_column_dataprovider( dataset, **settings )
512
513    @dataproviders.decorators.dataprovider_factory( 'dict', dataproviders.column.DictDataProvider.settings )
514    def dict_dataprovider( self, dataset, **settings ):
515        settings[ 'comment_char' ] = '@'
516        return super( Sam, self ).dict_dataprovider( dataset, **settings )
517
518    @dataproviders.decorators.dataprovider_factory( 'dataset-dict', dataproviders.column.DictDataProvider.settings )
519    def dataset_dict_dataprovider( self, dataset, **settings ):
520        settings[ 'comment_char' ] = '@'
521        return super( Sam, self ).dataset_dict_dataprovider( dataset, **settings )
522
523    @dataproviders.decorators.dataprovider_factory( 'header', dataproviders.line.RegexLineDataProvider.settings )
524    def header_dataprovider( self, dataset, **settings ):
525        dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
526        headers_source = dataproviders.line.RegexLineDataProvider( dataset_source, regex_list=[ '^@' ] )
527        return dataproviders.line.RegexLineDataProvider( headers_source, **settings )
528
529    @dataproviders.decorators.dataprovider_factory( 'id-seq-qual', dict_dataprovider.settings )
530    def id_seq_qual_dataprovider( self, dataset, **settings ):
531        # provided as an example of a specified column dict (w/o metadata)
532        settings[ 'indeces' ] = [ 0, 9, 10 ]
533        settings[ 'column_names' ] = [ 'id', 'seq', 'qual' ]
534        return self.dict_dataprovider( dataset, **settings )
535
536    @dataproviders.decorators.dataprovider_factory( 'genomic-region',
537                                                    dataproviders.dataset.GenomicRegionDataProvider.settings )
538    def genomic_region_dataprovider( self, dataset, **settings ):
539        settings[ 'comment_char' ] = '@'
540        return dataproviders.dataset.GenomicRegionDataProvider( dataset, 2, 3, 3, **settings )
541
542    @dataproviders.decorators.dataprovider_factory( 'genomic-region-dict',
543                                                    dataproviders.dataset.GenomicRegionDataProvider.settings )
544    def genomic_region_dict_dataprovider( self, dataset, **settings ):
545        settings[ 'comment_char' ] = '@'
546        return dataproviders.dataset.GenomicRegionDataProvider( dataset, 2, 3, 3, True, **settings )
547
548    #@dataproviders.decorators.dataprovider_factory( 'samtools' )
549    #def samtools_dataprovider( self, dataset, **settings ):
550    #    dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
551    #    return dataproviders.dataset.SamtoolsDataProvider( dataset_source, **settings )
552
553
554@dataproviders.decorators.has_dataproviders
555class Pileup( Tabular ):
556    """Tab delimited data in pileup (6- or 10-column) format"""
557    file_ext = "pileup"
558    line_class = "genomic coordinate"
559    data_sources = { "data": "tabix" }
560
561    """Add metadata elements"""
562    MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter )
563    MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter )
564    MetadataElement( name="endCol", default=2, desc="End column", param=metadata.ColumnParameter )
565    MetadataElement( name="baseCol", default=3, desc="Reference base column", param=metadata.ColumnParameter )
566
567    def init_meta( self, dataset, copy_from=None ):
568        Tabular.init_meta( self, dataset, copy_from=copy_from )
569
570    def display_peek( self, dataset ):
571        """Returns formated html of peek"""
572        return Tabular.make_html_table( self, dataset, column_parameter_alias={'chromCol':'Chrom', 'startCol':'Start', 'baseCol':'Base'} )
573
574    def repair_methods( self, dataset ):
575        """Return options for removing errors along with a description"""
576        return [ ("lines", "Remove erroneous lines") ]
577
578    def sniff( self, filename ):
579        """
580        Checks for 'pileup-ness'
581
582        There are two main types of pileup: 6-column and 10-column. For both,
583        the first three and last two columns are the same. We only check the
584        first three to allow for some personalization of the format.
585
586        >>> fname = get_test_fname( 'interval.interval' )
587        >>> Pileup().sniff( fname )
588        False
589        >>> fname = get_test_fname( '6col.pileup' )
590        >>> Pileup().sniff( fname )
591        True
592        >>> fname = get_test_fname( '10col.pileup' )
593        >>> Pileup().sniff( fname )
594        True
595        """
596        headers = get_headers( filename, '\t' )
597        try:
598            for hdr in headers:
599                if hdr and not hdr[0].startswith( '#' ):
600                    if len( hdr ) < 3:
601                        return False
602                    try:
603                        # chrom start in column 1 (with 0-based columns)
604                        # and reference base is in column 2
605                        check = int( hdr[1] )
606                        assert hdr[2] in [ 'A', 'C', 'G', 'T', 'N', 'a', 'c', 'g', 't', 'n' ]
607                    except:
608                        return False
609            return True
610        except:
611            return False
612
613    # ------------- Dataproviders
614    @dataproviders.decorators.dataprovider_factory( 'genomic-region',
615                                                    dataproviders.dataset.GenomicRegionDataProvider.settings )
616    def genomic_region_dataprovider( self, dataset, **settings ):
617        return dataproviders.dataset.GenomicRegionDataProvider( dataset, **settings )
618
619    @dataproviders.decorators.dataprovider_factory( 'genomic-region-dict',
620                                                    dataproviders.dataset.GenomicRegionDataProvider.settings )
621    def genomic_region_dict_dataprovider( self, dataset, **settings ):
622        settings[ 'named_columns' ] = True
623        return self.genomic_region_dataprovider( dataset, **settings )
624
625
626@dataproviders.decorators.has_dataproviders
627class Vcf( Tabular ):
628    """ Variant Call Format for describing SNPs and other simple genome variations. """
629    track_type = "VariantTrack"
630    data_sources = { "data": "tabix", "index": "bigwig" }
631
632    file_ext = 'vcf'
633    column_names = [ 'Chrom', 'Pos', 'ID', 'Ref', 'Alt', 'Qual', 'Filter', 'Info', 'Format', 'data' ]
634
635    MetadataElement( name="columns", default=10, desc="Number of columns", readonly=True, visible=False )
636    MetadataElement( name="column_types", default=['str','int','str','str','str','int','str','list','str','str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
637    MetadataElement( name="viz_filter_cols", desc="Score column for visualization", default=[5], param=metadata.ColumnParameter, optional=True, multiple=True, visible=False )
638    MetadataElement( name="sample_names", default=[], desc="Sample names", readonly=True, visible=False, optional=True, no_value=[] )
639
640    def sniff( self, filename ):
641        headers = get_headers( filename, '\n', count=1 )
642        return headers[0][0].startswith("##fileformat=VCF")
643
644    def display_peek( self, dataset ):
645        """Returns formated html of peek"""
646        return Tabular.make_html_table( self, dataset, column_names=self.column_names )
647
648    def set_meta( self, dataset, **kwd ):
649        Tabular.set_meta( self, dataset, **kwd )
650        source = open( dataset.file_name )
651
652        # Skip comments.
653        line = None
654        for line in source:
655            if not line.startswith( '##' ):
656                break
657
658        if line and line.startswith( '#' ):
659            # Found header line, get sample names.
660            dataset.metadata.sample_names = line.split()[ 9: ]
661
662    # ------------- Dataproviders
663    @dataproviders.decorators.dataprovider_factory( 'genomic-region',
664                                                    dataproviders.dataset.GenomicRegionDataProvider.settings )
665    def genomic_region_dataprovider( self, dataset, **settings ):
666        return dataproviders.dataset.GenomicRegionDataProvider( dataset, 0, 1, 1, **settings )
667
668    @dataproviders.decorators.dataprovider_factory( 'genomic-region-dict',
669                                                    dataproviders.dataset.GenomicRegionDataProvider.settings )
670    def genomic_region_dict_dataprovider( self, dataset, **settings ):
671        settings[ 'named_columns' ] = True
672        return self.genomic_region_dataprovider( dataset, **settings )
673
674
675class Eland( Tabular ):
676    """Support for the export.txt.gz file used by Illumina's ELANDv2e aligner"""
677    file_ext = '_export.txt.gz'
678    MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=False )
679    MetadataElement( name="column_types", default=[], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False, no_value=[] )
680    MetadataElement( name="comment_lines", default=0, desc="Number of comments", readonly=True, visible=False )
681    MetadataElement( name="tiles", default=[], param=metadata.ListParameter, desc="Set of tiles", readonly=True, visible=False, no_value=[] )
682    MetadataElement( name="reads", default=[], param=metadata.ListParameter, desc="Set of reads", readonly=True, visible=False, no_value=[] )
683    MetadataElement( name="lanes", default=[], param=metadata.ListParameter, desc="Set of lanes", readonly=True, visible=False, no_value=[] )
684    MetadataElement( name="barcodes", default=[], param=metadata.ListParameter, desc="Set of barcodes", readonly=True, visible=False, no_value=[] )
685    def __init__(self, **kwd):
686        """Initialize taxonomy datatype"""
687        Tabular.__init__( self, **kwd )
688        self.column_names = ['MACHINE', 'RUN_NO', 'LANE', 'TILE', 'X', 'Y',
689                             'INDEX', 'READ_NO', 'SEQ', 'QUAL', 'CHROM', 'CONTIG',
690                             'POSITION', 'STRAND', 'DESC', 'SRAS', 'PRAS', 'PART_CHROM'
691                             'PART_CONTIG', 'PART_OFFSET', 'PART_STRAND', 'FILT'
692                             ]
693    def make_html_table( self, dataset, skipchars=None ):
694        """Create HTML table, used for displaying peek"""
695        if skipchars is None:
696            skipchars = []
697        out = ['<table cellspacing="0" cellpadding="3">']
698        try:
699            # Generate column header
700            out.append( '<tr>' )
701            for i, name in enumerate( self.column_names ):
702                out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
703            # This data type requires at least 11 columns in the data
704            if dataset.metadata.columns - len( self.column_names ) > 0:
705                for i in range( len( self.column_names ), dataset.metadata.columns ):
706                    out.append( '<th>%s</th>' % str( i+1 ) )
707                out.append( '</tr>' )
708            out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
709            out.append( '</table>' )
710            out = "".join( out )
711        except Exception, exc:
712            out = "Can't create peek %s" % exc
713        return out
714    def sniff( self, filename ):
715        """
716        Determines whether the file is in ELAND export format
717
718        A file in ELAND export format consists of lines of tab-separated data.
719        There is no header.
720
721        Rules for sniffing as True::
722
723            - There must be 22 columns on each line
724            - LANE, TILEm X, Y, INDEX, READ_NO, SEQ, QUAL, POSITION, *STRAND, FILT must be correct
725            - We will only check that up to the first 5 alignments are correctly formatted.
726        """
727        try:
728            compress = is_gzip(filename)
729            if compress:
730               fh = gzip.GzipFile(filename, 'r')
731            else:
732               fh = open( filename )
733            count = 0
734            while True:
735                line = fh.readline()
736                line = line.strip()
737                if not line:
738                    break #EOF
739                if line:
740                    linePieces = line.split('\t')
741                    if len(linePieces) != 22:
742                        return False
743                    try:
744                        if long(linePieces[1]) < 0:
745                            raise Exception('Out of range')
746                        if long(linePieces[2]) < 0:
747                            raise Exception('Out of range')
748                        if long(linePieces[3]) < 0:
749                            raise Exception('Out of range')
750                        check = int(linePieces[4])
751                        check = int(linePieces[5])
752                        # can get a lot more specific
753                    except ValueError:
754                        fh.close()
755                        return False
756                    count += 1
757                    if count == 5:
758                        break
759            if count > 0:
760                fh.close()
761                return True
762        except:
763            pass
764        fh.close()
765        return False
766
767    def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = 5, **kwd ):
768        if dataset.has_data():
769            compress = is_gzip(dataset.file_name)
770            if compress:
771               dataset_fh = gzip.GzipFile(dataset.file_name, 'r')
772            else:
773               dataset_fh = open( dataset.file_name )
774            lanes = {}
775            tiles = {}
776            barcodes = {}
777            reads = {}
778            #    # Should always read the entire file (until we devise a more clever way to pass metadata on)
779            #if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
780            #    # If the dataset is larger than optional_metadata, just count comment lines.
781            #    dataset.metadata.data_lines = None
782            #else:
783            #    # Otherwise, read the whole thing and set num data lines.
784            for i, line in enumerate(dataset_fh):
785                if line:
786                    linePieces = line.split('\t')
787                    if len(linePieces) != 22:
788                        raise Exception('%s:%d:Corrupt line!' % (dataset.file_name,i))
789                    lanes[linePieces[2]]=1
790                    tiles[linePieces[3]]=1
791                    barcodes[linePieces[6]]=1
792                    reads[linePieces[7]]=1
793                pass
794            dataset.metadata.data_lines = i + 1
795            dataset_fh.close()
796            dataset.metadata.comment_lines = 0
797            dataset.metadata.columns = 21
798            dataset.metadata.column_types = ['str', 'int', 'int', 'int', 'int', 'int', 'str', 'int', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str']
799            dataset.metadata.lanes = lanes.keys()
800            dataset.metadata.tiles = ["%04d" % int(t) for t in tiles.keys()]
801            dataset.metadata.barcodes = filter(lambda x: x != '0', barcodes.keys()) + ['NoIndex' for x in barcodes.keys() if x == '0']
802            dataset.metadata.reads = reads.keys()
803
804
805class ElandMulti( Tabular ):
806    file_ext = 'elandmulti'
807
808    def sniff( self, filename ):
809        return False
810
811
812class FeatureLocationIndex( Tabular ):
813    """
814    An index that stores feature locations in tabular format.
815    """
816    file_ext='fli'
817    MetadataElement( name="columns", default=2, desc="Number of columns", readonly=True, visible=False )
818    MetadataElement( name="column_types", default=['str', 'str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False, no_value=[] )
819