PageRenderTime 88ms CodeModel.GetById 35ms app.highlight 45ms RepoModel.GetById 1ms app.codeStats 1ms

/lib/galaxy_utils/sequence/fastq.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 752 lines | 745 code | 1 blank | 6 comment | 47 complexity | 1272cbccc204b9274f9767aeb80058c9 MD5 | raw file
  1##Dan Blankenberg
  2import math
  3import string
  4import transform
  5from sequence import SequencingRead
  6from fasta import fastaSequence
  7
  8class fastqSequencingRead( SequencingRead ):
  9    format = 'sanger' #sanger is default
 10    ascii_min = 33
 11    ascii_max = 126
 12    quality_min = 0
 13    quality_max = 93
 14    score_system = 'phred' #phred or solexa
 15    sequence_space = 'base' #base or color
 16    @classmethod
 17    def get_class_by_format( cls, format ):
 18        assert format in FASTQ_FORMATS, 'Unknown format type specified: %s' % format
 19        return FASTQ_FORMATS[ format ]
 20    @classmethod
 21    def convert_score_phred_to_solexa( cls, decimal_score_list ):
 22        def phred_to_solexa( score ):
 23            if score <= 0: #can't take log10( 1 - 1 ); make <= 0 into -5
 24                return -5
 25            return int( round( 10.0 * math.log10( math.pow( 10.0, ( float( score ) / 10.0 ) ) - 1.0 ) ) )
 26        return map( phred_to_solexa, decimal_score_list )
 27    @classmethod
 28    def convert_score_solexa_to_phred( cls, decimal_score_list ):
 29        def solexa_to_phred( score ):
 30            return int( round( 10.0 * math.log10( math.pow( 10.0, ( float( score ) / 10.0 ) ) + 1.0 ) ) )
 31        return map( solexa_to_phred, decimal_score_list )
 32    @classmethod
 33    def restrict_scores_to_valid_range( cls, decimal_score_list ):
 34        def restrict_score( score ):
 35            return max( min( score, cls.quality_max ), cls.quality_min )
 36        return map( restrict_score, decimal_score_list )
 37    @classmethod
 38    def transform_scores_to_valid_range( cls, decimal_score_list):
 39        cls_quality_max = cls.quality_max
 40        cls_quality_min = cls.quality_min
 41        for i in range( len( decimal_score_list ) ):
 42            score = decimal_score_list[i]
 43            if(score > cls_quality_max):
 44                transformed_score = cls_quality_max
 45            elif( score < cls_quality_min ):
 46                transformed_score = cls_quality_min
 47            else:
 48                transformed_score = score
 49            decimal_score_list[i] = str(transformed_score)
 50    @classmethod
 51    def transform_scores_to_valid_range_ascii( cls, decimal_score_list ):
 52        cls_quality_max = cls.quality_max
 53        cls_quality_min = cls.quality_min
 54        to_quality = cls.ascii_min - cls.quality_min
 55        for i in range( len( decimal_score_list ) ):
 56            score = decimal_score_list[i]
 57            if(score > cls_quality_max):
 58                transformed_score = cls_quality_max
 59            elif( score < cls_quality_min ):
 60                transformed_score = cls_quality_min
 61            else:
 62                transformed_score = score
 63            transformed_score = chr(transformed_score + to_quality)
 64            decimal_score_list[i] = transformed_score
 65    @classmethod
 66    def convert_base_to_color_space( cls, sequence ):
 67        return cls.color_space_converter.to_color_space( sequence )
 68    @classmethod
 69    def convert_color_to_base_space( cls, sequence ):
 70        return cls.color_space_converter.to_base_space( sequence )
 71    def is_ascii_encoded( self ):
 72        #as per fastq definition only decimal quality strings can have spaces (and TABs for our purposes) in them (and must have a trailing space)
 73        if ' ' in self.quality:
 74            return False
 75        if '\t' in self.quality:
 76            return False
 77        return True
 78    def get_ascii_quality_scores( self ):
 79        if self.is_ascii_encoded():
 80            return list( self.quality )
 81        else:
 82            quality = self.quality.rstrip() #decimal scores should have a trailing space
 83            if quality:
 84                try:
 85                    to_quality = self.ascii_min - self.quality_min
 86                    return [ chr( int( val ) + to_quality ) for val in quality.split() ]
 87                except ValueError, e:
 88                    raise ValueError( 'Error Parsing quality String. ASCII quality strings cannot contain spaces (%s): %s' % ( self.quality, e ) )
 89            else:
 90                return []
 91    def get_ascii_quality_scores_len( self ):
 92        """
 93        Compute ascii quality score length, without generating relatively
 94        expensive qualty score array.
 95        """
 96        if self.is_ascii_encoded():
 97            return len( self.quality )
 98        else:
 99            quality = self.quality.rstrip()
100            if quality:
101                try:
102                    return len( quality.split() )
103                except ValueError, e:
104                    raise ValueError( 'Error Parsing quality String. ASCII quality strings cannot contain spaces (%s): %s' % ( self.quality, e ) )
105            else:
106                return 0
107    def get_decimal_quality_scores( self ):
108        return self.__get_decimal_quality_scores(self.is_ascii_encoded())
109    def __get_decimal_quality_scores( self, ascii ):
110        if ascii:
111            to_quality = self.quality_min - self.ascii_min
112            return [ ord( val ) + to_quality for val in self.quality ]
113        else:
114            quality = self.quality.rstrip() #decimal scores should have a trailing space
115            if quality:
116                return [ int( val ) for val in quality.split() if val.strip() ]
117            else:
118                return []
119    def convert_read_to_format( self, format, force_quality_encoding = None ):
120        assert format in FASTQ_FORMATS, 'Unknown format type specified: %s' % format
121        assert force_quality_encoding in [ None, 'ascii', 'decimal' ], 'Invalid force_quality_encoding: %s' % force_quality_encoding
122        new_class = FASTQ_FORMATS[ format ]
123        new_read = new_class()
124        new_read.identifier = self.identifier
125        if self.sequence_space == new_class.sequence_space:
126            new_read.sequence = self.sequence
127        else:
128            if self.sequence_space == 'base':
129                new_read.sequence = self.convert_base_to_color_space( self.sequence )
130            else:
131                new_read.sequence = self.convert_color_to_base_space( self.sequence )
132        new_read.description = self.description
133        is_ascii = self.is_ascii_encoded()
134        if self.score_system != new_read.score_system:
135            if self.score_system == 'phred':
136                score_list = self.convert_score_phred_to_solexa( self.__get_decimal_quality_scores(is_ascii) )
137            else:
138                score_list = self.convert_score_solexa_to_phred( self.__get_decimal_quality_scores(is_ascii) )
139        else:
140            score_list = self.__get_decimal_quality_scores(is_ascii)
141        if force_quality_encoding is None:
142            if is_ascii:
143                new_encoding = 'ascii'
144            else:
145                new_encoding = 'decimal'
146        else:
147            new_encoding = force_quality_encoding
148        if new_encoding == 'ascii':
149            new_class.transform_scores_to_valid_range_ascii( score_list )
150            new_read.quality = "".join( score_list )
151        else:  # decimal
152            new_class.transform_scores_to_valid_range( score_list )
153            new_read.quality = "%s " % " ".join( score_list ) #need trailing space to be valid decimal fastq
154        return new_read
155    def get_sequence( self ):
156        return self.sequence
157    def slice( self, left_column_offset, right_column_offset ):
158        new_read = fastqSequencingRead.get_class_by_format( self.format )()
159        new_read.identifier = self.identifier
160        new_read.sequence = self.get_sequence()[left_column_offset:right_column_offset]
161        new_read.description = self.description
162        if self.is_ascii_encoded():
163            new_read.quality = self.quality[left_column_offset:right_column_offset]
164        else:
165            quality = map( str, self.get_decimal_quality_scores()[left_column_offset:right_column_offset] )
166            if quality:
167                new_read.quality = "%s " % " ".join( quality )
168            else:
169                new_read.quality = ''
170        return new_read
171    def is_valid_format( self ):
172        if self.is_ascii_encoded():
173            for val in self.get_ascii_quality_scores():
174                val = ord( val )
175                if val < self.ascii_min or val > self.ascii_max:
176                    return False
177        else:
178            for val in self.get_decimal_quality_scores():
179                if val < self.quality_min or val > self.quality_max:
180                    return False
181        if not self.is_valid_sequence():
182            return False
183        return True
184    def is_valid_sequence( self ):
185        for base in self.get_sequence():
186            if base not in self.valid_sequence_list:
187                return False
188        return True
189    def insufficient_quality_length( self ):
190        return self.get_ascii_quality_scores_len() < len( self.sequence )
191    def assert_sequence_quality_lengths( self ):
192        qual_len = self.get_ascii_quality_scores_len()
193        seq_len = len( self.sequence )
194        assert qual_len == seq_len, "Invalid FASTQ file: quality score length (%i) does not match sequence length (%i)" % ( qual_len, seq_len )
195    def reverse( self, clone = True ):
196        #need to override how decimal quality scores are reversed
197        if clone:
198            rval = self.clone()
199        else:
200            rval = self
201        rval.sequence = transform.reverse( self.sequence )
202        if rval.is_ascii_encoded():
203            rval.quality = rval.quality[::-1]
204        else:
205            rval.quality = reversed( rval.get_decimal_quality_scores() )
206            rval.quality = "%s " % " ".join( map( str, rval.quality ) )
207        return rval
208    def apply_galaxy_conventions( self ):
209        pass
210
211class fastqSangerRead( fastqSequencingRead ):
212    format = 'sanger'
213    ascii_min = 33
214    ascii_max = 126
215    quality_min = 0
216    quality_max = 93
217    score_system = 'phred'
218    sequence_space = 'base'
219
220class fastqIlluminaRead( fastqSequencingRead ):
221    format = 'illumina'
222    ascii_min = 64
223    ascii_max = 126
224    quality_min = 0
225    quality_max = 62
226    score_system = 'phred'
227    sequence_space = 'base'
228
229class fastqSolexaRead( fastqSequencingRead ):
230    format = 'solexa'
231    ascii_min = 59
232    ascii_max = 126
233    quality_min = -5
234    quality_max = 62
235    score_system = 'solexa'
236    sequence_space = 'base'
237
238class fastqCSSangerRead( fastqSequencingRead ):
239    format = 'cssanger' #color space
240    ascii_min = 33
241    ascii_max = 126
242    quality_min = 0
243    quality_max = 93
244    score_system = 'phred'
245    sequence_space = 'color'
246    valid_sequence_list = map( str, range( 7 ) ) + [ '.' ]
247    def __len__( self ):
248        if self.has_adapter_base(): #Adapter base is not counted in length of read
249            return len( self.sequence ) - 1
250        return fastqSequencingRead.__len__( self )
251    def has_adapter_base( self ):
252        if self.sequence and self.sequence[0] in string.letters: #adapter base must be a letter
253            return True
254        return False
255    def insufficient_quality_length( self ):
256        if self.has_adapter_base():
257            return self.get_ascii_quality_scores_len() + 1 < len( self.sequence )
258        return fastqSequencingRead.insufficient_quality_length( self )
259    def assert_sequence_quality_lengths( self ):
260        if self.has_adapter_base():
261            qual_len = self.get_ascii_quality_scores_len()
262            seq_len = len( self.sequence )
263            assert ( qual_len + 1 == seq_len ) or ( qual_len == seq_len ), "Invalid FASTQ file: quality score length (%i) does not match sequence length (%i with adapter base)" % ( qual_len, seq_len ) #SRA adds FAKE/DUMMY quality scores to the adapter base, we'll allow the reading of the Improper score here, but remove it in the Reader when "apply_galaxy_conventions" is set to True
264        else:
265            return fastqSequencingRead.assert_sequence_quality_lengths( self )
266    def get_sequence( self ):
267        if self.has_adapter_base():
268            return self.sequence[1:]
269        return self.sequence
270    def reverse( self, clone = True ):
271        #need to override how color space is reversed
272        if clone:
273            rval = self.clone()
274        else:
275            rval = self
276        if rval.has_adapter_base():
277            adapter = rval.sequence[0]
278            #sequence = rval.sequence[1:]
279            rval.sequence = self.color_space_converter.to_color_space( transform.reverse( self.color_space_converter.to_base_space( rval.sequence ) ), adapter_base = adapter )
280        else:
281            rval.sequence = transform.reverse( rval.sequence )
282
283        if rval.is_ascii_encoded():
284            rval.quality = rval.quality[::-1]
285        else:
286            rval.quality = reversed( rval.get_decimal_quality_scores() )
287            rval.quality = "%s " % " ".join( map( str, rval.quality ) )
288        return rval
289    def complement( self, clone = True ):
290        #need to override how color space is complemented
291        if clone:
292            rval = self.clone()
293        else:
294            rval = self
295        if rval.has_adapter_base(): #No adapter, color space stays the same
296            adapter = rval.sequence[0]
297            sequence = rval.sequence[1:]
298            if adapter.lower() != 'u':
299                adapter = transform.DNA_complement( adapter )
300            else:
301                adapter = transform.RNA_complement( adapter )
302            rval.sequence = "%s%s" % ( adapter, sequence )
303        return rval
304    def change_adapter( self, new_adapter, clone = True ):
305        #if new_adapter is empty, remove adapter, otherwise replace with new_adapter
306        if clone:
307            rval = self.clone()
308        else:
309            rval = self
310        if rval.has_adapter_base():
311            if new_adapter:
312                if new_adapter != rval.sequence[0]:
313                    rval.sequence = rval.color_space_converter.to_color_space( rval.color_space_converter.to_base_space( rval.sequence ), adapter_base = new_adapter )
314            else:
315                rval.sequence = rval.sequence[1:]
316        elif new_adapter:
317            rval.sequence = "%s%s" % ( new_adapter, rval.sequence )
318        return rval
319    def apply_galaxy_conventions( self ):
320        if self.has_adapter_base() and len( self.sequence ) == len( self.get_ascii_quality_scores() ): #SRA adds FAKE/DUMMY quality scores to the adapter base, we remove them here
321            if self.is_ascii_encoded():
322                self.quality = self.quality[1:]
323            else:
324                self.quality = " ".join( map( str, self.get_decimal_quality_scores()[1:] ) )
325
326FASTQ_FORMATS = {}
327for format in [ fastqIlluminaRead, fastqSolexaRead, fastqSangerRead, fastqCSSangerRead ]:
328    FASTQ_FORMATS[ format.format ] = format
329
330
331class fastqAggregator( object ):
332    VALID_FORMATS = FASTQ_FORMATS.keys()
333    def __init__( self,  ):
334        self.ascii_values_used = [] #quick lookup of all ascii chars used
335        self.seq_lens = {} #counts of seqs by read len
336        self.nuc_index_quality = [] #counts of scores by read column
337        self.nuc_index_base = [] #counts of bases by read column
338    def consume_read( self, fastq_read ):
339        #ascii values used
340        for val in fastq_read.get_ascii_quality_scores():
341            if val not in self.ascii_values_used:
342                self.ascii_values_used.append( val )
343        #lengths
344        seq_len = len( fastq_read )
345        self.seq_lens[ seq_len ] = self.seq_lens.get( seq_len, 0 ) + 1
346        #decimal qualities by column
347        for i, val in enumerate( fastq_read.get_decimal_quality_scores() ):
348            if i == len( self.nuc_index_quality ):
349                self.nuc_index_quality.append( {} )
350            self.nuc_index_quality[ i ][ val ] = self.nuc_index_quality[ i ].get( val, 0 ) + 1
351        #bases by column
352        for i, nuc in enumerate( fastq_read.get_sequence() ):
353            if i == len( self.nuc_index_base ):
354                self.nuc_index_base.append( {} )
355            nuc = nuc.upper()
356            self.nuc_index_base[ i ][ nuc ] = self.nuc_index_base[ i ].get( nuc, 0 ) + 1
357    def get_valid_formats( self, check_list = None ):
358        if not check_list:
359            check_list = self.VALID_FORMATS
360        rval = []
361        sequence = []
362        for nuc_dict in self.nuc_index_base:
363            for nuc in nuc_dict.keys():
364                if nuc not in sequence:
365                    sequence.append( nuc )
366        sequence = "".join( sequence )
367        quality = "".join( self.ascii_values_used )
368        for fastq_format in check_list:
369            fastq_read = fastqSequencingRead.get_class_by_format( fastq_format )()
370            fastq_read.quality = quality
371            fastq_read.sequence = sequence
372            if fastq_read.is_valid_format():
373                rval.append( fastq_format )
374        return rval
375    def get_ascii_range( self ):
376        if not self.ascii_values_used:
377            return None
378        return ( min( self.ascii_values_used ), max( self.ascii_values_used ) )
379    def get_decimal_range( self ):
380        if not self.nuc_index_quality:
381            return None
382        decimal_values_used = []
383        for scores in self.nuc_index_quality:
384            decimal_values_used.extend( scores.keys() )
385        return ( min( decimal_values_used ), max( decimal_values_used ) )
386    def get_length_counts( self ):
387        return self.seq_lens
388    def get_max_read_length( self ):
389        return len( self.nuc_index_quality )
390    def get_read_count_for_column( self, column ):
391        if column >= len( self.nuc_index_quality ):
392            return 0
393        return sum( self.nuc_index_quality[ column ].values() )
394    def get_read_count( self ):
395        return self.get_read_count_for_column( 0 )
396    def get_base_counts_for_column( self, column ):
397        return self.nuc_index_base[ column ]
398    def get_score_list_for_column( self, column ):
399        return self.nuc_index_quality[ column ].keys()
400    def get_score_min_for_column( self, column ):
401        return min( self.nuc_index_quality[ column ].keys() )
402    def get_score_max_for_column( self, column ):
403        return max( self.nuc_index_quality[ column ].keys() )
404    def get_score_sum_for_column( self, column ):
405        return sum( score * count for score, count in self.nuc_index_quality[ column ].iteritems() )
406    def get_score_at_position_for_column( self, column, position ):
407        score_value_dict = self.nuc_index_quality[ column ]
408        scores = sorted( score_value_dict.keys() )
409        for score in scores:
410            if score_value_dict[ score ] <= position:
411                position -= score_value_dict[ score ]
412            else:
413                return score
414    def get_summary_statistics_for_column( self, i ):
415        def _get_med_pos( size ):
416            halfed = int( size / 2 )
417            if size % 2 == 1:
418                return [ halfed ]
419            return[ halfed - 1, halfed ]
420        read_count = self.get_read_count_for_column( i )
421
422        min_score = self.get_score_min_for_column( i )
423        max_score = self.get_score_max_for_column( i )
424        sum_score = self.get_score_sum_for_column( i )
425        mean_score = float( sum_score ) / float( read_count )
426        #get positions
427        med_pos = _get_med_pos( read_count )
428        if 0 in med_pos:
429            q1_pos = [ 0 ]
430            q3_pos = [ read_count - 1 ]
431        else:
432            q1_pos = _get_med_pos( min( med_pos ) )
433            q3_pos = []
434            for pos in q1_pos:
435                q3_pos.append( max( med_pos ) + 1 + pos )
436        #get scores at position
437        med_score = float( sum( [ self.get_score_at_position_for_column( i, pos ) for pos in med_pos ] ) ) / float( len( med_pos ) )
438        q1 = float( sum( [ self.get_score_at_position_for_column( i, pos ) for pos in q1_pos ] ) ) / float( len( q1_pos ) )
439        q3 = float( sum( [ self.get_score_at_position_for_column( i, pos ) for pos in q3_pos ] ) ) / float( len( q3_pos ) )
440        #determine iqr and step
441        iqr = q3 - q1
442        step = 1.5 * iqr
443
444        #Determine whiskers and outliers
445        outliers = []
446        score_list = sorted( self.get_score_list_for_column( i ) )
447        left_whisker = q1 - step
448        for score in score_list:
449            if left_whisker <= score:
450                left_whisker = score
451                break
452            else:
453                outliers.append( score )
454
455        right_whisker = q3 + step
456        score_list.reverse()
457        for score in score_list:
458            if right_whisker >= score:
459                right_whisker = score
460                break
461            else:
462                outliers.append( score )
463
464        column_stats = { 'read_count': read_count,
465                         'min_score': min_score,
466                         'max_score': max_score,
467                         'sum_score': sum_score,
468                         'mean_score': mean_score,
469                         'q1': q1,
470                         'med_score': med_score,
471                         'q3': q3,
472                         'iqr': iqr,
473                         'left_whisker': left_whisker,
474                         'right_whisker': right_whisker,
475                         'outliers': outliers }
476        return column_stats
477
478class fastqReader( object ):
479    def __init__( self, fh, format = 'sanger', apply_galaxy_conventions = False ):
480        self.file = fh
481        self.format = format
482        self.apply_galaxy_conventions = apply_galaxy_conventions
483    def close( self ):
484        return self.file.close()
485    def next(self):
486        while True:
487            fastq_header = self.file.readline()
488            if not fastq_header:
489                raise StopIteration
490            fastq_header = fastq_header.rstrip( '\n\r' )
491            #remove empty lines, apparently extra new lines at end of file is common?
492            if fastq_header:
493                break
494
495        assert fastq_header.startswith( '@' ), 'Invalid fastq header: %s' % fastq_header
496        rval = fastqSequencingRead.get_class_by_format( self.format )()
497        rval.identifier = fastq_header
498        while True:
499            line = self.file.readline()
500            if not line:
501                raise Exception( 'Invalid FASTQ file: could not find quality score of sequence identifier %s.' % rval.identifier )
502            line = line.rstrip( '\n\r' )
503            if line.startswith( '+' ) and ( len( line ) == 1 or line[1:].startswith( fastq_header[1:] ) ):
504                rval.description = line
505                break
506            rval.append_sequence( line )
507        while rval.insufficient_quality_length():
508            line = self.file.readline()
509            if not line:
510                break
511            rval.append_quality( line )
512        rval.assert_sequence_quality_lengths()
513        if self.apply_galaxy_conventions:
514            rval.apply_galaxy_conventions()
515        return rval
516    def __iter__( self ):
517        while True:
518            yield self.next()
519
520class ReadlineCountFile( object ):
521    def __init__( self, f ):
522        self.__file = f
523        self.readline_count = 0
524    def readline( self, *args, **kwds ):
525        self.readline_count += 1
526        return self.__file.readline( *args, **kwds )
527    def __getattr__( self, name ):
528        return getattr( self.__file, name )
529
530class fastqVerboseErrorReader( fastqReader ):
531    MAX_PRINT_ERROR_BYTES = 1024
532    def __init__( self, fh, **kwds ):
533        super( fastqVerboseErrorReader, self ).__init__( ReadlineCountFile( fh ), **kwds  )
534        self.last_good_identifier = None
535    def next( self ):
536        last_good_end_offset = self.file.tell()
537        last_readline_count = self.file.readline_count
538        try:
539            block = super( fastqVerboseErrorReader, self ).next()
540            self.last_good_identifier = block.identifier
541            return block
542        except StopIteration, e:
543            raise e
544        except Exception, e:
545            print "There was an error reading your input file. Your input file is likely malformed.\nIt is suggested that you double-check your original input file for errors -- helpful information for this purpose has been provided below.\nHowever, if you think that you have encountered an actual error with this tool, please do tell us by using the bug reporting mechanism.\n\nThe reported error is: '%s'." % e
546            if self.last_good_identifier is not None:
547                print "The last valid FASTQ read had an identifier of '%s'." % self.last_good_identifier
548            else:
549                print "The error occurred at the start of your file and no valid FASTQ reads were found."
550            error_offset = self.file.tell()
551            error_byte_count = error_offset - last_good_end_offset
552            print_error_bytes = min( self.MAX_PRINT_ERROR_BYTES, error_byte_count )
553            print "The error in your file occurs between lines '%i' and '%i', which corresponds to byte-offsets '%i' and '%i', and contains the text (%i of %i bytes shown):\n" % ( last_readline_count + 1, self.file.readline_count, last_good_end_offset, error_offset, print_error_bytes, error_byte_count )
554            self.file.seek( last_good_end_offset )
555            print self.file.read( print_error_bytes )
556            raise e
557
558class fastqNamedReader( object ):
559    def __init__( self, fh, format = 'sanger', apply_galaxy_conventions = False  ):
560        self.file = fh
561        self.format = format
562        self.reader = fastqReader( self.file, self.format )
563        #self.last_offset = self.file.tell()
564        self.offset_dict = {}
565        self.eof = False
566        self.apply_galaxy_conventions = apply_galaxy_conventions
567    def close( self ):
568        return self.file.close()
569    def get( self, sequence_identifier ):
570        # Input is either a sequence ID or a sequence object
571        if not isinstance( sequence_identifier, basestring ):
572            # Input was a sequence object (not a sequence ID). Get the sequence ID
573            sequence_identifier = sequence_identifier.identifier
574        # Get only the ID part of the sequence header
575        sequence_id, sequence_sep, sequence_desc = sequence_identifier.partition(' ')
576        rval = None
577        if sequence_id in self.offset_dict:
578            initial_offset = self.file.tell()
579            seq_offset = self.offset_dict[ sequence_id ].pop( 0 )
580            if not self.offset_dict[ sequence_id ]:
581                del self.offset_dict[ sequence_id ]
582            self.file.seek( seq_offset )
583            rval = self.reader.next()
584            #assert rval.id == sequence_id, 'seq id mismatch' #should be able to remove this
585            self.file.seek( initial_offset )
586        else:
587            while True:
588                offset = self.file.tell()
589                try:
590                    fastq_read = self.reader.next()
591                except StopIteration:
592                    self.eof = True
593                    break #eof, id not found, will return None
594                fastq_read_id, fastq_read_sep, fastq_read_desc = fastq_read.identifier.partition(' ')
595                if fastq_read_id == sequence_id:
596                    rval = fastq_read
597                    break
598                else:
599                    if fastq_read_id not in self.offset_dict:
600                        self.offset_dict[ fastq_read_id ] = []
601                    self.offset_dict[ fastq_read_id ].append( offset )
602        if rval is not None and self.apply_galaxy_conventions:
603            rval.apply_galaxy_conventions()
604        return rval
605    def has_data( self ):
606        #returns a string representation of remaining data, or empty string (False) if no data remaining
607        eof = self.eof
608        count = 0
609        rval = ''
610        if self.offset_dict:
611            count = sum( map( len, self.offset_dict.values() ) )
612        if not eof:
613            offset = self.file.tell()
614            try:
615                fastq_read = self.reader.next()
616            except StopIteration:
617                eof = True
618            self.file.seek( offset )
619        if count:
620            rval = "There were %i known sequence reads not utilized. " % count
621        if not eof:
622            rval = "%s%s" % ( rval, "An additional unknown number of reads exist in the input that were not utilized." )
623        return rval
624
625class fastqWriter( object ):
626    def __init__( self, fh, format = None, force_quality_encoding = None ):
627        self.file = fh
628        self.format = format
629        self.force_quality_encoding = force_quality_encoding
630    def write( self, fastq_read ):
631        if self.format:
632            fastq_read = fastq_read.convert_read_to_format( self.format, force_quality_encoding = self.force_quality_encoding )
633        self.file.write( str( fastq_read ) )
634    def close( self ):
635        return self.file.close()
636
637class fastqJoiner( object ):
638    def __init__( self, format, force_quality_encoding = None ):
639        self.format = format
640        self.force_quality_encoding = force_quality_encoding
641    def join( self, read1, read2 ):
642        read1_id, read1_sep, read1_desc = read1.identifier.partition(' ')
643        read2_id, read2_sep, read2_desc = read2.identifier.partition(' ')
644        if read1_id.endswith( '/2' ) and read2_id.endswith( '/1' ):
645            #swap 1 and 2
646            tmp = read1
647            read1 = read2
648            read2 = tmp
649            del tmp
650        if read1_id.endswith( '/1' ) and read2_id.endswith( '/2' ):
651            read1_id = read1_id[:-2]
652
653        identifier = read1_id
654        if read1_desc:
655            identifier = identifier + ' ' + read1_desc
656
657        #use force quality encoding, if not present force to encoding of first read
658        force_quality_encoding = self.force_quality_encoding
659        if not force_quality_encoding:
660            if read1.is_ascii_encoded():
661                force_quality_encoding = 'ascii'
662            else:
663                force_quality_encoding = 'decimal'
664
665        new_read1 = read1.convert_read_to_format( self.format, force_quality_encoding = force_quality_encoding )
666        new_read2 = read2.convert_read_to_format( self.format, force_quality_encoding = force_quality_encoding )
667        rval = FASTQ_FORMATS[ self.format ]()
668        rval.identifier = identifier
669        if len( read1.description ) > 1:
670            rval.description = "+%s" % ( identifier[1:] )
671        else:
672            rval.description = '+'
673        if rval.sequence_space == 'color':
674            #need to handle color space joining differently
675            #convert to nuc space, join, then convert back
676            rval.sequence = rval.convert_base_to_color_space( new_read1.convert_color_to_base_space( new_read1.sequence ) + new_read2.convert_color_to_base_space( new_read2.sequence ) )
677        else:
678            rval.sequence = new_read1.sequence + new_read2.sequence
679        if force_quality_encoding == 'ascii':
680            rval.quality = new_read1.quality + new_read2.quality
681        else:
682            rval.quality = "%s %s" % ( new_read1.quality.strip(), new_read2.quality.strip() )
683        return rval
684    def get_paired_identifier( self, fastq_read ):
685        read_id, read_sep, read_desc = fastq_read.identifier.partition(' ')
686        if read_id[-2] == '/':
687            if read_id[-1] == "1":
688                read_id = "%s2" % read_id[:-1]
689            elif read_id[-1] == "2":
690                read_id = "%s1" % read_id[:-1]
691        return read_id
692    def is_first_mate( self, sequence_id ):
693        is_first = None
694        if not isinstance( sequence_id, basestring ):
695            sequence_id = sequence_id.identifier
696        sequence_id, sequence_sep, sequence_desc = sequence_id.partition(' ')
697        if sequence_id[-2] == '/':
698            if sequence_id[-1] == "1":
699                is_first = True
700            else:
701                is_first = False
702        return is_first
703
704class fastqSplitter( object ):
705    def split( self, fastq_read ):
706        length = len( fastq_read )
707        #Only reads of even lengths can be split
708        if length % 2 != 0:
709            return None, None
710        half = int( length / 2 )
711        read1 = fastq_read.slice( 0, half )
712        read1.identifier += "/1"
713        if len( read1.description ) > 1:
714            read1.description += "/1"
715        read2 = fastq_read.slice( half, None )
716        read2.identifier += "/2"
717        if len( read2.description ) > 1:
718            read2.description += "/2"
719        return read1, read2
720
721class fastqCombiner( object ):
722    def __init__( self, format ):
723        self.format = format
724    def combine(self, fasta_seq, quality_seq ):
725        fastq_read = fastqSequencingRead.get_class_by_format( self.format )()
726        fastq_read.identifier = "@%s" % fasta_seq.identifier[1:]
727        fastq_read.description = '+'
728        fastq_read.sequence = fasta_seq.sequence
729        fastq_read.quality = quality_seq.sequence
730        return fastq_read
731
732class fastqFakeFastaScoreReader( object ):
733    def __init__( self, format = 'sanger', quality_encoding = None ):
734        self.fastq_read = fastqSequencingRead.get_class_by_format( format )()
735        if quality_encoding != 'decimal':
736            quality_encoding = 'ascii'
737        self.quality_encoding = quality_encoding
738    def close( self ):
739        return #nothing to close
740    def get( self, sequence ):
741        assert isinstance( sequence, fastaSequence ), 'fastqFakeFastaScoreReader requires a fastaSequence object as the parameter'
742        #add sequence to fastq_read, then get_sequence(), color space adapters do not have quality score values
743        self.fastq_read.sequence = sequence.sequence
744        new_sequence = fastaSequence()
745        new_sequence.identifier = sequence.identifier
746        if self.quality_encoding == 'ascii':
747            new_sequence.sequence = chr( self.fastq_read.ascii_max ) * len( self.fastq_read.get_sequence() )
748        else:
749            new_sequence.sequence = ( "%i " % self.fastq_read.quality_max ) * len( self.fastq_read.get_sequence() )
750        return new_sequence
751    def has_data( self ):
752        return '' #No actual data exist, none can be remaining