PageRenderTime 33ms CodeModel.GetById 9ms app.highlight 19ms RepoModel.GetById 1ms app.codeStats 1ms

/lib/galaxy/datatypes/converters/fastqsolexa_to_qual_converter.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 95 lines | 73 code | 3 blank | 19 comment | 12 complexity | 343bf5cd1af92f4bd8b5a1a328d6ce86 MD5 | raw file
 1#!/usr/bin/env python
 2"""
 3convert fastqsolexa file to separated sequence and quality files.
 4
 5assume each sequence and quality score are contained in one line
 6the order should be:
 71st line: @title_of_seq
 82nd line: nucleotides
 93rd line: +title_of_qualityscore (might be skipped)
104th line: quality scores
11(in three forms: a. digits, b. ASCII codes, the first char as the coding base, c. ASCII codes without the first char.)
12
13Usage:
14%python fastqsolexa_to_qual_converter.py <your_fastqsolexa_filename> <output_seq_filename> <output_score_filename>
15"""
16import sys, os
17from math import *
18
19assert sys.version_info[:2] >= ( 2, 4 )
20
21def stop_err( msg ):
22    sys.stderr.write( "%s" % msg )
23    sys.exit()
24
25def __main__():
26    infile_name = sys.argv[1]
27    outfile_score = open( sys.argv[2], 'w' )
28    datatype = sys.argv[3]
29    qual_title_startswith = ''
30    seq_title_startswith = ''
31    default_coding_value = 64
32    fastq_block_lines = 0
33
34    for i, line in enumerate( file( infile_name ) ):
35        line = line.rstrip()
36        if not line or line.startswith( '#' ):
37            continue
38        fastq_block_lines = ( fastq_block_lines + 1 ) % 4
39        line_startswith = line[0:1]
40        if fastq_block_lines == 1:
41            # first line is @title_of_seq
42            if not seq_title_startswith:
43                seq_title_startswith = line_startswith
44            if line_startswith != seq_title_startswith:
45                stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )
46            read_title = line[1:]
47        elif fastq_block_lines == 2:
48            # second line is nucleotides
49            read_length = len( line )
50        elif fastq_block_lines == 3:
51            # third line is +title_of_qualityscore (might be skipped)
52            if not qual_title_startswith:
53                qual_title_startswith = line_startswith
54            if line_startswith != qual_title_startswith:
55                stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )
56            quality_title = line[1:]
57            if quality_title and read_title != quality_title:
58                stop_err( 'Invalid fastqsolexa format at line %d: sequence title "%s" differes from score title "%s".' % ( i + 1, read_title, quality_title ) )
59            if not quality_title:
60                outfile_score.write( '>%s\n' % read_title )
61            else:
62                outfile_score.write( '>%s\n' % line[1:] )
63        else:
64            # fourth line is quality scores
65            qual = ''
66            fastq_integer = True
67            # peek: ascii or digits?
68            val = line.split()[0]
69
70            try:
71                check = int( val )
72                fastq_integer = True
73            except:
74                fastq_integer = False
75
76            if fastq_integer: # digits
77                qual = line
78            else:
79                # ascii
80                quality_score_length = len( line )
81                if quality_score_length == read_length + 1:
82                    quality_score_startswith = ord( line[0:1] )
83                    line = line[1:]
84                elif quality_score_length == read_length:
85                    quality_score_startswith = default_coding_value
86                else:
87                    stop_err( 'Invalid fastqsolexa format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) )
88                for j, char in enumerate( line ):
89                    score = ord( char ) - quality_score_startswith    # 64
90                    qual = "%s%s " % ( qual, str( score ) )
91            outfile_score.write( '%s\n' % qual )
92
93    outfile_score.close()
94
95if __name__ == "__main__": __main__()