PageRenderTime 18ms CodeModel.GetById 1ms app.highlight 14ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/next_gen_conversion/fastq_gen_conv.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 172 lines | 141 code | 4 blank | 27 comment | 34 complexity | ab0cdcd1329846ec45f0279ec31d510e MD5 | raw file
  1"""
  2Converts any type of FASTQ file to Sanger type  and makes small adjustments if necessary.
  3
  4usage: %prog [options]
  5   -i, --input=i: Input FASTQ candidate file
  6   -r, --origType=r: Original type
  7   -a, --allOrNot=a: Whether or not to check all blocks
  8   -b, --blocks=b: Number of blocks to check
  9   -o, --output=o: Output file
 10
 11usage: %prog input_file oroutput_file
 12"""
 13
 14import math, sys
 15from galaxy import eggs
 16import pkg_resources; pkg_resources.require( "bx-python" )
 17from bx.cookbook import doc_optparse
 18
 19def stop_err( msg ):
 20    sys.stderr.write( "%s\n" % msg )
 21    sys.exit()
 22    
 23def all_bases_valid(seq):
 24    """Confirm that the sequence contains only bases"""
 25    valid_bases = ['a', 'A', 'c', 'C', 'g', 'G', 't', 'T', 'N']
 26    for base in seq:
 27        if base not in valid_bases:
 28            return False
 29    return True
 30
 31def __main__():
 32    #Parse Command Line
 33    options, args = doc_optparse.parse( __doc__ )
 34    orig_type = options.origType
 35    if orig_type == 'sanger' and options.allOrNot == 'not':
 36        max_blocks = int(options.blocks)
 37    else:
 38        max_blocks = -1
 39    fin = file(options.input, 'r')
 40    fout = file(options.output, 'w')
 41    range_min = 1000
 42    range_max = -5
 43    block_num = 0
 44    bad_blocks = 0
 45    base_len = -1
 46    line_count = 0
 47    lines = []
 48    line = fin.readline()
 49    while line:
 50        if line.strip() and max_blocks >= 0 and block_num > 0 and orig_type == 'sanger' and block_num >= max_blocks:
 51            fout.write(line)
 52            if line_count % 4 == 0:
 53                block_num += 1
 54            line_count += 1
 55        elif line.strip():
 56            # the line that starts a block, with a name
 57            if line_count % 4 == 0 and line.startswith('@'):
 58                lines.append(line)
 59            else:
 60                # if we expect a sequence of bases
 61                if line_count % 4 == 1 and all_bases_valid(line.strip()):
 62                    lines.append(line)
 63                    base_len = len(line.strip())
 64                # if we expect the second name line
 65                elif line_count % 4 == 2 and line.startswith('+'):
 66                    lines.append(line)
 67                # if we expect a sequence of qualities and it's the expected length
 68                elif line_count % 4 == 3:
 69                    split_line = line.strip().split()
 70                    # decimal qualities
 71                    if len(split_line) == base_len:
 72                        # convert
 73                        phred_list = []
 74                        for ch in split_line:
 75                            int_ch = int(ch)
 76                            if int_ch < range_min:
 77                                range_min = int_ch
 78                            if int_ch > range_max:
 79                                range_max = int_ch
 80                            if int_ch >= 0 and int_ch <= 93:
 81                                phred_list.append(chr(int_ch + 33))
 82                        # make sure we haven't lost any quality values
 83                        if len(phred_list) == base_len:
 84                            # print first three lines
 85                            for l in lines:
 86                                fout.write(l)
 87                            # print converted quality line
 88                            fout.write(''.join(phred_list))
 89                            # reset
 90                            lines = []
 91                            base_len = -1
 92                        # abort if so
 93                        else:
 94                            bad_blocks += 1
 95                            lines = []
 96                            base_len = -1
 97                    # ascii qualities
 98                    elif len(split_line[0]) == base_len:
 99                        qualities = []
100                        # print converted quality line
101                        if orig_type == 'illumina':
102                            for c in line.strip():
103                                if ord(c) - 64 < range_min:
104                                    range_min = ord(c) - 64
105                                if ord(c) - 64 > range_max:
106                                    range_max = ord(c) - 64
107                                if ord(c) < 64 or ord(c) > 126:
108                                    bad_blocks += 1
109                                    base_len = -1
110                                    lines = []
111                                    break
112                                else:
113                                    qualities.append( chr( ord(c) - 31 ) )
114                            quals = ''.join(qualities)
115                        elif orig_type == 'solexa':
116                            for c in line.strip():
117                                if ord(c) - 64 < range_min:
118                                    range_min = ord(c) - 64
119                                if ord(c) - 64 > range_max:
120                                    range_max = ord(c) - 64
121                                if ord(c) < 59 or ord(c) > 126:
122                                    bad_blocks += 1
123                                    base_len = -1
124                                    lines = []
125                                    break
126                                else:
127                                    p = 10.0**( ( ord(c) - 64 ) / -10.0 ) / ( 1 + 10.0**( ( ord(c) - 64 ) / -10.0 ) )
128                                    qualities.append( chr( int( -10.0*math.log10( p ) ) + 33 ) )
129                            quals = ''.join(qualities)
130                        else:  # 'sanger'
131                            for c in line.strip():
132                                if ord(c) - 33 < range_min:
133                                    range_min = ord(c) - 33
134                                if ord(c) - 33 > range_max:
135                                    range_max = ord(c) - 33
136                                if ord(c) < 33 or ord(c) > 126:
137                                    bad_blocks += 1
138                                    base_len = -1
139                                    lines = []
140                                    break
141                                else:
142                                    qualities.append(c)
143                            quals = ''.join(qualities)
144                        # make sure we don't have bad qualities
145                        if len(quals) == base_len:
146                            # print first three lines
147                            for l in lines:
148                                fout.write(l)
149                            # print out quality line
150                            fout.write(quals+'\n')
151                        # reset
152                        lines = []
153                        base_len = -1
154                    else:
155                        bad_blocks += 1
156                        base_len = -1
157                        lines = []
158                    # mark the successful end of a block
159                    block_num += 1
160            line_count += 1
161        line = fin.readline()
162    fout.close()
163    fin.close()
164    if range_min != 1000 and range_min != -5:
165        outmsg = 'The range of quality values found were: %s to %s' % (range_min, range_max)
166    else:
167        outmsg = ''
168    if bad_blocks > 0:
169        outmsg += '\nThere were %s bad blocks skipped' % (bad_blocks)
170    sys.stdout.write(outmsg)
171
172if __name__=="__main__": __main__()