PageRenderTime 55ms CodeModel.GetById 24ms app.highlight 27ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/metag_tools/short_reads_figure_score.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 248 lines | 223 code | 12 blank | 13 comment | 41 complexity | d20c0e6d1e959fcd5374fa7fc82bf077 MD5 | raw file
  1#!/usr/bin/env python
  2"""
  3boxplot:
  4- box: first quartile and third quartile
  5- line inside the box: median
  6- outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile
  7           IQR = third quartile - first quartile
  8- The smallest/largest value that is not an outlier is connected to the box by with a horizontal line.
  9"""
 10
 11import os, sys, math, tempfile, re
 12from rpy import *
 13
 14assert sys.version_info[:2] >= ( 2, 4 )
 15
 16def stop_err( msg ):
 17    sys.stderr.write( "%s\n" % msg )
 18    sys.exit()
 19
 20def merge_to_20_datapoints( score ):
 21    number_of_points = 20
 22    read_length = len( score )
 23    step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) )
 24    scores = []
 25    point = 1
 26    point_sum = 0
 27    step_average = 0
 28    score_points = 0
 29    
 30    for i in xrange( 1, read_length ):
 31        if i < ( point * step ):
 32            point_sum += int( score[i] )
 33            step_average += 1
 34        else:
 35            point_avg = point_sum * 1.0 / step_average
 36            scores.append( point_avg )
 37            point += 1
 38            point_sum = 0
 39            step_average = 0                       
 40    if step_average > 0:
 41        point_avg = point_sum * 1.0 / step_average
 42        scores.append( point_avg )
 43    if len( scores ) > number_of_points:
 44        last_avg = 0
 45        for j in xrange( number_of_points - 1, len( scores ) ):
 46            last_avg += scores[j]
 47        last_avg = last_avg / ( len(scores) - number_of_points + 1 )
 48    else:    
 49        last_avg = scores[-1]
 50    score_points = []
 51    for k in range( number_of_points - 1 ):
 52        score_points.append( scores[k] )
 53    score_points.append( last_avg )
 54    return score_points
 55
 56def __main__():
 57
 58    invalid_lines = 0
 59
 60    infile_score_name = sys.argv[1].strip()
 61    outfile_R_name = sys.argv[2].strip()
 62
 63    infile_name = infile_score_name
 64
 65    # Determine tabular or fasta format within the first 100 lines
 66    seq_method = None
 67    data_type = None
 68    for i, line in enumerate( file( infile_name ) ):
 69        line = line.rstrip( '\r\n' )
 70        if not line or line.startswith( '#' ):
 71            continue
 72        if data_type == None:
 73            if line.startswith( '>' ):
 74                data_type = 'fasta'
 75                continue
 76            elif len( line.split( '\t' ) ) > 0:
 77                fields = line.split()
 78                for score in fields:
 79                    try:
 80                        int( score )
 81                        data_type = 'tabular'
 82                        seq_method = 'solexa'
 83                        break
 84                    except:
 85                        break
 86        elif data_type == 'fasta':
 87            fields = line.split()
 88            for score in fields:
 89                try: 
 90                    int( score )
 91                    seq_method = '454'
 92                    break
 93                except:
 94                    break
 95        if i == 100:
 96            break
 97
 98    if data_type is None:
 99        stop_err( 'This tool can only use fasta data or tabular data.' ) 
100    if seq_method is None:
101        stop_err( 'Invalid data for fasta format.')
102
103    # Determine fixed length or variable length within the first 100 lines
104    read_length = 0
105    variable_length = False
106    if seq_method == 'solexa':
107        for i, line in enumerate( file( infile_name ) ):
108            line = line.rstrip( '\r\n' )
109            if not line or line.startswith( '#' ):
110                continue
111            scores = line.split('\t')
112            if read_length == 0:
113                read_length = len( scores )
114            if read_length != len( scores ):
115                variable_length = True
116                break
117            if i == 100:
118                break
119    elif seq_method == '454':
120        score = ''
121        for i, line in enumerate( file( infile_name ) ):
122            line = line.rstrip( '\r\n' )
123            if not line or line.startswith( '#' ):
124                continue
125            if line.startswith( '>' ):
126                if len( score ) > 0:
127                    score = score.split()
128                    if read_length == 0:
129                        read_length = len( score )
130                    if read_length != len( score ):
131                        variable_length = True
132                        break
133                score = ''
134            else:
135                score = score + ' ' + line
136            if i == 100:
137                break
138
139    if variable_length:
140        number_of_points = 20
141    else:
142        number_of_points = read_length
143    read_length_threshold = 100 # minimal read length for 454 file
144    score_points = []   
145    score_matrix = []
146    invalid_scores = 0   
147
148    if seq_method == 'solexa':
149        for i, line in enumerate( open( infile_name ) ):
150            line = line.rstrip( '\r\n' )
151            if not line or line.startswith( '#' ):
152                continue
153            tmp_array = []
154            scores = line.split( '\t' )
155            for bases in scores:
156                nuc_errors = bases.split()
157                try:
158                    nuc_errors[0] = int( nuc_errors[0] )
159                    nuc_errors[1] = int( nuc_errors[1] )
160                    nuc_errors[2] = int( nuc_errors[2] )
161                    nuc_errors[3] = int( nuc_errors[3] )
162                    big = max( nuc_errors )
163                except:
164                    #print 'Invalid numbers in the file. Skipped.'
165                    invalid_scores += 1
166                    big = 0
167                tmp_array.append( big )                        
168            score_points.append( tmp_array )
169    elif seq_method == '454':
170        # skip the last fasta sequence
171        score = ''
172        for i, line in enumerate( open( infile_name ) ):
173            line = line.rstrip( '\r\n' )
174            if not line or line.startswith( '#' ):
175                continue
176            if line.startswith( '>' ):
177                if len( score ) > 0:
178                    score = ['0'] + score.split()
179                    read_length = len( score )
180                    tmp_array = []
181                    if not variable_length:
182                        score.pop(0)
183                        score_points.append( score )
184                        tmp_array = score
185                    elif read_length > read_length_threshold:
186                        score_points_tmp = merge_to_20_datapoints( score )
187                        score_points.append( score_points_tmp )
188                        tmp_array = score_points_tmp
189                score = ''
190            else:
191                score = "%s %s" % ( score, line )
192        if len( score ) > 0:
193            score = ['0'] + score.split()
194            read_length = len( score )
195            if not variable_length:
196                score.pop(0)
197                score_points.append( score )
198            elif read_length > read_length_threshold:
199                score_points_tmp = merge_to_20_datapoints( score )
200                score_points.append( score_points_tmp )
201                tmp_array = score_points_tmp
202
203    # reverse the matrix, for R
204    for i in range( number_of_points - 1 ):
205        tmp_array = []
206        for j in range( len( score_points ) ):
207            try:
208                tmp_array.append( int( score_points[j][i] ) )
209            except:
210                invalid_lines += 1
211        score_matrix.append( tmp_array )
212
213    # generate pdf figures
214    #outfile_R_pdf = outfile_R_name 
215    #r.pdf( outfile_R_pdf )
216    outfile_R_png = outfile_R_name
217    r.bitmap( outfile_R_png )
218    
219    title = "boxplot of quality scores"
220    empty_score_matrix_columns = 0
221    for i, subset in enumerate( score_matrix ):
222        if not subset:
223            empty_score_matrix_columns += 1
224            score_matrix[i] = [0]
225            
226    if not variable_length:
227        r.boxplot( score_matrix, xlab="location in read length", main=title )
228    else:
229        r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title )
230        x_old_range = []
231        x_new_range = []
232        step = read_length_threshold / number_of_points 
233        for i in xrange( 0, read_length_threshold, step ):
234            x_old_range.append( ( i / step ) )
235            x_new_range.append( i )
236        r.axis( 1, x_old_range, x_new_range )
237    r.dev_off()
238
239    if invalid_scores > 0:
240        print 'Skipped %d invalid scores. ' % invalid_scores
241    if invalid_lines > 0:
242        print 'Skipped %d invalid lines. ' % invalid_lines
243    if empty_score_matrix_columns > 0:
244        print '%d missing scores in score_matrix. ' % empty_score_matrix_columns
245
246    r.quit(save = "no")
247
248if __name__=="__main__":__main__()