/tools/metag_tools/short_reads_figure_score.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 248 lines · 210 code · 21 blank · 17 comment · 71 complexity · d20c0e6d1e959fcd5374fa7fc82bf077 MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. boxplot:
  4. - box: first quartile and third quartile
  5. - line inside the box: median
  6. - outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile
  7. IQR = third quartile - first quartile
  8. - The smallest/largest value that is not an outlier is connected to the box by with a horizontal line.
  9. """
  10. import os, sys, math, tempfile, re
  11. from rpy import *
  12. assert sys.version_info[:2] >= ( 2, 4 )
  13. def stop_err( msg ):
  14. sys.stderr.write( "%s\n" % msg )
  15. sys.exit()
  16. def merge_to_20_datapoints( score ):
  17. number_of_points = 20
  18. read_length = len( score )
  19. step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) )
  20. scores = []
  21. point = 1
  22. point_sum = 0
  23. step_average = 0
  24. score_points = 0
  25. for i in xrange( 1, read_length ):
  26. if i < ( point * step ):
  27. point_sum += int( score[i] )
  28. step_average += 1
  29. else:
  30. point_avg = point_sum * 1.0 / step_average
  31. scores.append( point_avg )
  32. point += 1
  33. point_sum = 0
  34. step_average = 0
  35. if step_average > 0:
  36. point_avg = point_sum * 1.0 / step_average
  37. scores.append( point_avg )
  38. if len( scores ) > number_of_points:
  39. last_avg = 0
  40. for j in xrange( number_of_points - 1, len( scores ) ):
  41. last_avg += scores[j]
  42. last_avg = last_avg / ( len(scores) - number_of_points + 1 )
  43. else:
  44. last_avg = scores[-1]
  45. score_points = []
  46. for k in range( number_of_points - 1 ):
  47. score_points.append( scores[k] )
  48. score_points.append( last_avg )
  49. return score_points
  50. def __main__():
  51. invalid_lines = 0
  52. infile_score_name = sys.argv[1].strip()
  53. outfile_R_name = sys.argv[2].strip()
  54. infile_name = infile_score_name
  55. # Determine tabular or fasta format within the first 100 lines
  56. seq_method = None
  57. data_type = None
  58. for i, line in enumerate( file( infile_name ) ):
  59. line = line.rstrip( '\r\n' )
  60. if not line or line.startswith( '#' ):
  61. continue
  62. if data_type == None:
  63. if line.startswith( '>' ):
  64. data_type = 'fasta'
  65. continue
  66. elif len( line.split( '\t' ) ) > 0:
  67. fields = line.split()
  68. for score in fields:
  69. try:
  70. int( score )
  71. data_type = 'tabular'
  72. seq_method = 'solexa'
  73. break
  74. except:
  75. break
  76. elif data_type == 'fasta':
  77. fields = line.split()
  78. for score in fields:
  79. try:
  80. int( score )
  81. seq_method = '454'
  82. break
  83. except:
  84. break
  85. if i == 100:
  86. break
  87. if data_type is None:
  88. stop_err( 'This tool can only use fasta data or tabular data.' )
  89. if seq_method is None:
  90. stop_err( 'Invalid data for fasta format.')
  91. # Determine fixed length or variable length within the first 100 lines
  92. read_length = 0
  93. variable_length = False
  94. if seq_method == 'solexa':
  95. for i, line in enumerate( file( infile_name ) ):
  96. line = line.rstrip( '\r\n' )
  97. if not line or line.startswith( '#' ):
  98. continue
  99. scores = line.split('\t')
  100. if read_length == 0:
  101. read_length = len( scores )
  102. if read_length != len( scores ):
  103. variable_length = True
  104. break
  105. if i == 100:
  106. break
  107. elif seq_method == '454':
  108. score = ''
  109. for i, line in enumerate( file( infile_name ) ):
  110. line = line.rstrip( '\r\n' )
  111. if not line or line.startswith( '#' ):
  112. continue
  113. if line.startswith( '>' ):
  114. if len( score ) > 0:
  115. score = score.split()
  116. if read_length == 0:
  117. read_length = len( score )
  118. if read_length != len( score ):
  119. variable_length = True
  120. break
  121. score = ''
  122. else:
  123. score = score + ' ' + line
  124. if i == 100:
  125. break
  126. if variable_length:
  127. number_of_points = 20
  128. else:
  129. number_of_points = read_length
  130. read_length_threshold = 100 # minimal read length for 454 file
  131. score_points = []
  132. score_matrix = []
  133. invalid_scores = 0
  134. if seq_method == 'solexa':
  135. for i, line in enumerate( open( infile_name ) ):
  136. line = line.rstrip( '\r\n' )
  137. if not line or line.startswith( '#' ):
  138. continue
  139. tmp_array = []
  140. scores = line.split( '\t' )
  141. for bases in scores:
  142. nuc_errors = bases.split()
  143. try:
  144. nuc_errors[0] = int( nuc_errors[0] )
  145. nuc_errors[1] = int( nuc_errors[1] )
  146. nuc_errors[2] = int( nuc_errors[2] )
  147. nuc_errors[3] = int( nuc_errors[3] )
  148. big = max( nuc_errors )
  149. except:
  150. #print 'Invalid numbers in the file. Skipped.'
  151. invalid_scores += 1
  152. big = 0
  153. tmp_array.append( big )
  154. score_points.append( tmp_array )
  155. elif seq_method == '454':
  156. # skip the last fasta sequence
  157. score = ''
  158. for i, line in enumerate( open( infile_name ) ):
  159. line = line.rstrip( '\r\n' )
  160. if not line or line.startswith( '#' ):
  161. continue
  162. if line.startswith( '>' ):
  163. if len( score ) > 0:
  164. score = ['0'] + score.split()
  165. read_length = len( score )
  166. tmp_array = []
  167. if not variable_length:
  168. score.pop(0)
  169. score_points.append( score )
  170. tmp_array = score
  171. elif read_length > read_length_threshold:
  172. score_points_tmp = merge_to_20_datapoints( score )
  173. score_points.append( score_points_tmp )
  174. tmp_array = score_points_tmp
  175. score = ''
  176. else:
  177. score = "%s %s" % ( score, line )
  178. if len( score ) > 0:
  179. score = ['0'] + score.split()
  180. read_length = len( score )
  181. if not variable_length:
  182. score.pop(0)
  183. score_points.append( score )
  184. elif read_length > read_length_threshold:
  185. score_points_tmp = merge_to_20_datapoints( score )
  186. score_points.append( score_points_tmp )
  187. tmp_array = score_points_tmp
  188. # reverse the matrix, for R
  189. for i in range( number_of_points - 1 ):
  190. tmp_array = []
  191. for j in range( len( score_points ) ):
  192. try:
  193. tmp_array.append( int( score_points[j][i] ) )
  194. except:
  195. invalid_lines += 1
  196. score_matrix.append( tmp_array )
  197. # generate pdf figures
  198. #outfile_R_pdf = outfile_R_name
  199. #r.pdf( outfile_R_pdf )
  200. outfile_R_png = outfile_R_name
  201. r.bitmap( outfile_R_png )
  202. title = "boxplot of quality scores"
  203. empty_score_matrix_columns = 0
  204. for i, subset in enumerate( score_matrix ):
  205. if not subset:
  206. empty_score_matrix_columns += 1
  207. score_matrix[i] = [0]
  208. if not variable_length:
  209. r.boxplot( score_matrix, xlab="location in read length", main=title )
  210. else:
  211. r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title )
  212. x_old_range = []
  213. x_new_range = []
  214. step = read_length_threshold / number_of_points
  215. for i in xrange( 0, read_length_threshold, step ):
  216. x_old_range.append( ( i / step ) )
  217. x_new_range.append( i )
  218. r.axis( 1, x_old_range, x_new_range )
  219. r.dev_off()
  220. if invalid_scores > 0:
  221. print 'Skipped %d invalid scores. ' % invalid_scores
  222. if invalid_lines > 0:
  223. print 'Skipped %d invalid lines. ' % invalid_lines
  224. if empty_score_matrix_columns > 0:
  225. print '%d missing scores in score_matrix. ' % empty_score_matrix_columns
  226. r.quit(save = "no")
  227. if __name__=="__main__":__main__()