/tools/metag_tools/short_reads_figure_high_quality_length.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 165 lines · 146 code · 15 blank · 4 comment · 54 complexity · b8288050a3e8a7ca6783600340bba3db MD5 · raw file

  1. #!/usr/bin/env python
  2. import os, sys, math, tempfile, zipfile, re
  3. from rpy import *
  4. assert sys.version_info[:2] >= ( 2, 4 )
  5. def stop_err( msg ):
  6. sys.stderr.write( "%s\n" % msg )
  7. sys.exit()
  8. def unzip( filename ):
  9. zip_file = zipfile.ZipFile( filename, 'r' )
  10. tmpfilename = tempfile.NamedTemporaryFile().name
  11. for name in zip_file.namelist():
  12. file( tmpfilename, 'a' ).write( zip_file.read( name ) )
  13. zip_file.close()
  14. return tmpfilename
  15. def __main__():
  16. infile_score_name = sys.argv[1].strip()
  17. outfile_R_name = sys.argv[2].strip()
  18. try:
  19. score_threshold = int( sys.argv[3].strip() )
  20. except:
  21. stop_err( 'Threshold for quality score must be numerical.' )
  22. infile_is_zipped = False
  23. if zipfile.is_zipfile( infile_score_name ):
  24. infile_is_zipped = True
  25. infile_name = unzip( infile_score_name )
  26. else:
  27. infile_name = infile_score_name
  28. # detect whether it's tabular or fasta format
  29. seq_method = None
  30. data_type = None
  31. for i, line in enumerate( file( infile_name ) ):
  32. line = line.rstrip( '\r\n' )
  33. if not line or line.startswith( '#' ):
  34. continue
  35. if data_type == None:
  36. if line.startswith( '>' ):
  37. data_type = 'fasta'
  38. continue
  39. elif len( line.split( '\t' ) ) > 0:
  40. fields = line.split()
  41. for score in fields:
  42. try:
  43. int( score )
  44. data_type = 'tabular'
  45. seq_method = 'solexa'
  46. break
  47. except:
  48. break
  49. elif data_type == 'fasta':
  50. fields = line.split()
  51. for score in fields:
  52. try:
  53. int( score )
  54. seq_method = '454'
  55. break
  56. except:
  57. break
  58. if i == 100:
  59. break
  60. if data_type is None:
  61. stop_err( 'This tool can only use fasta data or tabular data.' )
  62. if seq_method is None:
  63. stop_err( 'Invalid data for fasta format.')
  64. cont_high_quality = []
  65. invalid_lines = 0
  66. invalid_scores = 0
  67. if seq_method == 'solexa':
  68. for i, line in enumerate( open( infile_name ) ):
  69. line = line.rstrip( '\r\n' )
  70. if not line or line.startswith( '#' ):
  71. continue
  72. locs = line.split( '\t' )
  73. for j, base in enumerate( locs ):
  74. nuc_errors = base.split()
  75. try:
  76. nuc_errors[0] = int( nuc_errors[0] )
  77. nuc_errors[1] = int( nuc_errors[1] )
  78. nuc_errors[2] = int( nuc_errors[2] )
  79. nuc_errors[3] = int( nuc_errors[3] )
  80. big = max( nuc_errors )
  81. except:
  82. invalid_scores += 1
  83. big = 0
  84. if j == 0:
  85. cont_high_quality.append(1)
  86. else:
  87. if big >= score_threshold:
  88. cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
  89. else:
  90. cont_high_quality.append(1)
  91. else: # seq_method == '454'
  92. tmp_score = ''
  93. for i, line in enumerate( open( infile_name ) ):
  94. line = line.rstrip( '\r\n' )
  95. if not line or line.startswith( '#' ):
  96. continue
  97. if line.startswith( '>' ):
  98. if len( tmp_score ) > 0:
  99. locs = tmp_score.split()
  100. for j, base in enumerate( locs ):
  101. try:
  102. base = int( base )
  103. except:
  104. invalid_scores += 1
  105. base = 0
  106. if j == 0:
  107. cont_high_quality.append(1)
  108. else:
  109. if base >= score_threshold:
  110. cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
  111. else:
  112. cont_high_quality.append(1)
  113. tmp_score = ''
  114. else:
  115. tmp_score = "%s %s" % ( tmp_score, line )
  116. if len( tmp_score ) > 0:
  117. locs = tmp_score.split()
  118. for j, base in enumerate( locs ):
  119. try:
  120. base = int( base )
  121. except:
  122. invalid_scores += 1
  123. base = 0
  124. if j == 0:
  125. cont_high_quality.append(1)
  126. else:
  127. if base >= score_threshold:
  128. cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
  129. else:
  130. cont_high_quality.append(1)
  131. # generate pdf figures
  132. cont_high_quality = array ( cont_high_quality )
  133. outfile_R_pdf = outfile_R_name
  134. r.pdf( outfile_R_pdf )
  135. title = "Histogram of continuous high quality scores"
  136. xlim_range = [ 1, max( cont_high_quality ) ]
  137. nclass = max( cont_high_quality )
  138. if nclass > 100:
  139. nclass = 100
  140. r.hist( cont_high_quality, probability=True, xlab="Continuous High Quality Score length (bp)", ylab="Frequency (%)", xlim=xlim_range, main=title, nclass=nclass)
  141. r.dev_off()
  142. if infile_is_zipped and os.path.exists( infile_name ):
  143. # Need to delete temporary file created when we unzipped the infile archive
  144. os.remove( infile_name )
  145. if invalid_lines > 0:
  146. print 'Skipped %d invalid lines. ' % invalid_lines
  147. if invalid_scores > 0:
  148. print 'Skipped %d invalid scores. ' % invalid_scores
  149. r.quit( save="no" )
  150. if __name__=="__main__":__main__()