/tools/metag_tools/short_reads_trim_seq.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 234 lines · 207 code · 18 blank · 9 comment · 81 complexity · bb27dceaddd51b02ed901900a39c6ab5 MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. trim reads based on the quality scores
  4. input: read file and quality score file
  5. output: trimmed read file
  6. """
  7. import os, sys, math, tempfile, re
  8. assert sys.version_info[:2] >= ( 2, 4 )
  9. def stop_err( msg ):
  10. sys.stderr.write( "%s\n" % msg )
  11. sys.exit()
  12. def append_to_outfile( outfile_name, seq_title, segments ):
  13. segments = segments.split( ',' )
  14. if len( segments ) > 1:
  15. outfile = open( outfile_name, 'a' )
  16. for i in range( len( segments ) ):
  17. outfile.write( "%s_%d\n%s\n" % ( seq_title, i, segments[i] ) )
  18. outfile.close()
  19. elif segments[0]:
  20. outfile = open( outfile_name, 'a' )
  21. outfile.write( "%s\n%s\n" % ( seq_title, segments[0] ) )
  22. outfile.close()
  23. def trim_seq( seq, score, arg, trim_score, threshold ):
  24. seq_method = '454'
  25. trim_pos = 0
  26. # trim after a certain position
  27. if arg.isdigit():
  28. keep_homopolymers = False
  29. trim_pos = int( arg )
  30. if trim_pos > 0 and trim_pos < len( seq ):
  31. seq = seq[0:trim_pos]
  32. else:
  33. keep_homopolymers = arg=='yes'
  34. new_trim_seq = ''
  35. max_segment = 0
  36. for i in range( len( seq ) ):
  37. if i >= len( score ):
  38. score.append(-1)
  39. if int( score[i] ) >= trim_score:
  40. pass_nuc = seq[ i:( i + 1 ) ]
  41. else:
  42. if keep_homopolymers and ( (i == 0 ) or ( seq[ i:( i + 1 ) ].lower() == seq[ ( i - 1 ):i ].lower() ) ):
  43. pass_nuc = seq[ i:( i + 1 ) ]
  44. else:
  45. pass_nuc = ' '
  46. new_trim_seq = '%s%s' % ( new_trim_seq, pass_nuc )
  47. # find the max substrings
  48. segments = new_trim_seq.split()
  49. max_segment = ''
  50. len_max_segment = 0
  51. if threshold == 0:
  52. for seg in segments:
  53. if len_max_segment < len( seg ):
  54. max_segment = '%s,' % seg
  55. len_max_segment = len( seg )
  56. elif len_max_segment == len( seg ):
  57. max_segment = '%s%s,' % ( max_segment, seg )
  58. else:
  59. for seg in segments:
  60. if len( seg ) >= threshold:
  61. max_segment = '%s%s,' % ( max_segment, seg )
  62. return max_segment[ 0:-1 ]
  63. def __main__():
  64. try:
  65. threshold_trim = int( sys.argv[1].strip() )
  66. except:
  67. stop_err( "Minimal quality score must be numeric." )
  68. try:
  69. threshold_report = int( sys.argv[2].strip() )
  70. except:
  71. stop_err( "Minimal length of trimmed reads must be numeric." )
  72. outfile_seq_name = sys.argv[3].strip()
  73. infile_seq_name = sys.argv[4].strip()
  74. infile_score_name = sys.argv[5].strip()
  75. arg = sys.argv[6].strip()
  76. seq_infile_name = infile_seq_name
  77. score_infile_name = infile_score_name
  78. # Determine quailty score format: tabular or fasta format within the first 100 lines
  79. seq_method = None
  80. data_type = None
  81. for i, line in enumerate( file( score_infile_name ) ):
  82. line = line.rstrip( '\r\n' )
  83. if not line or line.startswith( '#' ):
  84. continue
  85. if data_type == None:
  86. if line.startswith( '>' ):
  87. data_type = 'fasta'
  88. continue
  89. elif len( line.split( '\t' ) ) > 0:
  90. fields = line.split()
  91. for score in fields:
  92. try:
  93. int( score )
  94. data_type = 'tabular'
  95. seq_method = 'solexa'
  96. break
  97. except:
  98. break
  99. elif data_type == 'fasta':
  100. fields = line.split()
  101. for score in fields:
  102. try:
  103. int( score )
  104. seq_method = '454'
  105. break
  106. except:
  107. break
  108. if i == 100:
  109. break
  110. if data_type is None:
  111. stop_err( 'This tool can only use fasta data or tabular data.' )
  112. if seq_method is None:
  113. stop_err( 'Invalid data for fasta format.')
  114. if os.path.exists( seq_infile_name ) and os.path.exists( score_infile_name ):
  115. seq = None
  116. score = None
  117. score_found = False
  118. score_file = open( score_infile_name, 'r' )
  119. for i, line in enumerate( open( seq_infile_name ) ):
  120. line = line.rstrip( '\r\n' )
  121. if not line or line.startswith( '#' ):
  122. continue
  123. if line.startswith( '>' ):
  124. if seq:
  125. scores = []
  126. if data_type == 'fasta':
  127. score = None
  128. score_found = False
  129. score_line = 'start'
  130. while not score_found and score_line:
  131. score_line = score_file.readline().rstrip( '\r\n' )
  132. if not score_line or score_line.startswith( '#' ):
  133. continue
  134. if score_line.startswith( '>' ):
  135. if score:
  136. scores = score.split()
  137. score_found = True
  138. score = None
  139. else:
  140. for val in score_line.split():
  141. try:
  142. int( val )
  143. except:
  144. score_file.close()
  145. stop_err( "Non-numerical value '%s' in score file." % val )
  146. if not score:
  147. score = score_line
  148. else:
  149. score = '%s %s' % ( score, score_line )
  150. elif data_type == 'tabular':
  151. score = score_file.readline().rstrip('\r\n')
  152. loc = score.split( '\t' )
  153. for base in loc:
  154. nuc_error = base.split()
  155. try:
  156. nuc_error[0] = int( nuc_error[0] )
  157. nuc_error[1] = int( nuc_error[1] )
  158. nuc_error[2] = int( nuc_error[2] )
  159. nuc_error[3] = int( nuc_error[3] )
  160. big = max( nuc_error )
  161. except:
  162. score_file.close()
  163. stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) )
  164. scores.append( big )
  165. if scores:
  166. new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report )
  167. append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments )
  168. seq_title = line
  169. seq = None
  170. else:
  171. if not seq:
  172. seq = line
  173. else:
  174. seq = "%s%s" % ( seq, line )
  175. if seq:
  176. scores = []
  177. if data_type == 'fasta':
  178. score = None
  179. while score_line:
  180. score_line = score_file.readline().rstrip( '\r\n' )
  181. if not score_line or score_line.startswith( '#' ) or score_line.startswith( '>' ):
  182. continue
  183. for val in score_line.split():
  184. try:
  185. int( val )
  186. except:
  187. score_file.close()
  188. stop_err( "Non-numerical value '%s' in score file." % val )
  189. if not score:
  190. score = score_line
  191. else:
  192. score = "%s %s" % ( score, score_line )
  193. if score:
  194. scores = score.split()
  195. elif data_type == 'tabular':
  196. score = score_file.readline().rstrip('\r\n')
  197. loc = score.split( '\t' )
  198. for base in loc:
  199. nuc_error = base.split()
  200. try:
  201. nuc_error[0] = int( nuc_error[0] )
  202. nuc_error[1] = int( nuc_error[1] )
  203. nuc_error[2] = int( nuc_error[2] )
  204. nuc_error[3] = int( nuc_error[3] )
  205. big = max( nuc_error )
  206. except:
  207. score_file.close()
  208. stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) )
  209. scores.append( big )
  210. if scores:
  211. new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report )
  212. append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments )
  213. score_file.close()
  214. else:
  215. stop_err( "Cannot locate sequence file '%s'or score file '%s'." % ( seq_infile_name, score_infile_name ) )
  216. if __name__ == "__main__": __main__()