/tools/next_gen_conversion/fastq_gen_conv.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 172 lines · 135 code · 5 blank · 32 comment · 60 complexity · ab0cdcd1329846ec45f0279ec31d510e MD5 · raw file

  1. """
  2. Converts any type of FASTQ file to Sanger type and makes small adjustments if necessary.
  3. usage: %prog [options]
  4. -i, --input=i: Input FASTQ candidate file
  5. -r, --origType=r: Original type
  6. -a, --allOrNot=a: Whether or not to check all blocks
  7. -b, --blocks=b: Number of blocks to check
  8. -o, --output=o: Output file
  9. usage: %prog input_file oroutput_file
  10. """
  11. import math, sys
  12. from galaxy import eggs
  13. import pkg_resources; pkg_resources.require( "bx-python" )
  14. from bx.cookbook import doc_optparse
  15. def stop_err( msg ):
  16. sys.stderr.write( "%s\n" % msg )
  17. sys.exit()
  18. def all_bases_valid(seq):
  19. """Confirm that the sequence contains only bases"""
  20. valid_bases = ['a', 'A', 'c', 'C', 'g', 'G', 't', 'T', 'N']
  21. for base in seq:
  22. if base not in valid_bases:
  23. return False
  24. return True
  25. def __main__():
  26. #Parse Command Line
  27. options, args = doc_optparse.parse( __doc__ )
  28. orig_type = options.origType
  29. if orig_type == 'sanger' and options.allOrNot == 'not':
  30. max_blocks = int(options.blocks)
  31. else:
  32. max_blocks = -1
  33. fin = file(options.input, 'r')
  34. fout = file(options.output, 'w')
  35. range_min = 1000
  36. range_max = -5
  37. block_num = 0
  38. bad_blocks = 0
  39. base_len = -1
  40. line_count = 0
  41. lines = []
  42. line = fin.readline()
  43. while line:
  44. if line.strip() and max_blocks >= 0 and block_num > 0 and orig_type == 'sanger' and block_num >= max_blocks:
  45. fout.write(line)
  46. if line_count % 4 == 0:
  47. block_num += 1
  48. line_count += 1
  49. elif line.strip():
  50. # the line that starts a block, with a name
  51. if line_count % 4 == 0 and line.startswith('@'):
  52. lines.append(line)
  53. else:
  54. # if we expect a sequence of bases
  55. if line_count % 4 == 1 and all_bases_valid(line.strip()):
  56. lines.append(line)
  57. base_len = len(line.strip())
  58. # if we expect the second name line
  59. elif line_count % 4 == 2 and line.startswith('+'):
  60. lines.append(line)
  61. # if we expect a sequence of qualities and it's the expected length
  62. elif line_count % 4 == 3:
  63. split_line = line.strip().split()
  64. # decimal qualities
  65. if len(split_line) == base_len:
  66. # convert
  67. phred_list = []
  68. for ch in split_line:
  69. int_ch = int(ch)
  70. if int_ch < range_min:
  71. range_min = int_ch
  72. if int_ch > range_max:
  73. range_max = int_ch
  74. if int_ch >= 0 and int_ch <= 93:
  75. phred_list.append(chr(int_ch + 33))
  76. # make sure we haven't lost any quality values
  77. if len(phred_list) == base_len:
  78. # print first three lines
  79. for l in lines:
  80. fout.write(l)
  81. # print converted quality line
  82. fout.write(''.join(phred_list))
  83. # reset
  84. lines = []
  85. base_len = -1
  86. # abort if so
  87. else:
  88. bad_blocks += 1
  89. lines = []
  90. base_len = -1
  91. # ascii qualities
  92. elif len(split_line[0]) == base_len:
  93. qualities = []
  94. # print converted quality line
  95. if orig_type == 'illumina':
  96. for c in line.strip():
  97. if ord(c) - 64 < range_min:
  98. range_min = ord(c) - 64
  99. if ord(c) - 64 > range_max:
  100. range_max = ord(c) - 64
  101. if ord(c) < 64 or ord(c) > 126:
  102. bad_blocks += 1
  103. base_len = -1
  104. lines = []
  105. break
  106. else:
  107. qualities.append( chr( ord(c) - 31 ) )
  108. quals = ''.join(qualities)
  109. elif orig_type == 'solexa':
  110. for c in line.strip():
  111. if ord(c) - 64 < range_min:
  112. range_min = ord(c) - 64
  113. if ord(c) - 64 > range_max:
  114. range_max = ord(c) - 64
  115. if ord(c) < 59 or ord(c) > 126:
  116. bad_blocks += 1
  117. base_len = -1
  118. lines = []
  119. break
  120. else:
  121. p = 10.0**( ( ord(c) - 64 ) / -10.0 ) / ( 1 + 10.0**( ( ord(c) - 64 ) / -10.0 ) )
  122. qualities.append( chr( int( -10.0*math.log10( p ) ) + 33 ) )
  123. quals = ''.join(qualities)
  124. else: # 'sanger'
  125. for c in line.strip():
  126. if ord(c) - 33 < range_min:
  127. range_min = ord(c) - 33
  128. if ord(c) - 33 > range_max:
  129. range_max = ord(c) - 33
  130. if ord(c) < 33 or ord(c) > 126:
  131. bad_blocks += 1
  132. base_len = -1
  133. lines = []
  134. break
  135. else:
  136. qualities.append(c)
  137. quals = ''.join(qualities)
  138. # make sure we don't have bad qualities
  139. if len(quals) == base_len:
  140. # print first three lines
  141. for l in lines:
  142. fout.write(l)
  143. # print out quality line
  144. fout.write(quals+'\n')
  145. # reset
  146. lines = []
  147. base_len = -1
  148. else:
  149. bad_blocks += 1
  150. base_len = -1
  151. lines = []
  152. # mark the successful end of a block
  153. block_num += 1
  154. line_count += 1
  155. line = fin.readline()
  156. fout.close()
  157. fin.close()
  158. if range_min != 1000 and range_min != -5:
  159. outmsg = 'The range of quality values found were: %s to %s' % (range_min, range_max)
  160. else:
  161. outmsg = ''
  162. if bad_blocks > 0:
  163. outmsg += '\nThere were %s bad blocks skipped' % (bad_blocks)
  164. sys.stdout.write(outmsg)
  165. if __name__=="__main__": __main__()