/tools/metag_tools/fastqsolexa_to_fasta_qual.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 109 lines · 76 code · 10 blank · 23 comment · 23 complexity · 4bf493b93916713a11ead7671f5fd7eb MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. convert fastqsolexa file to separated sequence and quality files.
  4. assume each sequence and quality score are contained in one line
  5. the order should be:
  6. 1st line: @title_of_seq
  7. 2nd line: nucleotides
  8. 3rd line: +title_of_qualityscore (might be skipped)
  9. 4th line: quality scores
  10. (in three forms: a. digits, b. ASCII codes, the first char as the coding base, c. ASCII codes without the first char.)
  11. Usage:
  12. %python fastqsolexa_to_fasta_qual.py <your_fastqsolexa_filename> <output_seq_filename> <output_score_filename>
  13. """
  14. import sys, os
  15. from math import *
  16. assert sys.version_info[:2] >= ( 2, 4 )
  17. def stop_err( msg ):
  18. sys.stderr.write( "%s" % msg )
  19. sys.exit()
  20. def __main__():
  21. infile_name = sys.argv[1]
  22. outfile_seq = open( sys.argv[2], 'w' )
  23. outfile_score = open( sys.argv[3], 'w' )
  24. datatype = sys.argv[4]
  25. seq_title_startswith = ''
  26. qual_title_startswith = ''
  27. default_coding_value = 64
  28. fastq_block_lines = 0
  29. for i, line in enumerate( file( infile_name ) ):
  30. line = line.rstrip()
  31. if not line or line.startswith( '#' ):
  32. continue
  33. fastq_block_lines = ( fastq_block_lines + 1 ) % 4
  34. line_startswith = line[0:1]
  35. if fastq_block_lines == 1:
  36. # first line is @title_of_seq
  37. if not seq_title_startswith:
  38. seq_title_startswith = line_startswith
  39. if line_startswith != seq_title_startswith:
  40. outfile_seq.close()
  41. outfile_score.close()
  42. stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )
  43. read_title = line[1:]
  44. outfile_seq.write( '>%s\n' % line[1:] )
  45. elif fastq_block_lines == 2:
  46. # second line is nucleotides
  47. read_length = len( line )
  48. outfile_seq.write( '%s\n' % line )
  49. elif fastq_block_lines == 3:
  50. # third line is +title_of_qualityscore ( might be skipped )
  51. if not qual_title_startswith:
  52. qual_title_startswith = line_startswith
  53. if line_startswith != qual_title_startswith:
  54. outfile_seq.close()
  55. outfile_score.close()
  56. stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )
  57. quality_title = line[1:]
  58. if quality_title and read_title != quality_title:
  59. outfile_seq.close()
  60. outfile_score.close()
  61. stop_err( 'Invalid fastqsolexa format at line %d: sequence title "%s" differes from score title "%s".' % ( i + 1, read_title, quality_title ) )
  62. if not quality_title:
  63. outfile_score.write( '>%s\n' % read_title )
  64. else:
  65. outfile_score.write( '>%s\n' % line[1:] )
  66. else:
  67. # fourth line is quality scores
  68. qual = ''
  69. fastq_integer = True
  70. # peek: ascii or digits?
  71. val = line.split()[0]
  72. try:
  73. check = int( val )
  74. fastq_integer = True
  75. except:
  76. fastq_integer = False
  77. if fastq_integer:
  78. # digits
  79. qual = line
  80. else:
  81. # ascii
  82. quality_score_length = len( line )
  83. if quality_score_length == read_length + 1:
  84. # first char is qual_score_startswith
  85. qual_score_startswith = ord( line[0:1] )
  86. line = line[1:]
  87. elif quality_score_length == read_length:
  88. qual_score_startswith = default_coding_value
  89. else:
  90. stop_err( 'Invalid fastqsolexa format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) )
  91. for j, char in enumerate( line ):
  92. score = ord( char ) - qual_score_startswith # 64
  93. qual = "%s%s " % ( qual, str( score ) )
  94. outfile_score.write( '%s\n' % qual )
  95. outfile_seq.close()
  96. outfile_score.close()
  97. if __name__ == "__main__": __main__()