/lib/galaxy/datatypes/converters/fastqsolexa_to_qual_converter.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 95 lines · 66 code · 8 blank · 21 comment · 23 complexity · 343bf5cd1af92f4bd8b5a1a328d6ce86 MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. convert fastqsolexa file to separated sequence and quality files.
  4. assume each sequence and quality score are contained in one line
  5. the order should be:
  6. 1st line: @title_of_seq
  7. 2nd line: nucleotides
  8. 3rd line: +title_of_qualityscore (might be skipped)
  9. 4th line: quality scores
  10. (in three forms: a. digits, b. ASCII codes, the first char as the coding base, c. ASCII codes without the first char.)
  11. Usage:
  12. %python fastqsolexa_to_qual_converter.py <your_fastqsolexa_filename> <output_seq_filename> <output_score_filename>
  13. """
  14. import sys, os
  15. from math import *
  16. assert sys.version_info[:2] >= ( 2, 4 )
  17. def stop_err( msg ):
  18. sys.stderr.write( "%s" % msg )
  19. sys.exit()
  20. def __main__():
  21. infile_name = sys.argv[1]
  22. outfile_score = open( sys.argv[2], 'w' )
  23. datatype = sys.argv[3]
  24. qual_title_startswith = ''
  25. seq_title_startswith = ''
  26. default_coding_value = 64
  27. fastq_block_lines = 0
  28. for i, line in enumerate( file( infile_name ) ):
  29. line = line.rstrip()
  30. if not line or line.startswith( '#' ):
  31. continue
  32. fastq_block_lines = ( fastq_block_lines + 1 ) % 4
  33. line_startswith = line[0:1]
  34. if fastq_block_lines == 1:
  35. # first line is @title_of_seq
  36. if not seq_title_startswith:
  37. seq_title_startswith = line_startswith
  38. if line_startswith != seq_title_startswith:
  39. stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )
  40. read_title = line[1:]
  41. elif fastq_block_lines == 2:
  42. # second line is nucleotides
  43. read_length = len( line )
  44. elif fastq_block_lines == 3:
  45. # third line is +title_of_qualityscore (might be skipped)
  46. if not qual_title_startswith:
  47. qual_title_startswith = line_startswith
  48. if line_startswith != qual_title_startswith:
  49. stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )
  50. quality_title = line[1:]
  51. if quality_title and read_title != quality_title:
  52. stop_err( 'Invalid fastqsolexa format at line %d: sequence title "%s" differes from score title "%s".' % ( i + 1, read_title, quality_title ) )
  53. if not quality_title:
  54. outfile_score.write( '>%s\n' % read_title )
  55. else:
  56. outfile_score.write( '>%s\n' % line[1:] )
  57. else:
  58. # fourth line is quality scores
  59. qual = ''
  60. fastq_integer = True
  61. # peek: ascii or digits?
  62. val = line.split()[0]
  63. try:
  64. check = int( val )
  65. fastq_integer = True
  66. except:
  67. fastq_integer = False
  68. if fastq_integer: # digits
  69. qual = line
  70. else:
  71. # ascii
  72. quality_score_length = len( line )
  73. if quality_score_length == read_length + 1:
  74. quality_score_startswith = ord( line[0:1] )
  75. line = line[1:]
  76. elif quality_score_length == read_length:
  77. quality_score_startswith = default_coding_value
  78. else:
  79. stop_err( 'Invalid fastqsolexa format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) )
  80. for j, char in enumerate( line ):
  81. score = ord( char ) - quality_score_startswith # 64
  82. qual = "%s%s " % ( qual, str( score ) )
  83. outfile_score.write( '%s\n' % qual )
  84. outfile_score.close()
  85. if __name__ == "__main__": __main__()