/lib/galaxy/datatypes/converters/fastqsolexa_to_fasta_converter.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 55 lines · 31 code · 7 blank · 17 comment · 9 complexity · 100c3ff13a4d750bb8d91cf40544c158 MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. convert fastqsolexa file to separated sequence and quality files.
  4. assume each sequence and quality score are contained in one line
  5. the order should be:
  6. 1st line: @title_of_seq
  7. 2nd line: nucleotides
  8. 3rd line: +title_of_qualityscore (might be skipped)
  9. 4th line: quality scores
  10. (in three forms: a. digits, b. ASCII codes, the first char as the coding base, c. ASCII codes without the first char.)
  11. Usage:
  12. %python fastqsolexa_to_fasta_converter.py <your_fastqsolexa_filename> <output_seq_filename> <output_score_filename>
  13. """
  14. import sys, os
  15. from math import *
  16. assert sys.version_info[:2] >= ( 2, 4 )
  17. def stop_err( msg ):
  18. sys.stderr.write( "%s" % msg )
  19. sys.exit()
  20. def __main__():
  21. infile_name = sys.argv[1]
  22. outfile = open( sys.argv[2], 'w' )
  23. fastq_block_lines = 0
  24. seq_title_startswith = ''
  25. for i, line in enumerate( file( infile_name ) ):
  26. line = line.rstrip() # eliminate trailing space and new line characters
  27. if not line or line.startswith( '#' ):
  28. continue
  29. fastq_block_lines = ( fastq_block_lines + 1 ) % 4
  30. line_startswith = line[0:1]
  31. if fastq_block_lines == 1:
  32. # line 1 is sequence title
  33. if not seq_title_startswith:
  34. seq_title_startswith = line_startswith
  35. if seq_title_startswith != line_startswith:
  36. stop_err( 'Invalid fastqsolexa format at line %d: %s.' %( i + 1, line ) )
  37. read_title = line[ 1: ]
  38. outfile.write( '>%s\n' % line[1:] )
  39. elif fastq_block_lines == 2:
  40. # line 2 is nucleotides
  41. read_length = len( line )
  42. outfile.write( '%s\n' % line )
  43. else:
  44. pass
  45. outfile.close()
  46. if __name__ == "__main__": __main__()