/tools/fastq/fastq_paired_end_interlacer.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 59 lines · 47 code · 9 blank · 3 comment · 12 complexity · 33dd533483d53744a7b82af3d3bbd7db MD5 · raw file

  1. #Florent Angly
  2. import sys
  3. from galaxy_utils.sequence.fastq import fastqReader, fastqWriter, fastqNamedReader, fastqJoiner
  4. def main():
  5. mate1_filename = sys.argv[1]
  6. mate1_type = sys.argv[2] or 'sanger'
  7. mate2_filename = sys.argv[3]
  8. mate2_type = sys.argv[4] or 'sanger'
  9. outfile_pairs = sys.argv[5]
  10. outfile_singles = sys.argv[6]
  11. if mate1_type != mate2_type:
  12. print "WARNING: You are trying to interlace files of two different types: %s and %s." % ( mate1_type, mate2_type )
  13. return
  14. type = mate1_type
  15. joiner = fastqJoiner( type )
  16. out_pairs = fastqWriter( open( outfile_pairs, 'wb' ), format = type )
  17. out_singles = fastqWriter( open( outfile_singles, 'wb' ), format = type )
  18. # Pairs + singles present in mate1
  19. nof_singles = 0
  20. nof_pairs = 0
  21. mate2_input = fastqNamedReader( open( mate2_filename, 'rb' ), format = type )
  22. i = None
  23. for i, mate1 in enumerate( fastqReader( open( mate1_filename, 'rb' ), format = type ) ):
  24. mate2 = mate2_input.get( joiner.get_paired_identifier( mate1 ) )
  25. if mate2:
  26. out_pairs.write( mate1 )
  27. out_pairs.write( mate2 )
  28. nof_pairs += 1
  29. else:
  30. out_singles.write( mate1 )
  31. nof_singles += 1
  32. # Singles present in mate2
  33. mate1_input = fastqNamedReader( open( mate1_filename, 'rb' ), format = type )
  34. j = None
  35. for j, mate2 in enumerate( fastqReader( open( mate2_filename, 'rb' ), format = type ) ):
  36. mate1 = mate1_input.get( joiner.get_paired_identifier( mate2 ) )
  37. if not mate1:
  38. out_singles.write( mate2 )
  39. nof_singles += 1
  40. if (i is None) and (j is None):
  41. print "Your input files contained no valid FASTQ sequences."
  42. else:
  43. print 'There were %s single reads.' % ( nof_singles )
  44. print 'Interlaced %s pairs of sequences.' % ( nof_pairs )
  45. mate1_input.close()
  46. mate2_input.close()
  47. out_pairs.close()
  48. out_singles.close()
  49. if __name__ == "__main__":
  50. main()