/tools/fastq/fastq_paired_end_deinterlacer.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 66 lines · 52 code · 11 blank · 3 comment · 13 complexity · e70b9161a93788830811a83b80ffdea4 MD5 · raw file

  1. #Florent Angly
  2. import sys
  3. from galaxy_utils.sequence.fastq import fastqReader, fastqWriter, fastqNamedReader, fastqJoiner
  4. def main():
  5. input_filename = sys.argv[1]
  6. input_type = sys.argv[2] or 'sanger'
  7. mate1_filename = sys.argv[3]
  8. mate2_filename = sys.argv[4]
  9. single1_filename = sys.argv[5]
  10. single2_filename = sys.argv[6]
  11. type = input_type
  12. input = fastqNamedReader( open( input_filename, 'rb' ), format = type )
  13. mate1_out = fastqWriter( open( mate1_filename, 'wb' ), format = type )
  14. mate2_out = fastqWriter( open( mate2_filename, 'wb' ), format = type )
  15. single1_out = fastqWriter( open( single1_filename, 'wb' ), format = type )
  16. single2_out = fastqWriter( open( single2_filename, 'wb' ), format = type )
  17. joiner = fastqJoiner( type )
  18. i = None
  19. skip_count = 0
  20. found = {}
  21. for i, read in enumerate( fastqReader( open( input_filename, 'rb' ), format = type ) ):
  22. if read.identifier in found:
  23. del found[read.identifier]
  24. continue
  25. mate1 = input.get( read.identifier )
  26. mate2 = input.get( joiner.get_paired_identifier( mate1 ) )
  27. if mate2:
  28. # This is a mate pair
  29. found[mate2.identifier] = None
  30. if joiner.is_first_mate( mate1 ):
  31. mate1_out.write( mate1 )
  32. mate2_out.write( mate2 )
  33. else:
  34. mate1_out.write( mate2 )
  35. mate2_out.write( mate1 )
  36. else:
  37. # This is a single
  38. skip_count += 1
  39. if joiner.is_first_mate( mate1 ):
  40. single1_out.write( mate1 )
  41. else:
  42. single2_out.write( mate1 )
  43. if i is None:
  44. print "Your input file contained no valid FASTQ sequences."
  45. else:
  46. if skip_count:
  47. print 'There were %i reads with no mate.' % skip_count
  48. print 'De-interlaced %s pairs of sequences.' % ( (i - skip_count + 1)/2 )
  49. input.close()
  50. mate1_out.close()
  51. mate2_out.close()
  52. single1_out.close()
  53. single2_out.close()
  54. if __name__ == "__main__":
  55. main()