/scripts/pair_up_reads.py

https://github.com/vanbug/denovo-assembly-tutorial · Python · 84 lines · 56 code · 13 blank · 15 comment · 11 complexity · c8983fa76838c9cfe3edf12034352125 MD5 · raw file

  1. #!/usr/bin/env python
  2. # based on this SeqAnswers thread
  3. # http://seqanswers.com/forums/showthread.php?t=6140
  4. # original written by Peter Cock
  5. import sys
  6. from Bio.SeqIO.QualityIO import FastqGeneralIterator #Biopython 1.51 or later
  7. ##########################################################
  8. #
  9. # Change the following settings to suit your needs
  10. #
  11. input_forward_filename = sys.argv[1]
  12. input_reverse_filename = sys.argv[2]
  13. output_paired_forward_filename = sys.argv[3]
  14. output_paired_reverse_filename = sys.argv[4]
  15. output_orphan_filename = sys.argv[5]
  16. f_suffix = "/1"
  17. r_suffix = "/2"
  18. ##########################################################
  19. if f_suffix:
  20. f_suffix_crop = -len(f_suffix)
  21. def f_name(title):
  22. """Remove the suffix from a forward read name."""
  23. name = title.split()[0]
  24. assert name.endswith(f_suffix), name
  25. return name[:f_suffix_crop]
  26. else:
  27. def f_name(title):
  28. return title.split()[0]
  29. if r_suffix:
  30. r_suffix_crop = -len(r_suffix)
  31. def r_name(title):
  32. """Remove the suffix from a reverse read name."""
  33. name = title.split()[0]
  34. assert name.endswith(r_suffix), name
  35. return name[:r_suffix_crop]
  36. else:
  37. def r_name(title):
  38. return title.split()[0]
  39. print "Scaning reverse file to build list of names..."
  40. reverse_ids = set()
  41. paired_ids = set()
  42. for title, seq, qual in FastqGeneralIterator(open(input_reverse_filename)):
  43. reverse_ids.add(r_name(title))
  44. print "Processing forward file..."
  45. forward_handle = open(output_paired_forward_filename, "w")
  46. orphan_handle = open(output_orphan_filename, "w")
  47. for title, seq, qual in FastqGeneralIterator(open(input_forward_filename)):
  48. name = f_name(title)
  49. if name in reverse_ids:
  50. #Paired
  51. paired_ids.add(name)
  52. reverse_ids.remove(name) #frees a little memory
  53. forward_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
  54. else:
  55. #Orphan
  56. orphan_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
  57. forward_handle.close()
  58. del reverse_ids #frees memory, although we won't need more now
  59. print "Processing reverse file..."
  60. reverse_handle = open(output_paired_reverse_filename, "w")
  61. for title, seq, qual in FastqGeneralIterator(open(input_reverse_filename)):
  62. name = r_name(title)
  63. if name in paired_ids:
  64. #Paired
  65. reverse_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
  66. else:
  67. #Orphan
  68. orphan_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
  69. orphan_handle.close()
  70. reverse_handle.close()
  71. print "Done"