/tools/vcf_tools/intersect.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 181 lines · 116 code · 30 blank · 35 comment · 36 complexity · e716eb1310462ed789d5e7c63821ee5e MD5 · raw file

  1. #!/usr/bin/python
  2. import os.path
  3. import sys
  4. import optparse
  5. import bedClass
  6. from bedClass import *
  7. import vcfClass
  8. from vcfClass import *
  9. import tools
  10. from tools import *
  11. if __name__ == "__main__":
  12. main()
  13. # Intersect two vcf files. It is assumed that the two files are
  14. # sorted by genomic coordinates and the reference sequences are
  15. # in the same order.
  16. def intersectVcf(v1, v2, priority, outputFile):
  17. success1 = v1.getRecord()
  18. success2 = v2.getRecord()
  19. currentReferenceSequence = v1.referenceSequence
  20. # As soon as the end of either file is reached, there can be no
  21. # more intersecting SNPs, so terminate.
  22. while success1 and success2:
  23. if v1.referenceSequence == v2.referenceSequence and v1.referenceSequence == currentReferenceSequence:
  24. if v1.position == v2.position:
  25. writeVcfRecord(priority, v1, v2, outputFile)
  26. success1 = v1.getRecord()
  27. success2 = v2.getRecord()
  28. elif v2.position > v1.position: success1 = v1.parseVcf(v2.referenceSequence, v2.position, False, None)
  29. elif v1.position > v2.position: success2 = v2.parseVcf(v1.referenceSequence, v1.position, False, None)
  30. else:
  31. if v1.referenceSequence == currentReferenceSequence: success1 = v1.parseVcf(v2.referenceSequence, v2.position, False, None)
  32. elif v2.referenceSequence == currentReferenceSequence: success2 = v2.parseVcf(v1.referenceSequence, v1.position, False, None)
  33. # If the last record for a reference sequence is the same for both vcf
  34. # files, they will both have referenceSequences different from the
  35. # current reference sequence. Change the reference sequence to reflect
  36. # this and proceed.
  37. else:
  38. if v1.referenceSequence != v2.referenceSequence:
  39. print >> sys.stderr, "ERROR: Reference sequences for both files are unexpectedly different."
  40. print >> sys.stderr, "Check that both files contain records for the following reference sequences:"
  41. print >> sys.stderr, "\t", v1.referenceSequence, " and ", v2.referenceSequence
  42. exit(1)
  43. currentReferenceSequence = v1.referenceSequence
  44. # Intersect a vcf file and a bed file. It is assumed that the
  45. # two files are sorted by genomic coordinates and the reference
  46. # sequences are in the same order.
  47. def intersectVcfBed(v, b, outputFile):
  48. successb = b.getRecord()
  49. successv = v.getRecord()
  50. currentReferenceSequence = v.referenceSequence
  51. # As soon as the end of the first file is reached, there are no
  52. # more intersections and the program can terminate.
  53. while successv:
  54. if v.referenceSequence == b.referenceSequence:
  55. if v.position < b.start: successv = v.parseVcf(b.referenceSequence, b.start, False, None)
  56. elif v.position > b.end: successb = b.parseBed(v.referenceSequence, v.position)
  57. else:
  58. outputFile.write(v.record)
  59. successv = v.getRecord()
  60. else:
  61. if v.referenceSequence == currentReferenceSequence: successv = v.parseVcf(b.referenceSequence, b.start, False, None)
  62. if b.referenceSequence == currentReferenceSequence: successb = b.parseBed(v.referenceSequence, v.position)
  63. currentReferenceSequence = v.referenceSequence
  64. def main():
  65. # Parse the command line options
  66. usage = "Usage: vcfPytools.py intersect [options]"
  67. parser = optparse.OptionParser(usage = usage)
  68. parser.add_option("-i", "--in",
  69. action="append", type="string",
  70. dest="vcfFiles", help="input vcf files")
  71. parser.add_option("-b", "--bed",
  72. action="store", type="string",
  73. dest="bedFile", help="input bed vcf file")
  74. parser.add_option("-o", "--out",
  75. action="store", type="string",
  76. dest="output", help="output vcf file")
  77. parser.add_option("-f", "--priority-file",
  78. action="store", type="string",
  79. dest="priorityFile", help="output records from this vcf file")
  80. (options, args) = parser.parse_args()
  81. # Check that a single vcf file is given.
  82. if options.vcfFiles == None:
  83. parser.print_help()
  84. print >> sys.stderr, "\nAt least one vcf file (--in, -i) is required for performing intersection."
  85. exit(1)
  86. elif len(options.vcfFiles) > 2:
  87. parser.print_help()
  88. print >> sys.stderr, "\nAt most, two vcf files (--in, -i) can be submitted for performing intersection."
  89. exit(1)
  90. elif len(options.vcfFiles) == 1 and not options.bedFile:
  91. parser.print_help()
  92. print >> sys.stderr, "\nIf only one vcf file (--in, -i) is specified, a bed file is also required for performing intersection."
  93. exit(1)
  94. # Set the output file to stdout if no output file was specified.
  95. outputFile, writeOut = setOutput(options.output) # tools.py
  96. # If intersecting with a bed file, call the bed intersection routine.
  97. if options.bedFile:
  98. v = vcf() # Define vcf object.
  99. b = bed() # Define bed object.
  100. # Open the files.
  101. v.openVcf(options.vcfFiles[0])
  102. b.openBed(options.bedFile)
  103. # Read in the header information.
  104. v.parseHeader(options.vcfFiles[0], writeOut)
  105. taskDescriptor = "##vcfPytools=intersect " + options.vcfFiles[0] + ", " + options.bedFile
  106. writeHeader(outputFile, v, False, taskDescriptor) # tools.py
  107. # Intersect the vcf file with the bed file.
  108. intersectVcfBed(v, b, outputFile)
  109. # Check that the input files had the same list of reference sequences.
  110. # If not, it is possible that there were some problems.
  111. checkReferenceSequenceLists(v.referenceSequenceList, b.referenceSequenceList) # tools.py
  112. # Close the files.
  113. v.closeVcf(options.vcfFiles[0])
  114. b.closeBed(options.bedFile)
  115. else:
  116. priority = setVcfPriority(options.priorityFile, options.vcfFiles)
  117. v1 = vcf() # Define vcf object.
  118. v2 = vcf() # Define vcf object.
  119. # Open the vcf files.
  120. v1.openVcf(options.vcfFiles[0])
  121. v2.openVcf(options.vcfFiles[1])
  122. # Read in the header information.
  123. v1.parseHeader(options.vcfFiles[0], writeOut)
  124. v2.parseHeader(options.vcfFiles[1], writeOut)
  125. if priority == 3:
  126. v3 = vcf() # Generate a new vcf object that will contain the header information of the new file.
  127. mergeHeaders(v1, v2, v3) # tools.py
  128. v1.processInfo = True
  129. v2.processInfo = True
  130. else: checkDataSets(v1, v2)
  131. #print v1.samplesList
  132. #print v2.samplesList
  133. # Check that the header for the two files contain the same samples.
  134. if v1.samplesList != v2.samplesList:
  135. print >> sys.stderr, "vcf files contain different samples (or sample order)."
  136. exit(1)
  137. else:
  138. taskDescriptor = "##vcfPytools=intersect " + v1.filename + ", " + v2.filename
  139. if priority == 3: writeHeader(outputFile, v3, False, taskDescriptor)
  140. elif (priority == 2 and v2.hasHeader) or not v1.hasHeader: writeHeader(outputFile, v2, False, taskDescriptor) # tools.py
  141. else: writeHeader(outputFile, v1, False, taskDescriptor) # tools.py
  142. # Intersect the two vcf files.
  143. intersectVcf(v1, v2, priority, outputFile)
  144. # Check that the input files had the same list of reference sequences.
  145. # If not, it is possible that there were some problems.
  146. checkReferenceSequenceLists(v1.referenceSequenceList, v2.referenceSequenceList) # tools.py
  147. # Close the vcf files.
  148. v1.closeVcf(options.vcfFiles[0])
  149. v2.closeVcf(options.vcfFiles[1])
  150. # End the program.
  151. return 0