PageRenderTime 15ms CodeModel.GetById 1ms app.highlight 11ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/vcf_tools/intersect.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 181 lines | 116 code | 30 blank | 35 comment | 23 complexity | e716eb1310462ed789d5e7c63821ee5e MD5 | raw file
  1#!/usr/bin/python
  2
  3import os.path
  4import sys
  5import optparse
  6
  7import bedClass
  8from bedClass import *
  9
 10import vcfClass
 11from vcfClass import *
 12
 13import tools
 14from tools import *
 15
 16if __name__ == "__main__":
 17  main()
 18
 19# Intersect two vcf files.  It is assumed that the two files are
 20# sorted by genomic coordinates and the reference sequences are
 21# in the same order.
 22def intersectVcf(v1, v2, priority, outputFile):
 23  success1 = v1.getRecord()
 24  success2 = v2.getRecord()
 25  currentReferenceSequence = v1.referenceSequence
 26
 27# As soon as the end of either file is reached, there can be no
 28# more intersecting SNPs, so terminate.
 29  while success1 and success2:
 30    if v1.referenceSequence == v2.referenceSequence and v1.referenceSequence == currentReferenceSequence:
 31      if v1.position == v2.position:
 32        writeVcfRecord(priority, v1, v2, outputFile)
 33        success1 = v1.getRecord()
 34        success2 = v2.getRecord()
 35      elif v2.position > v1.position: success1 = v1.parseVcf(v2.referenceSequence, v2.position, False, None)
 36      elif v1.position > v2.position: success2 = v2.parseVcf(v1.referenceSequence, v1.position, False, None)
 37    else:
 38      if v1.referenceSequence == currentReferenceSequence: success1 = v1.parseVcf(v2.referenceSequence, v2.position, False, None)
 39      elif v2.referenceSequence == currentReferenceSequence: success2 = v2.parseVcf(v1.referenceSequence, v1.position, False, None)
 40
 41# If the last record for a reference sequence is the same for both vcf
 42# files, they will both have referenceSequences different from the
 43# current reference sequence.  Change the reference sequence to reflect
 44# this and proceed.
 45      else:
 46        if v1.referenceSequence != v2.referenceSequence:
 47          print >> sys.stderr, "ERROR: Reference sequences for both files are unexpectedly different."
 48          print >> sys.stderr, "Check that both files contain records for the following reference sequences:"
 49          print >> sys.stderr, "\t", v1.referenceSequence, " and ", v2.referenceSequence
 50          exit(1)
 51      currentReferenceSequence = v1.referenceSequence
 52
 53# Intersect a vcf file and a bed file.  It is assumed that the 
 54# two files are sorted by genomic coordinates and the reference
 55# sequences are in the same order.
 56def intersectVcfBed(v, b, outputFile):
 57  successb = b.getRecord()
 58  successv = v.getRecord()
 59  currentReferenceSequence = v.referenceSequence
 60
 61# As soon as the end of the first file is reached, there are no
 62# more intersections and the program can terminate.
 63  while successv:
 64    if v.referenceSequence == b.referenceSequence:
 65      if v.position < b.start: successv = v.parseVcf(b.referenceSequence, b.start, False, None)
 66      elif v.position > b.end: successb = b.parseBed(v.referenceSequence, v.position)
 67      else:
 68        outputFile.write(v.record)
 69        successv = v.getRecord()
 70    else:
 71      if v.referenceSequence == currentReferenceSequence: successv = v.parseVcf(b.referenceSequence, b.start, False, None)
 72      if b.referenceSequence == currentReferenceSequence: successb = b.parseBed(v.referenceSequence, v.position)
 73      currentReferenceSequence = v.referenceSequence
 74
 75def main():
 76
 77# Parse the command line options
 78  usage = "Usage: vcfPytools.py intersect [options]"
 79  parser = optparse.OptionParser(usage = usage)
 80  parser.add_option("-i", "--in",
 81                    action="append", type="string",
 82                    dest="vcfFiles", help="input vcf files")
 83  parser.add_option("-b", "--bed",
 84                    action="store", type="string",
 85                    dest="bedFile", help="input bed vcf file")
 86  parser.add_option("-o", "--out",
 87                    action="store", type="string",
 88                    dest="output", help="output vcf file")
 89  parser.add_option("-f", "--priority-file",
 90                    action="store", type="string",
 91                    dest="priorityFile", help="output records from this vcf file")
 92
 93  (options, args) = parser.parse_args()
 94
 95# Check that a single  vcf file is given.
 96  if options.vcfFiles == None:
 97    parser.print_help()
 98    print >> sys.stderr, "\nAt least one vcf file (--in, -i) is required for performing intersection."
 99    exit(1)
100  elif len(options.vcfFiles) > 2:
101    parser.print_help()
102    print >> sys.stderr, "\nAt most, two vcf files (--in, -i) can be submitted for performing intersection."
103    exit(1)
104  elif len(options.vcfFiles) == 1 and not options.bedFile:
105    parser.print_help()
106    print >> sys.stderr, "\nIf only one vcf file (--in, -i) is specified, a bed file is also required for performing intersection."
107    exit(1)
108
109# Set the output file to stdout if no output file was specified.
110  outputFile, writeOut = setOutput(options.output) # tools.py
111
112# If intersecting with a bed file, call the bed intersection routine.
113  if options.bedFile:
114    v = vcf() # Define vcf object.
115    b = bed() # Define bed object.
116
117# Open the files.
118    v.openVcf(options.vcfFiles[0])
119    b.openBed(options.bedFile)
120
121# Read in the header information.
122    v.parseHeader(options.vcfFiles[0], writeOut)
123    taskDescriptor = "##vcfPytools=intersect " + options.vcfFiles[0] + ", " + options.bedFile
124    writeHeader(outputFile, v, False, taskDescriptor) # tools.py
125
126# Intersect the vcf file with the bed file.
127    intersectVcfBed(v, b, outputFile)
128
129# Check that the input files had the same list of reference sequences.
130# If not, it is possible that there were some problems.
131    checkReferenceSequenceLists(v.referenceSequenceList, b.referenceSequenceList) # tools.py
132
133# Close the files.
134    v.closeVcf(options.vcfFiles[0])
135    b.closeBed(options.bedFile)
136
137  else:
138    priority = setVcfPriority(options.priorityFile, options.vcfFiles)
139    v1 = vcf() # Define vcf object.
140    v2 = vcf() # Define vcf object.
141
142# Open the vcf files.
143    v1.openVcf(options.vcfFiles[0])
144    v2.openVcf(options.vcfFiles[1])
145
146# Read in the header information.
147    v1.parseHeader(options.vcfFiles[0], writeOut)
148    v2.parseHeader(options.vcfFiles[1], writeOut)
149    if priority == 3:
150      v3 = vcf() # Generate a new vcf object that will contain the header information of the new file.
151      mergeHeaders(v1, v2, v3) # tools.py
152      v1.processInfo = True
153      v2.processInfo = True
154    else: checkDataSets(v1, v2)
155    
156    #print v1.samplesList
157    #print v2.samplesList
158
159# Check that the header for the two files contain the same samples.
160    if v1.samplesList != v2.samplesList:
161      print >> sys.stderr, "vcf files contain different samples (or sample order)."
162      exit(1)
163    else:
164      taskDescriptor = "##vcfPytools=intersect " + v1.filename + ", " + v2.filename
165      if priority == 3: writeHeader(outputFile, v3, False, taskDescriptor)
166      elif (priority == 2 and v2.hasHeader) or not v1.hasHeader: writeHeader(outputFile, v2, False, taskDescriptor) # tools.py
167      else: writeHeader(outputFile, v1, False, taskDescriptor) # tools.py
168
169# Intersect the two vcf files.
170    intersectVcf(v1, v2, priority, outputFile)
171
172# Check that the input files had the same list of reference sequences.
173# If not, it is possible that there were some problems.
174    checkReferenceSequenceLists(v1.referenceSequenceList, v2.referenceSequenceList) # tools.py
175
176# Close the vcf files.
177    v1.closeVcf(options.vcfFiles[0])
178    v2.closeVcf(options.vcfFiles[1])
179
180# End the program.
181  return 0