intersect.py - This is a Python script that performs an int…

/tools/vcf_tools/intersect.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 181 lines · 116 code · 30 blank · 35 comment · 36 complexity · e716eb1310462ed789d5e7c63821ee5e MD5 · raw file


#!/usr/bin/python

import os.path
import sys
import optparse

import bedClass
from bedClass import *

import vcfClass
from vcfClass import *

import tools
from tools import *

if __name__ == "__main__":
  main()

# Intersect two vcf files.  It is assumed that the two files are
# sorted by genomic coordinates and the reference sequences are
# in the same order.
def intersectVcf(v1, v2, priority, outputFile):
  success1 = v1.getRecord()
  success2 = v2.getRecord()
  currentReferenceSequence = v1.referenceSequence

# As soon as the end of either file is reached, there can be no
# more intersecting SNPs, so terminate.
  while success1 and success2:
    if v1.referenceSequence == v2.referenceSequence and v1.referenceSequence == currentReferenceSequence:
      if v1.position == v2.position:
        writeVcfRecord(priority, v1, v2, outputFile)
        success1 = v1.getRecord()
        success2 = v2.getRecord()
      elif v2.position > v1.position: success1 = v1.parseVcf(v2.referenceSequence, v2.position, False, None)
      elif v1.position > v2.position: success2 = v2.parseVcf(v1.referenceSequence, v1.position, False, None)
    else:
      if v1.referenceSequence == currentReferenceSequence: success1 = v1.parseVcf(v2.referenceSequence, v2.position, False, None)
      elif v2.referenceSequence == currentReferenceSequence: success2 = v2.parseVcf(v1.referenceSequence, v1.position, False, None)

# If the last record for a reference sequence is the same for both vcf
# files, they will both have referenceSequences different from the
# current reference sequence.  Change the reference sequence to reflect
# this and proceed.
      else:
        if v1.referenceSequence != v2.referenceSequence:
          print >> sys.stderr, "ERROR: Reference sequences for both files are unexpectedly different."
          print >> sys.stderr, "Check that both files contain records for the following reference sequences:"
          print >> sys.stderr, "\t", v1.referenceSequence, " and ", v2.referenceSequence
          exit(1)
      currentReferenceSequence = v1.referenceSequence

# Intersect a vcf file and a bed file.  It is assumed that the 
# two files are sorted by genomic coordinates and the reference
# sequences are in the same order.
def intersectVcfBed(v, b, outputFile):
  successb = b.getRecord()
  successv = v.getRecord()
  currentReferenceSequence = v.referenceSequence

# As soon as the end of the first file is reached, there are no
# more intersections and the program can terminate.
  while successv:
    if v.referenceSequence == b.referenceSequence:
      if v.position < b.start: successv = v.parseVcf(b.referenceSequence, b.start, False, None)
      elif v.position > b.end: successb = b.parseBed(v.referenceSequence, v.position)
      else:
        outputFile.write(v.record)
        successv = v.getRecord()
    else:
      if v.referenceSequence == currentReferenceSequence: successv = v.parseVcf(b.referenceSequence, b.start, False, None)
      if b.referenceSequence == currentReferenceSequence: successb = b.parseBed(v.referenceSequence, v.position)
      currentReferenceSequence = v.referenceSequence

def main():

# Parse the command line options
  usage = "Usage: vcfPytools.py intersect [options]"
  parser = optparse.OptionParser(usage = usage)
  parser.add_option("-i", "--in",
                    action="append", type="string",
                    dest="vcfFiles", help="input vcf files")
  parser.add_option("-b", "--bed",
                    action="store", type="string",
                    dest="bedFile", help="input bed vcf file")
  parser.add_option("-o", "--out",
                    action="store", type="string",
                    dest="output", help="output vcf file")
  parser.add_option("-f", "--priority-file",
                    action="store", type="string",
                    dest="priorityFile", help="output records from this vcf file")

  (options, args) = parser.parse_args()

# Check that a single  vcf file is given.
  if options.vcfFiles == None:
    parser.print_help()
    print >> sys.stderr, "\nAt least one vcf file (--in, -i) is required for performing intersection."
    exit(1)
  elif len(options.vcfFiles) > 2:
    parser.print_help()
    print >> sys.stderr, "\nAt most, two vcf files (--in, -i) can be submitted for performing intersection."
    exit(1)
  elif len(options.vcfFiles) == 1 and not options.bedFile:
    parser.print_help()
    print >> sys.stderr, "\nIf only one vcf file (--in, -i) is specified, a bed file is also required for performing intersection."
    exit(1)

# Set the output file to stdout if no output file was specified.
  outputFile, writeOut = setOutput(options.output) # tools.py

# If intersecting with a bed file, call the bed intersection routine.
  if options.bedFile:
    v = vcf() # Define vcf object.
    b = bed() # Define bed object.

# Open the files.
    v.openVcf(options.vcfFiles[0])
    b.openBed(options.bedFile)

# Read in the header information.
    v.parseHeader(options.vcfFiles[0], writeOut)
    taskDescriptor = "##vcfPytools=intersect " + options.vcfFiles[0] + ", " + options.bedFile
    writeHeader(outputFile, v, False, taskDescriptor) # tools.py

# Intersect the vcf file with the bed file.
    intersectVcfBed(v, b, outputFile)

# Check that the input files had the same list of reference sequences.
# If not, it is possible that there were some problems.
    checkReferenceSequenceLists(v.referenceSequenceList, b.referenceSequenceList) # tools.py

# Close the files.
    v.closeVcf(options.vcfFiles[0])
    b.closeBed(options.bedFile)

  else:
    priority = setVcfPriority(options.priorityFile, options.vcfFiles)
    v1 = vcf() # Define vcf object.
    v2 = vcf() # Define vcf object.

# Open the vcf files.
    v1.openVcf(options.vcfFiles[0])
    v2.openVcf(options.vcfFiles[1])

# Read in the header information.
    v1.parseHeader(options.vcfFiles[0], writeOut)
    v2.parseHeader(options.vcfFiles[1], writeOut)
    if priority == 3:
      v3 = vcf() # Generate a new vcf object that will contain the header information of the new file.
      mergeHeaders(v1, v2, v3) # tools.py
      v1.processInfo = True
      v2.processInfo = True
    else: checkDataSets(v1, v2)
    
    #print v1.samplesList
    #print v2.samplesList

# Check that the header for the two files contain the same samples.
    if v1.samplesList != v2.samplesList:
      print >> sys.stderr, "vcf files contain different samples (or sample order)."
      exit(1)
    else:
      taskDescriptor = "##vcfPytools=intersect " + v1.filename + ", " + v2.filename
      if priority == 3: writeHeader(outputFile, v3, False, taskDescriptor)
      elif (priority == 2 and v2.hasHeader) or not v1.hasHeader: writeHeader(outputFile, v2, False, taskDescriptor) # tools.py
      else: writeHeader(outputFile, v1, False, taskDescriptor) # tools.py

# Intersect the two vcf files.
    intersectVcf(v1, v2, priority, outputFile)

# Check that the input files had the same list of reference sequences.
# If not, it is possible that there were some problems.
    checkReferenceSequenceLists(v1.referenceSequenceList, v2.referenceSequenceList) # tools.py

# Close the vcf files.
    v1.closeVcf(options.vcfFiles[0])
    v2.closeVcf(options.vcfFiles[1])

# End the program.
  return 0

Summary ✨

This is a Python script that performs an intersection operation on two VCF files. It reads in the header information from both files and checks if they contain the same samples. If not, it exits with an error message. Otherwise, it proceeds to perform the intersection operation and writes the output to a new file. The script also checks if the input files have the same list of reference sequences and exits with an error message if not.

Tech Fingerprint

Standard Library: OS Interaction

Alerts (23)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
8 11 14
'def' Ensure functions have docstrings for documentation
22 56 75
Complexity hotspot; lines 29 to 31 (total complexity: 5)
29 30 31
Complexity hotspot; lines 35 to 39 (total complexity: 5)
35 36 37 38 39
Complexity hotspot; lines 63 to 67 (total complexity: 5)
63 64 65 66 67
'== None' Use 'is' for None comparisons (e.g., x is None)
96
Complexity hotspot; lines 165 to 167 (total complexity: 5)
165 166 167