vcfClass.py - This is a Python class that reads and parses …

/tools/vcf_tools/vcfClass.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 440 lines · 284 code · 75 blank · 81 comment · 104 complexity · 1d1d461a4e089da585f3c83cde64159f MD5 · raw file

#!/usr/bin/python

import os.path
import sys
import re

class vcf:
  def __init__(self):

# Header info.
    self.filename = ""
    self.hasHeader = True
    self.headerText = ""
    self.headerTitles = ""
    self.vcfFormat = ""
    #self.headerInfoText = ""
    #self.headerFormatText = ""

# Store the info and format tags as well as the lines that describe
# them in a dictionary.
    self.numberDataSets = 0
    self.includedDataSets = {}
    self.infoHeaderTags = {}
    self.infoHeaderString = {}
    self.formatHeaderTags = {}
    self.formatHeaderString = {}

# Genotype information.
    self.genotypes = False
    self.infoField = {}

# Reference sequence information.
    self.referenceSequences = {}
    self.referenceSequenceList = []
    self.referenceSequence = ""

# Record information.
    self.position = -1
    self.samplesList = []

# Determine which fields to process.
    self.processInfo = False
    self.processGenotypes = False
    self.dbsnpVcf = False
    self.hapmapVcf = False

# Open a vcf file.
  def openVcf(self, filename):
    if filename == "stdin":
      self.filehandle = sys.stdin
      self.filename = "stdin"
    else:
      try: self.filehandle = open(filename,"r")
      except IOError:
        print >> sys.stderr, "Failed to find file: ",filename
        exit(1)
      self.filename = os.path.abspath(filename)

# Parse the vcf header.
  def parseHeader(self, filename, writeOut):
    while self.getHeaderLine(filename, writeOut):
      continue

# Determine the type of information in the header line.
  def getHeaderLine(self, filename, writeOut):
    self.headerLine = self.filehandle.readline().rstrip("\n")
    if self.headerLine.startswith("##fileformat"): success = self.getvcfFormat()
    if self.headerLine.startswith("##INFO"): success = self.headerInfo(writeOut, "info")
    elif self.headerLine.startswith("##FORMAT"): success = self.headerInfo(writeOut, "format")
    elif self.headerLine.startswith("##FILE"): success = self.headerFiles(writeOut)
    elif self.headerLine.startswith("##"): success = self.headerAdditional()
    elif self.headerLine.startswith("#"): success = self.headerTitleString(filename, writeOut)
    else: success = self.noHeader(filename, writeOut)

    return success

# Read VCF format
  def getvcfFormat(self):
      try:
          self.vcfFormat = self.headerLine.split("=",1)[1]
          self.vcfFormat = float( self.vcfFormat.split("VCFv",1)[1] )## Extract the version number rather than the whole string
      except IndexError:
          print >> sys.stderr, "\nError parsing the fileformat"
          print >> sys.stderr, "The following fileformat header is wrongly formatted: ", self.headerLine
          exit(1)
      return True


# Read information on an info field from the header line.
  def headerInfo(self, writeOut, lineType):
    tag = self.headerLine.split("=",1)
    tagID = (tag[1].split("ID=",1))[1].split(",",1)

# Check if this info field has already been defined.
    if (lineType == "info" and self.infoHeaderTags.has_key(tagID[0])) or (lineType == "format" and self.formatHeaderTags.has_key(tagID[0])):
      print >> sys.stderr, "Info tag \"", tagID[0], "\" is defined multiple times in the header."
      exit(1)

# Determine the number of entries, entry type and description.
    tagNumber = (tagID[1].split("Number=",1))[1].split(",",1)
    tagType = (tagNumber[1].split("Type=",1))[1].split(",",1)
    try: tagDescription = ( ( (tagType[1].split("Description=\"",1))[1] ).split("\">") )[0]
    except IndexError: tagDescription = ""
    tagID = tagID[0]; tagNumber = tagNumber[0]; tagType = tagType[0]

# Check that the number of fields associated with the tag is either
# an integer or a '.' to indicate variable number of entries.
    if tagNumber == ".": tagNumber = "variable"
    else:
      if self.vcfFormat<4.1:

        try:
            tagNumber = int(tagNumber)

        except ValueError:
            print >> sys.stderr, "\nError parsing header.  Problem with info tag:", tagID
            print >> sys.stderr, "Number of fields associated with this tag is not an integer or '.'"
            exit(1)

    if lineType == "info":
      self.infoHeaderTags[tagID] = tagNumber, tagType, tagDescription
      self.infoHeaderString[tagID] = self.headerLine
    if lineType == "format":
      self.formatHeaderTags[tagID] = tagNumber, tagType, tagDescription
      self.formatHeaderString[tagID] = self.headerLine

    return True

# Check to see if the records contain information from multiple different
# sources.  If vcfPytools has been used to find the intersection or union
# of two vcf files, the records may have been merged to keep all the
# information available.  If this is the case, there will be a ##FILE line
# for each set of information in the file.  The order of these files needs
# to be maintained.
  def headerFiles(self, writeOut):
    fileID = (self.headerLine.split("ID=",1))[1].split(",",1)
    filename = fileID[1].split("\"",2)[1]
    try: fileID = int(fileID[0])
    except ValueError:
      print >> sys.stderr, "File ID in ##FILE entry must be an integer."
      print >> sys.stderr, self.headerLine
      exit(1)
    if self.includedDataSets.has_key(fileID):
      print >> sys.stderr, "\nERROR: file " + self.filename
      print >> sys.stderr, "Multiple files in the ##FILE list have identical ID values."
      exit(1)
    self.includedDataSets[fileID] = filename

# Set the number of files with information in this vcf file.
    if fileID > self.numberDataSets: self.numberDataSets = fileID

    return True

# Read additional information contained in the header.
  def headerAdditional(self):
    self.headerText += self.headerLine + "\n"

    return True

# Read in the column titles to check that all standard fields
# are present and read in all the samples.
  def headerTitleString(self, filename, writeOut):
    self.headerTitles = self.headerLine + "\n"

# Strip the end of line character from the last infoFields entry.
    infoFields = self.headerLine.split("\t")
    if len(infoFields) > 8:
#      if len(infoFields) - 9 == 1 and writeOut: print >> sys.stdout, len(infoFields) - 9, " sample present in vcf file: ", filename
#      elif writeOut: print >> sys.stdout, len(infoFields) - 9, " samples present in vcf file: ", filename
      self.samplesList = infoFields[9:]
      self.genotypes = True
    elif len(infoFields) == 8:
      if writeOut: print >> sys.stdout, "No samples present in the header.  No genotype information available."
    else:
      print self.headerLine, len(infoFields)
      print >> sys.stderr, "Not all vcf standard fields are available."
      exit(1)

    return False

# If there is no header in the vcf file, close and reopen the
# file so that the first line is avaiable for parsing as a
# vcf record.
  def noHeader(self, filename, writeOut):
    if writeOut: print >> sys.stdout, "No header lines present in", filename
    self.hasHeader = False
    self.closeVcf(filename)
    self.openVcf(filename)

    return False

# Check that info fields exist.
  def checkInfoFields(self, tag):
    if self.infoHeaderTags.has_key(tag) == False:
      print >> sys.stderr, "Info tag \"", tag, "\" does not exist in the header."
      exit(1)

# Get the next line of information from the vcf file.
  def getRecord(self):
    self.record = self.filehandle.readline()
    if not self.record: return False

# Set up and execute a regular expression match.
    recordRe = re.compile(r"^(\S+)\t(\d+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)(\n|\t.+)$")
    #recordRe = re.compile(r"^(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)(\n|\s+.+)$")
    recordMatch = recordRe.match(self.record)
    if recordMatch == None:
      print >> sys.stderr, "Unable to resolve vcf record.\n"
      print >> sys.stderr, self.record
      exit(1)

    self.referenceSequence = recordMatch.group(1)
    try: self.position = int(recordMatch.group(2))
    except ValueError:
      text = "variant position is not an integer"
      self.generalError(text, "", None)
    self.rsid       = recordMatch.group(3)
    self.ref        = recordMatch.group(4)
    self.alt        = recordMatch.group(5)
    self.quality    = recordMatch.group(6)
    self.filters    = recordMatch.group(7)
    self.info       = recordMatch.group(8)
    self.genotypeString = recordMatch.group(9)
    self.infoTags   = {}

# Check that the quality is an integer or a float.  If not, set the quality
# to zero.
    try: self.quality = float(self.quality)
    except ValueError: self.quality = float(0.)

# If recordMatch.group(9) is not the end of line character, there is
# genotype information with this record.
    if self.genotypeString != "\n": self.hasGenotypes = True
    else: self.hasGenotypes = False

# Add the reference sequence to the dictionary.  If it didn't previously
# exist append the reference sequence to the end of the list as well.
# This ensures that the order in which the reference sequences appeared
# in the header can be preserved.
    if self.referenceSequence not in self.referenceSequences:
      self.referenceSequences[self.referenceSequence] = True
      self.referenceSequenceList.append(self.referenceSequence)

# Check for multiple alternate alleles.
    self.alternateAlleles = self.alt.split(",")
    self.numberAlternateAlleles = len(self.alternateAlleles)

# If required, process the info and genotypes.
    if self.processInfo: self.processInfoFields()
    if self.processGenotypes and self.hasGenotypes: self.processGenotypeFields()

    return True

# Process the info string.
  def processInfoFields(self):

# First break the info string into its constituent elements.
    infoEntries = self.info.split(";")

# As long as some info fields exist, place them into a dictionary.
    for entry in infoEntries:
      infoEntry = entry.split("=")

# If the entry is a flag, there will be no equals and the length of
# infoEntry will be 1.  In this case, set the dictionary entry to the
# whole entry.  If the vcf file has undergone a union or intersection
# operation and contains the information from multiple files, this may
# be a '/' seperate list of flags and so cannot be set to a Boolean value
# yet.
      if len(infoEntry) == 1: self.infoTags[infoEntry[0]] = infoEntry[0]
      elif len(infoEntry) > 1: self.infoTags[infoEntry[0]] = infoEntry[1]

# Process the genotype formats and values.
  def processGenotypeFields(self):
    genotypeEntries = self.genotypeString.split("\t")
    self.genotypeFormatString = genotypeEntries[1]
    self.genotypes = list(genotypeEntries[2:])
    self.genotypeFormats = {}
    self.genotypeFields = {}
    self.genotypeFormats = self.genotypeFormatString.split(":")

# Check that the number of genotype fields is equal to the number of samples
    if len(self.samplesList) != len(self.genotypes):
      text = "The number of genotypes is different to the number of samples"
      self.generalError(text, "", "")

# Add the genotype information to a dictionary.
    for i in range( len(self.samplesList) ):
      genotypeInfo = self.genotypes[i].split(":")
      self.genotypeFields[ self.samplesList[i] ] = {}

# Check that there are as many fields as in the format field.  If not, this must
# be because the information is not known.  In this case, it is permitted that
# the genotype information is either . or ./.
      if genotypeInfo[0] == "./." or genotypeInfo[0] == "." and len(self.genotypeFormats) != len(genotypeInfo):
        self.genotypeFields[ self.samplesList[i] ] = "."
      else:
        if len(self.genotypeFormats) != len(genotypeInfo):
          text = "The number of genotype fields is different to the number specified in the format string"
          self.generalError(text, "sample", self.samplesList[i])

        for j in range( len(self.genotypeFormats) ): self.genotypeFields[ self.samplesList[i] ][ self.genotypeFormats[j] ] = genotypeInfo[j]

# Parse through the vcf file until the correct reference sequence is
# encountered and the position is greater than or equal to that requested.
  def parseVcf(self, referenceSequence, position, writeOut, outputFile):
    success = True
    if self.referenceSequence != referenceSequence:
      while self.referenceSequence != referenceSequence and success:
        if writeOut: outputFile.write(self.record)
        success = self.getRecord()

    while self.referenceSequence == referenceSequence and self.position < position and success:
      if writeOut: outputFile.write(self.record)
      success = self.getRecord()

    return success

# Get the information for a specific info tag.  Also check that it contains
# the correct number and type of entries.
  def getInfo(self, tag):
    result = []

# Check if the tag exists in the header information.  If so,
# determine the number and type of entries asscoiated with this
# tag.
    if self.infoHeaderTags.has_key(tag):
      infoNumber = self.infoHeaderTags[tag][0]
      infoType = self.infoHeaderTags[tag][1]
      numberValues = infoNumber

# First check that the tag exists in the information string.  Then split
# the entry on commas.  For flag entries, do not perform the split.
      if self.infoTags.has_key(tag):
        if numberValues == 0 and infoType == "Flag": result = True
        elif numberValues != 0 and infoType == "Flag":
          print >> sys.stderr, "ERROR"
          exit(1)
        else:
          fields = self.infoTags[tag].split(",")
          if len(fields) != numberValues:
            text = "Unexpected number of entries"
            self.generalError(text, "information tag", tag)

          for i in range(infoNumber):
            try: result.append(fields[i])
            except IndexError:
              text = "Insufficient values. Expected: " + self.infoHeaderTags[tag][0]
              self.generalError(text, "tag:", tag)
      else: numberValues = 0

    else:
      text = "information field does not have a definition in the header"
      self.generalError(text, "tag", tag)

    return numberValues, infoType, result

# Get the genotype information.
  def getGenotypeInfo(self, sample, tag):
    result = []
    if self.formatHeaderTags.has_key(tag):
      infoNumber = self.formatHeaderTags[tag][0]
      infoType = self.formatHeaderTags[tag][1]
      numberValues = infoNumber

      if self.genotypeFields[sample] == "." and len(self.genotypeFields[sample]) == 1:
        numberValues = 0
        result = "."
      else:
        if self.genotypeFields[sample].has_key(tag):
          if tag == "GT":
            if len(self.genotypeFields[sample][tag]) != 3 and len(self.genotypeFields[sample][tag]) != 1:
              text = "Unexected number of characters in genotype (GT) field"
              self.generalError(text, "sample", sample)

# If a diploid call, check whether or not the genotype is phased.
            elif len(self.genotypeFields[sample][tag]) == 3:
              self.phased = True if self.genotypeFields[sample][tag][1] == "|" else False
              result.append( self.genotypeFields[sample][tag][0] )
              result.append( self.genotypeFields[sample][tag][2] )
            elif len(self.genotypeFields[sample][tag]) == 3:
              result.append( self.genotypeFields[sample][tag][0] )
          else:
            fields = self.genotypeFields[sample][tag].split(",")
            if len(fields) != numberValues:
              text = "Unexpected number of characters in " + tag + " field"
              self.generalError(text, "sample", sample)

            for i in range(infoNumber): result.append(fields[i])
    else:
      text = "genotype field does not have a definition in the header"
      self.generalError(text, "tag", tag)

    return numberValues, result

# Parse the dbsnp entry.  If the entry conforms to the required variant type,
# return the dbsnp rsid value, otherwise ".".
  def getDbsnpInfo(self):

# First check that the variant class (VC) is listed as SNP.
    vc = self.info.split("VC=",1)
    if vc[1].find(";") != -1: snp = vc[1].split(";",1)
    else:
      snp = []
      snp.append(vc[1])

    if snp[0].lower() == "snp": rsid = self.rsid
    else: rsid = "."

    return rsid

# Build a new vcf record.
  def buildRecord(self, removeGenotypes):
    record = self.referenceSequence + "\t" + \
                str(self.position) + "\t" + \
                self.rsid + "\t" + \
                self.ref + "\t" + \
                self.alt + "\t" + \
                str(self.quality) + "\t" + \
                self.filters + "\t" + \
                self.info

    if self.hasGenotypes and not removeGenotypes: record += self.genotypeString

    record += "\n"

    return record

# Close the vcf file.
  def closeVcf(self, filename):
    self.filehandle.close()

# Define error messages for different handled errors.
  def generalError(self, text, field, fieldValue):
    print >> sys.stderr, "\nError encountered when attempting to read:"
    print >> sys.stderr, "\treference sequence :\t", self.referenceSequence
    print >> sys.stderr, "\tposition :\t\t", self.position
    if field != "": print >> sys.stderr, "\t", field, ":\t", fieldValue
    print >> sys.stderr,  "\n", text
    exit(1)
Summary ✨

This is a Python class that reads and parses VCF (Variant Call Format) files. It has methods for reading the header, parsing the data lines, and building new records with specific fields removed.
Tech Fingerprint

Alerts (30)

'def' Ensure functions have docstrings for documentation
48 60 65 78 90 135 155 162 184 193 199 255 274 306 321 359 398 413 430 434
'open(' Use 'with open()' to ensure Files are properly closed
53
Complexity hotspot; lines 67 to 73 (total complexity: 7)
67 68 69 70 71 72 73
'== None' Use 'is' for None comparisons (e.g., x is None)
207
'list(' Avoid unnecessary list conversions; use generators where possible
277