PageRenderTime 53ms CodeModel.GetById 12ms app.highlight 34ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/vcf_tools/vcfClass.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 440 lines | 398 code | 22 blank | 20 comment | 24 complexity | 1d1d461a4e089da585f3c83cde64159f MD5 | raw file
  1#!/usr/bin/python
  2
  3import os.path
  4import sys
  5import re
  6
  7class vcf:
  8  def __init__(self):
  9
 10# Header info.
 11    self.filename = ""
 12    self.hasHeader = True
 13    self.headerText = ""
 14    self.headerTitles = ""
 15    self.vcfFormat = ""
 16    #self.headerInfoText = ""
 17    #self.headerFormatText = ""
 18
 19# Store the info and format tags as well as the lines that describe
 20# them in a dictionary.
 21    self.numberDataSets = 0
 22    self.includedDataSets = {}
 23    self.infoHeaderTags = {}
 24    self.infoHeaderString = {}
 25    self.formatHeaderTags = {}
 26    self.formatHeaderString = {}
 27
 28# Genotype information.
 29    self.genotypes = False
 30    self.infoField = {}
 31
 32# Reference sequence information.
 33    self.referenceSequences = {}
 34    self.referenceSequenceList = []
 35    self.referenceSequence = ""
 36
 37# Record information.
 38    self.position = -1
 39    self.samplesList = []
 40
 41# Determine which fields to process.
 42    self.processInfo = False
 43    self.processGenotypes = False
 44    self.dbsnpVcf = False
 45    self.hapmapVcf = False
 46
 47# Open a vcf file.
 48  def openVcf(self, filename):
 49    if filename == "stdin":
 50      self.filehandle = sys.stdin
 51      self.filename = "stdin"
 52    else:
 53      try: self.filehandle = open(filename,"r")
 54      except IOError:
 55        print >> sys.stderr, "Failed to find file: ",filename
 56        exit(1)
 57      self.filename = os.path.abspath(filename)
 58
 59# Parse the vcf header.
 60  def parseHeader(self, filename, writeOut):
 61    while self.getHeaderLine(filename, writeOut):
 62      continue
 63
 64# Determine the type of information in the header line.
 65  def getHeaderLine(self, filename, writeOut):
 66    self.headerLine = self.filehandle.readline().rstrip("\n")
 67    if self.headerLine.startswith("##fileformat"): success = self.getvcfFormat()
 68    if self.headerLine.startswith("##INFO"): success = self.headerInfo(writeOut, "info")
 69    elif self.headerLine.startswith("##FORMAT"): success = self.headerInfo(writeOut, "format")
 70    elif self.headerLine.startswith("##FILE"): success = self.headerFiles(writeOut)
 71    elif self.headerLine.startswith("##"): success = self.headerAdditional()
 72    elif self.headerLine.startswith("#"): success = self.headerTitleString(filename, writeOut)
 73    else: success = self.noHeader(filename, writeOut)
 74
 75    return success
 76
 77# Read VCF format
 78  def getvcfFormat(self):
 79      try:
 80          self.vcfFormat = self.headerLine.split("=",1)[1]
 81          self.vcfFormat = float( self.vcfFormat.split("VCFv",1)[1] )## Extract the version number rather than the whole string
 82      except IndexError:
 83          print >> sys.stderr, "\nError parsing the fileformat"
 84          print >> sys.stderr, "The following fileformat header is wrongly formatted: ", self.headerLine
 85          exit(1)
 86      return True
 87
 88
 89# Read information on an info field from the header line.
 90  def headerInfo(self, writeOut, lineType):
 91    tag = self.headerLine.split("=",1)
 92    tagID = (tag[1].split("ID=",1))[1].split(",",1)
 93
 94# Check if this info field has already been defined.
 95    if (lineType == "info" and self.infoHeaderTags.has_key(tagID[0])) or (lineType == "format" and self.formatHeaderTags.has_key(tagID[0])):
 96      print >> sys.stderr, "Info tag \"", tagID[0], "\" is defined multiple times in the header."
 97      exit(1)
 98
 99# Determine the number of entries, entry type and description.
100    tagNumber = (tagID[1].split("Number=",1))[1].split(",",1)
101    tagType = (tagNumber[1].split("Type=",1))[1].split(",",1)
102    try: tagDescription = ( ( (tagType[1].split("Description=\"",1))[1] ).split("\">") )[0]
103    except IndexError: tagDescription = ""
104    tagID = tagID[0]; tagNumber = tagNumber[0]; tagType = tagType[0]
105
106# Check that the number of fields associated with the tag is either
107# an integer or a '.' to indicate variable number of entries.
108    if tagNumber == ".": tagNumber = "variable"
109    else:
110      if self.vcfFormat<4.1:
111
112        try:
113            tagNumber = int(tagNumber)
114
115        except ValueError:
116            print >> sys.stderr, "\nError parsing header.  Problem with info tag:", tagID
117            print >> sys.stderr, "Number of fields associated with this tag is not an integer or '.'"
118            exit(1)
119
120    if lineType == "info":
121      self.infoHeaderTags[tagID] = tagNumber, tagType, tagDescription
122      self.infoHeaderString[tagID] = self.headerLine
123    if lineType == "format":
124      self.formatHeaderTags[tagID] = tagNumber, tagType, tagDescription
125      self.formatHeaderString[tagID] = self.headerLine
126
127    return True
128
129# Check to see if the records contain information from multiple different
130# sources.  If vcfPytools has been used to find the intersection or union
131# of two vcf files, the records may have been merged to keep all the
132# information available.  If this is the case, there will be a ##FILE line
133# for each set of information in the file.  The order of these files needs
134# to be maintained.
135  def headerFiles(self, writeOut):
136    fileID = (self.headerLine.split("ID=",1))[1].split(",",1)
137    filename = fileID[1].split("\"",2)[1]
138    try: fileID = int(fileID[0])
139    except ValueError:
140      print >> sys.stderr, "File ID in ##FILE entry must be an integer."
141      print >> sys.stderr, self.headerLine
142      exit(1)
143    if self.includedDataSets.has_key(fileID):
144      print >> sys.stderr, "\nERROR: file " + self.filename
145      print >> sys.stderr, "Multiple files in the ##FILE list have identical ID values."
146      exit(1)
147    self.includedDataSets[fileID] = filename
148
149# Set the number of files with information in this vcf file.
150    if fileID > self.numberDataSets: self.numberDataSets = fileID
151
152    return True
153
154# Read additional information contained in the header.
155  def headerAdditional(self):
156    self.headerText += self.headerLine + "\n"
157
158    return True
159
160# Read in the column titles to check that all standard fields
161# are present and read in all the samples.
162  def headerTitleString(self, filename, writeOut):
163    self.headerTitles = self.headerLine + "\n"
164
165# Strip the end of line character from the last infoFields entry.
166    infoFields = self.headerLine.split("\t")
167    if len(infoFields) > 8:
168#      if len(infoFields) - 9 == 1 and writeOut: print >> sys.stdout, len(infoFields) - 9, " sample present in vcf file: ", filename
169#      elif writeOut: print >> sys.stdout, len(infoFields) - 9, " samples present in vcf file: ", filename
170      self.samplesList = infoFields[9:]
171      self.genotypes = True
172    elif len(infoFields) == 8:
173      if writeOut: print >> sys.stdout, "No samples present in the header.  No genotype information available."
174    else:
175      print self.headerLine, len(infoFields)
176      print >> sys.stderr, "Not all vcf standard fields are available."
177      exit(1)
178
179    return False
180
181# If there is no header in the vcf file, close and reopen the
182# file so that the first line is avaiable for parsing as a
183# vcf record.
184  def noHeader(self, filename, writeOut):
185    if writeOut: print >> sys.stdout, "No header lines present in", filename
186    self.hasHeader = False
187    self.closeVcf(filename)
188    self.openVcf(filename)
189
190    return False
191
192# Check that info fields exist.
193  def checkInfoFields(self, tag):
194    if self.infoHeaderTags.has_key(tag) == False:
195      print >> sys.stderr, "Info tag \"", tag, "\" does not exist in the header."
196      exit(1)
197
198# Get the next line of information from the vcf file.
199  def getRecord(self):
200    self.record = self.filehandle.readline()
201    if not self.record: return False
202
203# Set up and execute a regular expression match.
204    recordRe = re.compile(r"^(\S+)\t(\d+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)(\n|\t.+)$")
205    #recordRe = re.compile(r"^(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)(\n|\s+.+)$")
206    recordMatch = recordRe.match(self.record)
207    if recordMatch == None:
208      print >> sys.stderr, "Unable to resolve vcf record.\n"
209      print >> sys.stderr, self.record
210      exit(1)
211
212    self.referenceSequence = recordMatch.group(1)
213    try: self.position = int(recordMatch.group(2))
214    except ValueError:
215      text = "variant position is not an integer"
216      self.generalError(text, "", None)
217    self.rsid       = recordMatch.group(3)
218    self.ref        = recordMatch.group(4)
219    self.alt        = recordMatch.group(5)
220    self.quality    = recordMatch.group(6)
221    self.filters    = recordMatch.group(7)
222    self.info       = recordMatch.group(8)
223    self.genotypeString = recordMatch.group(9)
224    self.infoTags   = {}
225
226# Check that the quality is an integer or a float.  If not, set the quality
227# to zero.
228    try: self.quality = float(self.quality)
229    except ValueError: self.quality = float(0.)
230
231# If recordMatch.group(9) is not the end of line character, there is
232# genotype information with this record.
233    if self.genotypeString != "\n": self.hasGenotypes = True
234    else: self.hasGenotypes = False
235
236# Add the reference sequence to the dictionary.  If it didn't previously
237# exist append the reference sequence to the end of the list as well.
238# This ensures that the order in which the reference sequences appeared
239# in the header can be preserved.
240    if self.referenceSequence not in self.referenceSequences:
241      self.referenceSequences[self.referenceSequence] = True
242      self.referenceSequenceList.append(self.referenceSequence)
243
244# Check for multiple alternate alleles.
245    self.alternateAlleles = self.alt.split(",")
246    self.numberAlternateAlleles = len(self.alternateAlleles)
247
248# If required, process the info and genotypes.
249    if self.processInfo: self.processInfoFields()
250    if self.processGenotypes and self.hasGenotypes: self.processGenotypeFields()
251
252    return True
253
254# Process the info string.
255  def processInfoFields(self):
256
257# First break the info string into its constituent elements.
258    infoEntries = self.info.split(";")
259
260# As long as some info fields exist, place them into a dictionary.
261    for entry in infoEntries:
262      infoEntry = entry.split("=")
263
264# If the entry is a flag, there will be no equals and the length of
265# infoEntry will be 1.  In this case, set the dictionary entry to the
266# whole entry.  If the vcf file has undergone a union or intersection
267# operation and contains the information from multiple files, this may
268# be a '/' seperate list of flags and so cannot be set to a Boolean value
269# yet.
270      if len(infoEntry) == 1: self.infoTags[infoEntry[0]] = infoEntry[0]
271      elif len(infoEntry) > 1: self.infoTags[infoEntry[0]] = infoEntry[1]
272
273# Process the genotype formats and values.
274  def processGenotypeFields(self):
275    genotypeEntries = self.genotypeString.split("\t")
276    self.genotypeFormatString = genotypeEntries[1]
277    self.genotypes = list(genotypeEntries[2:])
278    self.genotypeFormats = {}
279    self.genotypeFields = {}
280    self.genotypeFormats = self.genotypeFormatString.split(":")
281
282# Check that the number of genotype fields is equal to the number of samples
283    if len(self.samplesList) != len(self.genotypes):
284      text = "The number of genotypes is different to the number of samples"
285      self.generalError(text, "", "")
286
287# Add the genotype information to a dictionary.
288    for i in range( len(self.samplesList) ):
289      genotypeInfo = self.genotypes[i].split(":")
290      self.genotypeFields[ self.samplesList[i] ] = {}
291
292# Check that there are as many fields as in the format field.  If not, this must
293# be because the information is not known.  In this case, it is permitted that
294# the genotype information is either . or ./.
295      if genotypeInfo[0] == "./." or genotypeInfo[0] == "." and len(self.genotypeFormats) != len(genotypeInfo):
296        self.genotypeFields[ self.samplesList[i] ] = "."
297      else:
298        if len(self.genotypeFormats) != len(genotypeInfo):
299          text = "The number of genotype fields is different to the number specified in the format string"
300          self.generalError(text, "sample", self.samplesList[i])
301
302        for j in range( len(self.genotypeFormats) ): self.genotypeFields[ self.samplesList[i] ][ self.genotypeFormats[j] ] = genotypeInfo[j]
303
304# Parse through the vcf file until the correct reference sequence is
305# encountered and the position is greater than or equal to that requested.
306  def parseVcf(self, referenceSequence, position, writeOut, outputFile):
307    success = True
308    if self.referenceSequence != referenceSequence:
309      while self.referenceSequence != referenceSequence and success:
310        if writeOut: outputFile.write(self.record)
311        success = self.getRecord()
312
313    while self.referenceSequence == referenceSequence and self.position < position and success:
314      if writeOut: outputFile.write(self.record)
315      success = self.getRecord()
316
317    return success
318
319# Get the information for a specific info tag.  Also check that it contains
320# the correct number and type of entries.
321  def getInfo(self, tag):
322    result = []
323
324# Check if the tag exists in the header information.  If so,
325# determine the number and type of entries asscoiated with this
326# tag.
327    if self.infoHeaderTags.has_key(tag):
328      infoNumber = self.infoHeaderTags[tag][0]
329      infoType = self.infoHeaderTags[tag][1]
330      numberValues = infoNumber
331
332# First check that the tag exists in the information string.  Then split
333# the entry on commas.  For flag entries, do not perform the split.
334      if self.infoTags.has_key(tag):
335        if numberValues == 0 and infoType == "Flag": result = True
336        elif numberValues != 0 and infoType == "Flag":
337          print >> sys.stderr, "ERROR"
338          exit(1)
339        else:
340          fields = self.infoTags[tag].split(",")
341          if len(fields) != numberValues:
342            text = "Unexpected number of entries"
343            self.generalError(text, "information tag", tag)
344
345          for i in range(infoNumber):
346            try: result.append(fields[i])
347            except IndexError:
348              text = "Insufficient values. Expected: " + self.infoHeaderTags[tag][0]
349              self.generalError(text, "tag:", tag)
350      else: numberValues = 0
351
352    else:
353      text = "information field does not have a definition in the header"
354      self.generalError(text, "tag", tag)
355
356    return numberValues, infoType, result
357
358# Get the genotype information.
359  def getGenotypeInfo(self, sample, tag):
360    result = []
361    if self.formatHeaderTags.has_key(tag):
362      infoNumber = self.formatHeaderTags[tag][0]
363      infoType = self.formatHeaderTags[tag][1]
364      numberValues = infoNumber
365
366      if self.genotypeFields[sample] == "." and len(self.genotypeFields[sample]) == 1:
367        numberValues = 0
368        result = "."
369      else:
370        if self.genotypeFields[sample].has_key(tag):
371          if tag == "GT":
372            if len(self.genotypeFields[sample][tag]) != 3 and len(self.genotypeFields[sample][tag]) != 1:
373              text = "Unexected number of characters in genotype (GT) field"
374              self.generalError(text, "sample", sample)
375
376# If a diploid call, check whether or not the genotype is phased.
377            elif len(self.genotypeFields[sample][tag]) == 3:
378              self.phased = True if self.genotypeFields[sample][tag][1] == "|" else False
379              result.append( self.genotypeFields[sample][tag][0] )
380              result.append( self.genotypeFields[sample][tag][2] )
381            elif len(self.genotypeFields[sample][tag]) == 3:
382              result.append( self.genotypeFields[sample][tag][0] )
383          else:
384            fields = self.genotypeFields[sample][tag].split(",")
385            if len(fields) != numberValues:
386              text = "Unexpected number of characters in " + tag + " field"
387              self.generalError(text, "sample", sample)
388
389            for i in range(infoNumber): result.append(fields[i])
390    else:
391      text = "genotype field does not have a definition in the header"
392      self.generalError(text, "tag", tag)
393
394    return numberValues, result
395
396# Parse the dbsnp entry.  If the entry conforms to the required variant type,
397# return the dbsnp rsid value, otherwise ".".
398  def getDbsnpInfo(self):
399
400# First check that the variant class (VC) is listed as SNP.
401    vc = self.info.split("VC=",1)
402    if vc[1].find(";") != -1: snp = vc[1].split(";",1)
403    else:
404      snp = []
405      snp.append(vc[1])
406
407    if snp[0].lower() == "snp": rsid = self.rsid
408    else: rsid = "."
409
410    return rsid
411
412# Build a new vcf record.
413  def buildRecord(self, removeGenotypes):
414    record = self.referenceSequence + "\t" + \
415                str(self.position) + "\t" + \
416                self.rsid + "\t" + \
417                self.ref + "\t" + \
418                self.alt + "\t" + \
419                str(self.quality) + "\t" + \
420                self.filters + "\t" + \
421                self.info
422
423    if self.hasGenotypes and not removeGenotypes: record += self.genotypeString
424
425    record += "\n"
426
427    return record
428
429# Close the vcf file.
430  def closeVcf(self, filename):
431    self.filehandle.close()
432
433# Define error messages for different handled errors.
434  def generalError(self, text, field, fieldValue):
435    print >> sys.stderr, "\nError encountered when attempting to read:"
436    print >> sys.stderr, "\treference sequence :\t", self.referenceSequence
437    print >> sys.stderr, "\tposition :\t\t", self.position
438    if field != "": print >> sys.stderr, "\t", field, ":\t", fieldValue
439    print >> sys.stderr,  "\n", text
440    exit(1)