/tools/vcf_tools/vcfClass.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 440 lines · 284 code · 75 blank · 81 comment · 104 complexity · 1d1d461a4e089da585f3c83cde64159f MD5 · raw file

  1. #!/usr/bin/python
  2. import os.path
  3. import sys
  4. import re
  5. class vcf:
  6. def __init__(self):
  7. # Header info.
  8. self.filename = ""
  9. self.hasHeader = True
  10. self.headerText = ""
  11. self.headerTitles = ""
  12. self.vcfFormat = ""
  13. #self.headerInfoText = ""
  14. #self.headerFormatText = ""
  15. # Store the info and format tags as well as the lines that describe
  16. # them in a dictionary.
  17. self.numberDataSets = 0
  18. self.includedDataSets = {}
  19. self.infoHeaderTags = {}
  20. self.infoHeaderString = {}
  21. self.formatHeaderTags = {}
  22. self.formatHeaderString = {}
  23. # Genotype information.
  24. self.genotypes = False
  25. self.infoField = {}
  26. # Reference sequence information.
  27. self.referenceSequences = {}
  28. self.referenceSequenceList = []
  29. self.referenceSequence = ""
  30. # Record information.
  31. self.position = -1
  32. self.samplesList = []
  33. # Determine which fields to process.
  34. self.processInfo = False
  35. self.processGenotypes = False
  36. self.dbsnpVcf = False
  37. self.hapmapVcf = False
  38. # Open a vcf file.
  39. def openVcf(self, filename):
  40. if filename == "stdin":
  41. self.filehandle = sys.stdin
  42. self.filename = "stdin"
  43. else:
  44. try: self.filehandle = open(filename,"r")
  45. except IOError:
  46. print >> sys.stderr, "Failed to find file: ",filename
  47. exit(1)
  48. self.filename = os.path.abspath(filename)
  49. # Parse the vcf header.
  50. def parseHeader(self, filename, writeOut):
  51. while self.getHeaderLine(filename, writeOut):
  52. continue
  53. # Determine the type of information in the header line.
  54. def getHeaderLine(self, filename, writeOut):
  55. self.headerLine = self.filehandle.readline().rstrip("\n")
  56. if self.headerLine.startswith("##fileformat"): success = self.getvcfFormat()
  57. if self.headerLine.startswith("##INFO"): success = self.headerInfo(writeOut, "info")
  58. elif self.headerLine.startswith("##FORMAT"): success = self.headerInfo(writeOut, "format")
  59. elif self.headerLine.startswith("##FILE"): success = self.headerFiles(writeOut)
  60. elif self.headerLine.startswith("##"): success = self.headerAdditional()
  61. elif self.headerLine.startswith("#"): success = self.headerTitleString(filename, writeOut)
  62. else: success = self.noHeader(filename, writeOut)
  63. return success
  64. # Read VCF format
  65. def getvcfFormat(self):
  66. try:
  67. self.vcfFormat = self.headerLine.split("=",1)[1]
  68. self.vcfFormat = float( self.vcfFormat.split("VCFv",1)[1] )## Extract the version number rather than the whole string
  69. except IndexError:
  70. print >> sys.stderr, "\nError parsing the fileformat"
  71. print >> sys.stderr, "The following fileformat header is wrongly formatted: ", self.headerLine
  72. exit(1)
  73. return True
  74. # Read information on an info field from the header line.
  75. def headerInfo(self, writeOut, lineType):
  76. tag = self.headerLine.split("=",1)
  77. tagID = (tag[1].split("ID=",1))[1].split(",",1)
  78. # Check if this info field has already been defined.
  79. if (lineType == "info" and self.infoHeaderTags.has_key(tagID[0])) or (lineType == "format" and self.formatHeaderTags.has_key(tagID[0])):
  80. print >> sys.stderr, "Info tag \"", tagID[0], "\" is defined multiple times in the header."
  81. exit(1)
  82. # Determine the number of entries, entry type and description.
  83. tagNumber = (tagID[1].split("Number=",1))[1].split(",",1)
  84. tagType = (tagNumber[1].split("Type=",1))[1].split(",",1)
  85. try: tagDescription = ( ( (tagType[1].split("Description=\"",1))[1] ).split("\">") )[0]
  86. except IndexError: tagDescription = ""
  87. tagID = tagID[0]; tagNumber = tagNumber[0]; tagType = tagType[0]
  88. # Check that the number of fields associated with the tag is either
  89. # an integer or a '.' to indicate variable number of entries.
  90. if tagNumber == ".": tagNumber = "variable"
  91. else:
  92. if self.vcfFormat<4.1:
  93. try:
  94. tagNumber = int(tagNumber)
  95. except ValueError:
  96. print >> sys.stderr, "\nError parsing header. Problem with info tag:", tagID
  97. print >> sys.stderr, "Number of fields associated with this tag is not an integer or '.'"
  98. exit(1)
  99. if lineType == "info":
  100. self.infoHeaderTags[tagID] = tagNumber, tagType, tagDescription
  101. self.infoHeaderString[tagID] = self.headerLine
  102. if lineType == "format":
  103. self.formatHeaderTags[tagID] = tagNumber, tagType, tagDescription
  104. self.formatHeaderString[tagID] = self.headerLine
  105. return True
  106. # Check to see if the records contain information from multiple different
  107. # sources. If vcfPytools has been used to find the intersection or union
  108. # of two vcf files, the records may have been merged to keep all the
  109. # information available. If this is the case, there will be a ##FILE line
  110. # for each set of information in the file. The order of these files needs
  111. # to be maintained.
  112. def headerFiles(self, writeOut):
  113. fileID = (self.headerLine.split("ID=",1))[1].split(",",1)
  114. filename = fileID[1].split("\"",2)[1]
  115. try: fileID = int(fileID[0])
  116. except ValueError:
  117. print >> sys.stderr, "File ID in ##FILE entry must be an integer."
  118. print >> sys.stderr, self.headerLine
  119. exit(1)
  120. if self.includedDataSets.has_key(fileID):
  121. print >> sys.stderr, "\nERROR: file " + self.filename
  122. print >> sys.stderr, "Multiple files in the ##FILE list have identical ID values."
  123. exit(1)
  124. self.includedDataSets[fileID] = filename
  125. # Set the number of files with information in this vcf file.
  126. if fileID > self.numberDataSets: self.numberDataSets = fileID
  127. return True
  128. # Read additional information contained in the header.
  129. def headerAdditional(self):
  130. self.headerText += self.headerLine + "\n"
  131. return True
  132. # Read in the column titles to check that all standard fields
  133. # are present and read in all the samples.
  134. def headerTitleString(self, filename, writeOut):
  135. self.headerTitles = self.headerLine + "\n"
  136. # Strip the end of line character from the last infoFields entry.
  137. infoFields = self.headerLine.split("\t")
  138. if len(infoFields) > 8:
  139. # if len(infoFields) - 9 == 1 and writeOut: print >> sys.stdout, len(infoFields) - 9, " sample present in vcf file: ", filename
  140. # elif writeOut: print >> sys.stdout, len(infoFields) - 9, " samples present in vcf file: ", filename
  141. self.samplesList = infoFields[9:]
  142. self.genotypes = True
  143. elif len(infoFields) == 8:
  144. if writeOut: print >> sys.stdout, "No samples present in the header. No genotype information available."
  145. else:
  146. print self.headerLine, len(infoFields)
  147. print >> sys.stderr, "Not all vcf standard fields are available."
  148. exit(1)
  149. return False
  150. # If there is no header in the vcf file, close and reopen the
  151. # file so that the first line is avaiable for parsing as a
  152. # vcf record.
  153. def noHeader(self, filename, writeOut):
  154. if writeOut: print >> sys.stdout, "No header lines present in", filename
  155. self.hasHeader = False
  156. self.closeVcf(filename)
  157. self.openVcf(filename)
  158. return False
  159. # Check that info fields exist.
  160. def checkInfoFields(self, tag):
  161. if self.infoHeaderTags.has_key(tag) == False:
  162. print >> sys.stderr, "Info tag \"", tag, "\" does not exist in the header."
  163. exit(1)
  164. # Get the next line of information from the vcf file.
  165. def getRecord(self):
  166. self.record = self.filehandle.readline()
  167. if not self.record: return False
  168. # Set up and execute a regular expression match.
  169. recordRe = re.compile(r"^(\S+)\t(\d+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)(\n|\t.+)$")
  170. #recordRe = re.compile(r"^(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)(\n|\s+.+)$")
  171. recordMatch = recordRe.match(self.record)
  172. if recordMatch == None:
  173. print >> sys.stderr, "Unable to resolve vcf record.\n"
  174. print >> sys.stderr, self.record
  175. exit(1)
  176. self.referenceSequence = recordMatch.group(1)
  177. try: self.position = int(recordMatch.group(2))
  178. except ValueError:
  179. text = "variant position is not an integer"
  180. self.generalError(text, "", None)
  181. self.rsid = recordMatch.group(3)
  182. self.ref = recordMatch.group(4)
  183. self.alt = recordMatch.group(5)
  184. self.quality = recordMatch.group(6)
  185. self.filters = recordMatch.group(7)
  186. self.info = recordMatch.group(8)
  187. self.genotypeString = recordMatch.group(9)
  188. self.infoTags = {}
  189. # Check that the quality is an integer or a float. If not, set the quality
  190. # to zero.
  191. try: self.quality = float(self.quality)
  192. except ValueError: self.quality = float(0.)
  193. # If recordMatch.group(9) is not the end of line character, there is
  194. # genotype information with this record.
  195. if self.genotypeString != "\n": self.hasGenotypes = True
  196. else: self.hasGenotypes = False
  197. # Add the reference sequence to the dictionary. If it didn't previously
  198. # exist append the reference sequence to the end of the list as well.
  199. # This ensures that the order in which the reference sequences appeared
  200. # in the header can be preserved.
  201. if self.referenceSequence not in self.referenceSequences:
  202. self.referenceSequences[self.referenceSequence] = True
  203. self.referenceSequenceList.append(self.referenceSequence)
  204. # Check for multiple alternate alleles.
  205. self.alternateAlleles = self.alt.split(",")
  206. self.numberAlternateAlleles = len(self.alternateAlleles)
  207. # If required, process the info and genotypes.
  208. if self.processInfo: self.processInfoFields()
  209. if self.processGenotypes and self.hasGenotypes: self.processGenotypeFields()
  210. return True
  211. # Process the info string.
  212. def processInfoFields(self):
  213. # First break the info string into its constituent elements.
  214. infoEntries = self.info.split(";")
  215. # As long as some info fields exist, place them into a dictionary.
  216. for entry in infoEntries:
  217. infoEntry = entry.split("=")
  218. # If the entry is a flag, there will be no equals and the length of
  219. # infoEntry will be 1. In this case, set the dictionary entry to the
  220. # whole entry. If the vcf file has undergone a union or intersection
  221. # operation and contains the information from multiple files, this may
  222. # be a '/' seperate list of flags and so cannot be set to a Boolean value
  223. # yet.
  224. if len(infoEntry) == 1: self.infoTags[infoEntry[0]] = infoEntry[0]
  225. elif len(infoEntry) > 1: self.infoTags[infoEntry[0]] = infoEntry[1]
  226. # Process the genotype formats and values.
  227. def processGenotypeFields(self):
  228. genotypeEntries = self.genotypeString.split("\t")
  229. self.genotypeFormatString = genotypeEntries[1]
  230. self.genotypes = list(genotypeEntries[2:])
  231. self.genotypeFormats = {}
  232. self.genotypeFields = {}
  233. self.genotypeFormats = self.genotypeFormatString.split(":")
  234. # Check that the number of genotype fields is equal to the number of samples
  235. if len(self.samplesList) != len(self.genotypes):
  236. text = "The number of genotypes is different to the number of samples"
  237. self.generalError(text, "", "")
  238. # Add the genotype information to a dictionary.
  239. for i in range( len(self.samplesList) ):
  240. genotypeInfo = self.genotypes[i].split(":")
  241. self.genotypeFields[ self.samplesList[i] ] = {}
  242. # Check that there are as many fields as in the format field. If not, this must
  243. # be because the information is not known. In this case, it is permitted that
  244. # the genotype information is either . or ./.
  245. if genotypeInfo[0] == "./." or genotypeInfo[0] == "." and len(self.genotypeFormats) != len(genotypeInfo):
  246. self.genotypeFields[ self.samplesList[i] ] = "."
  247. else:
  248. if len(self.genotypeFormats) != len(genotypeInfo):
  249. text = "The number of genotype fields is different to the number specified in the format string"
  250. self.generalError(text, "sample", self.samplesList[i])
  251. for j in range( len(self.genotypeFormats) ): self.genotypeFields[ self.samplesList[i] ][ self.genotypeFormats[j] ] = genotypeInfo[j]
  252. # Parse through the vcf file until the correct reference sequence is
  253. # encountered and the position is greater than or equal to that requested.
  254. def parseVcf(self, referenceSequence, position, writeOut, outputFile):
  255. success = True
  256. if self.referenceSequence != referenceSequence:
  257. while self.referenceSequence != referenceSequence and success:
  258. if writeOut: outputFile.write(self.record)
  259. success = self.getRecord()
  260. while self.referenceSequence == referenceSequence and self.position < position and success:
  261. if writeOut: outputFile.write(self.record)
  262. success = self.getRecord()
  263. return success
  264. # Get the information for a specific info tag. Also check that it contains
  265. # the correct number and type of entries.
  266. def getInfo(self, tag):
  267. result = []
  268. # Check if the tag exists in the header information. If so,
  269. # determine the number and type of entries asscoiated with this
  270. # tag.
  271. if self.infoHeaderTags.has_key(tag):
  272. infoNumber = self.infoHeaderTags[tag][0]
  273. infoType = self.infoHeaderTags[tag][1]
  274. numberValues = infoNumber
  275. # First check that the tag exists in the information string. Then split
  276. # the entry on commas. For flag entries, do not perform the split.
  277. if self.infoTags.has_key(tag):
  278. if numberValues == 0 and infoType == "Flag": result = True
  279. elif numberValues != 0 and infoType == "Flag":
  280. print >> sys.stderr, "ERROR"
  281. exit(1)
  282. else:
  283. fields = self.infoTags[tag].split(",")
  284. if len(fields) != numberValues:
  285. text = "Unexpected number of entries"
  286. self.generalError(text, "information tag", tag)
  287. for i in range(infoNumber):
  288. try: result.append(fields[i])
  289. except IndexError:
  290. text = "Insufficient values. Expected: " + self.infoHeaderTags[tag][0]
  291. self.generalError(text, "tag:", tag)
  292. else: numberValues = 0
  293. else:
  294. text = "information field does not have a definition in the header"
  295. self.generalError(text, "tag", tag)
  296. return numberValues, infoType, result
  297. # Get the genotype information.
  298. def getGenotypeInfo(self, sample, tag):
  299. result = []
  300. if self.formatHeaderTags.has_key(tag):
  301. infoNumber = self.formatHeaderTags[tag][0]
  302. infoType = self.formatHeaderTags[tag][1]
  303. numberValues = infoNumber
  304. if self.genotypeFields[sample] == "." and len(self.genotypeFields[sample]) == 1:
  305. numberValues = 0
  306. result = "."
  307. else:
  308. if self.genotypeFields[sample].has_key(tag):
  309. if tag == "GT":
  310. if len(self.genotypeFields[sample][tag]) != 3 and len(self.genotypeFields[sample][tag]) != 1:
  311. text = "Unexected number of characters in genotype (GT) field"
  312. self.generalError(text, "sample", sample)
  313. # If a diploid call, check whether or not the genotype is phased.
  314. elif len(self.genotypeFields[sample][tag]) == 3:
  315. self.phased = True if self.genotypeFields[sample][tag][1] == "|" else False
  316. result.append( self.genotypeFields[sample][tag][0] )
  317. result.append( self.genotypeFields[sample][tag][2] )
  318. elif len(self.genotypeFields[sample][tag]) == 3:
  319. result.append( self.genotypeFields[sample][tag][0] )
  320. else:
  321. fields = self.genotypeFields[sample][tag].split(",")
  322. if len(fields) != numberValues:
  323. text = "Unexpected number of characters in " + tag + " field"
  324. self.generalError(text, "sample", sample)
  325. for i in range(infoNumber): result.append(fields[i])
  326. else:
  327. text = "genotype field does not have a definition in the header"
  328. self.generalError(text, "tag", tag)
  329. return numberValues, result
  330. # Parse the dbsnp entry. If the entry conforms to the required variant type,
  331. # return the dbsnp rsid value, otherwise ".".
  332. def getDbsnpInfo(self):
  333. # First check that the variant class (VC) is listed as SNP.
  334. vc = self.info.split("VC=",1)
  335. if vc[1].find(";") != -1: snp = vc[1].split(";",1)
  336. else:
  337. snp = []
  338. snp.append(vc[1])
  339. if snp[0].lower() == "snp": rsid = self.rsid
  340. else: rsid = "."
  341. return rsid
  342. # Build a new vcf record.
  343. def buildRecord(self, removeGenotypes):
  344. record = self.referenceSequence + "\t" + \
  345. str(self.position) + "\t" + \
  346. self.rsid + "\t" + \
  347. self.ref + "\t" + \
  348. self.alt + "\t" + \
  349. str(self.quality) + "\t" + \
  350. self.filters + "\t" + \
  351. self.info
  352. if self.hasGenotypes and not removeGenotypes: record += self.genotypeString
  353. record += "\n"
  354. return record
  355. # Close the vcf file.
  356. def closeVcf(self, filename):
  357. self.filehandle.close()
  358. # Define error messages for different handled errors.
  359. def generalError(self, text, field, fieldValue):
  360. print >> sys.stderr, "\nError encountered when attempting to read:"
  361. print >> sys.stderr, "\treference sequence :\t", self.referenceSequence
  362. print >> sys.stderr, "\tposition :\t\t", self.position
  363. if field != "": print >> sys.stderr, "\t", field, ":\t", fieldValue
  364. print >> sys.stderr, "\n", text
  365. exit(1)