PageRenderTime 33ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/tools/vcf_tools/tools.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 188 lines | 135 code | 18 blank | 35 comment | 38 complexity | 98cdfb767429fe7bdb9ce1e3faeebbfa MD5 | raw file
  1. #!/usr/bin/python
  2. import os.path
  3. import sys
  4. import vcfPytools
  5. from vcfPytools import __version__
  6. # Determine whether to output to a file or stdout.
  7. def setOutput(output):
  8. if output == None:
  9. outputFile = sys.stdout
  10. writeOut = False
  11. else:
  12. output = os.path.abspath(output)
  13. outputFile = open(output, 'w')
  14. writeOut = True
  15. return outputFile, writeOut
  16. # Determine which file has priority for writing out records.
  17. def setVcfPriority(priorityFile, vcfFiles):
  18. if priorityFile == None: priority = 0
  19. elif priorityFile == vcfFiles[0]: priority = 1
  20. elif priorityFile == vcfFiles[1]: priority = 2
  21. elif priorityFile.lower() == "merge": priority = 3
  22. else:
  23. print >> sys.stderr, "vcf file give priority must be one of the two input vcf files or merge."
  24. exit(1)
  25. return priority
  26. # If the union or intersection of two vcf files is being performed
  27. # and the output vcf file is to contain the information from both
  28. # files, the headers need to be merged to ensure that all info and
  29. # format entries have an explanation.
  30. def mergeHeaders(v1, v2, v3):
  31. # If either file does not have a header, terminate the program.
  32. # In order to merge the headers, the different fields must be
  33. # checked to ensure the files are compatible.
  34. if not v1.hasHeader or not v2.hasHeader:
  35. print >> sys.stderr, "Both vcf files must have a header in order to merge data sets."
  36. exit(1)
  37. v3.infoHeaderTags = v1.infoHeaderTags.copy()
  38. v3.formatHeaderTags = v1.formatHeaderTags.copy()
  39. v3.numberDataSets = v1.numberDataSets
  40. v3.includedDataSets = v1.includedDataSets.copy()
  41. v3.headerText = v1.headerText
  42. v3.headerTitles = v1.headerTitles
  43. v3.infoHeaderString = v1.infoHeaderString.copy()
  44. v3.formatHeaderString = v1.formatHeaderString.copy()
  45. # Merge the info field descriptions.
  46. for tag in v2.infoHeaderTags:
  47. if v1.infoHeaderTags.has_key(tag):
  48. if v1.infoHeaderTags[tag][0] != v2.infoHeaderTags[tag][0] or \
  49. v1.infoHeaderTags[tag][1] != v2.infoHeaderTags[tag][1]:
  50. print v1.infoHeaderTags[tag][0]
  51. print v1.infoHeaderTags[tag][1]
  52. print v1.infoHeaderTags[tag][2]
  53. print >> sys.stderr, "Input vcf files have different definitions for " + tag + " field."
  54. exit(1)
  55. else: v3.infoHeaderTags[tag] = v2.infoHeaderTags[tag]
  56. # Merge the format field descriptions.
  57. for tag in v2.formatHeaderTags:
  58. if v1.formatHeaderTags.has_key(tag):
  59. if v1.formatHeaderTags[tag][0] != v2.formatHeaderTags[tag][0] or \
  60. v1.formatHeaderTags[tag][1] != v2.formatHeaderTags[tag][1]:
  61. print >> sys.stderr, "Input vcf files have different definitions for " + tag + " field."
  62. exit(1)
  63. else: v3.formatHeaderTags[tag] = v2.formatHeaderTags[tag]
  64. # Now check to see if the vcf files contain information from multiple
  65. # records themselves and create an ordered list in which the data
  66. # will appear in the file. For instance, of the first file has
  67. # already got two sets of data and is being intersected with a file
  68. # with one set of data, the order of data in the new vcf file will be
  69. # the two sets from the first file followed by the second, e.g.
  70. # AB=3/2/4, where the 3 and 2 are from the first file and the 4 is the
  71. # value of AC from the second vcf. The header will have a ##FILE for
  72. # each of the three files, so the origin if the data can be recovered.
  73. if v1.numberDataSets == 0:
  74. v3.includedDataSets[v3.numberDataSets + 1] = v1.filename
  75. v3.numberDataSets += 1
  76. if v2.numberDataSets == 0:
  77. v3.includedDataSets[v3.numberDataSets + 1] = v2.filename
  78. v3.numberDataSets += 1
  79. else:
  80. for i in range(1, v2.numberDataSets + 1):
  81. v3.includedDataSets[v3.numberDataSets + 1] = v2.includedDataSets[i]
  82. v3.numberDataSets += 1
  83. # If either of the input files contain multiple data sets (e.g. multiple
  84. # vcf files have undergone intersection or union calculations and all
  85. # information has been retained) and the priority isn't set to 'merge',
  86. # terminate the program. This is to ensure that the origin of the data
  87. # doesn't get confused.
  88. def checkDataSets(v1, v2):
  89. if v1.numberDataSets + v2.numberDataSets != 0:
  90. print >> sys.stderr, "\nERROR:"
  91. print >> sys.stderr, "input vcf file(s) contain data sets from multiple vcf files."
  92. print >> sys.stderr, "Further intersection or union operations must include --priority-file merge"
  93. print >> sys.stderr, "Other tools may be incompatible with this format."
  94. exit(1)
  95. # Write the header to file.
  96. def writeHeader (outputFile, v, removeGenotypes, taskDescriptor):
  97. if not v.hasHeader:
  98. v.headerText = "##fileformat=VCFv4.0\n##source=vcfPytools " + __version__ + "\n"
  99. v.headerTitles = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"
  100. outputFile.write(v.headerText) if v.headerText != "" else None
  101. print >> outputFile, taskDescriptor
  102. for tag in v.infoHeaderString: print >> outputFile, v.infoHeaderString[tag]
  103. for tag in v.formatHeaderString: print >> outputFile, v.formatHeaderString[tag]
  104. # Write out a list of files indicating which data set belongs to which file.
  105. if v.numberDataSets != 0:
  106. for i in range(1, v.numberDataSets + 1):
  107. print >> outputFile, "##FILE=<ID=" + str(i) + ",\"" + v.includedDataSets[i] + "\">"
  108. if removeGenotypes:
  109. line = v.headerTitles.rstrip("\n").split("\t")
  110. newHeaderTitles = line[0]
  111. for i in range(1,8):
  112. newHeaderTitles = newHeaderTitles + "\t" + line[i]
  113. newHeaderTitles = newHeaderTitles + "\n"
  114. outputFile.write( newHeaderTitles )
  115. else:
  116. outputFile.write( v.headerTitles )
  117. # Check that the two reference sequence lists are identical.
  118. # If there are a different number or order, the results may
  119. # not be as expected.
  120. def checkReferenceSequenceLists(list1, list2):
  121. errorMessage = False
  122. if len(list1) != len(list2):
  123. print >> sys.stderr, "WARNING: Input files contain a different number of reference sequences."
  124. errorMessage = True
  125. elif list1 != list2:
  126. print >> sys.stderr, "WARNING: Input files contain different or differently ordered reference sequences."
  127. errorMessage = True
  128. if errorMessage:
  129. print >> sys.stderr, "Results may not be as expected."
  130. print >> sys.stderr, "Ensure that input files have the same reference sequences in the same order."
  131. print >> sys.stderr, "Reference sequence lists observed were:\n\t", list1, "\n\t", list2
  132. # Write out a vcf record to file. The record written depends on the
  133. # value of 'priority' and could therefore be the record from either
  134. # of the vcf files, or a combination of them.
  135. def writeVcfRecord(priority, v1, v2, outputFile):
  136. if priority == 0:
  137. if v1.quality >= v2.quality: outputFile.write(v1.record)
  138. else: outputFile.write(v2.record)
  139. elif priority == 1: outputFile.write(v1.record)
  140. elif priority == 2: outputFile.write(v2.record)
  141. elif priority == 3:
  142. # Define the missing entry values (depends on the number of data sets
  143. # in the file).
  144. info = ""
  145. missingEntry1 = missingEntry2 = "."
  146. for i in range(1, v1.numberDataSets): missingEntry1 += "/."
  147. for i in range(1, v2.numberDataSets): missingEntry2 += "/."
  148. secondList = v2.infoTags.copy()
  149. # Build up the info field.
  150. for tag in v1.infoTags:
  151. if secondList.has_key(tag):
  152. if v1.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + v1.infoTags[tag] + "/" + v2.infoTags[tag] + ";"
  153. del secondList[tag]
  154. else:
  155. if v1.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + v1.infoTags[tag] + "/" + missingEntry2 + ";"
  156. # Now include the info tags that are not populated in the first vcf file.
  157. for tag in secondList:
  158. if v2.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + missingEntry1 + "/" + v2.infoTags[tag] + ";"
  159. # Build the complete record.
  160. info = info.rstrip(";")
  161. record = v1.referenceSequence + "\t" + str(v1.position) + "\t" + v1.rsid + "\t" + v1.ref + "\t" + \
  162. v1.alt + "/" + v2.alt + "\t" + v1.quality + "/" + v2.quality + "\t.\t" + info
  163. print >> outputFile, record
  164. else:
  165. print >> sys.sterr, "Unknown file priority."
  166. exit(1)