PageRenderTime 38ms CodeModel.GetById 22ms app.highlight 13ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/vcf_tools/tools.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 188 lines | 135 code | 18 blank | 35 comment | 31 complexity | 98cdfb767429fe7bdb9ce1e3faeebbfa MD5 | raw file
  1#!/usr/bin/python
  2
  3import os.path
  4import sys
  5import vcfPytools
  6from vcfPytools import __version__
  7
  8# Determine whether to output to a file or stdout.
  9def setOutput(output):
 10  if output == None:
 11    outputFile = sys.stdout
 12    writeOut = False
 13  else:
 14    output = os.path.abspath(output)
 15    outputFile = open(output, 'w')
 16    writeOut = True
 17
 18  return outputFile, writeOut
 19
 20# Determine which file has priority for writing out records.
 21def setVcfPriority(priorityFile, vcfFiles):
 22  if priorityFile == None: priority = 0
 23  elif priorityFile == vcfFiles[0]: priority = 1
 24  elif priorityFile == vcfFiles[1]: priority = 2
 25  elif priorityFile.lower() == "merge": priority = 3
 26  else:
 27    print >> sys.stderr, "vcf file give priority must be one of the two input vcf files or merge."
 28    exit(1)
 29
 30  return priority
 31
 32# If the union or intersection of two vcf files is being performed
 33# and the output vcf file is to contain the information from both
 34# files, the headers need to be merged to ensure that all info and
 35# format entries have an explanation.
 36def mergeHeaders(v1, v2, v3):
 37
 38# If either file does not have a header, terminate the program.
 39# In order to merge the headers, the different fields must be
 40# checked to ensure the files are compatible.
 41  if not v1.hasHeader or not v2.hasHeader:
 42    print >> sys.stderr, "Both vcf files must have a header in order to merge data sets."
 43    exit(1)
 44
 45  v3.infoHeaderTags = v1.infoHeaderTags.copy()
 46  v3.formatHeaderTags = v1.formatHeaderTags.copy()
 47  v3.numberDataSets = v1.numberDataSets
 48  v3.includedDataSets = v1.includedDataSets.copy()
 49  v3.headerText = v1.headerText
 50  v3.headerTitles = v1.headerTitles
 51  v3.infoHeaderString = v1.infoHeaderString.copy()
 52  v3.formatHeaderString = v1.formatHeaderString.copy()
 53
 54# Merge the info field descriptions.
 55  for tag in v2.infoHeaderTags:
 56    if v1.infoHeaderTags.has_key(tag):
 57      if v1.infoHeaderTags[tag][0] != v2.infoHeaderTags[tag][0] or \
 58         v1.infoHeaderTags[tag][1] != v2.infoHeaderTags[tag][1]:
 59        print v1.infoHeaderTags[tag][0]
 60        print v1.infoHeaderTags[tag][1]
 61        print v1.infoHeaderTags[tag][2]
 62        print >> sys.stderr, "Input vcf files have different definitions for " + tag + " field."
 63        exit(1)
 64    else: v3.infoHeaderTags[tag] = v2.infoHeaderTags[tag]
 65
 66# Merge the format field descriptions.
 67  for tag in v2.formatHeaderTags:
 68    if v1.formatHeaderTags.has_key(tag):
 69      if v1.formatHeaderTags[tag][0] != v2.formatHeaderTags[tag][0] or \
 70         v1.formatHeaderTags[tag][1] != v2.formatHeaderTags[tag][1]:
 71        print >> sys.stderr, "Input vcf files have different definitions for " + tag + " field."
 72        exit(1)
 73    else: v3.formatHeaderTags[tag] = v2.formatHeaderTags[tag]
 74
 75# Now check to see if the vcf files contain information from multiple
 76# records themselves and create an ordered list in which the data
 77# will appear in the file.  For instance, of the first file has
 78# already got two sets of data and is being intersected with a file
 79# with one set of data, the order of data in the new vcf file will be
 80# the two sets from the first file followed by the second, e.g.
 81# AB=3/2/4, where the 3 and 2 are from the first file and the 4 is the
 82# value of AC from the second vcf.  The header will have a ##FILE for
 83# each of the three files, so the origin if the data can be recovered.
 84  if v1.numberDataSets == 0:
 85    v3.includedDataSets[v3.numberDataSets + 1] = v1.filename
 86    v3.numberDataSets += 1
 87  if v2.numberDataSets == 0:
 88    v3.includedDataSets[v3.numberDataSets + 1] = v2.filename
 89    v3.numberDataSets += 1
 90  else:
 91    for i in range(1, v2.numberDataSets + 1):
 92      v3.includedDataSets[v3.numberDataSets + 1] = v2.includedDataSets[i]
 93      v3.numberDataSets += 1
 94
 95# If either of the input files contain multiple data sets (e.g. multiple
 96# vcf files have undergone intersection or union calculations and all
 97# information has been retained) and the priority isn't set to 'merge',
 98# terminate the program.  This is to ensure that the origin of the data
 99# doesn't get confused.
100def checkDataSets(v1, v2):
101  if v1.numberDataSets + v2.numberDataSets != 0:
102    print >> sys.stderr, "\nERROR:"
103    print >> sys.stderr, "input vcf file(s) contain data sets from multiple vcf files."
104    print >> sys.stderr, "Further intersection or union operations must include --priority-file merge"
105    print >> sys.stderr, "Other tools may be incompatible with this format."
106    exit(1)
107
108# Write the header to file.
109def writeHeader (outputFile, v, removeGenotypes, taskDescriptor):
110  if not v.hasHeader: 
111    v.headerText = "##fileformat=VCFv4.0\n##source=vcfPytools " + __version__ + "\n"
112    v.headerTitles = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"
113  outputFile.write(v.headerText) if v.headerText != "" else None
114  print >> outputFile, taskDescriptor
115  for tag in v.infoHeaderString: print >> outputFile, v.infoHeaderString[tag]
116  for tag in v.formatHeaderString: print >> outputFile, v.formatHeaderString[tag]
117
118# Write out a list of files indicating which data set belongs to which file.
119  if v.numberDataSets != 0:
120    for i in range(1, v.numberDataSets + 1):
121      print >> outputFile, "##FILE=<ID=" + str(i) + ",\"" + v.includedDataSets[i] + "\">"
122
123  if removeGenotypes:
124    line = v.headerTitles.rstrip("\n").split("\t")
125    newHeaderTitles = line[0]
126    for i in range(1,8):
127      newHeaderTitles = newHeaderTitles + "\t" + line[i]
128    newHeaderTitles = newHeaderTitles + "\n"
129    outputFile.write( newHeaderTitles )
130  else:
131    outputFile.write( v.headerTitles )
132
133# Check that the two reference sequence lists are identical.
134# If there are a different number or order, the results may
135# not be as expected.
136def checkReferenceSequenceLists(list1, list2):
137  errorMessage = False
138  if len(list1) != len(list2):
139    print >> sys.stderr, "WARNING: Input files contain a different number of reference sequences."
140    errorMessage = True
141  elif list1 != list2:
142    print >> sys.stderr, "WARNING: Input files contain different or differently ordered reference sequences."
143    errorMessage = True
144  if errorMessage:
145    print >> sys.stderr, "Results may not be as expected."
146    print >> sys.stderr, "Ensure that input files have the same reference sequences in the same order."
147    print >> sys.stderr, "Reference sequence lists observed were:\n\t", list1, "\n\t", list2
148
149# Write out a vcf record to file.  The record written depends on the
150# value of 'priority' and could therefore be the record from either
151# of the vcf files, or a combination of them.
152
153def writeVcfRecord(priority, v1, v2, outputFile):
154  if priority == 0:
155    if v1.quality >= v2.quality: outputFile.write(v1.record)
156    else: outputFile.write(v2.record)
157  elif priority == 1: outputFile.write(v1.record)
158  elif priority == 2: outputFile.write(v2.record)
159  elif priority == 3:
160
161# Define the missing entry values (depends on the number of data sets
162# in the file).
163    info = ""
164    missingEntry1 = missingEntry2 = "."
165    for i in range(1, v1.numberDataSets): missingEntry1 += "/."
166    for i in range(1, v2.numberDataSets): missingEntry2 += "/."
167    secondList = v2.infoTags.copy()
168
169# Build up the info field.
170    for tag in v1.infoTags:
171      if secondList.has_key(tag):
172        if v1.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + v1.infoTags[tag] + "/" + v2.infoTags[tag] + ";"
173        del secondList[tag]
174      else: 
175        if v1.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + v1.infoTags[tag] + "/" + missingEntry2 + ";"
176
177# Now include the info tags that are not populated in the first vcf file.
178    for tag in secondList:
179      if v2.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + missingEntry1 + "/" + v2.infoTags[tag] + ";"
180
181# Build the complete record.
182    info = info.rstrip(";")
183    record = v1.referenceSequence + "\t" + str(v1.position) + "\t" + v1.rsid + "\t" + v1.ref + "\t" + \
184             v1.alt + "/" + v2.alt + "\t" + v1.quality + "/" + v2.quality + "\t.\t" + info
185    print >> outputFile, record
186  else:
187    print >> sys.sterr, "Unknown file priority."
188    exit(1)