PageRenderTime 26ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/maf/vcf_to_maf_customtrack.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 151 lines | 145 code | 5 blank | 1 comment | 6 complexity | 0742547dcd8aa3cdd628b0d172156e3d MD5 | raw file
  1. #Dan Blankenberg
  2. from optparse import OptionParser
  3. import sys
  4. import galaxy_utils.sequence.vcf
  5. from galaxy import eggs
  6. import pkg_resources; pkg_resources.require( "bx-python" )
  7. import bx.align.maf
  8. UNKNOWN_NUCLEOTIDE = '*'
  9. class PopulationVCFParser( object ):
  10. def __init__( self, reader, name ):
  11. self.reader = reader
  12. self.name = name
  13. self.counter = 0
  14. def next( self ):
  15. rval = []
  16. vc = self.reader.next()
  17. for i, allele in enumerate( vc.alt ):
  18. rval.append( ( '%s_%i.%i' % ( self.name, i + 1, self.counter + 1 ), allele ) )
  19. self.counter += 1
  20. return ( vc, rval )
  21. def __iter__( self ):
  22. while True:
  23. yield self.next()
  24. class SampleVCFParser( object ):
  25. def __init__( self, reader ):
  26. self.reader = reader
  27. self.counter = 0
  28. def next( self ):
  29. rval = []
  30. vc = self.reader.next()
  31. alleles = [ vc.ref ] + vc.alt
  32. if 'GT' in vc.format:
  33. gt_index = vc.format.index( 'GT' )
  34. for sample_name, sample_value in zip( vc.sample_names, vc.sample_values ):
  35. gt_indexes = []
  36. for i in sample_value[ gt_index ].replace( '|', '/' ).replace( '\\', '/' ).split( '/' ): #Do we need to consider phase here?
  37. try:
  38. gt_indexes.append( int( i ) )
  39. except:
  40. gt_indexes.append( None )
  41. for i, allele_i in enumerate( gt_indexes ):
  42. if allele_i is not None:
  43. rval.append( ( '%s_%i.%i' % ( sample_name, i + 1, self.counter + 1 ), alleles[ allele_i ] ) )
  44. self.counter += 1
  45. return ( vc, rval )
  46. def __iter__( self ):
  47. while True:
  48. yield self.next()
  49. def main():
  50. usage = "usage: %prog [options] output_file dbkey inputfile pop_name"
  51. parser = OptionParser( usage=usage )
  52. parser.add_option( "-p", "--population", action="store_true", dest="population", default=False, help="Create MAF on a per population basis")
  53. parser.add_option( "-s", "--sample", action="store_true", dest="sample", default=False, help="Create MAF on a per sample basis")
  54. parser.add_option( "-n", "--name", dest="name", default='Unknown Custom Track', help="Name for Custom Track")
  55. parser.add_option( "-g", "--galaxy", action="store_true", dest="galaxy", default=False, help="Tool is being executed by Galaxy (adds extra error messaging).")
  56. ( options, args ) = parser.parse_args()
  57. if len ( args ) < 3:
  58. if options.galaxy:
  59. print >>sys.stderr, "It appears that you forgot to specify an input VCF file, click 'Add new VCF...' to add at least input.\n"
  60. parser.error( "Need to specify an output file, a dbkey and at least one input file" )
  61. if not ( options.population ^ options.sample ):
  62. parser.error( 'You must specify either a per population conversion or a per sample conversion, but not both' )
  63. out = open( args.pop(0), 'wb' )
  64. out.write( 'track name="%s" visibility=pack\n' % options.name.replace( "\"", "'" ) )
  65. maf_writer = bx.align.maf.Writer( out )
  66. dbkey = args.pop(0)
  67. vcf_files = []
  68. if options.population:
  69. i = 0
  70. while args:
  71. filename = args.pop( 0 )
  72. pop_name = args.pop( 0 ).replace( ' ', '_' )
  73. if not pop_name:
  74. pop_name = 'population_%i' % ( i + 1 )
  75. vcf_files.append( PopulationVCFParser( galaxy_utils.sequence.vcf.Reader( open( filename ) ), pop_name ) )
  76. i += 1
  77. else:
  78. while args:
  79. filename = args.pop( 0 )
  80. vcf_files.append( SampleVCFParser( galaxy_utils.sequence.vcf.Reader( open( filename ) ) ) )
  81. non_spec_skipped = 0
  82. for vcf_file in vcf_files:
  83. for vc, variants in vcf_file:
  84. num_ins = 0
  85. num_dels = 0
  86. for variant_name, variant_text in variants:
  87. if 'D' in variant_text:
  88. num_dels = max( num_dels, int( variant_text[1:] ) )
  89. elif 'I' in variant_text:
  90. num_ins = max( num_ins, len( variant_text ) - 1 )
  91. alignment = bx.align.maf.Alignment()
  92. ref_text = vc.ref + '-' * num_ins + UNKNOWN_NUCLEOTIDE * ( num_dels - len( vc.ref ) )
  93. start_pos = vc.pos - 1
  94. if num_dels and start_pos:
  95. ref_text = UNKNOWN_NUCLEOTIDE + ref_text
  96. start_pos -= 1
  97. alignment.add_component( bx.align.maf.Component( src='%s.%s%s' % (
  98. dbkey, ("chr" if not vc.chrom.startswith("chr") else ""), vc.chrom ),
  99. start = start_pos, size = len( ref_text.replace( '-', '' ) ),
  100. strand = '+', src_size = start_pos + len( ref_text ),
  101. text = ref_text ) )
  102. for variant_name, variant_text in variants:
  103. #FIXME:
  104. ## skip non-spec. compliant data, see: http://1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3 for format spec
  105. ## this check is due to data having indels not represented in the published format spec,
  106. ## e.g. 1000 genomes pilot 1 indel data: ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2010_03/pilot1/indels/CEU.SRP000031.2010_03.indels.sites.vcf.gz
  107. if variant_text and variant_text[0] in [ '-', '+' ]:
  108. non_spec_skipped += 1
  109. continue
  110. #do we need a left padding unknown nucleotide (do we have deletions)?
  111. if num_dels and start_pos:
  112. var_text = UNKNOWN_NUCLEOTIDE
  113. else:
  114. var_text = ''
  115. if 'D' in variant_text:
  116. cur_num_del = int( variant_text[1:] )
  117. pre_del = min( len( vc.ref ), cur_num_del )
  118. post_del = cur_num_del - pre_del
  119. var_text = var_text + '-' * pre_del + '-' * num_ins + '-' * post_del
  120. var_text = var_text + UNKNOWN_NUCLEOTIDE * ( len( ref_text ) - len( var_text ) )
  121. elif 'I' in variant_text:
  122. cur_num_ins = len( variant_text ) - 1
  123. var_text = var_text + vc.ref + variant_text[1:] + '-' * ( num_ins - cur_num_ins ) + UNKNOWN_NUCLEOTIDE * max( 0, ( num_dels - 1 ) )
  124. else:
  125. var_text = var_text + variant_text + '-' * num_ins + UNKNOWN_NUCLEOTIDE * ( num_dels - len( vc.ref ) )
  126. alignment.add_component( bx.align.maf.Component( src=variant_name, start = 0, size = len( var_text.replace( '-', '' ) ), strand = '+', src_size = len( var_text.replace( '-', '' ) ), text = var_text ) )
  127. maf_writer.write( alignment )
  128. maf_writer.close()
  129. if non_spec_skipped:
  130. print 'Skipped %i non-specification compliant indels.' % non_spec_skipped
  131. if __name__ == "__main__": main()