/lib/galaxy_utils/sequence/vcf.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 112 lines · 92 code · 12 blank · 8 comment · 21 complexity · 33ccbe4bb2d9fe44c6003fcc29437aca MD5 · raw file

  1. #Dan Blankenberg
  2. # See http://www.1000genomes.org/wiki/Analysis/variant-call-format
  3. NOT_A_NUMBER = float( 'NaN' )
  4. class VariantCall( object ):
  5. version = None
  6. header_startswith = None
  7. required_header_fields = None
  8. required_header_length = None
  9. @classmethod
  10. def get_class_by_format( cls, format ):
  11. assert format in VCF_FORMATS, 'Unknown format type specified: %s' % format
  12. return VCF_FORMATS[ format ]
  13. def __init__( self, vcf_line, metadata, sample_names ):
  14. raise Exception( 'Abstract Method' )
  15. class VariantCall33( VariantCall ):
  16. version = 'VCFv3.3'
  17. header_startswith = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO'
  18. required_header_fields = header_startswith.split( '\t' )
  19. required_header_length = len( required_header_fields )
  20. def __init__( self, vcf_line, metadata, sample_names ):
  21. # Raw line is needed for indexing file.
  22. self.raw_line = vcf_line
  23. self.line = vcf_line.rstrip( '\n\r' )
  24. self.metadata = metadata
  25. self.sample_names = sample_names
  26. self.format = None
  27. self.sample_values = []
  28. #parse line
  29. self.fields = self.line.split( '\t' )
  30. if sample_names:
  31. assert len( self.fields ) == self.required_header_length + len( sample_names ) + 1, 'Provided VCF line (%s) has wrong length (expected: %i)' % ( self.line, self.required_header_length + len( sample_names ) + 1 )
  32. else:
  33. assert len( self.fields ) == self.required_header_length, 'Provided VCF line (%s) has wrong length (expected: %i)' % ( self.line, self.required_header_length)
  34. self.chrom, self.pos, self.id, self.ref, self.alt, self.qual, self.filter, self.info = self.fields[ :self.required_header_length ]
  35. self.pos = int( self.pos )
  36. self.alt = self.alt.split( ',' )
  37. try:
  38. self.qual = float( self.qual )
  39. except:
  40. self.qual = NOT_A_NUMBER #Missing data can be denoted as a '.'
  41. if len( self.fields ) > self.required_header_length:
  42. self.format = self.fields[ self.required_header_length ].split( ':' )
  43. for sample_value in self.fields[ self.required_header_length + 1: ]:
  44. self.sample_values.append( sample_value.split( ':' ) )
  45. class VariantCall40( VariantCall33 ):
  46. version = 'VCFv4.0'
  47. def __init__( self, vcf_line, metadata, sample_names ):
  48. VariantCall33.__init__( self, vcf_line, metadata, sample_names)
  49. class VariantCall41( VariantCall40 ):
  50. version = 'VCFv4.1'
  51. #VCF Format version lookup dict
  52. VCF_FORMATS = {}
  53. for format in [ VariantCall33, VariantCall40, VariantCall41 ]:
  54. VCF_FORMATS[format.version] = format
  55. class Reader( object ):
  56. def __init__( self, fh ):
  57. self.vcf_file = fh
  58. self.metadata = {}
  59. self.header_fields = None
  60. self.metadata_len = 0
  61. self.sample_names = []
  62. self.vcf_class = None
  63. # Read file metadata.
  64. while True:
  65. line = self.vcf_file.readline()
  66. self.metadata_len += len( line )
  67. assert line, 'Invalid VCF file provided.'
  68. line = line.rstrip( '\r\n' )
  69. if self.vcf_class and line.startswith( self.vcf_class.header_startswith ):
  70. # read the header fields, ignoring any blank tabs, which GATK
  71. # VCF produces after the sample
  72. self.header_fields = [l for l in line.split( '\t' ) if l]
  73. if len( self.header_fields ) > self.vcf_class.required_header_length:
  74. for sample_name in self.header_fields[ self.vcf_class.required_header_length + 1 : ]:
  75. self.sample_names.append( sample_name )
  76. break
  77. assert line.startswith( '##' ), 'Non-metadata line found before header'
  78. line = line[2:] #strip ##
  79. metadata = line.split( '=', 1 )
  80. metadata_name = metadata[ 0 ]
  81. if len( metadata ) == 2:
  82. metadata_value = metadata[ 1 ]
  83. else:
  84. metadata_value = None
  85. if metadata_name in self.metadata:
  86. if not isinstance( self.metadata[ metadata_name ], list ):
  87. self.metadata[ metadata_name ] = [ self.metadata[ metadata_name ] ]
  88. self.metadata[ metadata_name ].append( metadata_value )
  89. else:
  90. self.metadata[ metadata_name ] = metadata_value
  91. if metadata_name == 'fileformat':
  92. self.vcf_class = VariantCall.get_class_by_format( metadata_value )
  93. def next( self ):
  94. line = self.vcf_file.readline()
  95. if not line:
  96. raise StopIteration
  97. return self.vcf_class( line, self.metadata, self.sample_names )
  98. def __iter__( self ):
  99. while True:
  100. yield self.next()