PageRenderTime 228ms CodeModel.GetById 218ms app.highlight 8ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/galaxy_utils/sequence/vcf.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 112 lines | 92 code | 12 blank | 8 comment | 25 complexity | 33ccbe4bb2d9fe44c6003fcc29437aca MD5 | raw file
  1#Dan Blankenberg
  2# See http://www.1000genomes.org/wiki/Analysis/variant-call-format
  3
  4NOT_A_NUMBER = float( 'NaN' )
  5
  6class VariantCall( object ):
  7    version = None
  8    header_startswith = None
  9    required_header_fields = None
 10    required_header_length = None
 11
 12    @classmethod
 13    def get_class_by_format( cls, format ):
 14        assert format in VCF_FORMATS, 'Unknown format type specified: %s' % format
 15        return VCF_FORMATS[ format ]
 16
 17    def __init__( self, vcf_line, metadata, sample_names ):
 18        raise Exception( 'Abstract Method' )
 19
 20class VariantCall33( VariantCall ):
 21    version = 'VCFv3.3'
 22    header_startswith = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO'
 23    required_header_fields = header_startswith.split( '\t' )
 24    required_header_length = len( required_header_fields )
 25
 26    def __init__( self, vcf_line, metadata, sample_names ):
 27        # Raw line is needed for indexing file.
 28        self.raw_line = vcf_line
 29        self.line = vcf_line.rstrip( '\n\r' )
 30        self.metadata = metadata
 31        self.sample_names = sample_names
 32        self.format = None
 33        self.sample_values = []
 34
 35        #parse line
 36        self.fields = self.line.split( '\t' )
 37        if sample_names:
 38            assert len( self.fields ) == self.required_header_length + len( sample_names ) + 1, 'Provided VCF line (%s) has wrong length (expected: %i)' % ( self.line, self.required_header_length + len( sample_names ) + 1 )
 39        else:
 40            assert len( self.fields ) == self.required_header_length, 'Provided VCF line (%s) has wrong length (expected: %i)' % ( self.line, self.required_header_length)
 41        self.chrom, self.pos, self.id, self.ref, self.alt, self.qual, self.filter, self.info = self.fields[ :self.required_header_length ]
 42        self.pos = int( self.pos )
 43        self.alt = self.alt.split( ',' )
 44        try:
 45            self.qual = float( self.qual )
 46        except:
 47            self.qual = NOT_A_NUMBER #Missing data can be denoted as a '.'
 48        if len( self.fields ) > self.required_header_length:
 49            self.format = self.fields[ self.required_header_length ].split( ':' )
 50            for sample_value in self.fields[ self.required_header_length + 1: ]:
 51                self.sample_values.append( sample_value.split( ':' ) )
 52
 53class VariantCall40( VariantCall33 ):
 54    version = 'VCFv4.0'
 55    def __init__( self, vcf_line, metadata, sample_names ):
 56        VariantCall33.__init__( self, vcf_line, metadata, sample_names)
 57
 58class VariantCall41( VariantCall40 ):
 59    version = 'VCFv4.1'
 60
 61#VCF Format version lookup dict
 62VCF_FORMATS = {}
 63for format in [ VariantCall33, VariantCall40, VariantCall41 ]:
 64    VCF_FORMATS[format.version] = format
 65
 66class Reader( object ):
 67    def __init__( self, fh ):
 68        self.vcf_file = fh
 69        self.metadata = {}
 70        self.header_fields = None
 71        self.metadata_len = 0
 72        self.sample_names = []
 73        self.vcf_class = None
 74
 75        # Read file metadata.
 76        while True:
 77            line = self.vcf_file.readline()
 78            self.metadata_len += len( line )
 79            assert line, 'Invalid VCF file provided.'
 80            line = line.rstrip( '\r\n' )
 81            if self.vcf_class and line.startswith( self.vcf_class.header_startswith ):
 82                # read the header fields, ignoring any blank tabs, which GATK
 83                # VCF produces after the sample
 84                self.header_fields = [l for l in line.split( '\t' ) if l]
 85                if len( self.header_fields ) > self.vcf_class.required_header_length:
 86                    for sample_name in self.header_fields[ self.vcf_class.required_header_length + 1 : ]:
 87                        self.sample_names.append( sample_name )
 88                break
 89            assert line.startswith( '##' ), 'Non-metadata line found before header'
 90            line = line[2:] #strip ##
 91            metadata = line.split( '=', 1 )
 92            metadata_name = metadata[ 0 ]
 93            if len( metadata ) == 2:
 94                metadata_value = metadata[ 1 ]
 95            else:
 96                metadata_value = None
 97            if metadata_name in self.metadata:
 98                if not isinstance( self.metadata[ metadata_name ], list ):
 99                    self.metadata[ metadata_name ] = [ self.metadata[ metadata_name ] ]
100                self.metadata[ metadata_name ].append( metadata_value )
101            else:
102                self.metadata[ metadata_name ] = metadata_value
103            if metadata_name == 'fileformat':
104                self.vcf_class = VariantCall.get_class_by_format( metadata_value )
105    def next( self ):
106        line = self.vcf_file.readline()
107        if not line:
108            raise StopIteration
109        return self.vcf_class( line, self.metadata, self.sample_names )
110    def __iter__( self ):
111        while True:
112            yield self.next()