PageRenderTime 39ms CodeModel.GetById 11ms app.highlight 23ms RepoModel.GetById 1ms app.codeStats 1ms

/scripts/microbes/util.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 224 lines | 218 code | 3 blank | 3 comment | 4 complexity | 675d1595ca855dd936b404f21dc6d141 MD5 | raw file
  1#!/usr/bin/env python
  2#Dan Blankenberg
  3
  4import sys
  5
  6assert sys.version_info[:2] >= ( 2, 4 )
  7
  8#genbank_to_bed
  9class Region:
 10    def __init__( self ):
 11        self.qualifiers = {}
 12        self.start = None
 13        self.end = None
 14        self.strand = '+'
 15    def set_coordinates_by_location( self, location ):
 16        location = location.strip().lower().replace( '..', ',' )
 17        if "complement(" in location: #if part of the sequence is on the negative strand, it all is?
 18            self.strand = '-' #default of + strand
 19        for remove_text in ["join(", "order(", "complement(", ")"]:
 20            location = location.replace( remove_text, "" )
 21        for number in location.split( ',' ):
 22            number = number.strip('\n\r\t <>,()')
 23            if number:
 24                if "^" in number:
 25                    #a single point
 26                    #check that this is correct for points, ie: 413/NC_005027.gbk:     misc_feature    6636286^6636287  ===> 6636285,6636286
 27                    end = int( number.split( '^' )[0] )
 28                    start = end - 1
 29                else:
 30                    end = int( number )
 31                    start = end - 1 #match BED coordinates
 32                if self.start is None or start < self.start:
 33                    self.start = start
 34                if self.end is None or end > self.end:
 35                    self.end = end
 36
 37class GenBankFeatureParser:
 38    """Parses Features from Single Locus GenBank file"""
 39    def __init__( self, fh, features_list = [] ):
 40        self.fh = fh
 41        self.features = {}
 42        fh.seek(0)
 43        in_features = False
 44        last_feature_name = None
 45        base_indent = 0
 46        last_indent = 0
 47        last_attr_name = None
 48        for line in fh:
 49            if not in_features and line.startswith('FEATURES'):
 50                in_features = True
 51                continue
 52            if in_features:
 53                lstrip = line.lstrip()
 54                if line and lstrip == line:
 55                    break #end of feature block
 56                cur_indent = len( line ) - len( lstrip )
 57                if last_feature_name is None:
 58                    base_indent = cur_indent
 59                if cur_indent == base_indent:
 60                    #a new feature
 61                    last_attr_name = None
 62                    fields = lstrip.split( None, 1 )
 63                    last_feature_name = fields[0].strip()
 64                    if not features_list or ( features_list and last_feature_name in features_list ):
 65                        if last_feature_name not in self.features:
 66                            self.features[last_feature_name] = []
 67                        region = Region()
 68                        region.set_coordinates_by_location( fields[1] )
 69                        self.features[last_feature_name].append( region )
 70                else:
 71                    #add info to last known feature
 72                    line = line.strip()
 73                    if line.startswith( '/' ):
 74                        fields = line[1:].split( '=', 1 )
 75                        if len( fields ) == 2:
 76                            last_attr_name, content = fields
 77                        else:
 78                            #No data 
 79                            last_attr_name = line[1:]
 80                            content = ""
 81                        content = content.strip( '"' )
 82                        if last_attr_name not in self.features[last_feature_name][-1].qualifiers:
 83                            self.features[last_feature_name][-1].qualifiers[last_attr_name] = []
 84                        self.features[last_feature_name][-1].qualifiers[last_attr_name].append( content )
 85                    elif last_attr_name is None and last_feature_name:
 86                        # must still be working on location
 87                        self.features[last_feature_name][-1].set_coordinates_by_location( line )
 88                    else:
 89                        #continuation of multi-line qualifier content
 90                        if last_feature_name.lower() in ['translation']:
 91                            self.features[last_feature_name][-1].qualifiers[last_attr_name][-1] = "%s%s" % ( self.features[last_feature_name][-1].qualifiers[last_attr_name][-1], line.rstrip( '"' ) )
 92                        else:
 93                            self.features[last_feature_name][-1].qualifiers[last_attr_name][-1] = "%s %s" % ( self.features[last_feature_name][-1].qualifiers[last_attr_name][-1], line.rstrip( '"' ) )
 94                    
 95                
 96    def get_features_by_type( self, feature_type ):
 97        if feature_type not in self.features:
 98            return []
 99        else:
100            return self.features[feature_type]
101
102# Parse A GenBank file and return arrays of BED regions for the corresponding features
103def get_bed_from_genbank(gb_file, chrom, feature_list):
104    genbank_parser = GenBankFeatureParser( open( gb_file ) )
105    features = {}
106    for feature_type in feature_list:
107        features[feature_type]=[]
108        for feature in genbank_parser.get_features_by_type( feature_type ):
109            name = ""
110            for name_tag in ['gene', 'locus_tag', 'db_xref']:
111                if name_tag in feature.qualifiers:
112                    if name: name = name + ";"
113                    name = name + feature.qualifiers[name_tag][0].replace(" ","_")
114            if not name:
115                name = "unknown"
116            
117            features[feature_type].append( "%s\t%s\t%s\t%s\t%s\t%s" % ( chrom, feature.start, feature.end, name, 0, feature.strand ) )#append new bed field here
118    return features
119
120#geneMark to bed
121import sys
122
123#converts GeneMarkHMM to bed
124#returns an array of bed regions
125def get_bed_from_GeneMark(geneMark_filename, chr):
126    orfs = open(geneMark_filename).readlines()
127    while True:
128        line = orfs.pop(0).strip()
129        if line.startswith("--------"):
130            orfs.pop(0)
131            break
132    orfs = "".join(orfs)
133    ctr = 0
134    regions = []
135    for block in orfs.split("\n\n"):
136        if block.startswith("List of Regions of interest"): break
137        best_block = {'start':0,'end':0,'strand':'+','avg_prob':-sys.maxint,'start_prob':-sys.maxint,'name':'DNE'}
138        ctr+=1
139        ctr2=0
140        for line in block.split("\n"):
141            ctr2+=1
142            fields = line.split()
143            start = int(fields.pop(0))-1
144            end = int(fields.pop(0))
145            strand = fields.pop(0)
146            if strand == 'complement':
147                strand = "-"
148            else:
149                strand = "+"
150            frame = fields.pop(0)
151            frame = frame + " " + fields.pop(0)
152            avg_prob = float(fields.pop(0))
153            try:
154                start_prob = float(fields.pop(0))
155            except:
156                start_prob = 0
157            name = "orf_"+str(ctr)+"_"+str(ctr2)
158            if avg_prob >= best_block['avg_prob']:
159                if start_prob > best_block['start_prob']:
160                    best_block = {'start':start,'end':end,'strand':strand,'avg_prob':avg_prob,'start_prob':start_prob,'name':name}
161        regions.append(chr+"\t"+str(best_block['start'])+"\t"+str(best_block['end'])+"\t"+best_block['name']+"\t"+str(int(best_block['avg_prob']*1000))+"\t"+best_block['strand'])
162    return regions
163
164#geneMarkHMM to bed
165#converts GeneMarkHMM to bed
166#returns an array of bed regions
167def get_bed_from_GeneMarkHMM(geneMarkHMM_filename, chr):
168    orfs = open(geneMarkHMM_filename).readlines()
169    while True:
170        line = orfs.pop(0).strip()
171        if line == "Predicted genes": 
172            orfs.pop(0)
173            orfs.pop(0)
174            break
175    regions = []
176    for line in orfs:
177        fields = line.split()
178        name = "gene_number_"+fields.pop(0)
179        strand = fields.pop(0)
180        start = fields.pop(0)
181        if start.startswith("<"): start = 1
182        start = int(start)-1
183        end = fields.pop(0)
184        if end.startswith(">"): end = end[1:]
185        end = int(end)
186        score = 0 # no scores provided
187        regions.append(chr+"\t"+str(start)+"\t"+str(end)+"\t"+name+"\t"+str(score)+"\t"+strand)
188    return regions
189
190#glimmer3 to bed
191#converts glimmer3 to bed, doing some linear scaling (probably not correct?) on scores 
192#returns an array of bed regions
193def get_bed_from_glimmer3(glimmer3_filename, chr):
194    max_score = -sys.maxint
195    min_score = sys.maxint
196    orfs = []
197    for line in open(glimmer3_filename).readlines():
198        if line.startswith(">"): continue
199        fields = line.split()
200        name = fields.pop(0)
201        start = int(fields.pop(0))
202        end = int(fields.pop(0))
203        if int(fields.pop(0))<0:
204            strand = "-"
205            temp = start
206            start = end
207            end = temp
208        else:
209            strand = "+"
210        start = start - 1
211        score = (float(fields.pop(0)))
212        if score > max_score: max_score = score
213        if score < min_score: min_score = score
214        orfs.append((chr,start,end,name,score,strand))
215    
216    delta = 0
217    if min_score < 0: delta = min_score * -1
218    regions = []
219    for (chr,start,end,name,score,strand) in orfs:
220        #need to cast to str because was having the case where 1000.0 was rounded to 999 by int, some sort of precision bug?
221        my_score = int(float(str( ( (score+delta) * (1000-0-(min_score+delta)) ) / ( (max_score + delta) + 0 ))))
222        
223        regions.append(chr+"\t"+str(start)+"\t"+str(end)+"\t"+name+"\t"+str(my_score)+"\t"+strand)
224    return regions