/scripts/microbes/util.py
https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 224 lines · 188 code · 14 blank · 22 comment · 63 complexity · 675d1595ca855dd936b404f21dc6d141 MD5 · raw file
- #!/usr/bin/env python
- #Dan Blankenberg
-
- import sys
-
- assert sys.version_info[:2] >= ( 2, 4 )
-
- #genbank_to_bed
- class Region:
- def __init__( self ):
- self.qualifiers = {}
- self.start = None
- self.end = None
- self.strand = '+'
- def set_coordinates_by_location( self, location ):
- location = location.strip().lower().replace( '..', ',' )
- if "complement(" in location: #if part of the sequence is on the negative strand, it all is?
- self.strand = '-' #default of + strand
- for remove_text in ["join(", "order(", "complement(", ")"]:
- location = location.replace( remove_text, "" )
- for number in location.split( ',' ):
- number = number.strip('\n\r\t <>,()')
- if number:
- if "^" in number:
- #a single point
- #check that this is correct for points, ie: 413/NC_005027.gbk: misc_feature 6636286^6636287 ===> 6636285,6636286
- end = int( number.split( '^' )[0] )
- start = end - 1
- else:
- end = int( number )
- start = end - 1 #match BED coordinates
- if self.start is None or start < self.start:
- self.start = start
- if self.end is None or end > self.end:
- self.end = end
-
- class GenBankFeatureParser:
- """Parses Features from Single Locus GenBank file"""
- def __init__( self, fh, features_list = [] ):
- self.fh = fh
- self.features = {}
- fh.seek(0)
- in_features = False
- last_feature_name = None
- base_indent = 0
- last_indent = 0
- last_attr_name = None
- for line in fh:
- if not in_features and line.startswith('FEATURES'):
- in_features = True
- continue
- if in_features:
- lstrip = line.lstrip()
- if line and lstrip == line:
- break #end of feature block
- cur_indent = len( line ) - len( lstrip )
- if last_feature_name is None:
- base_indent = cur_indent
- if cur_indent == base_indent:
- #a new feature
- last_attr_name = None
- fields = lstrip.split( None, 1 )
- last_feature_name = fields[0].strip()
- if not features_list or ( features_list and last_feature_name in features_list ):
- if last_feature_name not in self.features:
- self.features[last_feature_name] = []
- region = Region()
- region.set_coordinates_by_location( fields[1] )
- self.features[last_feature_name].append( region )
- else:
- #add info to last known feature
- line = line.strip()
- if line.startswith( '/' ):
- fields = line[1:].split( '=', 1 )
- if len( fields ) == 2:
- last_attr_name, content = fields
- else:
- #No data
- last_attr_name = line[1:]
- content = ""
- content = content.strip( '"' )
- if last_attr_name not in self.features[last_feature_name][-1].qualifiers:
- self.features[last_feature_name][-1].qualifiers[last_attr_name] = []
- self.features[last_feature_name][-1].qualifiers[last_attr_name].append( content )
- elif last_attr_name is None and last_feature_name:
- # must still be working on location
- self.features[last_feature_name][-1].set_coordinates_by_location( line )
- else:
- #continuation of multi-line qualifier content
- if last_feature_name.lower() in ['translation']:
- self.features[last_feature_name][-1].qualifiers[last_attr_name][-1] = "%s%s" % ( self.features[last_feature_name][-1].qualifiers[last_attr_name][-1], line.rstrip( '"' ) )
- else:
- self.features[last_feature_name][-1].qualifiers[last_attr_name][-1] = "%s %s" % ( self.features[last_feature_name][-1].qualifiers[last_attr_name][-1], line.rstrip( '"' ) )
-
-
- def get_features_by_type( self, feature_type ):
- if feature_type not in self.features:
- return []
- else:
- return self.features[feature_type]
-
- # Parse A GenBank file and return arrays of BED regions for the corresponding features
- def get_bed_from_genbank(gb_file, chrom, feature_list):
- genbank_parser = GenBankFeatureParser( open( gb_file ) )
- features = {}
- for feature_type in feature_list:
- features[feature_type]=[]
- for feature in genbank_parser.get_features_by_type( feature_type ):
- name = ""
- for name_tag in ['gene', 'locus_tag', 'db_xref']:
- if name_tag in feature.qualifiers:
- if name: name = name + ";"
- name = name + feature.qualifiers[name_tag][0].replace(" ","_")
- if not name:
- name = "unknown"
-
- features[feature_type].append( "%s\t%s\t%s\t%s\t%s\t%s" % ( chrom, feature.start, feature.end, name, 0, feature.strand ) )#append new bed field here
- return features
-
- #geneMark to bed
- import sys
-
- #converts GeneMarkHMM to bed
- #returns an array of bed regions
- def get_bed_from_GeneMark(geneMark_filename, chr):
- orfs = open(geneMark_filename).readlines()
- while True:
- line = orfs.pop(0).strip()
- if line.startswith("--------"):
- orfs.pop(0)
- break
- orfs = "".join(orfs)
- ctr = 0
- regions = []
- for block in orfs.split("\n\n"):
- if block.startswith("List of Regions of interest"): break
- best_block = {'start':0,'end':0,'strand':'+','avg_prob':-sys.maxint,'start_prob':-sys.maxint,'name':'DNE'}
- ctr+=1
- ctr2=0
- for line in block.split("\n"):
- ctr2+=1
- fields = line.split()
- start = int(fields.pop(0))-1
- end = int(fields.pop(0))
- strand = fields.pop(0)
- if strand == 'complement':
- strand = "-"
- else:
- strand = "+"
- frame = fields.pop(0)
- frame = frame + " " + fields.pop(0)
- avg_prob = float(fields.pop(0))
- try:
- start_prob = float(fields.pop(0))
- except:
- start_prob = 0
- name = "orf_"+str(ctr)+"_"+str(ctr2)
- if avg_prob >= best_block['avg_prob']:
- if start_prob > best_block['start_prob']:
- best_block = {'start':start,'end':end,'strand':strand,'avg_prob':avg_prob,'start_prob':start_prob,'name':name}
- regions.append(chr+"\t"+str(best_block['start'])+"\t"+str(best_block['end'])+"\t"+best_block['name']+"\t"+str(int(best_block['avg_prob']*1000))+"\t"+best_block['strand'])
- return regions
-
- #geneMarkHMM to bed
- #converts GeneMarkHMM to bed
- #returns an array of bed regions
- def get_bed_from_GeneMarkHMM(geneMarkHMM_filename, chr):
- orfs = open(geneMarkHMM_filename).readlines()
- while True:
- line = orfs.pop(0).strip()
- if line == "Predicted genes":
- orfs.pop(0)
- orfs.pop(0)
- break
- regions = []
- for line in orfs:
- fields = line.split()
- name = "gene_number_"+fields.pop(0)
- strand = fields.pop(0)
- start = fields.pop(0)
- if start.startswith("<"): start = 1
- start = int(start)-1
- end = fields.pop(0)
- if end.startswith(">"): end = end[1:]
- end = int(end)
- score = 0 # no scores provided
- regions.append(chr+"\t"+str(start)+"\t"+str(end)+"\t"+name+"\t"+str(score)+"\t"+strand)
- return regions
-
- #glimmer3 to bed
- #converts glimmer3 to bed, doing some linear scaling (probably not correct?) on scores
- #returns an array of bed regions
- def get_bed_from_glimmer3(glimmer3_filename, chr):
- max_score = -sys.maxint
- min_score = sys.maxint
- orfs = []
- for line in open(glimmer3_filename).readlines():
- if line.startswith(">"): continue
- fields = line.split()
- name = fields.pop(0)
- start = int(fields.pop(0))
- end = int(fields.pop(0))
- if int(fields.pop(0))<0:
- strand = "-"
- temp = start
- start = end
- end = temp
- else:
- strand = "+"
- start = start - 1
- score = (float(fields.pop(0)))
- if score > max_score: max_score = score
- if score < min_score: min_score = score
- orfs.append((chr,start,end,name,score,strand))
-
- delta = 0
- if min_score < 0: delta = min_score * -1
- regions = []
- for (chr,start,end,name,score,strand) in orfs:
- #need to cast to str because was having the case where 1000.0 was rounded to 999 by int, some sort of precision bug?
- my_score = int(float(str( ( (score+delta) * (1000-0-(min_score+delta)) ) / ( (max_score + delta) + 0 ))))
-
- regions.append(chr+"\t"+str(start)+"\t"+str(end)+"\t"+name+"\t"+str(my_score)+"\t"+strand)
- return regions