util.py - This is a Python script that converts various pre…

/scripts/microbes/util.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 224 lines · 188 code · 14 blank · 22 comment · 63 complexity · 675d1595ca855dd936b404f21dc6d141 MD5 · raw file

#!/usr/bin/env python

#Dan Blankenberg



import sys



assert sys.version_info[:2] >= ( 2, 4 )



#genbank_to_bed

class Region:

    def __init__( self ):

        self.qualifiers = {}

        self.start = None

        self.end = None

        self.strand = '+'

    def set_coordinates_by_location( self, location ):

        location = location.strip().lower().replace( '..', ',' )

        if "complement(" in location: #if part of the sequence is on the negative strand, it all is?

            self.strand = '-' #default of + strand

        for remove_text in ["join(", "order(", "complement(", ")"]:

            location = location.replace( remove_text, "" )

        for number in location.split( ',' ):

            number = number.strip('\n\r\t <>,()')

            if number:

                if "^" in number:

                    #a single point

                    #check that this is correct for points, ie: 413/NC_005027.gbk:     misc_feature    6636286^6636287  ===> 6636285,6636286

                    end = int( number.split( '^' )[0] )

                    start = end - 1

                else:

                    end = int( number )

                    start = end - 1 #match BED coordinates

                if self.start is None or start < self.start:

                    self.start = start

                if self.end is None or end > self.end:

                    self.end = end



class GenBankFeatureParser:

    """Parses Features from Single Locus GenBank file"""

    def __init__( self, fh, features_list = [] ):

        self.fh = fh

        self.features = {}

        fh.seek(0)

        in_features = False

        last_feature_name = None

        base_indent = 0

        last_indent = 0

        last_attr_name = None

        for line in fh:

            if not in_features and line.startswith('FEATURES'):

                in_features = True

                continue

            if in_features:

                lstrip = line.lstrip()

                if line and lstrip == line:

                    break #end of feature block

                cur_indent = len( line ) - len( lstrip )

                if last_feature_name is None:

                    base_indent = cur_indent

                if cur_indent == base_indent:

                    #a new feature

                    last_attr_name = None

                    fields = lstrip.split( None, 1 )

                    last_feature_name = fields[0].strip()

                    if not features_list or ( features_list and last_feature_name in features_list ):

                        if last_feature_name not in self.features:

                            self.features[last_feature_name] = []

                        region = Region()

                        region.set_coordinates_by_location( fields[1] )

                        self.features[last_feature_name].append( region )

                else:

                    #add info to last known feature

                    line = line.strip()

                    if line.startswith( '/' ):

                        fields = line[1:].split( '=', 1 )

                        if len( fields ) == 2:

                            last_attr_name, content = fields

                        else:

                            #No data 

                            last_attr_name = line[1:]

                            content = ""

                        content = content.strip( '"' )

                        if last_attr_name not in self.features[last_feature_name][-1].qualifiers:

                            self.features[last_feature_name][-1].qualifiers[last_attr_name] = []

                        self.features[last_feature_name][-1].qualifiers[last_attr_name].append( content )

                    elif last_attr_name is None and last_feature_name:

                        # must still be working on location

                        self.features[last_feature_name][-1].set_coordinates_by_location( line )

                    else:

                        #continuation of multi-line qualifier content

                        if last_feature_name.lower() in ['translation']:

                            self.features[last_feature_name][-1].qualifiers[last_attr_name][-1] = "%s%s" % ( self.features[last_feature_name][-1].qualifiers[last_attr_name][-1], line.rstrip( '"' ) )

                        else:

                            self.features[last_feature_name][-1].qualifiers[last_attr_name][-1] = "%s %s" % ( self.features[last_feature_name][-1].qualifiers[last_attr_name][-1], line.rstrip( '"' ) )

                    

                

    def get_features_by_type( self, feature_type ):

        if feature_type not in self.features:

            return []

        else:

            return self.features[feature_type]



# Parse A GenBank file and return arrays of BED regions for the corresponding features

def get_bed_from_genbank(gb_file, chrom, feature_list):

    genbank_parser = GenBankFeatureParser( open( gb_file ) )

    features = {}

    for feature_type in feature_list:

        features[feature_type]=[]

        for feature in genbank_parser.get_features_by_type( feature_type ):

            name = ""

            for name_tag in ['gene', 'locus_tag', 'db_xref']:

                if name_tag in feature.qualifiers:

                    if name: name = name + ";"

                    name = name + feature.qualifiers[name_tag][0].replace(" ","_")

            if not name:

                name = "unknown"

            

            features[feature_type].append( "%s\t%s\t%s\t%s\t%s\t%s" % ( chrom, feature.start, feature.end, name, 0, feature.strand ) )#append new bed field here

    return features



#geneMark to bed

import sys



#converts GeneMarkHMM to bed

#returns an array of bed regions

def get_bed_from_GeneMark(geneMark_filename, chr):

    orfs = open(geneMark_filename).readlines()

    while True:

        line = orfs.pop(0).strip()

        if line.startswith("--------"):

            orfs.pop(0)

            break

    orfs = "".join(orfs)

    ctr = 0

    regions = []

    for block in orfs.split("\n\n"):

        if block.startswith("List of Regions of interest"): break

        best_block = {'start':0,'end':0,'strand':'+','avg_prob':-sys.maxint,'start_prob':-sys.maxint,'name':'DNE'}

        ctr+=1

        ctr2=0

        for line in block.split("\n"):

            ctr2+=1

            fields = line.split()

            start = int(fields.pop(0))-1

            end = int(fields.pop(0))

            strand = fields.pop(0)

            if strand == 'complement':

                strand = "-"

            else:

                strand = "+"

            frame = fields.pop(0)

            frame = frame + " " + fields.pop(0)

            avg_prob = float(fields.pop(0))

            try:

                start_prob = float(fields.pop(0))

            except:

                start_prob = 0

            name = "orf_"+str(ctr)+"_"+str(ctr2)

            if avg_prob >= best_block['avg_prob']:

                if start_prob > best_block['start_prob']:

                    best_block = {'start':start,'end':end,'strand':strand,'avg_prob':avg_prob,'start_prob':start_prob,'name':name}

        regions.append(chr+"\t"+str(best_block['start'])+"\t"+str(best_block['end'])+"\t"+best_block['name']+"\t"+str(int(best_block['avg_prob']*1000))+"\t"+best_block['strand'])

    return regions



#geneMarkHMM to bed

#converts GeneMarkHMM to bed

#returns an array of bed regions

def get_bed_from_GeneMarkHMM(geneMarkHMM_filename, chr):

    orfs = open(geneMarkHMM_filename).readlines()

    while True:

        line = orfs.pop(0).strip()

        if line == "Predicted genes": 

            orfs.pop(0)

            orfs.pop(0)

            break

    regions = []

    for line in orfs:

        fields = line.split()

        name = "gene_number_"+fields.pop(0)

        strand = fields.pop(0)

        start = fields.pop(0)

        if start.startswith("<"): start = 1

        start = int(start)-1

        end = fields.pop(0)

        if end.startswith(">"): end = end[1:]

        end = int(end)

        score = 0 # no scores provided

        regions.append(chr+"\t"+str(start)+"\t"+str(end)+"\t"+name+"\t"+str(score)+"\t"+strand)

    return regions



#glimmer3 to bed

#converts glimmer3 to bed, doing some linear scaling (probably not correct?) on scores 

#returns an array of bed regions

def get_bed_from_glimmer3(glimmer3_filename, chr):

    max_score = -sys.maxint

    min_score = sys.maxint

    orfs = []

    for line in open(glimmer3_filename).readlines():

        if line.startswith(">"): continue

        fields = line.split()

        name = fields.pop(0)

        start = int(fields.pop(0))

        end = int(fields.pop(0))

        if int(fields.pop(0))<0:

            strand = "-"

            temp = start

            start = end

            end = temp

        else:

            strand = "+"

        start = start - 1

        score = (float(fields.pop(0)))

        if score > max_score: max_score = score

        if score < min_score: min_score = score

        orfs.append((chr,start,end,name,score,strand))

    

    delta = 0

    if min_score < 0: delta = min_score * -1

    regions = []

    for (chr,start,end,name,score,strand) in orfs:

        #need to cast to str because was having the case where 1000.0 was rounded to 999 by int, some sort of precision bug?

        my_score = int(float(str( ( (score+delta) * (1000-0-(min_score+delta)) ) / ( (max_score + delta) + 0 ))))

        

        regions.append(chr+"\t"+str(start)+"\t"+str(end)+"\t"+name+"\t"+str(my_score)+"\t"+strand)

    return regions
Summary ✨

This is a Python script that converts various prediction tools’ outputs into BED format, which is a simple text-based format for representing genomic regions. The script can handle outputs from GeneMark, GeneMarkS, Glimmer3, and Infernal. It extracts the relevant information from each output file and formats it in a consistent way, resulting in a BED file that can be easily analyzed or visualized using tools like UCSC Genome Browser or IGV.
Tech Fingerprint

Standard Library: OS Interaction
Alerts (4)

'def' Ensure functions have docstrings for documentation
15
'= []' Avoid mutable defaults like '= []' or '= {}'; use None and initialize inside
39
Complexity hotspot; lines 64 to 65 (total complexity: 4)
64 65