interval_maf_to_merged_fasta.py - This Python script extrac…

/tools/maf/interval_maf_to_merged_fasta.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 196 lines · 126 code · 29 blank · 41 comment · 51 complexity · 50444f9fbbcc2758b296d6be900b3da1 MD5 · raw file

#!/usr/bin/env python

"""
Reads an interval or gene BED and a MAF Source.
Produces a FASTA file containing the aligned intervals/gene sequences, based upon the provided coordinates

Alignment blocks are layered ontop of each other based upon score.

usage: %prog maf_file [options]
   -d, --dbkey=d: Database key, ie hg17
   -c, --chromCol=c: Column of Chr
   -s, --startCol=s: Column of Start
   -e, --endCol=e: Column of End
   -S, --strandCol=S: Column of Strand
   -G, --geneBED: Input is a Gene BED file, process and join exons as one region
   -t, --mafSourceType=t: Type of MAF source to use
   -m, --mafSource=m: Path of source MAF file, if not using cached version
   -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version
   -i, --interval_file=i:       Input interval file
   -o, --output_file=o:      Output MAF file
   -p, --species=p: Species to include in output
   -O, --overwrite_with_gaps=O: Overwrite bases found in a lower-scoring block with gaps interior to the sequence for a species.
   -z, --mafIndexFileDir=z: Directory of local maf_index.loc file

usage: %prog dbkey_of_BED comma_separated_list_of_additional_dbkeys_to_extract comma_separated_list_of_indexed_maf_files input_gene_bed_file output_fasta_file cached|user GALAXY_DATA_INDEX_DIR
"""

#Dan Blankenberg
from galaxy import eggs
from galaxy.tools.util import maf_utilities
import pkg_resources; pkg_resources.require( "bx-python" )
from bx.cookbook import doc_optparse
import bx.intervals.io
import sys

assert sys.version_info[:2] >= ( 2, 4 )

def stop_err( msg ):
    sys.stderr.write( msg )
    sys.exit()

def __main__():
    
    #Parse Command Line
    options, args = doc_optparse.parse( __doc__ )
    mincols = 0
    strand_col = -1
    
    if options.dbkey:
        primary_species = options.dbkey
    else:
        primary_species = None
    if primary_species in [None, "?", "None"]:
        stop_err( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." )
    
    include_primary = True
    secondary_species = maf_utilities.parse_species_option( options.species )
    if secondary_species:
        species = list( secondary_species ) # make copy of species list
        if primary_species in secondary_species:
            secondary_species.remove( primary_species )
        else:
            include_primary = False
    else:
        species = None
    
    if options.interval_file:
        interval_file = options.interval_file
    else: 
        stop_err( "Input interval file has not been specified." )
    
    if options.output_file:
        output_file = options.output_file
    else: 
        stop_err( "Output file has not been specified." )
    
    if not options.geneBED:
        if options.chromCol:
            chr_col = int( options.chromCol ) - 1
        else: 
            stop_err( "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." )
        
        if options.startCol:
            start_col = int( options.startCol ) - 1
        else: 
            stop_err( "Start column not set, click the pencil icon in the history item to set the metadata attributes." )
        
        if options.endCol:
            end_col = int( options.endCol ) - 1
        else: 
            stop_err( "End column not set, click the pencil icon in the history item to set the metadata attributes." )
        
        if options.strandCol:
            strand_col = int( options.strandCol ) - 1
    
    mafIndexFile = "%s/maf_index.loc" % options.mafIndexFileDir
    
    overwrite_with_gaps = True
    if options.overwrite_with_gaps and options.overwrite_with_gaps.lower() == 'false':
        overwrite_with_gaps = False
    
    #Finish parsing command line
        
    #get index for mafs based on type 
    index = index_filename = None
    #using specified uid for locally cached
    if options.mafSourceType.lower() in ["cached"]:
        index = maf_utilities.maf_index_by_uid( options.mafSource, mafIndexFile )
        if index is None:
            stop_err( "The MAF source specified (%s) appears to be invalid." % ( options.mafSource ) )
    elif options.mafSourceType.lower() in ["user"]:
        #index maf for use here, need to remove index_file when finished
        index, index_filename = maf_utilities.open_or_build_maf_index( options.mafSource, options.mafIndex, species = [primary_species] )
        if index is None:
            stop_err( "Your MAF file appears to be malformed." )
    else:
        stop_err( "Invalid MAF source type specified." )
    
    #open output file
    output = open( output_file, "w" )
    
    if options.geneBED:
        region_enumerator = maf_utilities.line_enumerator( open( interval_file, "r" ).readlines() )
    else:
        region_enumerator = enumerate( bx.intervals.io.NiceReaderWrapper( open( interval_file, 'r' ), chrom_col = chr_col, start_col = start_col, end_col = end_col, strand_col = strand_col, fix_strand = True, return_header = False, return_comments = False ) )
    
    #Step through intervals
    regions_extracted = 0
    line_count = 0
    for line_count, line in region_enumerator:
        try:
            if options.geneBED: #Process as Gene BED
                try:
                    starts, ends, fields = maf_utilities.get_starts_ends_fields_from_gene_bed( line )
                    #create spliced alignment object
                    alignment = maf_utilities.get_spliced_region_alignment( index, primary_species, fields[0], starts, ends, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps )
                    primary_name = secondary_name = fields[3]
                    alignment_strand = fields[5]
                except Exception, e:
                    print "Error loading exon positions from input line %i: %s" % ( line_count, e )
                    continue
            else: #Process as standard intervals
                try:
                    #create spliced alignment object
                    alignment = maf_utilities.get_region_alignment( index, primary_species, line.chrom, line.start, line.end, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps )
                    primary_name = "%s(%s):%s-%s" % ( line.chrom, line.strand, line.start, line.end )
                    secondary_name = ""
                    alignment_strand = line.strand
                except Exception, e:
                    print "Error loading region positions from input line %i: %s" % ( line_count, e )
                    continue
            
            #Write alignment to output file
            #Output primary species first, if requested
            if include_primary:
                output.write( ">%s.%s\n" %( primary_species, primary_name ) )
                if alignment_strand == "-":
                    output.write( alignment.get_sequence_reverse_complement( primary_species ) )
                else:
                    output.write( alignment.get_sequence( primary_species ) )
                output.write( "\n" )
            #Output all remainging species
            for spec in secondary_species or alignment.get_species_names( skip = primary_species ):
                if secondary_name:
                    output.write( ">%s.%s\n" % ( spec, secondary_name ) )
                else:
                    output.write( ">%s\n" % ( spec ) )
                if alignment_strand == "-":
                    output.write( alignment.get_sequence_reverse_complement( spec ) )
                else:
                    output.write( alignment.get_sequence( spec ) )
                output.write( "\n" )
            
            output.write( "\n" )
            
            regions_extracted += 1
        
        except Exception, e:
            print "Unexpected error from input line %i: %s" % ( line_count, e )
            continue
    
    #close output file
    output.close()
    
    #remove index file if created during run
    maf_utilities.remove_temp_index_file( index_filename )
    
    #Print message about success for user
    if regions_extracted > 0:
        print "%i regions were processed successfully." % ( regions_extracted )
    else:
        print "No regions were processed successfully."
        if line_count > 0 and options.geneBED:
            print "This tool requires your input file to conform to the 12 column BED standard."

if __name__ == "__main__": __main__()
Summary ✨

This Python script extracts regions from a multiple alignment file and writes them to a new file, one region per line. It can process either standard interval files or Gene BED files as input.
Tech Fingerprint

Alerts (9)

'def' Ensure functions have docstrings for documentation
38
'list(' Avoid unnecessary list conversions; use generators where possible
59
'open(' Use 'with open()' to ensure Files are properly closed
120 123 125
Complexity hotspot; lines 130 to 133 (total complexity: 4)
130 131 132 133