PageRenderTime 51ms CodeModel.GetById 16ms app.highlight 29ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/maf/interval_maf_to_merged_fasta.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 196 lines | 140 code | 21 blank | 35 comment | 25 complexity | 50444f9fbbcc2758b296d6be900b3da1 MD5 | raw file
  1#!/usr/bin/env python
  2
  3"""
  4Reads an interval or gene BED and a MAF Source.
  5Produces a FASTA file containing the aligned intervals/gene sequences, based upon the provided coordinates
  6
  7Alignment blocks are layered ontop of each other based upon score.
  8
  9usage: %prog maf_file [options]
 10   -d, --dbkey=d: Database key, ie hg17
 11   -c, --chromCol=c: Column of Chr
 12   -s, --startCol=s: Column of Start
 13   -e, --endCol=e: Column of End
 14   -S, --strandCol=S: Column of Strand
 15   -G, --geneBED: Input is a Gene BED file, process and join exons as one region
 16   -t, --mafSourceType=t: Type of MAF source to use
 17   -m, --mafSource=m: Path of source MAF file, if not using cached version
 18   -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version
 19   -i, --interval_file=i:       Input interval file
 20   -o, --output_file=o:      Output MAF file
 21   -p, --species=p: Species to include in output
 22   -O, --overwrite_with_gaps=O: Overwrite bases found in a lower-scoring block with gaps interior to the sequence for a species.
 23   -z, --mafIndexFileDir=z: Directory of local maf_index.loc file
 24
 25usage: %prog dbkey_of_BED comma_separated_list_of_additional_dbkeys_to_extract comma_separated_list_of_indexed_maf_files input_gene_bed_file output_fasta_file cached|user GALAXY_DATA_INDEX_DIR
 26"""
 27
 28#Dan Blankenberg
 29from galaxy import eggs
 30from galaxy.tools.util import maf_utilities
 31import pkg_resources; pkg_resources.require( "bx-python" )
 32from bx.cookbook import doc_optparse
 33import bx.intervals.io
 34import sys
 35
 36assert sys.version_info[:2] >= ( 2, 4 )
 37
 38def stop_err( msg ):
 39    sys.stderr.write( msg )
 40    sys.exit()
 41
 42def __main__():
 43    
 44    #Parse Command Line
 45    options, args = doc_optparse.parse( __doc__ )
 46    mincols = 0
 47    strand_col = -1
 48    
 49    if options.dbkey:
 50        primary_species = options.dbkey
 51    else:
 52        primary_species = None
 53    if primary_species in [None, "?", "None"]:
 54        stop_err( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." )
 55    
 56    include_primary = True
 57    secondary_species = maf_utilities.parse_species_option( options.species )
 58    if secondary_species:
 59        species = list( secondary_species ) # make copy of species list
 60        if primary_species in secondary_species:
 61            secondary_species.remove( primary_species )
 62        else:
 63            include_primary = False
 64    else:
 65        species = None
 66    
 67    if options.interval_file:
 68        interval_file = options.interval_file
 69    else: 
 70        stop_err( "Input interval file has not been specified." )
 71    
 72    if options.output_file:
 73        output_file = options.output_file
 74    else: 
 75        stop_err( "Output file has not been specified." )
 76    
 77    if not options.geneBED:
 78        if options.chromCol:
 79            chr_col = int( options.chromCol ) - 1
 80        else: 
 81            stop_err( "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." )
 82        
 83        if options.startCol:
 84            start_col = int( options.startCol ) - 1
 85        else: 
 86            stop_err( "Start column not set, click the pencil icon in the history item to set the metadata attributes." )
 87        
 88        if options.endCol:
 89            end_col = int( options.endCol ) - 1
 90        else: 
 91            stop_err( "End column not set, click the pencil icon in the history item to set the metadata attributes." )
 92        
 93        if options.strandCol:
 94            strand_col = int( options.strandCol ) - 1
 95    
 96    mafIndexFile = "%s/maf_index.loc" % options.mafIndexFileDir
 97    
 98    overwrite_with_gaps = True
 99    if options.overwrite_with_gaps and options.overwrite_with_gaps.lower() == 'false':
100        overwrite_with_gaps = False
101    
102    #Finish parsing command line
103        
104    #get index for mafs based on type 
105    index = index_filename = None
106    #using specified uid for locally cached
107    if options.mafSourceType.lower() in ["cached"]:
108        index = maf_utilities.maf_index_by_uid( options.mafSource, mafIndexFile )
109        if index is None:
110            stop_err( "The MAF source specified (%s) appears to be invalid." % ( options.mafSource ) )
111    elif options.mafSourceType.lower() in ["user"]:
112        #index maf for use here, need to remove index_file when finished
113        index, index_filename = maf_utilities.open_or_build_maf_index( options.mafSource, options.mafIndex, species = [primary_species] )
114        if index is None:
115            stop_err( "Your MAF file appears to be malformed." )
116    else:
117        stop_err( "Invalid MAF source type specified." )
118    
119    #open output file
120    output = open( output_file, "w" )
121    
122    if options.geneBED:
123        region_enumerator = maf_utilities.line_enumerator( open( interval_file, "r" ).readlines() )
124    else:
125        region_enumerator = enumerate( bx.intervals.io.NiceReaderWrapper( open( interval_file, 'r' ), chrom_col = chr_col, start_col = start_col, end_col = end_col, strand_col = strand_col, fix_strand = True, return_header = False, return_comments = False ) )
126    
127    #Step through intervals
128    regions_extracted = 0
129    line_count = 0
130    for line_count, line in region_enumerator:
131        try:
132            if options.geneBED: #Process as Gene BED
133                try:
134                    starts, ends, fields = maf_utilities.get_starts_ends_fields_from_gene_bed( line )
135                    #create spliced alignment object
136                    alignment = maf_utilities.get_spliced_region_alignment( index, primary_species, fields[0], starts, ends, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps )
137                    primary_name = secondary_name = fields[3]
138                    alignment_strand = fields[5]
139                except Exception, e:
140                    print "Error loading exon positions from input line %i: %s" % ( line_count, e )
141                    continue
142            else: #Process as standard intervals
143                try:
144                    #create spliced alignment object
145                    alignment = maf_utilities.get_region_alignment( index, primary_species, line.chrom, line.start, line.end, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps )
146                    primary_name = "%s(%s):%s-%s" % ( line.chrom, line.strand, line.start, line.end )
147                    secondary_name = ""
148                    alignment_strand = line.strand
149                except Exception, e:
150                    print "Error loading region positions from input line %i: %s" % ( line_count, e )
151                    continue
152            
153            #Write alignment to output file
154            #Output primary species first, if requested
155            if include_primary:
156                output.write( ">%s.%s\n" %( primary_species, primary_name ) )
157                if alignment_strand == "-":
158                    output.write( alignment.get_sequence_reverse_complement( primary_species ) )
159                else:
160                    output.write( alignment.get_sequence( primary_species ) )
161                output.write( "\n" )
162            #Output all remainging species
163            for spec in secondary_species or alignment.get_species_names( skip = primary_species ):
164                if secondary_name:
165                    output.write( ">%s.%s\n" % ( spec, secondary_name ) )
166                else:
167                    output.write( ">%s\n" % ( spec ) )
168                if alignment_strand == "-":
169                    output.write( alignment.get_sequence_reverse_complement( spec ) )
170                else:
171                    output.write( alignment.get_sequence( spec ) )
172                output.write( "\n" )
173            
174            output.write( "\n" )
175            
176            regions_extracted += 1
177        
178        except Exception, e:
179            print "Unexpected error from input line %i: %s" % ( line_count, e )
180            continue
181    
182    #close output file
183    output.close()
184    
185    #remove index file if created during run
186    maf_utilities.remove_temp_index_file( index_filename )
187    
188    #Print message about success for user
189    if regions_extracted > 0:
190        print "%i regions were processed successfully." % ( regions_extracted )
191    else:
192        print "No regions were processed successfully."
193        if line_count > 0 and options.geneBED:
194            print "This tool requires your input file to conform to the 12 column BED standard."
195
196if __name__ == "__main__": __main__()