/tools/maf/interval_maf_to_merged_fasta.py
https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 196 lines · 126 code · 29 blank · 41 comment · 51 complexity · 50444f9fbbcc2758b296d6be900b3da1 MD5 · raw file
- #!/usr/bin/env python
- """
- Reads an interval or gene BED and a MAF Source.
- Produces a FASTA file containing the aligned intervals/gene sequences, based upon the provided coordinates
- Alignment blocks are layered ontop of each other based upon score.
- usage: %prog maf_file [options]
- -d, --dbkey=d: Database key, ie hg17
- -c, --chromCol=c: Column of Chr
- -s, --startCol=s: Column of Start
- -e, --endCol=e: Column of End
- -S, --strandCol=S: Column of Strand
- -G, --geneBED: Input is a Gene BED file, process and join exons as one region
- -t, --mafSourceType=t: Type of MAF source to use
- -m, --mafSource=m: Path of source MAF file, if not using cached version
- -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version
- -i, --interval_file=i: Input interval file
- -o, --output_file=o: Output MAF file
- -p, --species=p: Species to include in output
- -O, --overwrite_with_gaps=O: Overwrite bases found in a lower-scoring block with gaps interior to the sequence for a species.
- -z, --mafIndexFileDir=z: Directory of local maf_index.loc file
- usage: %prog dbkey_of_BED comma_separated_list_of_additional_dbkeys_to_extract comma_separated_list_of_indexed_maf_files input_gene_bed_file output_fasta_file cached|user GALAXY_DATA_INDEX_DIR
- """
- #Dan Blankenberg
- from galaxy import eggs
- from galaxy.tools.util import maf_utilities
- import pkg_resources; pkg_resources.require( "bx-python" )
- from bx.cookbook import doc_optparse
- import bx.intervals.io
- import sys
- assert sys.version_info[:2] >= ( 2, 4 )
- def stop_err( msg ):
- sys.stderr.write( msg )
- sys.exit()
- def __main__():
-
- #Parse Command Line
- options, args = doc_optparse.parse( __doc__ )
- mincols = 0
- strand_col = -1
-
- if options.dbkey:
- primary_species = options.dbkey
- else:
- primary_species = None
- if primary_species in [None, "?", "None"]:
- stop_err( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." )
-
- include_primary = True
- secondary_species = maf_utilities.parse_species_option( options.species )
- if secondary_species:
- species = list( secondary_species ) # make copy of species list
- if primary_species in secondary_species:
- secondary_species.remove( primary_species )
- else:
- include_primary = False
- else:
- species = None
-
- if options.interval_file:
- interval_file = options.interval_file
- else:
- stop_err( "Input interval file has not been specified." )
-
- if options.output_file:
- output_file = options.output_file
- else:
- stop_err( "Output file has not been specified." )
-
- if not options.geneBED:
- if options.chromCol:
- chr_col = int( options.chromCol ) - 1
- else:
- stop_err( "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." )
-
- if options.startCol:
- start_col = int( options.startCol ) - 1
- else:
- stop_err( "Start column not set, click the pencil icon in the history item to set the metadata attributes." )
-
- if options.endCol:
- end_col = int( options.endCol ) - 1
- else:
- stop_err( "End column not set, click the pencil icon in the history item to set the metadata attributes." )
-
- if options.strandCol:
- strand_col = int( options.strandCol ) - 1
-
- mafIndexFile = "%s/maf_index.loc" % options.mafIndexFileDir
-
- overwrite_with_gaps = True
- if options.overwrite_with_gaps and options.overwrite_with_gaps.lower() == 'false':
- overwrite_with_gaps = False
-
- #Finish parsing command line
-
- #get index for mafs based on type
- index = index_filename = None
- #using specified uid for locally cached
- if options.mafSourceType.lower() in ["cached"]:
- index = maf_utilities.maf_index_by_uid( options.mafSource, mafIndexFile )
- if index is None:
- stop_err( "The MAF source specified (%s) appears to be invalid." % ( options.mafSource ) )
- elif options.mafSourceType.lower() in ["user"]:
- #index maf for use here, need to remove index_file when finished
- index, index_filename = maf_utilities.open_or_build_maf_index( options.mafSource, options.mafIndex, species = [primary_species] )
- if index is None:
- stop_err( "Your MAF file appears to be malformed." )
- else:
- stop_err( "Invalid MAF source type specified." )
-
- #open output file
- output = open( output_file, "w" )
-
- if options.geneBED:
- region_enumerator = maf_utilities.line_enumerator( open( interval_file, "r" ).readlines() )
- else:
- region_enumerator = enumerate( bx.intervals.io.NiceReaderWrapper( open( interval_file, 'r' ), chrom_col = chr_col, start_col = start_col, end_col = end_col, strand_col = strand_col, fix_strand = True, return_header = False, return_comments = False ) )
-
- #Step through intervals
- regions_extracted = 0
- line_count = 0
- for line_count, line in region_enumerator:
- try:
- if options.geneBED: #Process as Gene BED
- try:
- starts, ends, fields = maf_utilities.get_starts_ends_fields_from_gene_bed( line )
- #create spliced alignment object
- alignment = maf_utilities.get_spliced_region_alignment( index, primary_species, fields[0], starts, ends, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps )
- primary_name = secondary_name = fields[3]
- alignment_strand = fields[5]
- except Exception, e:
- print "Error loading exon positions from input line %i: %s" % ( line_count, e )
- continue
- else: #Process as standard intervals
- try:
- #create spliced alignment object
- alignment = maf_utilities.get_region_alignment( index, primary_species, line.chrom, line.start, line.end, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps )
- primary_name = "%s(%s):%s-%s" % ( line.chrom, line.strand, line.start, line.end )
- secondary_name = ""
- alignment_strand = line.strand
- except Exception, e:
- print "Error loading region positions from input line %i: %s" % ( line_count, e )
- continue
-
- #Write alignment to output file
- #Output primary species first, if requested
- if include_primary:
- output.write( ">%s.%s\n" %( primary_species, primary_name ) )
- if alignment_strand == "-":
- output.write( alignment.get_sequence_reverse_complement( primary_species ) )
- else:
- output.write( alignment.get_sequence( primary_species ) )
- output.write( "\n" )
- #Output all remainging species
- for spec in secondary_species or alignment.get_species_names( skip = primary_species ):
- if secondary_name:
- output.write( ">%s.%s\n" % ( spec, secondary_name ) )
- else:
- output.write( ">%s\n" % ( spec ) )
- if alignment_strand == "-":
- output.write( alignment.get_sequence_reverse_complement( spec ) )
- else:
- output.write( alignment.get_sequence( spec ) )
- output.write( "\n" )
-
- output.write( "\n" )
-
- regions_extracted += 1
-
- except Exception, e:
- print "Unexpected error from input line %i: %s" % ( line_count, e )
- continue
-
- #close output file
- output.close()
-
- #remove index file if created during run
- maf_utilities.remove_temp_index_file( index_filename )
-
- #Print message about success for user
- if regions_extracted > 0:
- print "%i regions were processed successfully." % ( regions_extracted )
- else:
- print "No regions were processed successfully."
- if line_count > 0 and options.geneBED:
- print "This tool requires your input file to conform to the 12 column BED standard."
- if __name__ == "__main__": __main__()