/tools/maf/interval_maf_to_merged_fasta.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 196 lines · 126 code · 29 blank · 41 comment · 51 complexity · 50444f9fbbcc2758b296d6be900b3da1 MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. Reads an interval or gene BED and a MAF Source.
  4. Produces a FASTA file containing the aligned intervals/gene sequences, based upon the provided coordinates
  5. Alignment blocks are layered ontop of each other based upon score.
  6. usage: %prog maf_file [options]
  7. -d, --dbkey=d: Database key, ie hg17
  8. -c, --chromCol=c: Column of Chr
  9. -s, --startCol=s: Column of Start
  10. -e, --endCol=e: Column of End
  11. -S, --strandCol=S: Column of Strand
  12. -G, --geneBED: Input is a Gene BED file, process and join exons as one region
  13. -t, --mafSourceType=t: Type of MAF source to use
  14. -m, --mafSource=m: Path of source MAF file, if not using cached version
  15. -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version
  16. -i, --interval_file=i: Input interval file
  17. -o, --output_file=o: Output MAF file
  18. -p, --species=p: Species to include in output
  19. -O, --overwrite_with_gaps=O: Overwrite bases found in a lower-scoring block with gaps interior to the sequence for a species.
  20. -z, --mafIndexFileDir=z: Directory of local maf_index.loc file
  21. usage: %prog dbkey_of_BED comma_separated_list_of_additional_dbkeys_to_extract comma_separated_list_of_indexed_maf_files input_gene_bed_file output_fasta_file cached|user GALAXY_DATA_INDEX_DIR
  22. """
  23. #Dan Blankenberg
  24. from galaxy import eggs
  25. from galaxy.tools.util import maf_utilities
  26. import pkg_resources; pkg_resources.require( "bx-python" )
  27. from bx.cookbook import doc_optparse
  28. import bx.intervals.io
  29. import sys
  30. assert sys.version_info[:2] >= ( 2, 4 )
  31. def stop_err( msg ):
  32. sys.stderr.write( msg )
  33. sys.exit()
  34. def __main__():
  35. #Parse Command Line
  36. options, args = doc_optparse.parse( __doc__ )
  37. mincols = 0
  38. strand_col = -1
  39. if options.dbkey:
  40. primary_species = options.dbkey
  41. else:
  42. primary_species = None
  43. if primary_species in [None, "?", "None"]:
  44. stop_err( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." )
  45. include_primary = True
  46. secondary_species = maf_utilities.parse_species_option( options.species )
  47. if secondary_species:
  48. species = list( secondary_species ) # make copy of species list
  49. if primary_species in secondary_species:
  50. secondary_species.remove( primary_species )
  51. else:
  52. include_primary = False
  53. else:
  54. species = None
  55. if options.interval_file:
  56. interval_file = options.interval_file
  57. else:
  58. stop_err( "Input interval file has not been specified." )
  59. if options.output_file:
  60. output_file = options.output_file
  61. else:
  62. stop_err( "Output file has not been specified." )
  63. if not options.geneBED:
  64. if options.chromCol:
  65. chr_col = int( options.chromCol ) - 1
  66. else:
  67. stop_err( "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." )
  68. if options.startCol:
  69. start_col = int( options.startCol ) - 1
  70. else:
  71. stop_err( "Start column not set, click the pencil icon in the history item to set the metadata attributes." )
  72. if options.endCol:
  73. end_col = int( options.endCol ) - 1
  74. else:
  75. stop_err( "End column not set, click the pencil icon in the history item to set the metadata attributes." )
  76. if options.strandCol:
  77. strand_col = int( options.strandCol ) - 1
  78. mafIndexFile = "%s/maf_index.loc" % options.mafIndexFileDir
  79. overwrite_with_gaps = True
  80. if options.overwrite_with_gaps and options.overwrite_with_gaps.lower() == 'false':
  81. overwrite_with_gaps = False
  82. #Finish parsing command line
  83. #get index for mafs based on type
  84. index = index_filename = None
  85. #using specified uid for locally cached
  86. if options.mafSourceType.lower() in ["cached"]:
  87. index = maf_utilities.maf_index_by_uid( options.mafSource, mafIndexFile )
  88. if index is None:
  89. stop_err( "The MAF source specified (%s) appears to be invalid." % ( options.mafSource ) )
  90. elif options.mafSourceType.lower() in ["user"]:
  91. #index maf for use here, need to remove index_file when finished
  92. index, index_filename = maf_utilities.open_or_build_maf_index( options.mafSource, options.mafIndex, species = [primary_species] )
  93. if index is None:
  94. stop_err( "Your MAF file appears to be malformed." )
  95. else:
  96. stop_err( "Invalid MAF source type specified." )
  97. #open output file
  98. output = open( output_file, "w" )
  99. if options.geneBED:
  100. region_enumerator = maf_utilities.line_enumerator( open( interval_file, "r" ).readlines() )
  101. else:
  102. region_enumerator = enumerate( bx.intervals.io.NiceReaderWrapper( open( interval_file, 'r' ), chrom_col = chr_col, start_col = start_col, end_col = end_col, strand_col = strand_col, fix_strand = True, return_header = False, return_comments = False ) )
  103. #Step through intervals
  104. regions_extracted = 0
  105. line_count = 0
  106. for line_count, line in region_enumerator:
  107. try:
  108. if options.geneBED: #Process as Gene BED
  109. try:
  110. starts, ends, fields = maf_utilities.get_starts_ends_fields_from_gene_bed( line )
  111. #create spliced alignment object
  112. alignment = maf_utilities.get_spliced_region_alignment( index, primary_species, fields[0], starts, ends, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps )
  113. primary_name = secondary_name = fields[3]
  114. alignment_strand = fields[5]
  115. except Exception, e:
  116. print "Error loading exon positions from input line %i: %s" % ( line_count, e )
  117. continue
  118. else: #Process as standard intervals
  119. try:
  120. #create spliced alignment object
  121. alignment = maf_utilities.get_region_alignment( index, primary_species, line.chrom, line.start, line.end, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps )
  122. primary_name = "%s(%s):%s-%s" % ( line.chrom, line.strand, line.start, line.end )
  123. secondary_name = ""
  124. alignment_strand = line.strand
  125. except Exception, e:
  126. print "Error loading region positions from input line %i: %s" % ( line_count, e )
  127. continue
  128. #Write alignment to output file
  129. #Output primary species first, if requested
  130. if include_primary:
  131. output.write( ">%s.%s\n" %( primary_species, primary_name ) )
  132. if alignment_strand == "-":
  133. output.write( alignment.get_sequence_reverse_complement( primary_species ) )
  134. else:
  135. output.write( alignment.get_sequence( primary_species ) )
  136. output.write( "\n" )
  137. #Output all remainging species
  138. for spec in secondary_species or alignment.get_species_names( skip = primary_species ):
  139. if secondary_name:
  140. output.write( ">%s.%s\n" % ( spec, secondary_name ) )
  141. else:
  142. output.write( ">%s\n" % ( spec ) )
  143. if alignment_strand == "-":
  144. output.write( alignment.get_sequence_reverse_complement( spec ) )
  145. else:
  146. output.write( alignment.get_sequence( spec ) )
  147. output.write( "\n" )
  148. output.write( "\n" )
  149. regions_extracted += 1
  150. except Exception, e:
  151. print "Unexpected error from input line %i: %s" % ( line_count, e )
  152. continue
  153. #close output file
  154. output.close()
  155. #remove index file if created during run
  156. maf_utilities.remove_temp_index_file( index_filename )
  157. #Print message about success for user
  158. if regions_extracted > 0:
  159. print "%i regions were processed successfully." % ( regions_extracted )
  160. else:
  161. print "No regions were processed successfully."
  162. if line_count > 0 and options.geneBED:
  163. print "This tool requires your input file to conform to the 12 column BED standard."
  164. if __name__ == "__main__": __main__()