/tools/extract/extract_genomic_dna.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 300 lines · 261 code · 13 blank · 26 comment · 68 complexity · 082d97d0fe95153543a1412e3c1945a1 MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. usage: %prog $input $out_file1
  4. -1, --cols=N,N,N,N,N: Columns for start, end, strand in input file
  5. -d, --dbkey=N: Genome build of input file
  6. -o, --output_format=N: the data type of the output file
  7. -g, --GALAXY_DATA_INDEX_DIR=N: the directory containing alignseq.loc
  8. -I, --interpret_features: if true, complete features are interpreted when input is GFF
  9. -F, --fasta=<genomic_sequences>: genomic sequences to use for extraction
  10. -G, --gff: input and output file, when it is interval, coordinates are treated as GFF format (1-based, half-open) rather than 'traditional' 0-based, closed format.
  11. """
  12. from galaxy import eggs
  13. import pkg_resources
  14. pkg_resources.require( "bx-python" )
  15. import sys, string, os, re, tempfile, subprocess
  16. from bx.cookbook import doc_optparse
  17. from bx.intervals.io import Header, Comment
  18. import bx.seq.nib
  19. import bx.seq.twobit
  20. from galaxy.tools.util.galaxyops import *
  21. from galaxy.datatypes.util import gff_util
  22. assert sys.version_info[:2] >= ( 2, 4 )
  23. def stop_err( msg ):
  24. sys.stderr.write( msg )
  25. sys.exit()
  26. def reverse_complement( s ):
  27. complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" }
  28. reversed_s = []
  29. for i in s:
  30. reversed_s.append( complement_dna[i] )
  31. reversed_s.reverse()
  32. return "".join( reversed_s )
  33. def check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR ):
  34. seq_file = "%s/alignseq.loc" % GALAXY_DATA_INDEX_DIR
  35. seq_path = ''
  36. for line in open( seq_file ):
  37. line = line.rstrip( '\r\n' )
  38. if line and not line.startswith( "#" ) and line.startswith( 'seq' ):
  39. fields = line.split( '\t' )
  40. if len( fields ) < 3:
  41. continue
  42. if fields[1] == dbkey:
  43. seq_path = fields[2].strip()
  44. break
  45. return seq_path
  46. def __main__():
  47. #
  48. # Parse options, args.
  49. #
  50. options, args = doc_optparse.parse( __doc__ )
  51. try:
  52. if len(options.cols.split(',')) == 5:
  53. # BED file
  54. chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg( options.cols )
  55. else:
  56. # gff file
  57. chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols )
  58. name_col = False
  59. dbkey = options.dbkey
  60. output_format = options.output_format
  61. gff_format = options.gff
  62. interpret_features = options.interpret_features
  63. GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR
  64. fasta_file = options.fasta
  65. input_filename, output_filename = args
  66. except:
  67. doc_optparse.exception()
  68. includes_strand_col = strand_col >= 0
  69. strand = None
  70. nibs = {}
  71. twobits = {}
  72. #
  73. # Set path to sequence data.
  74. #
  75. if fasta_file:
  76. # Need to create 2bit file from fasta file.
  77. try:
  78. seq_path = tempfile.NamedTemporaryFile( dir="." ).name
  79. cmd = "faToTwoBit %s %s" % ( fasta_file, seq_path )
  80. tmp_name = tempfile.NamedTemporaryFile( dir="." ).name
  81. tmp_stderr = open( tmp_name, 'wb' )
  82. proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )
  83. returncode = proc.wait()
  84. tmp_stderr.close()
  85. # Get stderr, allowing for case where it's very large.
  86. tmp_stderr = open( tmp_name, 'rb' )
  87. stderr = ''
  88. buffsize = 1048576
  89. try:
  90. while True:
  91. stderr += tmp_stderr.read( buffsize )
  92. if not stderr or len( stderr ) % buffsize != 0:
  93. break
  94. except OverflowError:
  95. pass
  96. tmp_stderr.close()
  97. # Error checking.
  98. if returncode != 0:
  99. raise Exception, stderr
  100. except Exception, e:
  101. stop_err( 'Error running faToTwoBit. ' + str( e ) )
  102. else:
  103. seq_path = check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR )
  104. if not os.path.exists( seq_path ):
  105. # If this occurs, we need to fix the metadata validator.
  106. stop_err( "No sequences are available for '%s', request them by reporting this error." % dbkey )
  107. #
  108. # Fetch sequences.
  109. #
  110. # Get feature's line(s).
  111. def get_lines( feature ):
  112. if isinstance( feature, gff_util.GFFFeature ):
  113. return feature.lines()
  114. else:
  115. return [ feature.rstrip( '\r\n' ) ]
  116. skipped_lines = 0
  117. first_invalid_line = 0
  118. invalid_lines = []
  119. fout = open( output_filename, "w" )
  120. warnings = []
  121. warning = ''
  122. twobitfile = None
  123. file_iterator = open( input_filename )
  124. if gff_format and interpret_features:
  125. file_iterator = gff_util.GFFReaderWrapper( file_iterator, fix_strand=False )
  126. line_count = 1
  127. for feature in file_iterator:
  128. # Ignore comments, headers.
  129. if isinstance( feature, ( Header, Comment ) ):
  130. line_count += 1
  131. continue
  132. name = ""
  133. if gff_format and interpret_features:
  134. # Processing features.
  135. gff_util.convert_gff_coords_to_bed( feature )
  136. chrom = feature.chrom
  137. start = feature.start
  138. end = feature.end
  139. strand = feature.strand
  140. else:
  141. # Processing lines, either interval or GFF format.
  142. line = feature.rstrip( '\r\n' )
  143. if line and not line.startswith( "#" ):
  144. fields = line.split( '\t' )
  145. try:
  146. chrom = fields[chrom_col]
  147. start = int( fields[start_col] )
  148. end = int( fields[end_col] )
  149. if name_col:
  150. name = fields[name_col]
  151. if gff_format:
  152. start, end = gff_util.convert_gff_coords_to_bed( [start, end] )
  153. if includes_strand_col:
  154. strand = fields[strand_col]
  155. except:
  156. warning = "Invalid chrom, start or end column values. "
  157. warnings.append( warning )
  158. if not invalid_lines:
  159. invalid_lines = get_lines( feature )
  160. first_invalid_line = line_count
  161. skipped_lines += len( invalid_lines )
  162. continue
  163. if start > end:
  164. warning = "Invalid interval, start '%d' > end '%d'. " % ( start, end )
  165. warnings.append( warning )
  166. if not invalid_lines:
  167. invalid_lines = get_lines( feature )
  168. first_invalid_line = line_count
  169. skipped_lines += len( invalid_lines )
  170. continue
  171. if strand not in ['+', '-']:
  172. strand = '+'
  173. sequence = ''
  174. else:
  175. continue
  176. # Open sequence file and get sequence for feature/interval.
  177. if seq_path and os.path.exists( "%s/%s.nib" % ( seq_path, chrom ) ):
  178. # TODO: improve support for GFF-nib interaction.
  179. if chrom in nibs:
  180. nib = nibs[chrom]
  181. else:
  182. nibs[chrom] = nib = bx.seq.nib.NibFile( file( "%s/%s.nib" % ( seq_path, chrom ) ) )
  183. try:
  184. sequence = nib.get( start, end-start )
  185. except Exception, e:
  186. warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " %( start, end-start, dbkey )
  187. warnings.append( warning )
  188. if not invalid_lines:
  189. invalid_lines = get_lines( feature )
  190. first_invalid_line = line_count
  191. skipped_lines += len( invalid_lines )
  192. continue
  193. elif seq_path and os.path.isfile( seq_path ):
  194. if not(twobitfile):
  195. twobitfile = bx.seq.twobit.TwoBitFile( file( seq_path ) )
  196. try:
  197. if options.gff and interpret_features:
  198. # Create sequence from intervals within a feature.
  199. sequence = ''
  200. for interval in feature.intervals:
  201. sequence += twobitfile[interval.chrom][interval.start:interval.end]
  202. else:
  203. sequence = twobitfile[chrom][start:end]
  204. except:
  205. warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " %( start, end-start, chrom )
  206. warnings.append( warning )
  207. if not invalid_lines:
  208. invalid_lines = get_lines( feature )
  209. first_invalid_line = line_count
  210. skipped_lines += len( invalid_lines )
  211. continue
  212. else:
  213. warning = "Chromosome by name '%s' was not found for build '%s'. " % ( chrom, dbkey )
  214. warnings.append( warning )
  215. if not invalid_lines:
  216. invalid_lines = get_lines( feature )
  217. first_invalid_line = line_count
  218. skipped_lines += len( invalid_lines )
  219. continue
  220. if sequence == '':
  221. warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " \
  222. % ( chrom, start, end, dbkey )
  223. warnings.append( warning )
  224. if not invalid_lines:
  225. invalid_lines = get_lines( feature )
  226. first_invalid_line = line_count
  227. skipped_lines += len( invalid_lines )
  228. continue
  229. if includes_strand_col and strand == "-":
  230. sequence = reverse_complement( sequence )
  231. if output_format == "fasta" :
  232. l = len( sequence )
  233. c = 0
  234. if gff_format:
  235. start, end = gff_util.convert_bed_coords_to_gff( [ start, end ] )
  236. fields = [dbkey, str( chrom ), str( start ), str( end ), strand]
  237. meta_data = "_".join( fields )
  238. if name.strip():
  239. fout.write( ">%s %s\n" % (meta_data, name) )
  240. else:
  241. fout.write( ">%s\n" % meta_data )
  242. while c < l:
  243. b = min( c + 50, l )
  244. fout.write( "%s\n" % str( sequence[c:b] ) )
  245. c = b
  246. else: # output_format == "interval"
  247. if gff_format and interpret_features:
  248. # TODO: need better GFF Reader to capture all information needed
  249. # to produce this line.
  250. meta_data = "\t".join(
  251. [feature.chrom, "galaxy_extract_genomic_dna", "interval", \
  252. str( feature.start ), str( feature.end ), feature.score, feature.strand,
  253. ".", gff_util.gff_attributes_to_str( feature.attributes, "GTF" ) ] )
  254. else:
  255. meta_data = "\t".join( fields )
  256. if gff_format:
  257. format_str = "%s seq \"%s\";\n"
  258. else:
  259. format_str = "%s\t%s\n"
  260. fout.write( format_str % ( meta_data, str( sequence ) ) )
  261. # Update line count.
  262. if isinstance( feature, gff_util.GFFFeature ):
  263. line_count += len( feature.intervals )
  264. else:
  265. line_count += 1
  266. fout.close()
  267. if warnings:
  268. warn_msg = "%d warnings, 1st is: " % len( warnings )
  269. warn_msg += warnings[0]
  270. print warn_msg
  271. if skipped_lines:
  272. # Error message includes up to the first 10 skipped lines.
  273. print 'Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, '\n'.join( invalid_lines[:10] ) )
  274. # Clean up temp file.
  275. if fasta_file:
  276. os.remove( seq_path )
  277. os.remove( tmp_name )
  278. if __name__ == "__main__": __main__()