PageRenderTime 27ms CodeModel.GetById 1ms app.highlight 22ms RepoModel.GetById 1ms app.codeStats 1ms

/tools/extract/extract_genomic_dna.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 300 lines | 287 code | 3 blank | 10 comment | 3 complexity | 082d97d0fe95153543a1412e3c1945a1 MD5 | raw file
  1#!/usr/bin/env python
  2"""
  3usage: %prog $input $out_file1
  4    -1, --cols=N,N,N,N,N: Columns for start, end, strand in input file
  5    -d, --dbkey=N: Genome build of input file
  6    -o, --output_format=N: the data type of the output file
  7    -g, --GALAXY_DATA_INDEX_DIR=N: the directory containing alignseq.loc
  8    -I, --interpret_features: if true, complete features are interpreted when input is GFF 
  9    -F, --fasta=<genomic_sequences>: genomic sequences to use for extraction
 10    -G, --gff: input and output file, when it is interval, coordinates are treated as GFF format (1-based, half-open) rather than 'traditional' 0-based, closed format.
 11"""
 12from galaxy import eggs
 13import pkg_resources
 14pkg_resources.require( "bx-python" )
 15import sys, string, os, re, tempfile, subprocess
 16from bx.cookbook import doc_optparse
 17from bx.intervals.io import Header, Comment
 18import bx.seq.nib
 19import bx.seq.twobit
 20from galaxy.tools.util.galaxyops import *
 21from galaxy.datatypes.util import gff_util
 22
 23assert sys.version_info[:2] >= ( 2, 4 )
 24    
 25def stop_err( msg ):
 26    sys.stderr.write( msg )
 27    sys.exit()
 28
 29def reverse_complement( s ):
 30    complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" }
 31    reversed_s = []
 32    for i in s:
 33        reversed_s.append( complement_dna[i] )
 34    reversed_s.reverse()
 35    return "".join( reversed_s )
 36
 37def check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR ):
 38    seq_file = "%s/alignseq.loc" % GALAXY_DATA_INDEX_DIR
 39    seq_path = ''
 40    for line in open( seq_file ):
 41        line = line.rstrip( '\r\n' )
 42        if line and not line.startswith( "#" ) and line.startswith( 'seq' ):
 43            fields = line.split( '\t' )
 44            if len( fields ) < 3:
 45                continue
 46            if fields[1] == dbkey:
 47                seq_path = fields[2].strip()
 48                break
 49    return seq_path
 50        
 51def __main__():
 52    #
 53    # Parse options, args.
 54    #
 55    options, args = doc_optparse.parse( __doc__ )
 56    try:
 57        if len(options.cols.split(',')) == 5:
 58            # BED file
 59            chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg( options.cols )
 60        else:
 61            # gff file
 62            chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols )
 63            name_col = False
 64        dbkey = options.dbkey
 65        output_format = options.output_format
 66        gff_format = options.gff
 67        interpret_features = options.interpret_features
 68        GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR
 69        fasta_file = options.fasta
 70        input_filename, output_filename = args
 71    except:
 72        doc_optparse.exception()
 73
 74    includes_strand_col = strand_col >= 0
 75    strand = None
 76    nibs = {}
 77    twobits = {}
 78        
 79    #
 80    # Set path to sequence data.
 81    #
 82    if fasta_file:
 83        # Need to create 2bit file from fasta file.
 84        try:
 85            seq_path = tempfile.NamedTemporaryFile( dir="." ).name
 86            cmd = "faToTwoBit %s %s" % ( fasta_file, seq_path )
 87        
 88            tmp_name = tempfile.NamedTemporaryFile( dir="." ).name
 89            tmp_stderr = open( tmp_name, 'wb' )
 90            proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )
 91            returncode = proc.wait()
 92            tmp_stderr.close()
 93
 94            # Get stderr, allowing for case where it's very large.
 95            tmp_stderr = open( tmp_name, 'rb' )
 96            stderr = ''
 97            buffsize = 1048576
 98            try:
 99                while True:
100                    stderr += tmp_stderr.read( buffsize )
101                    if not stderr or len( stderr ) % buffsize != 0:
102                        break
103            except OverflowError:
104                pass
105            tmp_stderr.close()
106
107            # Error checking.
108            if returncode != 0:
109                raise Exception, stderr
110        except Exception, e:
111            stop_err( 'Error running faToTwoBit. ' + str( e ) )
112    else:
113        seq_path = check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR )
114        if not os.path.exists( seq_path ):
115            # If this occurs, we need to fix the metadata validator.
116            stop_err( "No sequences are available for '%s', request them by reporting this error." % dbkey )
117    
118    #
119    # Fetch sequences.
120    #
121    
122    # Get feature's line(s).
123    def get_lines( feature ):
124        if isinstance( feature, gff_util.GFFFeature ):
125            return feature.lines()
126        else:
127            return [ feature.rstrip( '\r\n' ) ]
128    
129    skipped_lines = 0
130    first_invalid_line = 0
131    invalid_lines = []
132    fout = open( output_filename, "w" )
133    warnings = []
134    warning = ''
135    twobitfile = None
136    file_iterator = open( input_filename )
137    if gff_format and interpret_features:
138        file_iterator = gff_util.GFFReaderWrapper( file_iterator, fix_strand=False )
139    line_count = 1
140    for feature in file_iterator:
141        # Ignore comments, headers.
142        if isinstance( feature, ( Header, Comment ) ):
143            line_count += 1
144            continue
145
146        name = ""
147        if gff_format and interpret_features:
148            # Processing features.
149            gff_util.convert_gff_coords_to_bed( feature )
150            chrom = feature.chrom
151            start = feature.start
152            end = feature.end
153            strand = feature.strand
154        else:
155            # Processing lines, either interval or GFF format.
156            line = feature.rstrip( '\r\n' )
157            if line and not line.startswith( "#" ):
158                fields = line.split( '\t' )
159                try:
160                    chrom = fields[chrom_col]
161                    start = int( fields[start_col] )
162                    end = int( fields[end_col] )
163                    if name_col:
164                        name = fields[name_col]
165                    if gff_format:
166                        start, end = gff_util.convert_gff_coords_to_bed( [start, end] )
167                    if includes_strand_col:
168                        strand = fields[strand_col]
169                except:
170                    warning = "Invalid chrom, start or end column values. "
171                    warnings.append( warning )
172                    if not invalid_lines:
173                        invalid_lines = get_lines( feature )
174                        first_invalid_line = line_count
175                    skipped_lines += len( invalid_lines )
176                    continue
177                if start > end:
178                    warning = "Invalid interval, start '%d' > end '%d'.  " % ( start, end )
179                    warnings.append( warning )
180                    if not invalid_lines:
181                        invalid_lines = get_lines( feature )
182                        first_invalid_line = line_count
183                    skipped_lines += len( invalid_lines )
184                    continue
185
186                if strand not in ['+', '-']:
187                    strand = '+'
188                sequence = ''
189            else:
190                continue
191
192        # Open sequence file and get sequence for feature/interval. 
193        if seq_path and os.path.exists( "%s/%s.nib" % ( seq_path, chrom ) ):
194            # TODO: improve support for GFF-nib interaction.
195            if chrom in nibs:
196                nib = nibs[chrom]
197            else:
198                nibs[chrom] = nib = bx.seq.nib.NibFile( file( "%s/%s.nib" % ( seq_path, chrom ) ) )
199            try:
200                sequence = nib.get( start, end-start )
201            except Exception, e:
202                warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " %( start, end-start, dbkey )
203                warnings.append( warning )
204                if not invalid_lines:
205                    invalid_lines = get_lines( feature )
206                    first_invalid_line = line_count
207                skipped_lines += len( invalid_lines )
208                continue
209        elif seq_path and os.path.isfile( seq_path ):
210            if not(twobitfile):
211                twobitfile = bx.seq.twobit.TwoBitFile( file( seq_path ) )
212            try:
213                if options.gff and interpret_features:
214                    # Create sequence from intervals within a feature.
215                    sequence = ''
216                    for interval in feature.intervals:
217                        sequence += twobitfile[interval.chrom][interval.start:interval.end]
218                else:
219                    sequence = twobitfile[chrom][start:end]
220            except:
221                warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " %( start, end-start, chrom )
222                warnings.append( warning )
223                if not invalid_lines:
224                    invalid_lines = get_lines( feature )
225                    first_invalid_line = line_count
226                skipped_lines += len( invalid_lines )
227                continue
228        else:
229            warning = "Chromosome by name '%s' was not found for build '%s'. " % ( chrom, dbkey )
230            warnings.append( warning )
231            if not invalid_lines:
232                invalid_lines = get_lines( feature )
233                first_invalid_line = line_count
234            skipped_lines += len( invalid_lines )
235            continue
236        if sequence == '':
237            warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " \
238                        % ( chrom, start, end, dbkey )
239            warnings.append( warning )
240            if not invalid_lines:
241                invalid_lines = get_lines( feature )
242                first_invalid_line = line_count
243            skipped_lines += len( invalid_lines )
244            continue
245        if includes_strand_col and strand == "-":
246            sequence = reverse_complement( sequence )
247
248        if output_format == "fasta" :
249            l = len( sequence )
250            c = 0
251            if gff_format:
252                start, end = gff_util.convert_bed_coords_to_gff( [ start, end ] )
253            fields = [dbkey, str( chrom ), str( start ), str( end ), strand]
254            meta_data = "_".join( fields )
255            if name.strip():
256                fout.write( ">%s %s\n" % (meta_data, name) )
257            else:
258                fout.write( ">%s\n" % meta_data )
259            while c < l:
260                b = min( c + 50, l )
261                fout.write( "%s\n" % str( sequence[c:b] ) )
262                c = b
263        else: # output_format == "interval"
264            if gff_format and interpret_features:
265                # TODO: need better GFF Reader to capture all information needed
266                # to produce this line.
267                meta_data = "\t".join( 
268                                [feature.chrom, "galaxy_extract_genomic_dna", "interval", \
269                                 str( feature.start ), str( feature.end ), feature.score, feature.strand,
270                                 ".", gff_util.gff_attributes_to_str( feature.attributes, "GTF" ) ] )
271            else:
272                meta_data = "\t".join( fields )
273            if gff_format:
274                format_str = "%s seq \"%s\";\n"
275            else:
276                format_str = "%s\t%s\n"
277            fout.write( format_str % ( meta_data, str( sequence ) ) )
278            
279        # Update line count.
280        if isinstance( feature, gff_util.GFFFeature ):
281            line_count += len( feature.intervals )
282        else:
283            line_count += 1
284
285    fout.close()
286
287    if warnings:
288        warn_msg = "%d warnings, 1st is: " % len( warnings )
289        warn_msg += warnings[0]
290        print warn_msg
291    if skipped_lines:
292        # Error message includes up to the first 10 skipped lines.
293        print 'Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, '\n'.join( invalid_lines[:10] ) )
294        
295    # Clean up temp file.
296    if fasta_file:
297        os.remove( seq_path )
298        os.remove( tmp_name )
299
300if __name__ == "__main__": __main__()