PageRenderTime 44ms CodeModel.GetById 23ms app.highlight 17ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/metag_tools/short_reads_trim_seq.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 234 lines | 215 code | 12 blank | 7 comment | 58 complexity | bb27dceaddd51b02ed901900a39c6ab5 MD5 | raw file
  1#!/usr/bin/env python
  2"""
  3trim reads based on the quality scores
  4input: read file and quality score file
  5output: trimmed read file
  6"""
  7
  8import os, sys, math, tempfile, re
  9
 10assert sys.version_info[:2] >= ( 2, 4 )
 11
 12def stop_err( msg ):
 13    sys.stderr.write( "%s\n" % msg )
 14    sys.exit()
 15
 16def append_to_outfile( outfile_name, seq_title, segments ):
 17    segments = segments.split( ',' )
 18    if len( segments ) > 1:
 19        outfile = open( outfile_name, 'a' )
 20        for i in range( len( segments ) ):
 21            outfile.write( "%s_%d\n%s\n" % ( seq_title, i, segments[i] ) )
 22        outfile.close()
 23    elif segments[0]:
 24        outfile = open( outfile_name, 'a' )
 25        outfile.write( "%s\n%s\n" % ( seq_title, segments[0] ) )
 26        outfile.close()
 27
 28def trim_seq( seq, score, arg, trim_score, threshold ):
 29    seq_method = '454'
 30    trim_pos = 0
 31    # trim after a certain position
 32    if arg.isdigit():
 33        keep_homopolymers = False
 34        trim_pos = int( arg )    
 35        if trim_pos > 0 and trim_pos < len( seq ):
 36            seq = seq[0:trim_pos]
 37    else:
 38        keep_homopolymers = arg=='yes'
 39        
 40    new_trim_seq = ''
 41    max_segment = 0
 42
 43    for i in range( len( seq ) ):
 44        if i >= len( score ):
 45            score.append(-1)   
 46        if int( score[i] ) >= trim_score:
 47            pass_nuc = seq[ i:( i + 1 ) ]
 48        else:
 49            if keep_homopolymers and ( (i == 0 ) or ( seq[ i:( i + 1 ) ].lower() == seq[ ( i - 1 ):i ].lower() ) ):
 50                pass_nuc = seq[ i:( i + 1 ) ]
 51            else:
 52                pass_nuc = ' '    
 53        new_trim_seq = '%s%s' % ( new_trim_seq, pass_nuc )
 54        # find the max substrings
 55        segments = new_trim_seq.split()
 56        max_segment = ''
 57        len_max_segment = 0
 58        if threshold == 0:
 59            for seg in segments:
 60                if len_max_segment < len( seg ):
 61                    max_segment = '%s,' % seg
 62                    len_max_segment = len( seg )
 63                elif len_max_segment == len( seg ):
 64                    max_segment = '%s%s,' % ( max_segment, seg )
 65        else:
 66            for seg in segments:
 67                if len( seg ) >= threshold:
 68                    max_segment = '%s%s,' % ( max_segment, seg )
 69    return max_segment[ 0:-1 ]
 70
 71def __main__():
 72    
 73    try:
 74        threshold_trim = int( sys.argv[1].strip() )
 75    except:
 76        stop_err( "Minimal quality score must be numeric." )
 77    try:
 78        threshold_report = int( sys.argv[2].strip() )
 79    except:
 80        stop_err( "Minimal length of trimmed reads must be numeric." )
 81    outfile_seq_name = sys.argv[3].strip()
 82    infile_seq_name = sys.argv[4].strip()
 83    infile_score_name = sys.argv[5].strip()
 84    arg = sys.argv[6].strip()
 85
 86    seq_infile_name = infile_seq_name
 87    score_infile_name = infile_score_name
 88    
 89
 90    # Determine quailty score format: tabular or fasta format within the first 100 lines
 91    seq_method = None
 92    data_type = None
 93    for i, line in enumerate( file( score_infile_name ) ):
 94        line = line.rstrip( '\r\n' )
 95        if not line or line.startswith( '#' ):
 96            continue
 97        if data_type == None:
 98            if line.startswith( '>' ):
 99                data_type = 'fasta'
100                continue
101            elif len( line.split( '\t' ) ) > 0:
102                fields = line.split()
103                for score in fields:
104                    try:
105                        int( score )
106                        data_type = 'tabular'
107                        seq_method = 'solexa'
108                        break
109                    except:
110                        break
111        elif data_type == 'fasta':
112            fields = line.split()
113            for score in fields:
114                try: 
115                    int( score )
116                    seq_method = '454'
117                    break
118                except:
119                    break
120        if i == 100:
121            break
122
123    if data_type is None:
124        stop_err( 'This tool can only use fasta data or tabular data.' ) 
125    if seq_method is None:
126        stop_err( 'Invalid data for fasta format.')
127    
128    if os.path.exists( seq_infile_name ) and os.path.exists( score_infile_name ):
129        seq = None
130        score = None
131        score_found = False
132        
133        score_file = open( score_infile_name, 'r' )
134
135        for i, line in enumerate( open( seq_infile_name ) ):
136            line = line.rstrip( '\r\n' )
137            if not line or line.startswith( '#' ):
138                continue
139            if line.startswith( '>' ):
140                if seq:
141                    scores = []
142                    if data_type == 'fasta':
143                        score = None
144                        score_found = False
145                        score_line = 'start'
146                        while not score_found and score_line:
147                            score_line = score_file.readline().rstrip( '\r\n' )
148                            if not score_line or score_line.startswith( '#' ):
149                                continue
150                            if score_line.startswith( '>' ):
151                                if score:
152                                    scores = score.split()
153                                    score_found = True    
154                                score = None
155                            else:
156                                for val in score_line.split():
157                                    try:
158                                        int( val ) 
159                                    except:
160                                        score_file.close()
161                                        stop_err( "Non-numerical value '%s' in score file." % val )
162                                if not score:
163                                    score = score_line
164                                else:
165                                    score = '%s %s' % ( score, score_line )                                        
166                    elif data_type == 'tabular':
167                        score = score_file.readline().rstrip('\r\n')
168                        loc = score.split( '\t' )
169                        for base in loc:
170                            nuc_error = base.split()
171                            try:
172                                nuc_error[0] = int( nuc_error[0] )
173                                nuc_error[1] = int( nuc_error[1] )
174                                nuc_error[2] = int( nuc_error[2] )
175                                nuc_error[3] = int( nuc_error[3] )
176                                big = max( nuc_error )
177                            except:
178                                score_file.close()
179                                stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) )
180                            scores.append( big )
181                    if scores:
182                        new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report )
183                        append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments )  
184                                
185                seq_title = line
186                seq = None
187            else:
188                if not seq:
189                    seq = line
190                else:
191                    seq = "%s%s" % ( seq, line )
192        if seq:
193            scores = []
194            if data_type == 'fasta':
195                score = None
196                while score_line:
197                    score_line = score_file.readline().rstrip( '\r\n' )
198                    if not score_line or score_line.startswith( '#' ) or score_line.startswith( '>' ):
199                        continue
200                    for val in score_line.split():
201                        try:
202                            int( val )
203                        except:
204                            score_file.close()
205                            stop_err( "Non-numerical value '%s' in score file." % val )
206                    if not score:
207                        score = score_line
208                    else:
209                        score = "%s %s" % ( score, score_line ) 
210                if score: 
211                    scores = score.split()
212            elif data_type == 'tabular':
213                score = score_file.readline().rstrip('\r\n')
214                loc = score.split( '\t' )
215                for base in loc:
216                    nuc_error = base.split()
217                    try:
218                        nuc_error[0] = int( nuc_error[0] )
219                        nuc_error[1] = int( nuc_error[1] )
220                        nuc_error[2] = int( nuc_error[2] )
221                        nuc_error[3] = int( nuc_error[3] )
222                        big = max( nuc_error )
223                    except:
224                        score_file.close()
225                        stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) )
226                    scores.append( big )
227            if scores:
228                new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report )
229                append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments )  
230        score_file.close()
231    else:
232        stop_err( "Cannot locate sequence file '%s'or score file '%s'." % ( seq_infile_name, score_infile_name ) )    
233
234if __name__ == "__main__": __main__()