PageRenderTime 53ms CodeModel.GetById 12ms app.highlight 36ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/metag_tools/short_reads_figure_high_quality_length.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 165 lines | 153 code | 10 blank | 2 comment | 33 complexity | b8288050a3e8a7ca6783600340bba3db MD5 | raw file
  1#!/usr/bin/env python
  2
  3import os, sys, math, tempfile, zipfile, re
  4from rpy import *
  5
  6assert sys.version_info[:2] >= ( 2, 4 )
  7
  8def stop_err( msg ):
  9    sys.stderr.write( "%s\n" % msg )
 10    sys.exit()
 11
 12def unzip( filename ):
 13    zip_file = zipfile.ZipFile( filename, 'r' )
 14    tmpfilename = tempfile.NamedTemporaryFile().name
 15    for name in zip_file.namelist():
 16        file( tmpfilename, 'a' ).write( zip_file.read( name ) )
 17    zip_file.close()
 18    return tmpfilename
 19
 20def __main__():
 21    infile_score_name = sys.argv[1].strip()
 22    outfile_R_name = sys.argv[2].strip()
 23    
 24    try:
 25        score_threshold = int( sys.argv[3].strip() )
 26    except:
 27        stop_err( 'Threshold for quality score must be numerical.' )
 28
 29    infile_is_zipped = False
 30    if zipfile.is_zipfile( infile_score_name ):
 31        infile_is_zipped = True
 32        infile_name = unzip( infile_score_name )
 33    else:
 34        infile_name = infile_score_name
 35
 36    # detect whether it's tabular or fasta format
 37    seq_method = None
 38    data_type = None
 39    for i, line in enumerate( file( infile_name ) ):
 40        line = line.rstrip( '\r\n' )
 41        if not line or line.startswith( '#' ):
 42            continue
 43        if data_type == None:
 44            if line.startswith( '>' ):
 45                data_type = 'fasta'
 46                continue
 47            elif len( line.split( '\t' ) ) > 0:
 48                fields = line.split()
 49                for score in fields:
 50                    try:
 51                        int( score )
 52                        data_type = 'tabular'
 53                        seq_method = 'solexa'
 54                        break
 55                    except:
 56                        break
 57        elif data_type == 'fasta':
 58            fields = line.split()
 59            for score in fields:
 60                try: 
 61                    int( score )
 62                    seq_method = '454'
 63                    break
 64                except:
 65                    break
 66        if i == 100:
 67            break
 68
 69    if data_type is None:
 70        stop_err( 'This tool can only use fasta data or tabular data.' ) 
 71    if seq_method is None:
 72        stop_err( 'Invalid data for fasta format.')
 73 
 74    cont_high_quality = []
 75    invalid_lines = 0
 76    invalid_scores = 0                       
 77    if seq_method == 'solexa':
 78        for i, line in enumerate( open( infile_name ) ):
 79            line = line.rstrip( '\r\n' )
 80            if not line or line.startswith( '#' ):
 81                continue
 82            locs = line.split( '\t' )
 83            for j, base in enumerate( locs ):
 84                nuc_errors = base.split()
 85                try:
 86                    nuc_errors[0] = int( nuc_errors[0] )
 87                    nuc_errors[1] = int( nuc_errors[1] )
 88                    nuc_errors[2] = int( nuc_errors[2] )
 89                    nuc_errors[3] = int( nuc_errors[3] )
 90                    big = max( nuc_errors )
 91                except:
 92                    invalid_scores += 1
 93                    big = 0
 94                if j == 0:
 95                    cont_high_quality.append(1)
 96                else:
 97                    if big >= score_threshold:
 98                        cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
 99                    else:
100                        cont_high_quality.append(1)
101    else: # seq_method == '454'
102        tmp_score = ''
103        for i, line in enumerate( open( infile_name ) ):
104            line = line.rstrip( '\r\n' )
105            if not line or line.startswith( '#' ):
106                continue
107            if line.startswith( '>' ):
108                if len( tmp_score ) > 0:
109                    locs = tmp_score.split()
110                    for j, base in enumerate( locs ):
111                        try:
112                            base = int( base )
113                        except:
114                            invalid_scores += 1
115                            base = 0
116                        if j == 0:
117                            cont_high_quality.append(1)
118                        else:
119                            if base >= score_threshold:
120                                cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
121                            else:
122                                cont_high_quality.append(1)
123                tmp_score = ''
124            else:
125                tmp_score = "%s %s" % ( tmp_score, line )
126        if len( tmp_score ) > 0:
127            locs = tmp_score.split()
128            for j, base in enumerate( locs ):
129                try:
130                    base = int( base )
131                except:
132                    invalid_scores += 1
133                    base = 0
134                if j == 0:
135                    cont_high_quality.append(1)
136                else:
137                    if base >= score_threshold:
138                        cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
139                    else:
140                        cont_high_quality.append(1)
141
142    # generate pdf figures
143    cont_high_quality = array ( cont_high_quality )
144    outfile_R_pdf = outfile_R_name 
145    r.pdf( outfile_R_pdf )
146    title = "Histogram of continuous high quality scores"
147    xlim_range = [ 1, max( cont_high_quality ) ]
148    nclass = max( cont_high_quality )
149    if nclass > 100:
150        nclass = 100
151    r.hist( cont_high_quality, probability=True, xlab="Continuous High Quality Score length (bp)", ylab="Frequency (%)", xlim=xlim_range, main=title, nclass=nclass)
152    r.dev_off()    
153
154    if infile_is_zipped and os.path.exists( infile_name ):
155        # Need to delete temporary file created when we unzipped the infile archive
156        os.remove( infile_name )
157
158    if invalid_lines > 0: 
159        print 'Skipped %d invalid lines. ' % invalid_lines
160    if invalid_scores > 0:
161        print 'Skipped %d invalid scores. ' % invalid_scores
162
163    r.quit( save="no" )
164
165if __name__=="__main__":__main__()