/tools/metag_tools/short_reads_figure_high_quality_length.py
https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 165 lines · 146 code · 15 blank · 4 comment · 54 complexity · b8288050a3e8a7ca6783600340bba3db MD5 · raw file
- #!/usr/bin/env python
- import os, sys, math, tempfile, zipfile, re
- from rpy import *
- assert sys.version_info[:2] >= ( 2, 4 )
- def stop_err( msg ):
- sys.stderr.write( "%s\n" % msg )
- sys.exit()
- def unzip( filename ):
- zip_file = zipfile.ZipFile( filename, 'r' )
- tmpfilename = tempfile.NamedTemporaryFile().name
- for name in zip_file.namelist():
- file( tmpfilename, 'a' ).write( zip_file.read( name ) )
- zip_file.close()
- return tmpfilename
- def __main__():
- infile_score_name = sys.argv[1].strip()
- outfile_R_name = sys.argv[2].strip()
-
- try:
- score_threshold = int( sys.argv[3].strip() )
- except:
- stop_err( 'Threshold for quality score must be numerical.' )
- infile_is_zipped = False
- if zipfile.is_zipfile( infile_score_name ):
- infile_is_zipped = True
- infile_name = unzip( infile_score_name )
- else:
- infile_name = infile_score_name
- # detect whether it's tabular or fasta format
- seq_method = None
- data_type = None
- for i, line in enumerate( file( infile_name ) ):
- line = line.rstrip( '\r\n' )
- if not line or line.startswith( '#' ):
- continue
- if data_type == None:
- if line.startswith( '>' ):
- data_type = 'fasta'
- continue
- elif len( line.split( '\t' ) ) > 0:
- fields = line.split()
- for score in fields:
- try:
- int( score )
- data_type = 'tabular'
- seq_method = 'solexa'
- break
- except:
- break
- elif data_type == 'fasta':
- fields = line.split()
- for score in fields:
- try:
- int( score )
- seq_method = '454'
- break
- except:
- break
- if i == 100:
- break
- if data_type is None:
- stop_err( 'This tool can only use fasta data or tabular data.' )
- if seq_method is None:
- stop_err( 'Invalid data for fasta format.')
-
- cont_high_quality = []
- invalid_lines = 0
- invalid_scores = 0
- if seq_method == 'solexa':
- for i, line in enumerate( open( infile_name ) ):
- line = line.rstrip( '\r\n' )
- if not line or line.startswith( '#' ):
- continue
- locs = line.split( '\t' )
- for j, base in enumerate( locs ):
- nuc_errors = base.split()
- try:
- nuc_errors[0] = int( nuc_errors[0] )
- nuc_errors[1] = int( nuc_errors[1] )
- nuc_errors[2] = int( nuc_errors[2] )
- nuc_errors[3] = int( nuc_errors[3] )
- big = max( nuc_errors )
- except:
- invalid_scores += 1
- big = 0
- if j == 0:
- cont_high_quality.append(1)
- else:
- if big >= score_threshold:
- cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
- else:
- cont_high_quality.append(1)
- else: # seq_method == '454'
- tmp_score = ''
- for i, line in enumerate( open( infile_name ) ):
- line = line.rstrip( '\r\n' )
- if not line or line.startswith( '#' ):
- continue
- if line.startswith( '>' ):
- if len( tmp_score ) > 0:
- locs = tmp_score.split()
- for j, base in enumerate( locs ):
- try:
- base = int( base )
- except:
- invalid_scores += 1
- base = 0
- if j == 0:
- cont_high_quality.append(1)
- else:
- if base >= score_threshold:
- cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
- else:
- cont_high_quality.append(1)
- tmp_score = ''
- else:
- tmp_score = "%s %s" % ( tmp_score, line )
- if len( tmp_score ) > 0:
- locs = tmp_score.split()
- for j, base in enumerate( locs ):
- try:
- base = int( base )
- except:
- invalid_scores += 1
- base = 0
- if j == 0:
- cont_high_quality.append(1)
- else:
- if base >= score_threshold:
- cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
- else:
- cont_high_quality.append(1)
- # generate pdf figures
- cont_high_quality = array ( cont_high_quality )
- outfile_R_pdf = outfile_R_name
- r.pdf( outfile_R_pdf )
- title = "Histogram of continuous high quality scores"
- xlim_range = [ 1, max( cont_high_quality ) ]
- nclass = max( cont_high_quality )
- if nclass > 100:
- nclass = 100
- r.hist( cont_high_quality, probability=True, xlab="Continuous High Quality Score length (bp)", ylab="Frequency (%)", xlim=xlim_range, main=title, nclass=nclass)
- r.dev_off()
- if infile_is_zipped and os.path.exists( infile_name ):
- # Need to delete temporary file created when we unzipped the infile archive
- os.remove( infile_name )
- if invalid_lines > 0:
- print 'Skipped %d invalid lines. ' % invalid_lines
- if invalid_scores > 0:
- print 'Skipped %d invalid scores. ' % invalid_scores
- r.quit( save="no" )
- if __name__=="__main__":__main__()