short_reads_figure_high_quality_length.py

/tools/metag_tools/short_reads_figure_high_quality_length.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 165 lines · 146 code · 15 blank · 4 comment · 54 complexity · b8288050a3e8a7ca6783600340bba3db MD5 · raw file


#!/usr/bin/env python

import os, sys, math, tempfile, zipfile, re
from rpy import *

assert sys.version_info[:2] >= ( 2, 4 )

def stop_err( msg ):
    sys.stderr.write( "%s\n" % msg )
    sys.exit()

def unzip( filename ):
    zip_file = zipfile.ZipFile( filename, 'r' )
    tmpfilename = tempfile.NamedTemporaryFile().name
    for name in zip_file.namelist():
        file( tmpfilename, 'a' ).write( zip_file.read( name ) )
    zip_file.close()
    return tmpfilename

def __main__():
    infile_score_name = sys.argv[1].strip()
    outfile_R_name = sys.argv[2].strip()
    
    try:
        score_threshold = int( sys.argv[3].strip() )
    except:
        stop_err( 'Threshold for quality score must be numerical.' )

    infile_is_zipped = False
    if zipfile.is_zipfile( infile_score_name ):
        infile_is_zipped = True
        infile_name = unzip( infile_score_name )
    else:
        infile_name = infile_score_name

    # detect whether it's tabular or fasta format
    seq_method = None
    data_type = None
    for i, line in enumerate( file( infile_name ) ):
        line = line.rstrip( '\r\n' )
        if not line or line.startswith( '#' ):
            continue
        if data_type == None:
            if line.startswith( '>' ):
                data_type = 'fasta'
                continue
            elif len( line.split( '\t' ) ) > 0:
                fields = line.split()
                for score in fields:
                    try:
                        int( score )
                        data_type = 'tabular'
                        seq_method = 'solexa'
                        break
                    except:
                        break
        elif data_type == 'fasta':
            fields = line.split()
            for score in fields:
                try: 
                    int( score )
                    seq_method = '454'
                    break
                except:
                    break
        if i == 100:
            break

    if data_type is None:
        stop_err( 'This tool can only use fasta data or tabular data.' ) 
    if seq_method is None:
        stop_err( 'Invalid data for fasta format.')
 
    cont_high_quality = []
    invalid_lines = 0
    invalid_scores = 0                       
    if seq_method == 'solexa':
        for i, line in enumerate( open( infile_name ) ):
            line = line.rstrip( '\r\n' )
            if not line or line.startswith( '#' ):
                continue
            locs = line.split( '\t' )
            for j, base in enumerate( locs ):
                nuc_errors = base.split()
                try:
                    nuc_errors[0] = int( nuc_errors[0] )
                    nuc_errors[1] = int( nuc_errors[1] )
                    nuc_errors[2] = int( nuc_errors[2] )
                    nuc_errors[3] = int( nuc_errors[3] )
                    big = max( nuc_errors )
                except:
                    invalid_scores += 1
                    big = 0
                if j == 0:
                    cont_high_quality.append(1)
                else:
                    if big >= score_threshold:
                        cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
                    else:
                        cont_high_quality.append(1)
    else: # seq_method == '454'
        tmp_score = ''
        for i, line in enumerate( open( infile_name ) ):
            line = line.rstrip( '\r\n' )
            if not line or line.startswith( '#' ):
                continue
            if line.startswith( '>' ):
                if len( tmp_score ) > 0:
                    locs = tmp_score.split()
                    for j, base in enumerate( locs ):
                        try:
                            base = int( base )
                        except:
                            invalid_scores += 1
                            base = 0
                        if j == 0:
                            cont_high_quality.append(1)
                        else:
                            if base >= score_threshold:
                                cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
                            else:
                                cont_high_quality.append(1)
                tmp_score = ''
            else:
                tmp_score = "%s %s" % ( tmp_score, line )
        if len( tmp_score ) > 0:
            locs = tmp_score.split()
            for j, base in enumerate( locs ):
                try:
                    base = int( base )
                except:
                    invalid_scores += 1
                    base = 0
                if j == 0:
                    cont_high_quality.append(1)
                else:
                    if base >= score_threshold:
                        cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
                    else:
                        cont_high_quality.append(1)

    # generate pdf figures
    cont_high_quality = array ( cont_high_quality )
    outfile_R_pdf = outfile_R_name 
    r.pdf( outfile_R_pdf )
    title = "Histogram of continuous high quality scores"
    xlim_range = [ 1, max( cont_high_quality ) ]
    nclass = max( cont_high_quality )
    if nclass > 100:
        nclass = 100
    r.hist( cont_high_quality, probability=True, xlab="Continuous High Quality Score length (bp)", ylab="Frequency (%)", xlim=xlim_range, main=title, nclass=nclass)
    r.dev_off()    

    if infile_is_zipped and os.path.exists( infile_name ):
        # Need to delete temporary file created when we unzipped the infile archive
        os.remove( infile_name )

    if invalid_lines > 0: 
        print 'Skipped %d invalid lines. ' % invalid_lines
    if invalid_scores > 0:
        print 'Skipped %d invalid scores. ' % invalid_scores

    r.quit( save="no" )

if __name__=="__main__":__main__()

Summary ✨

This Python script generates a histogram of continuous high quality scores from a fasta or tabular file. It takes three arguments: the input score file, the output R file, and the threshold for quality score. The script detects whether the input file is zipped or not and unzips it if necessary. Then, it reads the input file line by line and splits each line into fields based on tabs. If the data format is fasta, it checks whether the first field is a header (starts with ‘>’) and if so, it assumes that the quality scores are in the second field. If the data format is tabular, it assumes that the quality scores are in the first column. The script then converts each quality score to an integer and adds it to a list of continuous high quality scores. Finally, it generates a histogram of these scores using R’s “hist” function and saves the output as a PDF file.

Tech Fingerprint

Standard Library: OS Interaction

Alerts (13)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
4
'def' Ensure functions have docstrings for documentation
8 12
'zipfile.ZipFile(' Potential decompression bomb vulnerability in Python code if input is untrusted; ensure to limit the number of bytes read.
13
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
26 55 64 91 113 131
'== None' Use 'is' for None comparisons (e.g., x is None)
43
'open(' Use 'with open()' to ensure Files are properly closed
78 103