solid_qual_stats.py - This is a Python script that calculat…

/tools/solid_tools/solid_qual_stats.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 140 lines · 114 code · 24 blank · 2 comment · 41 complexity · 5560d37474ff2b3d8ebf369c696fb229 MD5 · raw file


#!/usr/bin/env python
#Guruprasad Ananda

import sys, os, zipfile, tempfile

QUAL_UPPER_BOUND = 41
QUAL_LOWER_BOUND = 1

def stop_err( msg ):
    sys.stderr.write( "%s\n" % msg )
    sys.exit()
 
def unzip( filename ):
    zip_file = zipfile.ZipFile( filename, 'r' )
    tmpfilename = tempfile.NamedTemporaryFile().name
    for name in zip_file.namelist():
        file( tmpfilename, 'a' ).write( zip_file.read( name ) )
    zip_file.close()
    return tmpfilename
   
def __main__():

    infile_score_name = sys.argv[1].strip()
    fout = open(sys.argv[2].strip(),'r+w')

    infile_is_zipped = False
    if zipfile.is_zipfile( infile_score_name ):
        infile_is_zipped = True
        infile_name = unzip( infile_score_name )
    else:
        infile_name = infile_score_name
    
    readlen = None
    invalid_lines = 0
    j = 0
    for line in file( infile_name ):
        line = line.strip()
        if not(line) or line.startswith("#") or line.startswith(">"):
            continue
        elems = line.split()
        try:
            for item in elems:
                int(item)
            if not readlen:
                readlen = len(elems)
            if len(elems) != readlen:
                print "Note: Reads in the input dataset are of variable lengths."
            j += 1
        except ValueError:
            invalid_lines += 1
        if j > 10:
            break
        
    position_dict = {}
    print >>fout, "column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW"
    for k,line in enumerate(file( infile_name )):
        line = line.strip()
        if not(line) or line.startswith("#") or line.startswith(">"):
            continue
        elems = line.split()
        if position_dict == {}:
            for pos in range(readlen):
                position_dict[pos] = [0]*QUAL_UPPER_BOUND
        if len(elems) != readlen:
            invalid_lines += 1
            continue
        for ind,item in enumerate(elems):
            try:
                item = int(item)
                position_dict[ind][item]+=1
            except:
                pass
    
    invalid_positions = 0
    for pos in position_dict:
        carr = position_dict[pos] #count array for position pos
        total = sum(carr) #number of bases found in this column.
        med_elem = int(round(total/2.0))
        lowest = None   #Lowest quality score value found in this column.
        highest = None  #Highest quality score value found in this column.
        median = None   #Median quality score value found in this column.
        qsum = 0.0      #Sum of quality score values for this column.
        q1 = None       #1st quartile quality score.
        q3 = None       #3rd quartile quality score.
        q1_elem = int(round((total+1)/4.0))
        q3_elem = int(round((total+1)*3/4.0))
        
        try:
            for ind,cnt in enumerate(carr):
                qsum += ind*cnt
                
                if cnt!=0:
                    highest = ind
                
                if lowest==None and cnt!=0:  #first non-zero count
                    lowest = ind
                
                if q1==None:
                    if sum(carr[:ind+1]) >= q1_elem:
                        q1 = ind
                           
                if median==None:
                    if sum(carr[:ind+1]) < med_elem:
                        continue
                    median = ind
                    if total%2 == 0: #even number of elements
                        median2 = median
                        if sum(carr[:ind+1]) < med_elem+1:
                            for ind2,elem in enumerate(carr[ind+1:]):
                                if elem != 0:
                                    median2 = ind+ind2+1
                                    break
                        median = (median + median2)/2.0
    
                
                if q3==None:
                    if sum(carr[:ind+1]) >= q3_elem:
                        q3 = ind
                 
                
            mean = qsum/total    #Mean quality score value for this column.
            iqr = q3-q1
            left_whisker = max(q1 - 1.5*iqr,lowest)
            right_whisker = min(q3 + 1.5*iqr,highest)
            
            print >>fout,"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %(pos+1,total,lowest,highest,qsum,mean,q1,median,q3,iqr,left_whisker,right_whisker)
        except:
            invalid_positions += 1
            nullvals = ['NA']*11
            print >>fout,"%s\t%s" %(pos+1,'\t'.join(nullvals))

    if invalid_lines:
        print "Skipped %d reads as invalid." %invalid_lines
    if invalid_positions:
        print "Skipped stats computation for %d read positions." %invalid_positions
        
if __name__=="__main__":
    __main__()

Summary ✨

This is a Python script that calculates various statistics for quality scores of reads in a FASTQ file. It takes two arguments: the input FASTQ file and the output file where the results will be written. The script first checks if the input file is a zipped file, unzips it if necessary, and then reads the contents line by line. For each line, it splits the elements into an array and tries to convert them to integers. If any of the elements cannot be converted to integers, it skips that read. It then calculates various statistics for each column in the input file, including the number of bases found in that column, the lowest quality score value found in that column, the highest quality score value found in that column, the median quality score value found in that column, and the mean quality score value found in that column. It also calculates the 1st quartile (Q1), median, and 3rd quartile (Q3) of the quality scores for each column. Finally, it writes the results to the output file.

Tech Fingerprint

Standard Library: OS Interaction

Alerts (12)

'def' Ensure functions have docstrings for documentation
9 13
'zipfile.ZipFile(' Potential decompression bomb vulnerability in Python code if input is untrusted; ensure to limit the number of bytes read.
14
'open(' Use 'with open()' to ensure Files are properly closed
24
Complexity hotspot; line 38 (total complexity: 3)
38
Complexity hotspot; line 58 (total complexity: 3)
58
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
71 127
'try:' Ensure try blocks have corresponding except or finally blocks
88
Complexity hotspot; lines 108 to 110 (total complexity: 3)
108 109 110