short_reads_figure_score.py - This Python script generates …

/tools/metag_tools/short_reads_figure_score.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 248 lines · 210 code · 21 blank · 17 comment · 71 complexity · d20c0e6d1e959fcd5374fa7fc82bf077 MD5 · raw file


#!/usr/bin/env python
"""
boxplot:
- box: first quartile and third quartile
- line inside the box: median
- outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile
           IQR = third quartile - first quartile
- The smallest/largest value that is not an outlier is connected to the box by with a horizontal line.
"""

import os, sys, math, tempfile, re
from rpy import *

assert sys.version_info[:2] >= ( 2, 4 )

def stop_err( msg ):
    sys.stderr.write( "%s\n" % msg )
    sys.exit()

def merge_to_20_datapoints( score ):
    number_of_points = 20
    read_length = len( score )
    step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) )
    scores = []
    point = 1
    point_sum = 0
    step_average = 0
    score_points = 0
    
    for i in xrange( 1, read_length ):
        if i < ( point * step ):
            point_sum += int( score[i] )
            step_average += 1
        else:
            point_avg = point_sum * 1.0 / step_average
            scores.append( point_avg )
            point += 1
            point_sum = 0
            step_average = 0                       
    if step_average > 0:
        point_avg = point_sum * 1.0 / step_average
        scores.append( point_avg )
    if len( scores ) > number_of_points:
        last_avg = 0
        for j in xrange( number_of_points - 1, len( scores ) ):
            last_avg += scores[j]
        last_avg = last_avg / ( len(scores) - number_of_points + 1 )
    else:    
        last_avg = scores[-1]
    score_points = []
    for k in range( number_of_points - 1 ):
        score_points.append( scores[k] )
    score_points.append( last_avg )
    return score_points

def __main__():

    invalid_lines = 0

    infile_score_name = sys.argv[1].strip()
    outfile_R_name = sys.argv[2].strip()

    infile_name = infile_score_name

    # Determine tabular or fasta format within the first 100 lines
    seq_method = None
    data_type = None
    for i, line in enumerate( file( infile_name ) ):
        line = line.rstrip( '\r\n' )
        if not line or line.startswith( '#' ):
            continue
        if data_type == None:
            if line.startswith( '>' ):
                data_type = 'fasta'
                continue
            elif len( line.split( '\t' ) ) > 0:
                fields = line.split()
                for score in fields:
                    try:
                        int( score )
                        data_type = 'tabular'
                        seq_method = 'solexa'
                        break
                    except:
                        break
        elif data_type == 'fasta':
            fields = line.split()
            for score in fields:
                try: 
                    int( score )
                    seq_method = '454'
                    break
                except:
                    break
        if i == 100:
            break

    if data_type is None:
        stop_err( 'This tool can only use fasta data or tabular data.' ) 
    if seq_method is None:
        stop_err( 'Invalid data for fasta format.')

    # Determine fixed length or variable length within the first 100 lines
    read_length = 0
    variable_length = False
    if seq_method == 'solexa':
        for i, line in enumerate( file( infile_name ) ):
            line = line.rstrip( '\r\n' )
            if not line or line.startswith( '#' ):
                continue
            scores = line.split('\t')
            if read_length == 0:
                read_length = len( scores )
            if read_length != len( scores ):
                variable_length = True
                break
            if i == 100:
                break
    elif seq_method == '454':
        score = ''
        for i, line in enumerate( file( infile_name ) ):
            line = line.rstrip( '\r\n' )
            if not line or line.startswith( '#' ):
                continue
            if line.startswith( '>' ):
                if len( score ) > 0:
                    score = score.split()
                    if read_length == 0:
                        read_length = len( score )
                    if read_length != len( score ):
                        variable_length = True
                        break
                score = ''
            else:
                score = score + ' ' + line
            if i == 100:
                break

    if variable_length:
        number_of_points = 20
    else:
        number_of_points = read_length
    read_length_threshold = 100 # minimal read length for 454 file
    score_points = []   
    score_matrix = []
    invalid_scores = 0   

    if seq_method == 'solexa':
        for i, line in enumerate( open( infile_name ) ):
            line = line.rstrip( '\r\n' )
            if not line or line.startswith( '#' ):
                continue
            tmp_array = []
            scores = line.split( '\t' )
            for bases in scores:
                nuc_errors = bases.split()
                try:
                    nuc_errors[0] = int( nuc_errors[0] )
                    nuc_errors[1] = int( nuc_errors[1] )
                    nuc_errors[2] = int( nuc_errors[2] )
                    nuc_errors[3] = int( nuc_errors[3] )
                    big = max( nuc_errors )
                except:
                    #print 'Invalid numbers in the file. Skipped.'
                    invalid_scores += 1
                    big = 0
                tmp_array.append( big )                        
            score_points.append( tmp_array )
    elif seq_method == '454':
        # skip the last fasta sequence
        score = ''
        for i, line in enumerate( open( infile_name ) ):
            line = line.rstrip( '\r\n' )
            if not line or line.startswith( '#' ):
                continue
            if line.startswith( '>' ):
                if len( score ) > 0:
                    score = ['0'] + score.split()
                    read_length = len( score )
                    tmp_array = []
                    if not variable_length:
                        score.pop(0)
                        score_points.append( score )
                        tmp_array = score
                    elif read_length > read_length_threshold:
                        score_points_tmp = merge_to_20_datapoints( score )
                        score_points.append( score_points_tmp )
                        tmp_array = score_points_tmp
                score = ''
            else:
                score = "%s %s" % ( score, line )
        if len( score ) > 0:
            score = ['0'] + score.split()
            read_length = len( score )
            if not variable_length:
                score.pop(0)
                score_points.append( score )
            elif read_length > read_length_threshold:
                score_points_tmp = merge_to_20_datapoints( score )
                score_points.append( score_points_tmp )
                tmp_array = score_points_tmp

    # reverse the matrix, for R
    for i in range( number_of_points - 1 ):
        tmp_array = []
        for j in range( len( score_points ) ):
            try:
                tmp_array.append( int( score_points[j][i] ) )
            except:
                invalid_lines += 1
        score_matrix.append( tmp_array )

    # generate pdf figures
    #outfile_R_pdf = outfile_R_name 
    #r.pdf( outfile_R_pdf )
    outfile_R_png = outfile_R_name
    r.bitmap( outfile_R_png )
    
    title = "boxplot of quality scores"
    empty_score_matrix_columns = 0
    for i, subset in enumerate( score_matrix ):
        if not subset:
            empty_score_matrix_columns += 1
            score_matrix[i] = [0]
            
    if not variable_length:
        r.boxplot( score_matrix, xlab="location in read length", main=title )
    else:
        r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title )
        x_old_range = []
        x_new_range = []
        step = read_length_threshold / number_of_points 
        for i in xrange( 0, read_length_threshold, step ):
            x_old_range.append( ( i / step ) )
            x_new_range.append( i )
        r.axis( 1, x_old_range, x_new_range )
    r.dev_off()

    if invalid_scores > 0:
        print 'Skipped %d invalid scores. ' % invalid_scores
    if invalid_lines > 0:
        print 'Skipped %d invalid lines. ' % invalid_lines
    if empty_score_matrix_columns > 0:
        print '%d missing scores in score_matrix. ' % empty_score_matrix_columns

    r.quit(save = "no")

if __name__=="__main__":__main__()

Summary ✨

This Python script generates a boxplot of quality scores from a FASTQ file using R. It first reads the FASTQ file and extracts the quality scores, then converts them to integers and stores them in a matrix. The matrix is then passed to R’s boxplot function to generate the plot. If the read length is not fixed, the script will divide the read length into 20 equal parts and label each part with its position within the read.

Tech Fingerprint

Standard Library: OS Interaction

Alerts (10)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
12
'def' Ensure functions have docstrings for documentation
16 20
'== None' Use 'is' for None comparisons (e.g., x is None)
72
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
84 93 163 209
'open(' Use 'with open()' to ensure Files are properly closed
149 172