fastq_gen_conv.py - This Python script converts a FASTQ fil…

/tools/next_gen_conversion/fastq_gen_conv.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 172 lines · 135 code · 5 blank · 32 comment · 60 complexity · ab0cdcd1329846ec45f0279ec31d510e MD5 · raw file


"""
Converts any type of FASTQ file to Sanger type  and makes small adjustments if necessary.

usage: %prog [options]
   -i, --input=i: Input FASTQ candidate file
   -r, --origType=r: Original type
   -a, --allOrNot=a: Whether or not to check all blocks
   -b, --blocks=b: Number of blocks to check
   -o, --output=o: Output file

usage: %prog input_file oroutput_file
"""

import math, sys
from galaxy import eggs
import pkg_resources; pkg_resources.require( "bx-python" )
from bx.cookbook import doc_optparse

def stop_err( msg ):
    sys.stderr.write( "%s\n" % msg )
    sys.exit()
    
def all_bases_valid(seq):
    """Confirm that the sequence contains only bases"""
    valid_bases = ['a', 'A', 'c', 'C', 'g', 'G', 't', 'T', 'N']
    for base in seq:
        if base not in valid_bases:
            return False
    return True

def __main__():
    #Parse Command Line
    options, args = doc_optparse.parse( __doc__ )
    orig_type = options.origType
    if orig_type == 'sanger' and options.allOrNot == 'not':
        max_blocks = int(options.blocks)
    else:
        max_blocks = -1
    fin = file(options.input, 'r')
    fout = file(options.output, 'w')
    range_min = 1000
    range_max = -5
    block_num = 0
    bad_blocks = 0
    base_len = -1
    line_count = 0
    lines = []
    line = fin.readline()
    while line:
        if line.strip() and max_blocks >= 0 and block_num > 0 and orig_type == 'sanger' and block_num >= max_blocks:
            fout.write(line)
            if line_count % 4 == 0:
                block_num += 1
            line_count += 1
        elif line.strip():
            # the line that starts a block, with a name
            if line_count % 4 == 0 and line.startswith('@'):
                lines.append(line)
            else:
                # if we expect a sequence of bases
                if line_count % 4 == 1 and all_bases_valid(line.strip()):
                    lines.append(line)
                    base_len = len(line.strip())
                # if we expect the second name line
                elif line_count % 4 == 2 and line.startswith('+'):
                    lines.append(line)
                # if we expect a sequence of qualities and it's the expected length
                elif line_count % 4 == 3:
                    split_line = line.strip().split()
                    # decimal qualities
                    if len(split_line) == base_len:
                        # convert
                        phred_list = []
                        for ch in split_line:
                            int_ch = int(ch)
                            if int_ch < range_min:
                                range_min = int_ch
                            if int_ch > range_max:
                                range_max = int_ch
                            if int_ch >= 0 and int_ch <= 93:
                                phred_list.append(chr(int_ch + 33))
                        # make sure we haven't lost any quality values
                        if len(phred_list) == base_len:
                            # print first three lines
                            for l in lines:
                                fout.write(l)
                            # print converted quality line
                            fout.write(''.join(phred_list))
                            # reset
                            lines = []
                            base_len = -1
                        # abort if so
                        else:
                            bad_blocks += 1
                            lines = []
                            base_len = -1
                    # ascii qualities
                    elif len(split_line[0]) == base_len:
                        qualities = []
                        # print converted quality line
                        if orig_type == 'illumina':
                            for c in line.strip():
                                if ord(c) - 64 < range_min:
                                    range_min = ord(c) - 64
                                if ord(c) - 64 > range_max:
                                    range_max = ord(c) - 64
                                if ord(c) < 64 or ord(c) > 126:
                                    bad_blocks += 1
                                    base_len = -1
                                    lines = []
                                    break
                                else:
                                    qualities.append( chr( ord(c) - 31 ) )
                            quals = ''.join(qualities)
                        elif orig_type == 'solexa':
                            for c in line.strip():
                                if ord(c) - 64 < range_min:
                                    range_min = ord(c) - 64
                                if ord(c) - 64 > range_max:
                                    range_max = ord(c) - 64
                                if ord(c) < 59 or ord(c) > 126:
                                    bad_blocks += 1
                                    base_len = -1
                                    lines = []
                                    break
                                else:
                                    p = 10.0**( ( ord(c) - 64 ) / -10.0 ) / ( 1 + 10.0**( ( ord(c) - 64 ) / -10.0 ) )
                                    qualities.append( chr( int( -10.0*math.log10( p ) ) + 33 ) )
                            quals = ''.join(qualities)
                        else:  # 'sanger'
                            for c in line.strip():
                                if ord(c) - 33 < range_min:
                                    range_min = ord(c) - 33
                                if ord(c) - 33 > range_max:
                                    range_max = ord(c) - 33
                                if ord(c) < 33 or ord(c) > 126:
                                    bad_blocks += 1
                                    base_len = -1
                                    lines = []
                                    break
                                else:
                                    qualities.append(c)
                            quals = ''.join(qualities)
                        # make sure we don't have bad qualities
                        if len(quals) == base_len:
                            # print first three lines
                            for l in lines:
                                fout.write(l)
                            # print out quality line
                            fout.write(quals+'\n')
                        # reset
                        lines = []
                        base_len = -1
                    else:
                        bad_blocks += 1
                        base_len = -1
                        lines = []
                    # mark the successful end of a block
                    block_num += 1
            line_count += 1
        line = fin.readline()
    fout.close()
    fin.close()
    if range_min != 1000 and range_min != -5:
        outmsg = 'The range of quality values found were: %s to %s' % (range_min, range_max)
    else:
        outmsg = ''
    if bad_blocks > 0:
        outmsg += '\nThere were %s bad blocks skipped' % (bad_blocks)
    sys.stdout.write(outmsg)

if __name__=="__main__": __main__()

Summary ✨

This Python script converts a FASTQ file to a Sanger-encoded quality score file. It takes two arguments: the input file and the output file. The script reads the input file line by line, checks the quality scores for each read, and writes the converted quality scores to the output file. If any bad blocks are found during the conversion process, the script skips them and reports their number at the end of the conversion process.

Tech Fingerprint

Alerts (3)

'def' Ensure functions have docstrings for documentation
19
Complexity hotspot; lines 49 to 50 (total complexity: 6)
49 50