/tools/next_gen_conversion/solid2fastq.py
Python | 209 lines | 205 code | 3 blank | 1 comment | 3 complexity | 41a0372b075cb8f65df81ac8bbb6b8dd MD5 | raw file
1#!/usr/bin/env python 2 3import sys 4import string 5import optparse 6import tempfile 7import sqlite3 8 9def stop_err( msg ): 10 sys.stderr.write( msg ) 11 sys.exit() 12 13def solid2sanger( quality_string, min_qual = 0 ): 14 sanger = "" 15 quality_string = quality_string.rstrip( " " ) 16 for qv in quality_string.split(" "): 17 try: 18 if int( qv ) < 0: 19 qv = '0' 20 if int( qv ) < min_qual: 21 return False 22 break 23 sanger += chr( int( qv ) + 33 ) 24 except: 25 pass 26 return sanger 27 28def Translator(frm='', to='', delete='', keep=None): 29 allchars = string.maketrans('','') 30 if len(to) == 1: 31 to = to * len(frm) 32 trans = string.maketrans(frm, to) 33 if keep is not None: 34 delete = allchars.translate(allchars, keep.translate(allchars, delete)) 35 def callable(s): 36 return s.translate(trans, delete) 37 return callable 38 39def merge_reads_qual( f_reads, f_qual, f_out, trim_name=False, out='fastq', double_encode = False, trim_first_base = False, pair_end_flag = '', min_qual = 0, table_name=None ): 40 41 # Reads from two files f_csfasta (reads) and f_qual (quality values) and produces output in three formats depending on out parameter, 42 # which can have three values: fastq, txt, and db 43 # fastq = fastq format 44 # txt = space delimited format with defline, reads, and qvs 45 # dp = dump data into sqlite3 db. 46 # IMPORTNAT! If out = db two optins must be provided: 47 # 1. f_out must be a db connection object initialized with sqlite3.connect() 48 # 2. table_name must be provided 49 50 if out == 'db': 51 cursor = f_out.cursor() 52 sql = "create table %s (name varchar(50) not null, read blob, qv blob)" % table_name 53 cursor.execute(sql) 54 55 lines = [] 56 line = " " 57 while line: 58 for f in [ f_reads, f_qual ]: 59 line = f.readline().rstrip( '\n\r' ) 60 while line.startswith( '#' ): 61 line = f.readline().rstrip( '\n\r' ) 62 lines.append( line ) 63 64 if lines[0].startswith( '>' ): 65 defline = lines[0][1:] 66 if trim_name and ( defline[ len( defline )-3: ] == "_F3" or defline[ len( defline )-3: ] == "_R3" ): 67 defline = defline[ : len( defline )-3 ] 68 69 else: 70 71 if trim_first_base: 72 lines[0] = lines[0][1:] 73 if double_encode: 74 de = Translator(frm="0123.", to="ACGTN") 75 lines[0] = de(lines[0]) 76 qual = solid2sanger( lines[1], int( min_qual ) ) 77 if qual: 78 if out == 'fastq': 79 f_out.write( "@%s%s\n%s\n+\n%s\n" % ( defline, pair_end_flag, lines[0], qual ) ) 80 if out == 'txt': 81 f_out.write( '%s %s %s\n' % (defline, lines[0], qual ) ) 82 if out == 'db': 83 cursor.execute('insert into %s values("%s","%s","%s")' % (table_name, defline, lines[0], qual ) ) 84 lines = [] 85 86def main(): 87 88 usage = "%prog --fr F3.csfasta --fq R3.csfasta --fout fastq_output_file [option]" 89 parser = optparse.OptionParser(usage=usage) 90 91 92 parser.add_option( 93 '--fr','--f_reads', 94 metavar="F3_CSFASTA_FILE", 95 dest='fr', 96 help='Name of F3 file with color space reads') 97 98 parser.add_option( 99 '--fq','--f_qual', 100 metavar="F3_QUAL_FILE", 101 dest='fq', 102 help='Name of F3 file with color quality values') 103 104 parser.add_option( 105 '--fout','--f3_fastq_output', 106 metavar="F3_OUTPUT", 107 dest='fout', 108 help='Name for F3 output file') 109 110 parser.add_option( 111 '--rr','--r_reads', 112 metavar="R3_CSFASTA_FILE", 113 dest='rr', 114 default = False, 115 help='Name of R3 file with color space reads') 116 117 parser.add_option( 118 '--rq','--r_qual', 119 metavar="R3_QUAL_FILE", 120 dest='rq', 121 default = False, 122 help='Name of R3 file with color quality values') 123 124 parser.add_option( 125 '--rout', 126 metavar="R3_OUTPUT", 127 dest='rout', 128 help='Name for F3 output file') 129 130 parser.add_option( 131 '-q','--min_qual', 132 dest='min_qual', 133 default = '-1000', 134 help='Minimum quality threshold for printing reads. If a read contains a single call with QV lower than this value, it will not be reported. Default is -1000') 135 136 parser.add_option( 137 '-t','--trim_name', 138 dest='trim_name', 139 action='store_true', 140 default = False, 141 help='Trim _R3 and _F3 off read names. Default is False') 142 143 parser.add_option( 144 '-f','--trim_first_base', 145 dest='trim_first_base', 146 action='store_true', 147 default = False, 148 help='Remove the first base of reads in color-space. Default is False') 149 150 parser.add_option( 151 '-d','--double_encode', 152 dest='de', 153 action='store_true', 154 default = False, 155 help='Double encode color calls as nucleotides: 0123. becomes ACGTN. Default is False') 156 157 options, args = parser.parse_args() 158 159 if not ( options.fout and options.fr and options.fq ): 160 parser.error(""" 161 One or more of the three required paremetrs is missing: 162 (1) --fr F3.csfasta file 163 (2) --fq F3.qual file 164 (3) --fout name of output file 165 Use --help for more info 166 """) 167 168 fr = open ( options.fr , 'r' ) 169 fq = open ( options.fq , 'r' ) 170 f_out = open ( options.fout , 'w' ) 171 172 if options.rr and options.rq: 173 rr = open ( options.rr , 'r' ) 174 rq = open ( options.rq , 'r' ) 175 if not options.rout: 176 parser.error("Provide the name for f3 output using --rout option. Use --help for more info") 177 r_out = open ( options.rout, 'w' ) 178 179 db = tempfile.NamedTemporaryFile() 180 181 try: 182 con = sqlite3.connect(db.name) 183 cur = con.cursor() 184 except: 185 stop_err('Cannot connect to %s\n') % db.name 186 187 188 merge_reads_qual( fr, fq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="f3" ) 189 merge_reads_qual( rr, rq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="r3" ) 190 cur.execute('create index f3_name on f3( name )') 191 cur.execute('create index r3_name on r3( name )') 192 193 cur.execute('select * from r3,f3 where f3.name = r3.name') 194 for item in cur: 195 f_out.write( "@%s%s\n%s\n+\n%s\n" % (item[0], "/1", item[1], item[2]) ) 196 r_out.write( "@%s%s\n%s\n+\n%s\n" % (item[3], "/2", item[4], item[5]) ) 197 198 199 else: 200 merge_reads_qual( fr, fq, f_out, trim_name=options.trim_name, out='fastq', double_encode = options.de, trim_first_base = options.trim_first_base, min_qual=options.min_qual ) 201 202 203 204 f_out.close() 205 206if __name__ == "__main__": 207 main() 208 209