PageRenderTime 17ms CodeModel.GetById 1ms app.highlight 13ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/next_gen_conversion/solid2fastq.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 214 lines | 210 code | 3 blank | 1 comment | 3 complexity | df0c875e235646ef0629c19128f5eed6 MD5 | raw file
  1#!/usr/bin/env python
  2
  3import sys
  4import string
  5import optparse
  6import tempfile
  7import sqlite3
  8
  9def stop_err( msg ):
 10    sys.stderr.write( msg )
 11    sys.exit()
 12    
 13def solid2sanger( quality_string, min_qual = 0 ):
 14    sanger = ""
 15    quality_string = quality_string.rstrip( " " )
 16    for qv in quality_string.split(" "):
 17        try:
 18            if int( qv ) < 0:
 19                qv = '0'
 20            if int( qv ) < min_qual:
 21                return False
 22                break
 23            sanger += chr( int( qv ) + 33 )
 24        except:
 25            pass    
 26    return sanger
 27
 28def Translator(frm='', to='', delete='', keep=None):
 29    allchars = string.maketrans('','')
 30    if len(to) == 1:
 31        to = to * len(frm)
 32    trans = string.maketrans(frm, to)
 33    if keep is not None:
 34        delete = allchars.translate(allchars, keep.translate(allchars, delete))
 35    def callable(s):
 36        return s.translate(trans, delete)
 37    return callable
 38    
 39def merge_reads_qual( f_reads, f_qual, f_out, trim_name=False, out='fastq', double_encode = False, trim_first_base = False, pair_end_flag = '', min_qual = 0, table_name=None ):
 40 
 41    # Reads from two files f_csfasta (reads) and f_qual (quality values) and produces output in three formats depending on out parameter,
 42    # which can have three values: fastq, txt, and db
 43    # fastq = fastq format
 44    # txt = space delimited format with defline, reads, and qvs
 45    # dp = dump data into sqlite3 db. 
 46    # IMPORTNAT! If out = db two optins must be provided:
 47    #   1. f_out must be a db connection object initialized with sqlite3.connect()
 48    #   2. table_name must be provided
 49    
 50    if out == 'db':
 51        cursor = f_out.cursor()
 52        sql = "create table %s (name varchar(50) not null, read blob, qv blob)" % table_name
 53        cursor.execute(sql)
 54    
 55    lines = []
 56    line = " "
 57    while line:
 58        for f in [ f_reads, f_qual ]:
 59            line = f.readline().rstrip( '\n\r' )
 60            while line.startswith( '#' ):
 61                line = f.readline().rstrip( '\n\r' )
 62            lines.append( line )
 63    
 64            
 65        if lines[0].startswith( '>' ) and lines[1].startswith( '>' ):
 66            
 67            if lines[0] != lines[1]:
 68                stop_err('Files reads and quality score files are out of sync and likely corrupted. Please, check your input data')
 69            
 70            defline = lines[0][1:]
 71            if trim_name and ( defline[ len( defline )-3: ] == "_F3" or defline[ len( defline )-3: ] == "_R3" ):
 72                defline = defline[ : len( defline )-3 ]
 73                
 74        elif ( not lines[0].startswith( '>' ) and not lines[1].startswith( '>' ) and len( lines[0] ) > 0 and len( lines[1] ) > 0 ):
 75
 76            if trim_first_base:
 77                lines[0] = lines[0][1:]
 78            if double_encode:
 79                de = Translator(frm="0123.", to="ACGTN")
 80                lines[0] = de(lines[0])
 81            qual = solid2sanger( lines[1], int( min_qual ) )
 82            if qual:
 83                if out == 'fastq':
 84                    f_out.write( "@%s%s\n%s\n+\n%s\n" % ( defline, pair_end_flag, lines[0], qual ) ) 
 85                if out == 'txt':
 86                    f_out.write( '%s %s %s\n' % (defline, lines[0], qual ) )
 87                if out == 'db':
 88                    cursor.execute('insert into %s values("%s","%s","%s")' % (table_name, defline, lines[0], qual ) )     
 89        lines = []
 90
 91def main():
 92
 93    usage = "%prog --fr F3.csfasta --fq R3.csfasta --fout fastq_output_file [option]"
 94    parser = optparse.OptionParser(usage=usage)
 95    
 96        
 97    parser.add_option(
 98        '--fr','--f_reads',
 99        metavar="F3_CSFASTA_FILE",
100        dest='fr',
101        help='Name of F3 file with color space reads')
102        
103    parser.add_option(
104        '--fq','--f_qual',
105        metavar="F3_QUAL_FILE",
106        dest='fq',
107        help='Name of F3 file with color quality values')
108        
109    parser.add_option(
110        '--fout','--f3_fastq_output',
111        metavar="F3_OUTPUT",
112        dest='fout',
113        help='Name for F3 output file')
114                
115    parser.add_option(
116        '--rr','--r_reads',
117        metavar="R3_CSFASTA_FILE",
118        dest='rr',
119        default = False,
120        help='Name of R3 file with color space reads')
121        
122    parser.add_option(
123        '--rq','--r_qual',
124        metavar="R3_QUAL_FILE",
125        dest='rq',
126        default = False,
127        help='Name of R3 file with color quality values')
128        
129    parser.add_option(
130        '--rout',
131        metavar="R3_OUTPUT",
132        dest='rout',
133        help='Name for F3 output file')
134    
135    parser.add_option(
136        '-q','--min_qual',
137        dest='min_qual',
138        default = '-1000',
139        help='Minimum quality threshold for printing reads. If a read contains a single call with QV lower than this value, it will not be reported. Default is -1000')
140        
141    parser.add_option(
142        '-t','--trim_name',
143        dest='trim_name',
144        action='store_true',
145        default = False,
146        help='Trim _R3 and _F3 off read names. Default is False')
147    
148    parser.add_option(
149        '-f','--trim_first_base',
150        dest='trim_first_base',
151        action='store_true',
152        default = False,
153        help='Remove the first base of reads in color-space. Default is False')
154        
155    parser.add_option(
156        '-d','--double_encode',
157        dest='de',
158        action='store_true',
159        default = False,
160        help='Double encode color calls as nucleotides: 0123. becomes ACGTN. Default is False')
161    
162    options, args = parser.parse_args()
163    
164    if not ( options.fout and options.fr and options.fq ):
165        parser.error("""
166        One or more of the three required paremetrs is missing:
167        (1) --fr F3.csfasta file
168        (2) --fq F3.qual file
169        (3) --fout name of output file
170        Use --help for more info
171        """)
172
173    fr =  open ( options.fr , 'r' ) 
174    fq =  open ( options.fq , 'r' ) 
175    f_out = open ( options.fout , 'w' )
176    
177    if options.rr and options.rq:
178        rr =  open ( options.rr , 'r' ) 
179        rq =  open ( options.rq , 'r' ) 
180        if not options.rout:
181            parser.error("Provide the name for f3 output using --rout option. Use --help for more info")
182        r_out = open ( options.rout, 'w' )
183    
184        db = tempfile.NamedTemporaryFile()
185        
186        try:
187            con = sqlite3.connect(db.name)
188            cur = con.cursor()
189        except:
190            stop_err('Cannot connect to %s\n') % db.name
191    
192         
193        merge_reads_qual( fr, fq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="f3" )
194        merge_reads_qual( rr, rq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="r3" )
195        cur.execute('create index f3_name on f3( name )')
196        cur.execute('create index r3_name on r3( name )')
197         
198        cur.execute('select * from f3,r3 where f3.name = r3.name')
199        for item in cur:
200            f_out.write( "@%s%s\n%s\n+\n%s\n" % (item[0], "/1", item[1], item[2]) )
201            r_out.write( "@%s%s\n%s\n+\n%s\n" % (item[3], "/2", item[4], item[5]) )
202        
203            
204    else:
205        merge_reads_qual( fr, fq, f_out, trim_name=options.trim_name, out='fastq', double_encode = options.de, trim_first_base = options.trim_first_base, min_qual=options.min_qual )
206        
207    
208      
209    f_out.close()
210
211if __name__ == "__main__":
212    main()
213    
214