PageRenderTime 20ms CodeModel.GetById 14ms app.highlight 4ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/fastq/fastq_paired_end_deinterlacer.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 66 lines | 52 code | 11 blank | 3 comment | 11 complexity | e70b9161a93788830811a83b80ffdea4 MD5 | raw file
 1#Florent Angly
 2import sys
 3from galaxy_utils.sequence.fastq import fastqReader, fastqWriter, fastqNamedReader, fastqJoiner
 4
 5def main():
 6    input_filename   = sys.argv[1]
 7    input_type       = sys.argv[2] or 'sanger'
 8    mate1_filename   = sys.argv[3]
 9    mate2_filename   = sys.argv[4]
10    single1_filename = sys.argv[5]
11    single2_filename = sys.argv[6]
12
13    type        = input_type
14    input       = fastqNamedReader( open( input_filename, 'rb' ), format = type  )
15    mate1_out   = fastqWriter( open( mate1_filename, 'wb' ), format = type )
16    mate2_out   = fastqWriter( open( mate2_filename, 'wb' ), format = type )
17    single1_out = fastqWriter( open( single1_filename, 'wb' ), format = type )
18    single2_out = fastqWriter( open( single2_filename, 'wb' ), format = type )
19    joiner      = fastqJoiner( type )
20
21    i = None
22    skip_count = 0
23    found = {}
24    for i, read in enumerate( fastqReader( open( input_filename, 'rb' ), format = type ) ):
25     
26        if read.identifier in found:
27            del found[read.identifier]
28            continue
29
30        mate1 = input.get( read.identifier )
31
32        mate2 = input.get( joiner.get_paired_identifier( mate1 ) )
33
34        if mate2:
35            # This is a mate pair
36            found[mate2.identifier] = None
37            if joiner.is_first_mate( mate1 ):
38                mate1_out.write( mate1 )
39                mate2_out.write( mate2 )
40            else:
41                mate1_out.write( mate2 )
42                mate2_out.write( mate1 )
43        else:
44            # This is a single
45            skip_count += 1
46            if joiner.is_first_mate( mate1 ):
47                single1_out.write( mate1 )
48            else:
49                single2_out.write( mate1 )
50
51    if i is None:
52        print "Your input file contained no valid FASTQ sequences."
53    else:
54        if skip_count:
55            print 'There were %i reads with no mate.' % skip_count
56        print 'De-interlaced %s pairs of sequences.' % ( (i - skip_count + 1)/2 )
57
58    input.close()
59    mate1_out.close()
60    mate2_out.close()
61    single1_out.close()
62    single2_out.close()
63
64 
65if __name__ == "__main__":
66    main()