PageRenderTime 14ms CodeModel.GetById 1ms app.highlight 9ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/fastq/fastq_paired_end_interlacer.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 59 lines | 47 code | 9 blank | 3 comment | 13 complexity | 33dd533483d53744a7b82af3d3bbd7db MD5 | raw file
 1#Florent Angly
 2import sys
 3from galaxy_utils.sequence.fastq import fastqReader, fastqWriter, fastqNamedReader, fastqJoiner
 4
 5def main():
 6    mate1_filename   = sys.argv[1]
 7    mate1_type       = sys.argv[2] or 'sanger'
 8    mate2_filename   = sys.argv[3]
 9    mate2_type       = sys.argv[4] or 'sanger'
10    outfile_pairs    = sys.argv[5]
11    outfile_singles = sys.argv[6]
12
13    if mate1_type != mate2_type:
14        print "WARNING: You are trying to interlace files of two different types: %s and %s." % ( mate1_type, mate2_type )
15        return
16
17    type = mate1_type
18    joiner = fastqJoiner( type )
19    out_pairs = fastqWriter( open( outfile_pairs, 'wb' ), format = type )
20    out_singles = fastqWriter( open( outfile_singles, 'wb' ), format = type )
21
22    # Pairs + singles present in mate1
23    nof_singles = 0
24    nof_pairs   = 0
25    mate2_input = fastqNamedReader( open( mate2_filename, 'rb' ), format = type )
26    i = None
27    for i, mate1 in enumerate( fastqReader( open( mate1_filename, 'rb' ), format = type ) ):
28        mate2 = mate2_input.get( joiner.get_paired_identifier( mate1 ) )
29        if mate2:
30            out_pairs.write( mate1 )
31            out_pairs.write( mate2 )
32            nof_pairs += 1
33        else:
34            out_singles.write( mate1 )
35            nof_singles += 1
36
37    # Singles present in mate2
38    mate1_input = fastqNamedReader( open( mate1_filename, 'rb' ), format = type )
39    j = None
40    for j, mate2 in enumerate( fastqReader( open( mate2_filename, 'rb' ), format = type ) ):
41        mate1 = mate1_input.get( joiner.get_paired_identifier( mate2 ) )
42        if not mate1:
43            out_singles.write( mate2 )
44            nof_singles += 1
45
46    if (i is None) and (j is None):
47        print "Your input files contained no valid FASTQ sequences."
48    else:
49        print 'There were %s single reads.' % ( nof_singles )
50        print 'Interlaced %s pairs of sequences.' % ( nof_pairs )
51
52    mate1_input.close()
53    mate2_input.close()
54    out_pairs.close()
55    out_singles.close()
56
57 
58if __name__ == "__main__":
59    main()