PageRenderTime 27ms CodeModel.GetById 2ms app.highlight 19ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/metag_tools/convert_SOLiD_color2nuc.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 89 lines | 77 code | 6 blank | 6 comment | 1 complexity | daa5194fc8cc37263b6da9d47c2507d1 MD5 | raw file
 1#!/usr/bin/env python
 2"""
 3convert SOLiD calor-base data to nucleotide sequence
 4example: T011213122200221123032111221021210131332222101
 5         TTGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT
 6"""
 7
 8import sys, os
 9
10def stop_err(msg):
11    
12    sys.stderr.write(msg)
13    sys.stderr.write('\n')
14    sys.exit()
15    
16def color2base(color_seq):
17
18    first_nuc = ['A','C','G','T']
19    code_matrix = {}
20    code_matrix['0'] = ['A','C','G','T']
21    code_matrix['1'] = ['C','A','T','G']
22    code_matrix['2'] = ['G','T','A','C']
23    code_matrix['3'] = ['T','G','C','A']
24
25    overlap_nuc = ''
26    nuc_seq = ''
27    
28    seq_prefix = prefix = color_seq[0].upper()
29    color_seq = color_seq[1:]
30                
31    if not (seq_prefix in first_nuc):
32        stop_err('The leading nucleotide is invalid. Must be one of the four nucleotides: A, T, C, G.\nThe file contains a %s' %seq_prefix )
33
34    for code in color_seq:
35        
36        if not (code in ['0','1','2','3']):
37            stop_err('Expect digits (0, 1, 2, 3) in the color-cading data. File contains numbers other than the set.\nThe file contains a %s' %code)
38        
39        second_nuc = code_matrix[code]
40        overlap_nuc = second_nuc[first_nuc.index(prefix)]
41        nuc_seq += overlap_nuc
42        prefix = overlap_nuc
43
44    return seq_prefix, nuc_seq
45
46def __main__():
47
48    infilename = sys.argv[1]
49    keep_prefix = sys.argv[2].lower()
50    outfilename = sys.argv[3]
51
52    outfile = open(outfilename,'w')
53
54    prefix = ''
55    color_seq = ''
56    for i, line in enumerate(file(infilename)):
57        line = line.rstrip('\r\n')
58
59        if not line: continue
60        if line.startswith("#"): continue
61    
62        if line.startswith(">"):
63            
64            if color_seq:
65                prefix, nuc_seq = color2base(color_seq)
66                
67                if keep_prefix == 'yes':
68                    nuc_seq = prefix + nuc_seq
69                
70                outfile.write(title+'\n')
71                outfile.write(nuc_seq+'\n')
72                
73            title = line
74            color_seq = ''
75        else:
76            color_seq += line
77            
78    if color_seq:
79        prefix, nuc_seq = color2base(color_seq)
80                
81        if keep_prefix == 'yes':
82            nuc_seq = prefix + nuc_seq
83
84        outfile.write(title+'\n')
85        outfile.write(nuc_seq+'\n')
86            
87    outfile.close()
88    
89if __name__=='__main__': __main__()