/tools/metag_tools/convert_SOLiD_color2nuc.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 89 lines · 56 code · 27 blank · 6 comment · 13 complexity · daa5194fc8cc37263b6da9d47c2507d1 MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. convert SOLiD calor-base data to nucleotide sequence
  4. example: T011213122200221123032111221021210131332222101
  5. TTGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT
  6. """
  7. import sys, os
  8. def stop_err(msg):
  9. sys.stderr.write(msg)
  10. sys.stderr.write('\n')
  11. sys.exit()
  12. def color2base(color_seq):
  13. first_nuc = ['A','C','G','T']
  14. code_matrix = {}
  15. code_matrix['0'] = ['A','C','G','T']
  16. code_matrix['1'] = ['C','A','T','G']
  17. code_matrix['2'] = ['G','T','A','C']
  18. code_matrix['3'] = ['T','G','C','A']
  19. overlap_nuc = ''
  20. nuc_seq = ''
  21. seq_prefix = prefix = color_seq[0].upper()
  22. color_seq = color_seq[1:]
  23. if not (seq_prefix in first_nuc):
  24. stop_err('The leading nucleotide is invalid. Must be one of the four nucleotides: A, T, C, G.\nThe file contains a %s' %seq_prefix )
  25. for code in color_seq:
  26. if not (code in ['0','1','2','3']):
  27. stop_err('Expect digits (0, 1, 2, 3) in the color-cading data. File contains numbers other than the set.\nThe file contains a %s' %code)
  28. second_nuc = code_matrix[code]
  29. overlap_nuc = second_nuc[first_nuc.index(prefix)]
  30. nuc_seq += overlap_nuc
  31. prefix = overlap_nuc
  32. return seq_prefix, nuc_seq
  33. def __main__():
  34. infilename = sys.argv[1]
  35. keep_prefix = sys.argv[2].lower()
  36. outfilename = sys.argv[3]
  37. outfile = open(outfilename,'w')
  38. prefix = ''
  39. color_seq = ''
  40. for i, line in enumerate(file(infilename)):
  41. line = line.rstrip('\r\n')
  42. if not line: continue
  43. if line.startswith("#"): continue
  44. if line.startswith(">"):
  45. if color_seq:
  46. prefix, nuc_seq = color2base(color_seq)
  47. if keep_prefix == 'yes':
  48. nuc_seq = prefix + nuc_seq
  49. outfile.write(title+'\n')
  50. outfile.write(nuc_seq+'\n')
  51. title = line
  52. color_seq = ''
  53. else:
  54. color_seq += line
  55. if color_seq:
  56. prefix, nuc_seq = color2base(color_seq)
  57. if keep_prefix == 'yes':
  58. nuc_seq = prefix + nuc_seq
  59. outfile.write(title+'\n')
  60. outfile.write(nuc_seq+'\n')
  61. outfile.close()
  62. if __name__=='__main__': __main__()