/tools/metag_tools/mapping_to_ucsc.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 204 lines · 181 code · 15 blank · 8 comment · 70 complexity · 8638c06e8a7a51e875b4e8b4bc45e807 MD5 · raw file

  1. #!/usr/bin/env python
  2. from galaxy import eggs
  3. import sys, tempfile, os
  4. assert sys.version_info[:2] >= (2.4)
  5. def stop_err(msg):
  6. sys.stderr.write(msg)
  7. sys.exit()
  8. def main():
  9. out_fname = sys.argv[1]
  10. in_fname = sys.argv[2]
  11. chr_col = int(sys.argv[3])-1
  12. coord_col = int(sys.argv[4])-1
  13. track_type = sys.argv[5]
  14. if track_type == 'coverage' or track_type == 'both':
  15. coverage_col = int(sys.argv[6])-1
  16. cname = sys.argv[7]
  17. cdescription = sys.argv[8]
  18. ccolor = sys.argv[9].replace('-',',')
  19. cvisibility = sys.argv[10]
  20. if track_type == 'snp' or track_type == 'both':
  21. if track_type == 'both':
  22. j = 5
  23. else:
  24. j = 0
  25. #sname = sys.argv[7+j]
  26. sdescription = sys.argv[6+j]
  27. svisibility = sys.argv[7+j]
  28. #ref_col = int(sys.argv[10+j])-1
  29. read_col = int(sys.argv[8+j])-1
  30. # Sort the input file based on chromosome (alphabetically) and start co-ordinates (numerically)
  31. sorted_infile = tempfile.NamedTemporaryFile()
  32. try:
  33. os.system("sort -k %d,%d -k %dn -o %s %s" %(chr_col+1,chr_col+1,coord_col+1,sorted_infile.name,in_fname))
  34. except Exception, exc:
  35. stop_err( 'Initialization error -> %s' %str(exc) )
  36. #generate chr list
  37. sorted_infile.seek(0)
  38. chr_vals = []
  39. for line in file( sorted_infile.name ):
  40. line = line.strip()
  41. if not(line):
  42. continue
  43. try:
  44. fields = line.split('\t')
  45. chr = fields[chr_col]
  46. if chr not in chr_vals:
  47. chr_vals.append(chr)
  48. except:
  49. pass
  50. if not(chr_vals):
  51. stop_err("Skipped all lines as invalid.")
  52. if track_type == 'coverage' or track_type == 'both':
  53. if track_type == 'coverage':
  54. fout = open( out_fname, "w" )
  55. else:
  56. fout = tempfile.NamedTemporaryFile()
  57. fout.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \
  58. % ( cname, cdescription, ccolor, cvisibility ))
  59. if track_type == 'snp' or track_type == 'both':
  60. fout_a = tempfile.NamedTemporaryFile()
  61. fout_t = tempfile.NamedTemporaryFile()
  62. fout_g = tempfile.NamedTemporaryFile()
  63. fout_c = tempfile.NamedTemporaryFile()
  64. fout_ref = tempfile.NamedTemporaryFile()
  65. fout_a.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \
  66. % ( "Track A", sdescription, '255,0,0', svisibility ))
  67. fout_t.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \
  68. % ( "Track T", sdescription, '0,255,0', svisibility ))
  69. fout_g.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \
  70. % ( "Track G", sdescription, '0,0,255', svisibility ))
  71. fout_c.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \
  72. % ( "Track C", sdescription, '255,0,255', svisibility ))
  73. sorted_infile.seek(0)
  74. for line in file( sorted_infile.name ):
  75. line = line.strip()
  76. if not(line):
  77. continue
  78. try:
  79. fields = line.split('\t')
  80. chr = fields[chr_col]
  81. start = int(fields[coord_col])
  82. assert start > 0
  83. except:
  84. continue
  85. try:
  86. ind = chr_vals.index(chr) #encountered chr for the 1st time
  87. del chr_vals[ind]
  88. prev_start = ''
  89. header = "variableStep chrom=%s\n" %(chr)
  90. if track_type == 'coverage' or track_type == 'both':
  91. coverage = int(fields[coverage_col])
  92. line1 = "%s\t%s\n" %(start,coverage)
  93. fout.write("%s%s" %(header,line1))
  94. if track_type == 'snp' or track_type == 'both':
  95. a = t = g = c = 0
  96. fout_a.write("%s" %(header))
  97. fout_t.write("%s" %(header))
  98. fout_g.write("%s" %(header))
  99. fout_c.write("%s" %(header))
  100. try:
  101. #ref_nt = fields[ref_col].capitalize()
  102. read_nt = fields[read_col].capitalize()
  103. try:
  104. nt_ind = ['A','T','G','C'].index(read_nt)
  105. if nt_ind == 0:
  106. a+=1
  107. elif nt_ind == 1:
  108. t+=1
  109. elif nt_ind == 2:
  110. g+=1
  111. else:
  112. c+=1
  113. except ValueError:
  114. pass
  115. except:
  116. pass
  117. prev_start = start
  118. except ValueError:
  119. if start != prev_start:
  120. if track_type == 'coverage' or track_type == 'both':
  121. coverage = int(fields[coverage_col])
  122. fout.write("%s\t%s\n" %(start,coverage))
  123. if track_type == 'snp' or track_type == 'both':
  124. if a:
  125. fout_a.write("%s\t%s\n" %(prev_start,a))
  126. if t:
  127. fout_t.write("%s\t%s\n" %(prev_start,t))
  128. if g:
  129. fout_g.write("%s\t%s\n" %(prev_start,g))
  130. if c:
  131. fout_c.write("%s\t%s\n" %(prev_start,c))
  132. a = t = g = c = 0
  133. try:
  134. #ref_nt = fields[ref_col].capitalize()
  135. read_nt = fields[read_col].capitalize()
  136. try:
  137. nt_ind = ['A','T','G','C'].index(read_nt)
  138. if nt_ind == 0:
  139. a+=1
  140. elif nt_ind == 1:
  141. t+=1
  142. elif nt_ind == 2:
  143. g+=1
  144. else:
  145. c+=1
  146. except ValueError:
  147. pass
  148. except:
  149. pass
  150. prev_start = start
  151. else:
  152. if track_type == 'snp' or track_type == 'both':
  153. try:
  154. #ref_nt = fields[ref_col].capitalize()
  155. read_nt = fields[read_col].capitalize()
  156. try:
  157. nt_ind = ['A','T','G','C'].index(read_nt)
  158. if nt_ind == 0:
  159. a+=1
  160. elif nt_ind == 1:
  161. t+=1
  162. elif nt_ind == 2:
  163. g+=1
  164. else:
  165. c+=1
  166. except ValueError:
  167. pass
  168. except:
  169. pass
  170. if track_type == 'snp' or track_type == 'both':
  171. if a:
  172. fout_a.write("%s\t%s\n" %(prev_start,a))
  173. if t:
  174. fout_t.write("%s\t%s\n" %(prev_start,t))
  175. if g:
  176. fout_g.write("%s\t%s\n" %(prev_start,g))
  177. if c:
  178. fout_c.write("%s\t%s\n" %(prev_start,c))
  179. fout_a.seek(0)
  180. fout_g.seek(0)
  181. fout_t.seek(0)
  182. fout_c.seek(0)
  183. if track_type == 'snp':
  184. os.system("cat %s %s %s %s >> %s" %(fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_fname))
  185. elif track_type == 'both':
  186. fout.seek(0)
  187. os.system("cat %s %s %s %s %s | cat > %s" %(fout.name,fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_fname))
  188. if __name__ == "__main__":
  189. main()