/tools/human_genome_variation/senatag.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 243 lines · 171 code · 53 blank · 19 comment · 43 complexity · 572347b65afb4b849302dd885a7c3fcc MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. This tool takes the following file pairs as input:
  4. a) input_snp : A file with identifiers for SNPs (one on each line)
  5. b) ldfile : A file where each line has the following
  6. snp list
  7. where "snp" is an identifier for one SNP and the "list" is a
  8. comma separated list of all the other snps that are in LD with
  9. it (as per some threshold of rsquare)
  10. The output is a set of tag SNPs for the given datasets
  11. The algorithm is as follows:
  12. a) Construct a graph for each population, where each node is a SNP and two nodes
  13. are connected using an edge iff they are in LD.
  14. b) For each SNP, count the total number of connected nodes, which have not yet
  15. been visited.
  16. c) Find the SNP with the highest count and assign it to be a tag SNP.
  17. d) Mark that SNP and all the snps connected to it as "visited". This should be
  18. done for each population.
  19. e) Continue steps b-e until all SNPs, in all populations have been visited.
  20. """
  21. from sys import argv, stderr, exit
  22. from getopt import getopt, GetoptError
  23. import os
  24. import heapq
  25. __author__ = "Aakrosh Ratan"
  26. __email__ = "ratan@bx.psu.edu"
  27. # do we want the debug information to be printed?
  28. debug_flag = False
  29. class node:
  30. def __init__(self, name):
  31. self.name = name
  32. self.edges = []
  33. self.visited = False
  34. # return the number of nodes connected to this node, that have yet to be
  35. # visited
  36. def num_not_visited(self):
  37. num = 0
  38. for n in self.edges:
  39. if n.visited == False: num += 1
  40. return num
  41. def __cmp__(self, other):
  42. return other.num_not_visited() - self.num_not_visited()
  43. def __str__(self):
  44. return self.name
  45. class graph:
  46. def __init__(self):
  47. self.nodes = {}
  48. def __str__(self):
  49. string = ""
  50. for n1 in self.nodes.values():
  51. n2s = [x.name for x in n1.edges]
  52. string += "%s %s\n" % (n1.name, ",".join(n2s))
  53. return string[:-1]
  54. def add_node(self, n):
  55. self.nodes[n.name] = n
  56. def add_edges(self, n1, n2):
  57. assert n1.name in self.nodes
  58. assert n2.name in self.nodes
  59. n1.edges.append(n2)
  60. n2.edges.append(n1)
  61. def check_graph(self):
  62. for n in self.nodes.values():
  63. ms = [x for x in n.edges]
  64. for m in ms:
  65. if n not in m.edges:
  66. print >> stderr, "check : %s - %s" % (n,m)
  67. def construct_graph(ldfile, snpfile):
  68. # construct the initial graph. add all the SNPs as nodes
  69. g = graph()
  70. file = open(snpfile, "r")
  71. for line in file:
  72. # ignore empty lines and add the remainder to the graph
  73. if len(line.strip()) == 0: continue
  74. n = node(line.strip())
  75. g.add_node(n)
  76. file.close()
  77. print >> stderr, "Added %d nodes to a graph" % len(g.nodes)
  78. # now add all the edges
  79. file = open(ldfile, "r")
  80. for line in file:
  81. tokens = line.split()
  82. assert len(tokens) == 2
  83. # if this node is in the graph, then we need to construct an edge from
  84. # this node to all the nodes which are highly related to it
  85. if tokens[0] in g.nodes:
  86. n1 = g.nodes[tokens[0]]
  87. n2s = [g.nodes[x] for x in tokens[1].split(",")]
  88. for n2 in n2s:
  89. g.add_edges(n1, n2)
  90. file.close()
  91. print >> stderr, "Added all edges to the graph"
  92. return g
  93. def check_output(g, tagsnps):
  94. # find all the nodes in the graph
  95. allsnps = [x.name for x in g.nodes.values()]
  96. # find the nodes that are covered by our tagsnps
  97. mysnps = [x.name for x in tagsnps]
  98. for n in tagsnps:
  99. for m in n.edges:
  100. mysnps.append(m.name)
  101. mysnps = list(set(mysnps))
  102. if set(allsnps) != set(mysnps):
  103. diff = list(set(allsnps) - set(mysnps))
  104. print >> stderr, "%s are not covered" % ",".join(diff)
  105. def main(ldfile, snpsfile, required, excluded):
  106. # construct the graph
  107. g = construct_graph(ldfile, snpsfile)
  108. if debug_flag == True: g.check_graph()
  109. tagsnps = []
  110. neighbors = {}
  111. # take care of the SNPs that are required to be TagSNPs
  112. for s in required:
  113. t = g.nodes[s]
  114. t.visited = True
  115. ns = []
  116. for n in t.edges:
  117. if n.visited == False: ns.append(n.name)
  118. n.visited = True
  119. tagsnps.append(t)
  120. neighbors[t.name] = list(set(ns))
  121. # find the tag SNPs for this graph
  122. data = [x for x in g.nodes.values()]
  123. heapq.heapify(data)
  124. while data:
  125. s = heapq.heappop(data)
  126. if s.visited == True or s.name in excluded: continue
  127. s.visited = True
  128. ns = []
  129. for n in s.edges:
  130. if n.visited == False: ns.append(n.name)
  131. n.visited = True
  132. tagsnps.append(s)
  133. neighbors[s.name] = list(set(ns))
  134. heapq.heapify(data)
  135. for s in tagsnps:
  136. if len(neighbors[s.name]) > 0:
  137. print "%s\t%s" % (s, ",".join(neighbors[s.name]))
  138. continue
  139. print s
  140. if debug_flag == True: check_output(g, tagsnps)
  141. def read_list(filename):
  142. assert os.path.exists(filename) == True
  143. file = open(filename, "r")
  144. list = {}
  145. for line in file:
  146. list[line.strip()] = 1
  147. file.close()
  148. return list
  149. def usage():
  150. f = stderr
  151. print >> f, "usage:"
  152. print >> f, "senatag [options] neighborhood.txt inputsnps.txt"
  153. print >> f, "where inputsnps.txt is a file of snps from one population"
  154. print >> f, "where neighborhood.txt is neighborhood details for the pop."
  155. print >> f, "where the options are:"
  156. print >> f, "-h,--help : print usage and quit"
  157. print >> f, "-d,--debug: print debug information"
  158. print >> f, "-e,--excluded : file with names of SNPs that cannot be TagSNPs"
  159. print >> f, "-r,--required : file with names of SNPs that should be TagSNPs"
  160. if __name__ == "__main__":
  161. try:
  162. opts, args = getopt(argv[1:], "hdr:e:",\
  163. ["help", "debug", "required=", "excluded="])
  164. except GetoptError, err:
  165. print str(err)
  166. usage()
  167. exit(2)
  168. required = {}
  169. excluded = {}
  170. for o, a in opts:
  171. if o in ("-h", "--help"):
  172. usage()
  173. exit()
  174. elif o in ("-d", "--debug"):
  175. debug_flag = True
  176. elif o in ("-r", "--required"):
  177. required = read_list(a)
  178. elif o in ("-e", "--excluded"):
  179. excluded = read_list(a)
  180. else:
  181. assert False, "unhandled option"
  182. if len(args) != 2:
  183. usage()
  184. exit(3)
  185. assert os.path.exists(args[0]) == True
  186. assert os.path.exists(args[1]) == True
  187. main(args[0], args[1], required, excluded)