senatag.py - This is a Python script that takes two input f…

/tools/human_genome_variation/senatag.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 243 lines · 171 code · 53 blank · 19 comment · 43 complexity · 572347b65afb4b849302dd885a7c3fcc MD5 · raw file


#!/usr/bin/env python

"""
This tool takes the following file pairs as input:
a) input_snp  : A file with identifiers for SNPs (one on each line)
b) ldfile     : A file where each line has  the following
                snp     list
                where "snp" is an identifier for one SNP and the "list" is a 
                comma separated list of all the other snps that are in LD with
                it (as per some threshold of rsquare)

The output is a set of tag SNPs for the given datasets

The algorithm is as follows:

a) Construct a graph for each population, where each node is a SNP and two nodes
are connected using an edge iff they are in LD.
b) For each SNP, count the total number of connected nodes, which have not yet
been visited.
c) Find the SNP with the highest count and assign it to be a tag SNP.
d) Mark that SNP and all the snps connected to it as "visited". This should be
done for each population.
e) Continue steps b-e until all SNPs, in all populations have been visited.
"""

from sys import argv, stderr, exit
from getopt import getopt, GetoptError

import os
import heapq

__author__ = "Aakrosh Ratan"
__email__  = "ratan@bx.psu.edu"

# do we want the debug information to be printed?
debug_flag = False

class node:
    def __init__(self, name):
        self.name    = name
        self.edges   = []
        self.visited = False

    # return the number of nodes connected to this node, that have yet to be
    # visited
    def num_not_visited(self):
        num = 0
        for n in self.edges:
            if n.visited == False: num += 1
        return num 

    def __cmp__(self, other):
        return other.num_not_visited() - self.num_not_visited()

    def __str__(self):  
        return self.name

class graph:
    def __init__(self):
        self.nodes = {}

    def __str__(self):
        string = ""
        for n1 in self.nodes.values():
            n2s = [x.name for x in n1.edges]
            string += "%s %s\n" % (n1.name, ",".join(n2s))
        return string[:-1]

    def add_node(self, n):
        self.nodes[n.name] = n

    def add_edges(self, n1, n2):
        assert n1.name in self.nodes
        assert n2.name in self.nodes
        n1.edges.append(n2)
        n2.edges.append(n1)

    def check_graph(self):
        for n in self.nodes.values():
            ms = [x for x in n.edges]
            for m in ms:
                if n not in m.edges:
                    print >> stderr, "check : %s - %s" % (n,m)

def construct_graph(ldfile, snpfile):
    # construct the initial graph. add all the SNPs as nodes
    g = graph()
    file = open(snpfile, "r")

    for line in file:
        # ignore empty lines and add the remainder to the graph
        if len(line.strip()) == 0: continue
        n = node(line.strip())           
        g.add_node(n)

    file.close()
    print >> stderr, "Added %d nodes to a graph" % len(g.nodes)
  
    # now add all the edges
    file   = open(ldfile, "r")

    for line in file:
        tokens = line.split()
        assert len(tokens) == 2

        # if this node is in the graph, then we need to construct an edge from
        # this node to all the nodes which are highly related to it
        if tokens[0] in g.nodes:
            n1  = g.nodes[tokens[0]]
            n2s = [g.nodes[x] for x in tokens[1].split(",")]

            for n2 in n2s:
                g.add_edges(n1, n2)

    file.close()
    print >> stderr, "Added all edges to the graph"

    return g
     
def check_output(g, tagsnps):
    # find all the nodes in the graph 
    allsnps = [x.name for x in g.nodes.values()]

    # find the nodes that are covered by our tagsnps
    mysnps = [x.name for x in tagsnps]

    for n in tagsnps:
        for m in n.edges:
                mysnps.append(m.name)

    mysnps = list(set(mysnps))

    if set(allsnps) != set(mysnps):
        diff = list(set(allsnps) - set(mysnps))
        print >> stderr, "%s are not covered" % ",".join(diff)

def main(ldfile, snpsfile, required, excluded):
    # construct the graph
    g = construct_graph(ldfile, snpsfile)
    if debug_flag == True: g.check_graph()

    tagsnps   = []
    neighbors = {}

    # take care of the SNPs that are required to be TagSNPs
    for s in required:
        t = g.nodes[s]

        t.visited = True
        ns = []
 
        for n in t.edges:
            if n.visited == False: ns.append(n.name)
            n.visited = True 
        
        tagsnps.append(t)
        neighbors[t.name] = list(set(ns))

    # find the tag SNPs for this graph
    data = [x for x in g.nodes.values()]
    heapq.heapify(data)

    while data:
        s = heapq.heappop(data)

        if s.visited == True or s.name in excluded: continue

        s.visited = True
        ns = []

        for n in s.edges:
            if n.visited == False: ns.append(n.name)
            n.visited = True
            
        tagsnps.append(s)
        neighbors[s.name] = list(set(ns))

        heapq.heapify(data)

    for s in tagsnps:
        if len(neighbors[s.name]) > 0: 
            print "%s\t%s" % (s, ",".join(neighbors[s.name]))
            continue
        print s
        
    if debug_flag == True: check_output(g, tagsnps) 
       
def read_list(filename):
    assert os.path.exists(filename) == True
    file = open(filename, "r")
    list = {}

    for line in file:
        list[line.strip()] = 1

    file.close()
    return list
           
def usage():
    f = stderr
    print >> f, "usage:"
    print >> f, "senatag [options] neighborhood.txt inputsnps.txt"
    print >> f, "where inputsnps.txt is a file of snps from one population"
    print >> f, "where neighborhood.txt is neighborhood details for the pop."
    print >> f, "where the options are:"
    print >> f, "-h,--help : print usage and quit"
    print >> f, "-d,--debug: print debug information"
    print >> f, "-e,--excluded : file with names of SNPs that cannot be TagSNPs"
    print >> f, "-r,--required : file with names of SNPs that should be TagSNPs"

if __name__ == "__main__":
    try:
        opts, args = getopt(argv[1:], "hdr:e:",\
                     ["help", "debug", "required=", "excluded="])
    except GetoptError, err:
        print str(err)
        usage()
        exit(2) 

    required = {}
    excluded = {}

    for o, a in opts:
        if o in ("-h", "--help"):
            usage()
            exit()
        elif o in ("-d", "--debug"):
            debug_flag = True
        elif o in ("-r", "--required"):
            required = read_list(a)
        elif o in ("-e", "--excluded"):
            excluded = read_list(a)
        else:
            assert False, "unhandled option"

    if len(args) != 2:
        usage()
        exit(3)

    assert os.path.exists(args[0]) == True
    assert os.path.exists(args[1]) == True
    
    main(args[0], args[1], required, excluded)

Summary ✨

This is a Python script that takes two input files: neighborhood.txt and inputsnps.txt. It uses the information in these files to identify TagSNPs, which are SNPs that are likely to be used as markers for genetic variation in a population. The script first constructs a graph of SNPs based on their pairwise relationships, then identifies TagSNPs by finding the nodes with the highest degree (i.e., the most neighbors) and ensuring that they have no excluded SNPs as neighbors. It also takes into account any required SNPs and any excluded SNPs. The output is a list of TagSNPs, along with their neighbors in the graph.

Tech Fingerprint

Standard Library: OS Interaction

Alerts (20)

'def' Ensure functions have docstrings for documentation
46 69 72 78 85 120 137 188 199
Complexity hotspot; lines 79 to 82 (total complexity: 4)
79 80 81 82
'open(' Use 'with open()' to ensure Files are properly closed
88 100 190
'list(' Avoid unnecessary list conversions; use generators where possible
131 134 157 176