geneinfo.py | searchcode

/geneinfo.py

https://bitbucket.org/dalexander/geneinfo
Python | 207 lines | 163 code | 24 blank | 20 comment | 36 complexity | 9d20749cb63e05cee91983cbcc4e8a26 MD5 | raw file

#!/bin/env python

# use python 2.6 or higher!
import os, sys, MySQLdb, re, string, time
from optparse import OptionParser
from interval_tree import *

localdb  = dict(user="genome", host="localhost",      db="hg19")

snpPattern = re.compile("(rs\d+)")

class SNP:
    def __init__(self,name,chrom,bp):
        self.name=name
        self.chrom=chrom
        self.bp=bp
    @property
    def shortChrom(self):
        assert self.chrom[:3]=="chr"
        return self.chrom[3:]
    def __repr__(self):
        return "%s: %s %d", (self.name,self.chrom,self.bp)


class Gene:
    def __init__(self,name,chrom,txStart,txEnd,uniqueId):
        self.name=name
        self.chrom=chrom
        self.txStart=txStart
        self.txEnd=txEnd
        self.uniqueId = uniqueId
        self.midpoint = (txStart+txEnd)/2
    @property
    def shortChrom(self):
        assert self.chrom[:3]=="chr"
        return self.chrom[3:]
    def __lt__(self,other):
        return (self.chrom,self.txStart)<(other.chrom,other.txStart)
    def __repr__(self):
        return "%d (%s): %s (%d-%d)" % (self.uniqueId, self.name, self.chrom, self.txStart, self.txEnd)


def getRsNumber(snpName):
    assert snpPattern.match(snpName)
    return int(snpName[2:])

def dbLookupSNP(cursor,snpName):
    '''returns SNP(presentSnpName, presentChr, presentBp)'''
    # find new name, if this name is deprecated
    presentSnpName=None
    rsNumber=getRsNumber(snpName)
    cursor.execute("SELECT rsCurrent FROM rsmergearch where rsHigh=%d" % rsNumber)
    row = cursor.fetchone()
    if (row==None):
        presentSnpName=snpName
    else:
        rsPresent = row[0]
        presentSnpName = "rs" + str(rsPresent)
    # query dbSNP to find chr, bp
    cursor.execute("SELECT chrom, chromStart, chromEnd FROM snp130 where name='%s'"
                   % presentSnpName)
    row = cursor.fetchone()
    if row==None:
        return None
    else:
        chrom=row[0]
        bp=row[1]
        return SNP(presentSnpName,chrom,bp)
    

def dbLookupGenes(cursor):
    '''returns list of all genes in knownCanonical'''
    queryString="SELECT protein, chrom, chromStart, chromEnd, clusterId " \
                "from knownCanonical ORDER BY chrom, chromStart";
    cursor.execute(queryString)
    rows = cursor.fetchall()
    genes=[]
    for row in rows:
        (name,chrom,start,end,uniqueId)=row
        genes.append(Gene(name,chrom,int(start),int(end),int(uniqueId)))
    return genes


def findGeneForSNP(chromIntervalTrees, snp):

    def argmin(lst):
        return lst.index(min(lst))

    def nearestInterval(point, intervals):
        midpoints = [(i.start+i.stop)/2 for i in intervals]
        idx_of_nearest=argmin([abs(point-mp) for mp in midpoints])
        return intervals[idx_of_nearest]

    if not snp:
        return None
    tree=chromIntervalTrees[snp.chrom]
    intervals=tree.find(snp.bp,snp.bp)
    if intervals==[]:
        return None
    else:
        bestInterval=nearestInterval(snp.bp,intervals)
        return bestInterval.gene


parser = OptionParser(usage="Usage: %prog [options] inputfile")
parser.add_option("-f", "--fudgebp", dest="fudgeBP",
                  type="int", default=20000, metavar="BASEPAIRS",
                  help="# of BP outside of genes that program will search")
def usage():
    parser.print_help()

if __name__=="__main__":
    # read cmd line
    (options, args) = parser.parse_args()
    if len(args) != 1:
        usage()
        sys.exit(1)
    print "Connecting to database."
    conn = MySQLdb.connect(**localdb)
    cursor = conn.cursor()

    snpMap={}
    # 1+2
    print "Looking up position information for SNPs."
    snpMap = {}
    inputFile=open(args[0])
    for line in inputFile:
        m = snpPattern.search(line)
        if m:
            name = m.group(1)
            foundSNP=dbLookupSNP(cursor,name)
            if not foundSNP:
                print "Couldn't find SNP %s"%name
            snpMap[name]=foundSNP
            if len(snpMap)%10000==0:
                print "%d SNPs so far"%len(snpMap) 
    
    # 3+4
    print "Loading gene table."
    genesList = dbLookupGenes(cursor)

    # get unique chromosome names
    chroms = set([gene.chrom for gene in genesList])
    chromIntervalTrees={}

    # for each chromosome, build intervals and intervaltree
    for currentChrom in chroms:
        chromGenes =  sorted([gene for gene in genesList if gene.chrom==currentChrom])
        # Using knownCanonical was *supposed* to solve the problem of
        # overlapping transcripts, by choosing one among every
        # overlapping set.  It turns out that there are still
        # overlapping and even nested 'genes'.  We can't have this, so
        # here we identify every pair of overlapping genes, and choose
        # the bigger one.
        rawIntervals=[]
        for gene in chromGenes:
            i=Interval(gene.txStart,gene.txEnd)
            i.gene=gene
            rawIntervals.append(i)
        nonOverlappingIntervals=set()
        for i in rawIntervals:
            J = [j for j in nonOverlappingIntervals if i.overlaps(j)]
            assert len(J)<=2
            J.append(i)
            biggest=max(J, key=lambda j: j.size())
            new_interval=Interval(min(j.start for j in J), max(j.stop for j in J))
            new_interval.gene=biggest.gene
            nonOverlappingIntervals-=set(J)
            nonOverlappingIntervals.add(new_interval)
        chromGenes=sorted([i.gene for i in nonOverlappingIntervals])
        # Now build the interval tree that we will actually use
        chromIntervals=[]
        for gene in chromGenes:
            i = Interval(gene.txStart-options.fudgeBP, gene.txEnd+options.fudgeBP)
            i.gene=gene
            chromIntervals.append(i)
        chromIntervalTree=IntervalTree(chromIntervals)
        chromIntervalTrees[currentChrom] = chromIntervalTree
    # write cross-reference file
    print "Writing output files."
    xrefFile = open("GenesXref.txt", "w")
    xrefFile.writelines("%d\t%s\t%s\t%d\n" % (gene.uniqueId, gene.name, gene.shortChrom, gene.midpoint)
                        for gene in genesList)
    xrefFile.close()
    # write main output file
    outputFile = open("SNP_def.genes", "w") 
    outputFile.writelines(["2\n", str(len(snpMap))+"\n"])
    inputFile  = open(args[0])
    for line in inputFile:
        fmt="%16s, %8s, %10d, %8s\n"
        m = snpPattern.search(line)
        if m:
            name = m.group(1)
            snp = snpMap[name]
            gene = findGeneForSNP(chromIntervalTrees, snp)
            if not snp:
                outputFile.write(fmt % (name,"M",99999,"0"))
            elif not gene:
                outputFile.write(fmt % (snp.name, snp.shortChrom, snp.bp, "0"))
            else:
                outputFile.write(fmt % (snp.name, snp.shortChrom, snp.bp, str(gene.uniqueId)))
        else:
            if "," in line:
                fields = line.split(",")
                snp = fields[1]
                outputFile.write(fmt % (snp, "M", 99999, "0"))