/geneinfo.py
Python | 207 lines | 163 code | 24 blank | 20 comment | 36 complexity | 9d20749cb63e05cee91983cbcc4e8a26 MD5 | raw file
- #!/bin/env python
- # use python 2.6 or higher!
- import os, sys, MySQLdb, re, string, time
- from optparse import OptionParser
- from interval_tree import *
- localdb = dict(user="genome", host="localhost", db="hg19")
- snpPattern = re.compile("(rs\d+)")
- class SNP:
- def __init__(self,name,chrom,bp):
- self.name=name
- self.chrom=chrom
- self.bp=bp
- @property
- def shortChrom(self):
- assert self.chrom[:3]=="chr"
- return self.chrom[3:]
- def __repr__(self):
- return "%s: %s %d", (self.name,self.chrom,self.bp)
- class Gene:
- def __init__(self,name,chrom,txStart,txEnd,uniqueId):
- self.name=name
- self.chrom=chrom
- self.txStart=txStart
- self.txEnd=txEnd
- self.uniqueId = uniqueId
- self.midpoint = (txStart+txEnd)/2
- @property
- def shortChrom(self):
- assert self.chrom[:3]=="chr"
- return self.chrom[3:]
- def __lt__(self,other):
- return (self.chrom,self.txStart)<(other.chrom,other.txStart)
- def __repr__(self):
- return "%d (%s): %s (%d-%d)" % (self.uniqueId, self.name, self.chrom, self.txStart, self.txEnd)
- def getRsNumber(snpName):
- assert snpPattern.match(snpName)
- return int(snpName[2:])
- def dbLookupSNP(cursor,snpName):
- '''returns SNP(presentSnpName, presentChr, presentBp)'''
- # find new name, if this name is deprecated
- presentSnpName=None
- rsNumber=getRsNumber(snpName)
- cursor.execute("SELECT rsCurrent FROM rsmergearch where rsHigh=%d" % rsNumber)
- row = cursor.fetchone()
- if (row==None):
- presentSnpName=snpName
- else:
- rsPresent = row[0]
- presentSnpName = "rs" + str(rsPresent)
- # query dbSNP to find chr, bp
- cursor.execute("SELECT chrom, chromStart, chromEnd FROM snp130 where name='%s'"
- % presentSnpName)
- row = cursor.fetchone()
- if row==None:
- return None
- else:
- chrom=row[0]
- bp=row[1]
- return SNP(presentSnpName,chrom,bp)
-
- def dbLookupGenes(cursor):
- '''returns list of all genes in knownCanonical'''
- queryString="SELECT protein, chrom, chromStart, chromEnd, clusterId " \
- "from knownCanonical ORDER BY chrom, chromStart";
- cursor.execute(queryString)
- rows = cursor.fetchall()
- genes=[]
- for row in rows:
- (name,chrom,start,end,uniqueId)=row
- genes.append(Gene(name,chrom,int(start),int(end),int(uniqueId)))
- return genes
- def findGeneForSNP(chromIntervalTrees, snp):
- def argmin(lst):
- return lst.index(min(lst))
- def nearestInterval(point, intervals):
- midpoints = [(i.start+i.stop)/2 for i in intervals]
- idx_of_nearest=argmin([abs(point-mp) for mp in midpoints])
- return intervals[idx_of_nearest]
- if not snp:
- return None
- tree=chromIntervalTrees[snp.chrom]
- intervals=tree.find(snp.bp,snp.bp)
- if intervals==[]:
- return None
- else:
- bestInterval=nearestInterval(snp.bp,intervals)
- return bestInterval.gene
- parser = OptionParser(usage="Usage: %prog [options] inputfile")
- parser.add_option("-f", "--fudgebp", dest="fudgeBP",
- type="int", default=20000, metavar="BASEPAIRS",
- help="# of BP outside of genes that program will search")
- def usage():
- parser.print_help()
- if __name__=="__main__":
- # read cmd line
- (options, args) = parser.parse_args()
- if len(args) != 1:
- usage()
- sys.exit(1)
- print "Connecting to database."
- conn = MySQLdb.connect(**localdb)
- cursor = conn.cursor()
- snpMap={}
- # 1+2
- print "Looking up position information for SNPs."
- snpMap = {}
- inputFile=open(args[0])
- for line in inputFile:
- m = snpPattern.search(line)
- if m:
- name = m.group(1)
- foundSNP=dbLookupSNP(cursor,name)
- if not foundSNP:
- print "Couldn't find SNP %s"%name
- snpMap[name]=foundSNP
- if len(snpMap)%10000==0:
- print "%d SNPs so far"%len(snpMap)
-
- # 3+4
- print "Loading gene table."
- genesList = dbLookupGenes(cursor)
- # get unique chromosome names
- chroms = set([gene.chrom for gene in genesList])
- chromIntervalTrees={}
- # for each chromosome, build intervals and intervaltree
- for currentChrom in chroms:
- chromGenes = sorted([gene for gene in genesList if gene.chrom==currentChrom])
- # Using knownCanonical was *supposed* to solve the problem of
- # overlapping transcripts, by choosing one among every
- # overlapping set. It turns out that there are still
- # overlapping and even nested 'genes'. We can't have this, so
- # here we identify every pair of overlapping genes, and choose
- # the bigger one.
- rawIntervals=[]
- for gene in chromGenes:
- i=Interval(gene.txStart,gene.txEnd)
- i.gene=gene
- rawIntervals.append(i)
- nonOverlappingIntervals=set()
- for i in rawIntervals:
- J = [j for j in nonOverlappingIntervals if i.overlaps(j)]
- assert len(J)<=2
- J.append(i)
- biggest=max(J, key=lambda j: j.size())
- new_interval=Interval(min(j.start for j in J), max(j.stop for j in J))
- new_interval.gene=biggest.gene
- nonOverlappingIntervals-=set(J)
- nonOverlappingIntervals.add(new_interval)
- chromGenes=sorted([i.gene for i in nonOverlappingIntervals])
- # Now build the interval tree that we will actually use
- chromIntervals=[]
- for gene in chromGenes:
- i = Interval(gene.txStart-options.fudgeBP, gene.txEnd+options.fudgeBP)
- i.gene=gene
- chromIntervals.append(i)
- chromIntervalTree=IntervalTree(chromIntervals)
- chromIntervalTrees[currentChrom] = chromIntervalTree
- # write cross-reference file
- print "Writing output files."
- xrefFile = open("GenesXref.txt", "w")
- xrefFile.writelines("%d\t%s\t%s\t%d\n" % (gene.uniqueId, gene.name, gene.shortChrom, gene.midpoint)
- for gene in genesList)
- xrefFile.close()
- # write main output file
- outputFile = open("SNP_def.genes", "w")
- outputFile.writelines(["2\n", str(len(snpMap))+"\n"])
- inputFile = open(args[0])
- for line in inputFile:
- fmt="%16s, %8s, %10d, %8s\n"
- m = snpPattern.search(line)
- if m:
- name = m.group(1)
- snp = snpMap[name]
- gene = findGeneForSNP(chromIntervalTrees, snp)
- if not snp:
- outputFile.write(fmt % (name,"M",99999,"0"))
- elif not gene:
- outputFile.write(fmt % (snp.name, snp.shortChrom, snp.bp, "0"))
- else:
- outputFile.write(fmt % (snp.name, snp.shortChrom, snp.bp, str(gene.uniqueId)))
- else:
- if "," in line:
- fields = line.split(",")
- snp = fields[1]
- outputFile.write(fmt % (snp, "M", 99999, "0"))