PageRenderTime 45ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 1ms

/geneinfo.py

https://bitbucket.org/dalexander/geneinfo
Python | 207 lines | 163 code | 24 blank | 20 comment | 36 complexity | 9d20749cb63e05cee91983cbcc4e8a26 MD5 | raw file
  1. #!/bin/env python
  2. # use python 2.6 or higher!
  3. import os, sys, MySQLdb, re, string, time
  4. from optparse import OptionParser
  5. from interval_tree import *
  6. localdb = dict(user="genome", host="localhost", db="hg19")
  7. snpPattern = re.compile("(rs\d+)")
  8. class SNP:
  9. def __init__(self,name,chrom,bp):
  10. self.name=name
  11. self.chrom=chrom
  12. self.bp=bp
  13. @property
  14. def shortChrom(self):
  15. assert self.chrom[:3]=="chr"
  16. return self.chrom[3:]
  17. def __repr__(self):
  18. return "%s: %s %d", (self.name,self.chrom,self.bp)
  19. class Gene:
  20. def __init__(self,name,chrom,txStart,txEnd,uniqueId):
  21. self.name=name
  22. self.chrom=chrom
  23. self.txStart=txStart
  24. self.txEnd=txEnd
  25. self.uniqueId = uniqueId
  26. self.midpoint = (txStart+txEnd)/2
  27. @property
  28. def shortChrom(self):
  29. assert self.chrom[:3]=="chr"
  30. return self.chrom[3:]
  31. def __lt__(self,other):
  32. return (self.chrom,self.txStart)<(other.chrom,other.txStart)
  33. def __repr__(self):
  34. return "%d (%s): %s (%d-%d)" % (self.uniqueId, self.name, self.chrom, self.txStart, self.txEnd)
  35. def getRsNumber(snpName):
  36. assert snpPattern.match(snpName)
  37. return int(snpName[2:])
  38. def dbLookupSNP(cursor,snpName):
  39. '''returns SNP(presentSnpName, presentChr, presentBp)'''
  40. # find new name, if this name is deprecated
  41. presentSnpName=None
  42. rsNumber=getRsNumber(snpName)
  43. cursor.execute("SELECT rsCurrent FROM rsmergearch where rsHigh=%d" % rsNumber)
  44. row = cursor.fetchone()
  45. if (row==None):
  46. presentSnpName=snpName
  47. else:
  48. rsPresent = row[0]
  49. presentSnpName = "rs" + str(rsPresent)
  50. # query dbSNP to find chr, bp
  51. cursor.execute("SELECT chrom, chromStart, chromEnd FROM snp130 where name='%s'"
  52. % presentSnpName)
  53. row = cursor.fetchone()
  54. if row==None:
  55. return None
  56. else:
  57. chrom=row[0]
  58. bp=row[1]
  59. return SNP(presentSnpName,chrom,bp)
  60. def dbLookupGenes(cursor):
  61. '''returns list of all genes in knownCanonical'''
  62. queryString="SELECT protein, chrom, chromStart, chromEnd, clusterId " \
  63. "from knownCanonical ORDER BY chrom, chromStart";
  64. cursor.execute(queryString)
  65. rows = cursor.fetchall()
  66. genes=[]
  67. for row in rows:
  68. (name,chrom,start,end,uniqueId)=row
  69. genes.append(Gene(name,chrom,int(start),int(end),int(uniqueId)))
  70. return genes
  71. def findGeneForSNP(chromIntervalTrees, snp):
  72. def argmin(lst):
  73. return lst.index(min(lst))
  74. def nearestInterval(point, intervals):
  75. midpoints = [(i.start+i.stop)/2 for i in intervals]
  76. idx_of_nearest=argmin([abs(point-mp) for mp in midpoints])
  77. return intervals[idx_of_nearest]
  78. if not snp:
  79. return None
  80. tree=chromIntervalTrees[snp.chrom]
  81. intervals=tree.find(snp.bp,snp.bp)
  82. if intervals==[]:
  83. return None
  84. else:
  85. bestInterval=nearestInterval(snp.bp,intervals)
  86. return bestInterval.gene
  87. parser = OptionParser(usage="Usage: %prog [options] inputfile")
  88. parser.add_option("-f", "--fudgebp", dest="fudgeBP",
  89. type="int", default=20000, metavar="BASEPAIRS",
  90. help="# of BP outside of genes that program will search")
  91. def usage():
  92. parser.print_help()
  93. if __name__=="__main__":
  94. # read cmd line
  95. (options, args) = parser.parse_args()
  96. if len(args) != 1:
  97. usage()
  98. sys.exit(1)
  99. print "Connecting to database."
  100. conn = MySQLdb.connect(**localdb)
  101. cursor = conn.cursor()
  102. snpMap={}
  103. # 1+2
  104. print "Looking up position information for SNPs."
  105. snpMap = {}
  106. inputFile=open(args[0])
  107. for line in inputFile:
  108. m = snpPattern.search(line)
  109. if m:
  110. name = m.group(1)
  111. foundSNP=dbLookupSNP(cursor,name)
  112. if not foundSNP:
  113. print "Couldn't find SNP %s"%name
  114. snpMap[name]=foundSNP
  115. if len(snpMap)%10000==0:
  116. print "%d SNPs so far"%len(snpMap)
  117. # 3+4
  118. print "Loading gene table."
  119. genesList = dbLookupGenes(cursor)
  120. # get unique chromosome names
  121. chroms = set([gene.chrom for gene in genesList])
  122. chromIntervalTrees={}
  123. # for each chromosome, build intervals and intervaltree
  124. for currentChrom in chroms:
  125. chromGenes = sorted([gene for gene in genesList if gene.chrom==currentChrom])
  126. # Using knownCanonical was *supposed* to solve the problem of
  127. # overlapping transcripts, by choosing one among every
  128. # overlapping set. It turns out that there are still
  129. # overlapping and even nested 'genes'. We can't have this, so
  130. # here we identify every pair of overlapping genes, and choose
  131. # the bigger one.
  132. rawIntervals=[]
  133. for gene in chromGenes:
  134. i=Interval(gene.txStart,gene.txEnd)
  135. i.gene=gene
  136. rawIntervals.append(i)
  137. nonOverlappingIntervals=set()
  138. for i in rawIntervals:
  139. J = [j for j in nonOverlappingIntervals if i.overlaps(j)]
  140. assert len(J)<=2
  141. J.append(i)
  142. biggest=max(J, key=lambda j: j.size())
  143. new_interval=Interval(min(j.start for j in J), max(j.stop for j in J))
  144. new_interval.gene=biggest.gene
  145. nonOverlappingIntervals-=set(J)
  146. nonOverlappingIntervals.add(new_interval)
  147. chromGenes=sorted([i.gene for i in nonOverlappingIntervals])
  148. # Now build the interval tree that we will actually use
  149. chromIntervals=[]
  150. for gene in chromGenes:
  151. i = Interval(gene.txStart-options.fudgeBP, gene.txEnd+options.fudgeBP)
  152. i.gene=gene
  153. chromIntervals.append(i)
  154. chromIntervalTree=IntervalTree(chromIntervals)
  155. chromIntervalTrees[currentChrom] = chromIntervalTree
  156. # write cross-reference file
  157. print "Writing output files."
  158. xrefFile = open("GenesXref.txt", "w")
  159. xrefFile.writelines("%d\t%s\t%s\t%d\n" % (gene.uniqueId, gene.name, gene.shortChrom, gene.midpoint)
  160. for gene in genesList)
  161. xrefFile.close()
  162. # write main output file
  163. outputFile = open("SNP_def.genes", "w")
  164. outputFile.writelines(["2\n", str(len(snpMap))+"\n"])
  165. inputFile = open(args[0])
  166. for line in inputFile:
  167. fmt="%16s, %8s, %10d, %8s\n"
  168. m = snpPattern.search(line)
  169. if m:
  170. name = m.group(1)
  171. snp = snpMap[name]
  172. gene = findGeneForSNP(chromIntervalTrees, snp)
  173. if not snp:
  174. outputFile.write(fmt % (name,"M",99999,"0"))
  175. elif not gene:
  176. outputFile.write(fmt % (snp.name, snp.shortChrom, snp.bp, "0"))
  177. else:
  178. outputFile.write(fmt % (snp.name, snp.shortChrom, snp.bp, str(gene.uniqueId)))
  179. else:
  180. if "," in line:
  181. fields = line.split(",")
  182. snp = fields[1]
  183. outputFile.write(fmt % (snp, "M", 99999, "0"))