/tools/rgenetics/rgGRR.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 1089 lines · 869 code · 64 blank · 156 comment · 120 complexity · f2b858baca27314e3adaa7f42540fa9a MD5 · raw file

  1. """
  2. # july 2009: Need to see outliers so need to draw them last?
  3. # could use clustering on the zscores to guess real relationships for unrelateds
  4. # but definitely need to draw last
  5. # added MAX_SHOW_ROWS to limit the length of the main report page
  6. # Changes for Galaxy integration
  7. # added more robust knuth method for one pass mean and sd
  8. # no difference really - let's use scipy.mean() and scipy.std() instead...
  9. # fixed labels and changed to .xls for outlier reports so can open in excel
  10. # interesting - with a few hundred subjects, 5k gives good resolution
  11. # and 100k gives better but not by much
  12. # TODO remove non autosomal markers
  13. # TODO it would be best if label had the zmean and zsd as these are what matter for
  14. # outliers rather than the group mean/sd
  15. # mods to rgGRR.py from channing CVS which John Ziniti has rewritten to produce SVG plots
  16. # to make a Galaxy tool - we need the table of mean and SD for interesting pairs, the SVG and the log
  17. # so the result should be an HTML file
  18. # rgIBS.py
  19. # use a random subset of markers for a quick ibs
  20. # to identify sample dups and closely related subjects
  21. # try snpMatrix and plink and see which one works best for us?
  22. # abecasis grr plots mean*sd for every subject to show clusters
  23. # mods june 23 rml to avoid non-autosomal markers
  24. # we seem to be distinguishing parent-child by gender - 2 clouds!
  25. snpMatrix from David Clayton has:
  26. ibs.stats function to calculate the identity-by-state stats of a group of samples
  27. Description
  28. Given a snp.matrix-class or a X.snp.matrix-class object with N samples, calculates some statistics
  29. about the relatedness of every pair of samples within.
  30. Usage
  31. ibs.stats(x)
  32. 8 ibs.stats
  33. Arguments
  34. x a snp.matrix-class or a X.snp.matrix-class object containing N samples
  35. Details
  36. No-calls are excluded from consideration here.
  37. Value
  38. A data.frame containing N(N - 1)/2 rows, where the row names are the sample name pairs separated
  39. by a comma, and the columns are:
  40. Count count of identical calls, exclusing no-calls
  41. Fraction fraction of identical calls comparied to actual calls being made in both samples
  42. Warning
  43. In some applications, it may be preferable to subset a (random) selection of SNPs first - the
  44. calculation
  45. time increases as N(N - 1)M/2 . Typically for N = 800 samples and M = 3000 SNPs, the
  46. calculation time is about 1 minute. A full GWA scan could take hours, and quite unnecessary for
  47. simple applications such as checking for duplicate or related samples.
  48. Note
  49. This is mostly written to find mislabelled and/or duplicate samples.
  50. Illumina indexes their SNPs in alphabetical order so the mitochondria SNPs comes first - for most
  51. purpose it is undesirable to use these SNPs for IBS purposes.
  52. TODO: Worst-case S4 subsetting seems to make 2 copies of a large object, so one might want to
  53. subset before rbind(), etc; a future version of this routine may contain a built-in subsetting facility
  54. """
  55. import sys,os,time,random,string,copy,optparse
  56. try:
  57. set
  58. except NameError:
  59. from Sets import Set as set
  60. from rgutils import timenow,plinke
  61. import plinkbinJZ
  62. opts = None
  63. verbose = False
  64. showPolygons = False
  65. class NullDevice:
  66. def write(self, s):
  67. pass
  68. tempstderr = sys.stderr # save
  69. #sys.stderr = NullDevice()
  70. # need to avoid blather about deprecation and other strange stuff from scipy
  71. # the current galaxy job runner assumes that
  72. # the job is in error if anything appears on sys.stderr
  73. # grrrrr. James wants to keep it that way instead of using the
  74. # status flag for some strange reason. Presumably he doesn't use R or (in this case, scipy)
  75. import numpy
  76. import scipy
  77. from scipy import weave
  78. sys.stderr=tempstderr
  79. PROGNAME = os.path.split(sys.argv[0])[-1]
  80. X_AXIS_LABEL = 'Mean Alleles Shared'
  81. Y_AXIS_LABEL = 'SD Alleles Shared'
  82. LEGEND_ALIGN = 'topleft'
  83. LEGEND_TITLE = 'Relationship'
  84. DEFAULT_SYMBOL_SIZE = 1.0 # default symbol size
  85. DEFAULT_SYMBOL_SIZE = 0.5 # default symbol size
  86. ### Some colors for R/rpy
  87. R_BLACK = 1
  88. R_RED = 2
  89. R_GREEN = 3
  90. R_BLUE = 4
  91. R_CYAN = 5
  92. R_PURPLE = 6
  93. R_YELLOW = 7
  94. R_GRAY = 8
  95. ### ... and some point-styles
  96. ###
  97. PLOT_HEIGHT = 600
  98. PLOT_WIDTH = 1150
  99. #SVG_COLORS = ('black', 'darkblue', 'blue', 'deepskyblue', 'firebrick','maroon','crimson')
  100. #SVG_COLORS = ('cyan','dodgerblue','mediumpurple', 'fuchsia', 'red','gold','gray')
  101. SVG_COLORS = ('cyan','dodgerblue','mediumpurple','forestgreen', 'lightgreen','gold','gray')
  102. # dupe,parentchild,sibpair,halfsib,parents,unrel,unkn
  103. #('orange', 'red', 'green', 'chartreuse', 'blue', 'purple', 'gray')
  104. OUTLIERS_HEADER_list = ['Mean','Sdev','ZMean','ZSdev','FID1','IID1','FID2','IID2','RelMean_M','RelMean_SD','RelSD_M','RelSD_SD','PID1','MID1','PID2','MID2','Ped']
  105. OUTLIERS_HEADER = '\t'.join(OUTLIERS_HEADER_list)
  106. TABLE_HEADER='fid1_iid1\tfid2_iid2\tmean\tsdev\tzmean\tzsdev\tgeno\trelcode\tpid1\tmid1\tpid2\tmid2\n'
  107. ### Relationship codes, text, and lookups/mappings
  108. N_RELATIONSHIP_TYPES = 7
  109. REL_DUPE, REL_PARENTCHILD, REL_SIBS, REL_HALFSIBS, REL_RELATED, REL_UNRELATED, REL_UNKNOWN = range(N_RELATIONSHIP_TYPES)
  110. REL_LOOKUP = {
  111. REL_DUPE: ('dupe', R_BLUE, 1),
  112. REL_PARENTCHILD: ('parentchild', R_YELLOW, 1),
  113. REL_SIBS: ('sibpairs', R_RED, 1),
  114. REL_HALFSIBS: ('halfsibs', R_GREEN, 1),
  115. REL_RELATED: ('parents', R_PURPLE, 1),
  116. REL_UNRELATED: ('unrelated', R_CYAN, 1),
  117. REL_UNKNOWN: ('unknown', R_GRAY, 1),
  118. }
  119. OUTLIER_STDEVS = {
  120. REL_DUPE: 2,
  121. REL_PARENTCHILD: 2,
  122. REL_SIBS: 2,
  123. REL_HALFSIBS: 2,
  124. REL_RELATED: 2,
  125. REL_UNRELATED: 3,
  126. REL_UNKNOWN: 2,
  127. }
  128. # note now Z can be passed in
  129. REL_STATES = [REL_LOOKUP[r][0] for r in range(N_RELATIONSHIP_TYPES)]
  130. REL_COLORS = SVG_COLORS
  131. REL_POINTS = [REL_LOOKUP[r][2] for r in range(N_RELATIONSHIP_TYPES)]
  132. DEFAULT_MAX_SAMPLE_SIZE = 10000
  133. REF_COUNT_HOM1 = 3
  134. REF_COUNT_HET = 2
  135. REF_COUNT_HOM2 = 1
  136. MISSING = 0
  137. MAX_SHOW_ROWS = 100 # framingham has millions - delays showing output page - so truncate and explain
  138. MARKER_PAIRS_PER_SECOND_SLOW = 15000000.0
  139. MARKER_PAIRS_PER_SECOND_FAST = 70000000.0
  140. galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
  141. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  142. <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  143. <head>
  144. <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  145. <meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" />
  146. <title></title>
  147. <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
  148. </head>
  149. <body>
  150. <div class="document">
  151. """
  152. SVG_HEADER = '''<?xml version="1.0" standalone="no"?>
  153. <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.2//EN" "http://www.w3.org/Graphics/SVG/1.2/DTD/svg12.dtd">
  154. <svg width="1280" height="800"
  155. xmlns="http://www.w3.org/2000/svg" version="1.2"
  156. xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 1280 800" onload="init()">
  157. <script type="text/ecmascript" xlink:href="/static/scripts/checkbox_and_radiobutton.js"/>
  158. <script type="text/ecmascript" xlink:href="/static/scripts/helper_functions.js"/>
  159. <script type="text/ecmascript" xlink:href="/static/scripts/timer.js"/>
  160. <script type="text/ecmascript">
  161. <![CDATA[
  162. var checkBoxes = new Array();
  163. var radioGroupBandwidth;
  164. var colours = ['%s','%s','%s','%s','%s','%s','%s'];
  165. function init() {
  166. var style = {"font-family":"Arial,Helvetica", "fill":"black", "font-size":12};
  167. var dist = 12;
  168. var yOffset = 4;
  169. //A checkBox for each relationship type dupe,parentchild,sibpair,halfsib,parents,unrel,unkn
  170. checkBoxes["dupe"] = new checkBox("dupe","checkboxes",20,40,"cbRect","cbCross",true,"Duplicate",style,dist,yOffset,undefined,hideShowLayer);
  171. checkBoxes["parentchild"] = new checkBox("parentchild","checkboxes",20,60,"cbRect","cbCross",true,"Parent-Child",style,dist,yOffset,undefined,hideShowLayer);
  172. checkBoxes["sibpairs"] = new checkBox("sibpairs","checkboxes",20,80,"cbRect","cbCross",true,"Sib-pairs",style,dist,yOffset,undefined,hideShowLayer);
  173. checkBoxes["halfsibs"] = new checkBox("halfsibs","checkboxes",20,100,"cbRect","cbCross",true,"Half-sibs",style,dist,yOffset,undefined,hideShowLayer);
  174. checkBoxes["parents"] = new checkBox("parents","checkboxes",20,120,"cbRect","cbCross",true,"Parents",style,dist,yOffset,undefined,hideShowLayer);
  175. checkBoxes["unrelated"] = new checkBox("unrelated","checkboxes",20,140,"cbRect","cbCross",true,"Unrelated",style,dist,yOffset,undefined,hideShowLayer);
  176. checkBoxes["unknown"] = new checkBox("unknown","checkboxes",20,160,"cbRect","cbCross",true,"Unknown",style,dist,yOffset,undefined,hideShowLayer);
  177. }
  178. function hideShowLayer(id, status, label) {
  179. var vis = "hidden";
  180. if (status) {
  181. vis = "visible";
  182. }
  183. document.getElementById(id).setAttributeNS(null, 'visibility', vis);
  184. }
  185. function showBTT(evt, rel, mm, dm, md, dd, n, mg, dg, lg, hg) {
  186. var x = parseInt(evt.pageX)-250;
  187. var y = parseInt(evt.pageY)-110;
  188. switch(rel) {
  189. case 0:
  190. fill = colours[rel];
  191. relt = "dupe";
  192. break;
  193. case 1:
  194. fill = colours[rel];
  195. relt = "parentchild";
  196. break;
  197. case 2:
  198. fill = colours[rel];
  199. relt = "sibpairs";
  200. break;
  201. case 3:
  202. fill = colours[rel];
  203. relt = "halfsibs";
  204. break;
  205. case 4:
  206. fill = colours[rel];
  207. relt = "parents";
  208. break;
  209. case 5:
  210. fill = colours[rel];
  211. relt = "unrelated";
  212. break;
  213. case 6:
  214. fill = colours[rel];
  215. relt = "unknown";
  216. break;
  217. default:
  218. fill = "cyan";
  219. relt = "ERROR_CODE: "+rel;
  220. }
  221. document.getElementById("btRel").textContent = "GROUP: "+relt;
  222. document.getElementById("btMean").textContent = "mean="+mm+" +/- "+dm;
  223. document.getElementById("btSdev").textContent = "sdev="+dm+" +/- "+dd;
  224. document.getElementById("btPair").textContent = "npairs="+n;
  225. document.getElementById("btGeno").textContent = "ngenos="+mg+" +/- "+dg+" (min="+lg+", max="+hg+")";
  226. document.getElementById("btHead").setAttribute('fill', fill);
  227. var tt = document.getElementById("btTip");
  228. tt.setAttribute("transform", "translate("+x+","+y+")");
  229. tt.setAttribute('visibility', 'visible');
  230. }
  231. function showOTT(evt, rel, s1, s2, mean, sdev, ngeno, rmean, rsdev) {
  232. var x = parseInt(evt.pageX)-150;
  233. var y = parseInt(evt.pageY)-180;
  234. switch(rel) {
  235. case 0:
  236. fill = colours[rel];
  237. relt = "dupe";
  238. break;
  239. case 1:
  240. fill = colours[rel];
  241. relt = "parentchild";
  242. break;
  243. case 2:
  244. fill = colours[rel];
  245. relt = "sibpairs";
  246. break;
  247. case 3:
  248. fill = colours[rel];
  249. relt = "halfsibs";
  250. break;
  251. case 4:
  252. fill = colours[rel];
  253. relt = "parents";
  254. break;
  255. case 5:
  256. fill = colours[rel];
  257. relt = "unrelated";
  258. break;
  259. case 6:
  260. fill = colours[rel];
  261. relt = "unknown";
  262. break;
  263. default:
  264. fill = "cyan";
  265. relt = "ERROR_CODE: "+rel;
  266. }
  267. document.getElementById("otRel").textContent = "PAIR: "+relt;
  268. document.getElementById("otS1").textContent = "s1="+s1;
  269. document.getElementById("otS2").textContent = "s2="+s2;
  270. document.getElementById("otMean").textContent = "mean="+mean;
  271. document.getElementById("otSdev").textContent = "sdev="+sdev;
  272. document.getElementById("otGeno").textContent = "ngenos="+ngeno;
  273. document.getElementById("otRmean").textContent = "relmean="+rmean;
  274. document.getElementById("otRsdev").textContent = "relsdev="+rsdev;
  275. document.getElementById("otHead").setAttribute('fill', fill);
  276. var tt = document.getElementById("otTip");
  277. tt.setAttribute("transform", "translate("+x+","+y+")");
  278. tt.setAttribute('visibility', 'visible');
  279. }
  280. function hideBTT(evt) {
  281. document.getElementById("btTip").setAttributeNS(null, 'visibility', 'hidden');
  282. }
  283. function hideOTT(evt) {
  284. document.getElementById("otTip").setAttributeNS(null, 'visibility', 'hidden');
  285. }
  286. ]]>
  287. </script>
  288. <defs>
  289. <!-- symbols for check boxes -->
  290. <symbol id="cbRect" overflow="visible">
  291. <rect x="-5" y="-5" width="10" height="10" fill="white" stroke="dimgray" stroke-width="1" cursor="pointer"/>
  292. </symbol>
  293. <symbol id="cbCross" overflow="visible">
  294. <g pointer-events="none" stroke="black" stroke-width="1">
  295. <line x1="-3" y1="-3" x2="3" y2="3"/>
  296. <line x1="3" y1="-3" x2="-3" y2="3"/>
  297. </g>
  298. </symbol>
  299. </defs>
  300. <desc>Developer Works Dynamic Scatter Graph Scaling Example</desc>
  301. <!-- Now Draw the main X and Y axis -->
  302. <g style="stroke-width:1.0; stroke:black; shape-rendering:crispEdges">
  303. <!-- X Axis top and bottom -->
  304. <path d="M 100 100 L 1250 100 Z"/>
  305. <path d="M 100 700 L 1250 700 Z"/>
  306. <!-- Y Axis left and right -->
  307. <path d="M 100 100 L 100 700 Z"/>
  308. <path d="M 1250 100 L 1250 700 Z"/>
  309. </g>
  310. <g transform="translate(100,100)">
  311. <!-- Grid Lines -->
  312. <g style="fill:none; stroke:#dddddd; stroke-width:1; stroke-dasharray:2,2; text-anchor:end; shape-rendering:crispEdges">
  313. <!-- Vertical grid lines -->
  314. <line x1="125" y1="0" x2="115" y2="600" />
  315. <line x1="230" y1="0" x2="230" y2="600" />
  316. <line x1="345" y1="0" x2="345" y2="600" />
  317. <line x1="460" y1="0" x2="460" y2="600" />
  318. <line x1="575" y1="0" x2="575" y2="600" style="stroke-dasharray:none;" />
  319. <line x1="690" y1="0" x2="690" y2="600" />
  320. <line x1="805" y1="0" x2="805" y2="600" />
  321. <line x1="920" y1="0" x2="920" y2="600" />
  322. <line x1="1035" y1="0" x2="1035" y2="600" />
  323. <!-- Horizontal grid lines -->
  324. <line x1="0" y1="60" x2="1150" y2="60" />
  325. <line x1="0" y1="120" x2="1150" y2="120" />
  326. <line x1="0" y1="180" x2="1150" y2="180" />
  327. <line x1="0" y1="240" x2="1150" y2="240" />
  328. <line x1="0" y1="300" x2="1150" y2="300" style="stroke-dasharray:none;" />
  329. <line x1="0" y1="360" x2="1150" y2="360" />
  330. <line x1="0" y1="420" x2="1150" y2="420" />
  331. <line x1="0" y1="480" x2="1150" y2="480" />
  332. <line x1="0" y1="540" x2="1150" y2="540" />
  333. </g>
  334. <!-- Legend -->
  335. <g style="fill:black; stroke:none" font-size="12" font-family="Arial" transform="translate(25,25)">
  336. <rect width="160" height="270" style="fill:none; stroke:black; shape-rendering:crispEdges" />
  337. <text x="5" y="20" style="fill:black; stroke:none;" font-size="13" font-weight="bold">Given Pair Relationship</text>
  338. <rect x="120" y="35" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
  339. <rect x="120" y="55" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
  340. <rect x="120" y="75" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
  341. <rect x="120" y="95" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
  342. <rect x="120" y="115" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
  343. <rect x="120" y="135" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
  344. <rect x="120" y="155" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
  345. <text x="15" y="195" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore gt 15</text>
  346. <circle cx="125" cy="192" r="6" style="stroke:red; fill:gold; fill-opacity:1.0; stroke-width:1;"/>
  347. <text x="15" y="215" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore 4 to 15</text>
  348. <circle cx="125" cy="212" r="3" style="stroke:gold; fill:gold; fill-opacity:1.0; stroke-width:1;"/>
  349. <text x="15" y="235" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore lt 4</text>
  350. <circle cx="125" cy="232" r="2" style="stroke:gold; fill:gold; fill-opacity:1.0; stroke-width:1;"/>
  351. <g id="checkboxes">
  352. </g>
  353. </g>
  354. <g style='fill:black; stroke:none' font-size="17" font-family="Arial">
  355. <!-- X Axis Labels -->
  356. <text x="480" y="660">Mean Alleles Shared</text>
  357. <text x="0" y="630" >1.0</text>
  358. <text x="277" y="630" >1.25</text>
  359. <text x="564" y="630" >1.5</text>
  360. <text x="842" y="630" >1.75</text>
  361. <text x="1140" y="630" >2.0</text>
  362. </g>
  363. <g transform="rotate(270)" style="fill:black; stroke:none" font-size="17" font-family="Arial">
  364. <!-- Y Axis Labels -->
  365. <text x="-350" y="-40">SD Alleles Shared</text>
  366. <text x="-20" y="-10" >1.0</text>
  367. <text x="-165" y="-10" >0.75</text>
  368. <text x="-310" y="-10" >0.5</text>
  369. <text x="-455" y="-10" >0.25</text>
  370. <text x="-600" y="-10" >0.0</text>
  371. </g>
  372. <!-- Plot Title -->
  373. <g style="fill:black; stroke:none" font-size="18" font-family="Arial">
  374. <text x="425" y="-30">%s</text>
  375. </g>
  376. <!-- One group/layer of points for each relationship type -->
  377. '''
  378. SVG_FOOTER = '''
  379. <!-- End of Data -->
  380. </g>
  381. <g id="btTip" visibility="hidden" style="stroke-width:1.0; fill:black; stroke:none;" font-size="10" font-family="Arial">
  382. <rect width="250" height="110" style="fill:silver" rx="2" ry="2"/>
  383. <rect id="btHead" width="250" height="20" rx="2" ry="2" />
  384. <text id="btRel" y="14" x="85">unrelated</text>
  385. <text id="btMean" y="40" x="4">mean=1.5 +/- 0.04</text>
  386. <text id="btSdev" y="60" x="4">sdev=0.7 +/- 0.03</text>
  387. <text id="btPair" y="80" x="4">npairs=1152</text>
  388. <text id="btGeno" y="100" x="4">ngenos=4783 +/- 24 (min=1000, max=5000)</text>
  389. </g>
  390. <g id="otTip" visibility="hidden" style="stroke-width:1.0; fill:black; stroke:none;" font-size="10" font-family="Arial">
  391. <rect width="150" height="180" style="fill:silver" rx="2" ry="2"/>
  392. <rect id="otHead" width="150" height="20" rx="2" ry="2" />
  393. <text id="otRel" y="14" x="40">sibpairs</text>
  394. <text id="otS1" y="40" x="4">s1=fid1,iid1</text>
  395. <text id="otS2" y="60" x="4">s2=fid2,iid2</text>
  396. <text id="otMean" y="80" x="4">mean=1.82</text>
  397. <text id="otSdev" y="100" x="4">sdev=0.7</text>
  398. <text id="otGeno" y="120" x="4">ngeno=4487</text>
  399. <text id="otRmean" y="140" x="4">relmean=1.85</text>
  400. <text id="otRsdev" y="160" x="4">relsdev=0.65</text>
  401. </g>
  402. </svg>
  403. '''
  404. DEFAULT_MAX_SAMPLE_SIZE = 5000
  405. REF_COUNT_HOM1 = 3
  406. REF_COUNT_HET = 2
  407. REF_COUNT_HOM2 = 1
  408. MISSING = 0
  409. MARKER_PAIRS_PER_SECOND_SLOW = 15000000
  410. MARKER_PAIRS_PER_SECOND_FAST = 70000000
  411. POLYGONS = {
  412. REL_UNRELATED: ((1.360, 0.655), (1.385, 0.730), (1.620, 0.575), (1.610, 0.505)),
  413. REL_HALFSIBS: ((1.630, 0.500), (1.630, 0.550), (1.648, 0.540), (1.648, 0.490)),
  414. REL_SIBS: ((1.660, 0.510), (1.665, 0.560), (1.820, 0.410), (1.820, 0.390)),
  415. REL_PARENTCHILD: ((1.650, 0.470), (1.650, 0.490), (1.750, 0.440), (1.750, 0.420)),
  416. REL_DUPE: ((1.970, 0.000), (1.970, 0.150), (2.000, 0.150), (2.000, 0.000)),
  417. }
  418. def distance(point1, point2):
  419. """ Calculate the distance between two points
  420. """
  421. (x1,y1) = [float(d) for d in point1]
  422. (x2,y2) = [float(d) for d in point2]
  423. dx = abs(x1 - x2)
  424. dy = abs(y1 - y2)
  425. return math.sqrt(dx**2 + dy**2)
  426. def point_inside_polygon(x, y, poly):
  427. """ Determine if a point (x,y) is inside a given polygon or not
  428. poly is a list of (x,y) pairs.
  429. Taken from: http://www.ariel.com.au/a/python-point-int-poly.html
  430. """
  431. n = len(poly)
  432. inside = False
  433. p1x,p1y = poly[0]
  434. for i in range(n+1):
  435. p2x,p2y = poly[i % n]
  436. if y > min(p1y,p2y):
  437. if y <= max(p1y,p2y):
  438. if x <= max(p1x,p2x):
  439. if p1y != p2y:
  440. xinters = (y-p1y)*(p2x-p1x)/(p2y-p1y)+p1x
  441. if p1x == p2x or x <= xinters:
  442. inside = not inside
  443. p1x,p1y = p2x,p2y
  444. return inside
  445. def readMap(pedfile):
  446. """
  447. """
  448. mapfile = pedfile.replace('.ped', '.map')
  449. marker_list = []
  450. if os.path.exists(mapfile):
  451. print 'readMap: %s' % (mapfile)
  452. fh = file(mapfile, 'r')
  453. for line in fh:
  454. marker_list.append(line.strip().split())
  455. fh.close()
  456. print 'readMap: %s markers' % (len(marker_list))
  457. return marker_list
  458. def calcMeanSD(useme):
  459. """
  460. A numerically stable algorithm is given below. It also computes the mean.
  461. This algorithm is due to Knuth,[1] who cites Welford.[2]
  462. n = 0
  463. mean = 0
  464. M2 = 0
  465. foreach x in data:
  466. n = n + 1
  467. delta = x - mean
  468. mean = mean + delta/n
  469. M2 = M2 + delta*(x - mean) // This expression uses the new value of mean
  470. end for
  471. variance_n = M2/n
  472. variance = M2/(n - 1)
  473. """
  474. mean = 0.0
  475. M2 = 0.0
  476. sd = 0.0
  477. n = len(useme)
  478. if n > 1:
  479. for i,x in enumerate(useme):
  480. delta = x - mean
  481. mean = mean + delta/(i+1) # knuth uses n+=1 at start
  482. M2 = M2 + delta*(x - mean) # This expression uses the new value of mean
  483. variance = M2/(n-1) # assume is sample so lose 1 DOF
  484. sd = pow(variance,0.5)
  485. return mean,sd
  486. def doIBSpy(ped=None,basename='',outdir=None,logf=None,
  487. nrsSamples=10000,title='title',pdftoo=0,Zcutoff=2.0):
  488. #def doIBS(pedName, title, nrsSamples=None, pdftoo=False):
  489. """ started with snpmatrix but GRR uses actual IBS counts and sd's
  490. """
  491. repOut = [] # text strings to add to the html display
  492. refallele = {}
  493. tblf = '%s_table.xls' % (title)
  494. tbl = file(os.path.join(outdir,tblf), 'w')
  495. tbl.write(TABLE_HEADER)
  496. svgf = '%s.svg' % (title)
  497. svg = file(os.path.join(outdir,svgf), 'w')
  498. nMarkers = len(ped._markers)
  499. if nMarkers < 5:
  500. print sys.stderr, '### ERROR - %d is too few markers for reliable estimation in %s - terminating' % (nMarkers,PROGNAME)
  501. sys.exit(1)
  502. nSubjects = len(ped._subjects)
  503. nrsSamples = min(nMarkers, nrsSamples)
  504. if opts and opts.use_mito:
  505. markers = range(nMarkers)
  506. nrsSamples = min(len(markers), nrsSamples)
  507. sampleIndexes = sorted(random.sample(markers, nrsSamples))
  508. else:
  509. autosomals = ped.autosomal_indices()
  510. nrsSamples = min(len(autosomals), nrsSamples)
  511. sampleIndexes = sorted(random.sample(autosomals, nrsSamples))
  512. print ''
  513. print 'Getting random.sample of %s from %s total' % (nrsSamples, nMarkers)
  514. npairs = (nSubjects*(nSubjects-1))/2 # total rows in table
  515. newfiles=[svgf,tblf]
  516. explanations = ['rgGRR Plot (requires SVG)','Mean by SD alleles shared - %d rows' % npairs]
  517. # these go with the output file links in the html file
  518. s = 'Reading genotypes for %s subjects and %s markers\n' % (nSubjects, nrsSamples)
  519. logf.write(s)
  520. minUsegenos = nrsSamples/2 # must have half?
  521. nGenotypes = nSubjects*nrsSamples
  522. stime = time.time()
  523. emptyRows = set()
  524. genos = numpy.zeros((nSubjects, nrsSamples), dtype=int)
  525. for s in xrange(nSubjects):
  526. nValid = 0
  527. #getGenotypesByIndices(self, s, mlist, format)
  528. genos[s] = ped.getGenotypesByIndices(s, sampleIndexes, format='ref')
  529. nValid = sum([1 for g in genos[s] if g])
  530. if not nValid:
  531. emptyRows.add(s)
  532. sub = ped.getSubject(s)
  533. print 'All missing for row %d (%s)' % (s, sub)
  534. logf.write('All missing for row %d (%s)\n' % (s, sub))
  535. rtime = time.time() - stime
  536. if verbose:
  537. print '@@Read %s genotypes in %s seconds' % (nGenotypes, rtime)
  538. ### Now the expensive part. For each pair of subjects, we get the mean number
  539. ### and standard deviation of shared alleles over all of the markers where both
  540. ### subjects have a known genotype. Identical subjects should have mean shared
  541. ### alleles very close to 2.0 with a standard deviation very close to 0.0.
  542. tot = nSubjects*(nSubjects-1)/2
  543. nprog = tot/10
  544. nMarkerpairs = tot * nrsSamples
  545. estimatedTimeSlow = nMarkerpairs/MARKER_PAIRS_PER_SECOND_SLOW
  546. estimatedTimeFast = nMarkerpairs/MARKER_PAIRS_PER_SECOND_FAST
  547. pairs = []
  548. pair_data = {}
  549. means = [] ## Mean IBS for each pair
  550. ngenoL = [] ## Count of comparable genotypes for each pair
  551. sdevs = [] ## Standard dev for each pair
  552. rels = [] ## A relationship code for each pair
  553. zmeans = [0.0 for x in xrange(tot)] ## zmean score for each pair for the relgroup
  554. zstds = [0.0 for x in xrange(tot)] ## zstd score for each pair for the relgrp
  555. skip = set()
  556. ndone = 0 ## How many have been done so far
  557. logf.write('Calculating %d pairs...\n' % (tot))
  558. logf.write('Estimated time is %2.2f to %2.2f seconds ...\n' % (estimatedTimeFast, estimatedTimeSlow))
  559. t1sum = 0
  560. t2sum = 0
  561. t3sum = 0
  562. now = time.time()
  563. scache = {}
  564. _founder_cache = {}
  565. C_CODE = """
  566. #include "math.h"
  567. int i;
  568. int sumibs = 0;
  569. int ssqibs = 0;
  570. int ngeno = 0;
  571. float mean = 0;
  572. float M2 = 0;
  573. float delta = 0;
  574. float sdev=0;
  575. float variance=0;
  576. for (i=0; i<nrsSamples; i++) {
  577. int a1 = g1[i];
  578. int a2 = g2[i];
  579. if (a1 != 0 && a2 != 0) {
  580. ngeno += 1;
  581. int shared = 2-abs(a1-a2);
  582. delta = shared - mean;
  583. mean = mean + delta/ngeno;
  584. M2 += delta*(shared-mean);
  585. // yes that second time, the updated mean is used see calcmeansd above;
  586. //printf("%d %d %d %d %d %d\\n", i, a1, a2, ngeno, shared, squared);
  587. }
  588. }
  589. if (ngeno > 1) {
  590. variance = M2/(ngeno-1);
  591. sdev = sqrt(variance);
  592. //printf("OK: %d %3.2f %3.2f\\n", ngeno, mean, sdev);
  593. }
  594. //printf("%d %d %d %1.2f %1.2f\\n", ngeno, sumibs, ssqibs, mean, sdev);
  595. result[0] = ngeno;
  596. result[1] = mean;
  597. result[2] = sdev;
  598. return_val = ngeno;
  599. """
  600. started = time.time()
  601. for s1 in xrange(nSubjects):
  602. if s1 in emptyRows:
  603. continue
  604. (fid1,iid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache.setdefault(s1, ped.getSubject(s1))
  605. isFounder1 = _founder_cache.setdefault(s1, (did1==mid1))
  606. g1 = genos[s1]
  607. for s2 in xrange(s1+1, nSubjects):
  608. if s2 in emptyRows:
  609. continue
  610. t1s = time.time()
  611. (fid2,iid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache.setdefault(s2, ped.getSubject(s2))
  612. g2 = genos[s2]
  613. isFounder2 = _founder_cache.setdefault(s2, (did2==mid2))
  614. # Determine the relationship for this pair
  615. relcode = REL_UNKNOWN
  616. if (fid2 == fid1):
  617. if iid1 == iid2:
  618. relcode = REL_DUPE
  619. elif (did2 == did1) and (mid2 == mid1) and did1 != mid1:
  620. relcode = REL_SIBS
  621. elif (iid1 == mid2) or (iid1 == did2) or (iid2 == mid1) or (iid2 == did1):
  622. relcode = REL_PARENTCHILD
  623. elif (str(did1) != '0' and (did2 == did1)) or (str(mid1) != '0' and (mid2 == mid1)):
  624. relcode = REL_HALFSIBS
  625. else:
  626. # People in the same family should be marked as some other
  627. # form of related. In general, these people will have a
  628. # pretty random spread of similarity. This distinction is
  629. # probably not very useful most of the time
  630. relcode = REL_RELATED
  631. else:
  632. ### Different families
  633. relcode = REL_UNRELATED
  634. t1e = time.time()
  635. t1sum += t1e-t1s
  636. ### Calculate sum(2-abs(a1-a2)) and sum((2-abs(a1-a2))**2) and count
  637. ### the number of contributing genotypes. These values are not actually
  638. ### calculated here, but instead are looked up in a table for speed.
  639. ### FIXME: This is still too slow ...
  640. result = [0.0, 0.0, 0.0]
  641. ngeno = weave.inline(C_CODE, ['g1', 'g2', 'nrsSamples', 'result'])
  642. if ngeno >= minUsegenos:
  643. _, mean, sdev = result
  644. means.append(mean)
  645. sdevs.append(sdev)
  646. ngenoL.append(ngeno)
  647. pairs.append((s1, s2))
  648. rels.append(relcode)
  649. else:
  650. skip.add(ndone) # signal no comparable genotypes for this pair
  651. ndone += 1
  652. t2e = time.time()
  653. t2sum += t2e-t1e
  654. t3e = time.time()
  655. t3sum += t3e-t2e
  656. logme = [ 'T1: %s' % (t1sum), 'T2: %s' % (t2sum), 'T3: %s' % (t3sum),'TOT: %s' % (t3e-now),
  657. '%s pairs with no (or not enough) comparable genotypes (%3.1f%%)' % (len(skip),
  658. float(len(skip))/float(tot)*100)]
  659. logf.write('%s\n' % '\t'.join(logme))
  660. ### Calculate mean and standard deviation of scores on a per relationship
  661. ### type basis, allowing us to flag outliers for each particular relationship
  662. ### type
  663. relstats = {}
  664. relCounts = {}
  665. outlierFiles = {}
  666. for relCode, relInfo in REL_LOOKUP.items():
  667. relName, relColor, relStyle = relInfo
  668. useme = [means[x] for x in xrange(len(means)) if rels[x] == relCode]
  669. relCounts[relCode] = len(useme)
  670. mm = scipy.mean(useme)
  671. ms = scipy.std(useme)
  672. useme = [sdevs[x] for x in xrange(len(sdevs)) if rels[x] == relCode]
  673. sm = scipy.mean(useme)
  674. ss = scipy.std(useme)
  675. relstats[relCode] = {'sd':(sm,ss), 'mean':(mm,ms)}
  676. s = 'Relstate %s (n=%d): mean(mean)=%3.2f sdev(mean)=%3.2f, mean(sdev)=%3.2f sdev(sdev)=%3.2f\n' % \
  677. (relName,relCounts[relCode], mm, ms, sm, ss)
  678. logf.write(s)
  679. ### now fake z scores for each subject like abecasis recommends max(|zmu|,|zsd|)
  680. ### within each group, for each pair, z=(groupmean-pairmean)/groupsd
  681. available = len(means)
  682. logf.write('%d pairs are available of %d\n' % (available, tot))
  683. ### s = '\nOutliers:\nrelationship\tzmean\tzsd\tped1\tped2\tmean\tsd\trmeanmean\trmeansd\trsdmean\trsdsd\n'
  684. ### logf.write(s)
  685. pairnum = 0
  686. offset = 0
  687. nOutliers = 0
  688. cexs = []
  689. outlierRecords = dict([(r, []) for r in range(N_RELATIONSHIP_TYPES)])
  690. zsdmax = 0
  691. for s1 in range(nSubjects):
  692. if s1 in emptyRows:
  693. continue
  694. (fid1,iid1,did1,mid1,sex1,aff1,ok1,d_sid1,m_sid1) = scache[s1]
  695. for s2 in range(s1+1, nSubjects):
  696. if s2 in emptyRows:
  697. continue
  698. if pairnum not in skip:
  699. ### Get group stats for this relationship
  700. (fid2,iid2,did2,mid2,sex2,aff2,ok2,d_sid2,m_sid2) = scache[s2]
  701. try:
  702. r = rels[offset]
  703. except IndexError:
  704. logf.write('###OOPS offset %d available %d pairnum %d len(rels) %d', offset, available, pairnum, len(rels))
  705. notfound = ('?',('?','0','0'))
  706. relInfo = REL_LOOKUP.get(r,notfound)
  707. relName, relColor, relStyle = relInfo
  708. rmm,rmd = relstats[r]['mean'] # group mean, group meansd alleles shared
  709. rdm,rdd = relstats[r]['sd'] # group sdmean, group sdsd alleles shared
  710. try:
  711. zsd = (sdevs[offset] - rdm)/rdd # distance from group mean in group sd units
  712. except:
  713. zsd = 1
  714. if abs(zsd) > zsdmax:
  715. zsdmax = zsd # keep for sort scaling
  716. try:
  717. zmean = (means[offset] - rmm)/rmd # distance from group mean
  718. except:
  719. zmean = 1
  720. zmeans[offset] = zmean
  721. zstds[offset] = zsd
  722. pid=(s1,s2)
  723. zrad = max(zsd,zmean)
  724. if zrad < 4:
  725. zrad = 2
  726. elif 4 < zrad < 15:
  727. zrad = 3 # to 9
  728. else: # > 15 6=24+
  729. zrad=zrad/4
  730. zrad = min(zrad,6) # scale limit
  731. zrad = max(2,max(zsd,zmean)) # as > 2, z grows
  732. pair_data[pid] = (zmean,zsd,r,zrad)
  733. if max(zsd,zmean) > Zcutoff: # is potentially interesting
  734. mean = means[offset]
  735. sdev = sdevs[offset]
  736. outlierRecords[r].append((mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd,did1,mid1,did2,mid2))
  737. nOutliers += 1
  738. tbl.write('%s_%s\t%s_%s\t%f\t%f\t%f\t%f\t%d\t%s\t%s\t%s\t%s\t%s\n' % \
  739. (fid1, iid1, fid2, iid2, mean, sdev, zmean,zsd, ngeno, relName, did1,mid1,did2,mid2))
  740. offset += 1
  741. pairnum += 1
  742. logf.write( 'Outliers: %s\n' % (nOutliers))
  743. ### Write outlier files for each relationship type
  744. repOut.append('<h2>Outliers in tab delimited files linked above are also listed below</h2>')
  745. lzsd = round(numpy.log10(zsdmax)) + 1
  746. scalefactor = 10**lzsd
  747. for relCode, relInfo in REL_LOOKUP.items():
  748. relName, _, _ = relInfo
  749. outliers = outlierRecords[relCode]
  750. if not outliers:
  751. continue
  752. outliers = [(scalefactor*int(abs(x[3]))+ int(abs(x[2])),x) for x in outliers] # decorate
  753. outliers.sort()
  754. outliers.reverse() # largest deviation first
  755. outliers = [x[1] for x in outliers] # undecorate
  756. nrows = len(outliers)
  757. truncated = 0
  758. if nrows > MAX_SHOW_ROWS:
  759. s = '<h3>%s outlying pairs (top %d of %d) from %s</h3><table border="0" cellpadding="3">' % \
  760. (relName,MAX_SHOW_ROWS,nrows,title)
  761. truncated = nrows - MAX_SHOW_ROWS
  762. else:
  763. s = '<h3>%s outlying pairs (n=%d) from %s</h3><table border="0" cellpadding="3">' % (relName,nrows,title)
  764. repOut.append(s)
  765. fhname = '%s_rgGRR_%s_outliers.xls' % (title, relName)
  766. fhpath = os.path.join(outdir,fhname)
  767. fh = open(fhpath, 'w')
  768. newfiles.append(fhname)
  769. explanations.append('%s Outlier Pairs %s, N=%d, Cutoff SD=%f' % (relName,title,len(outliers),Zcutoff))
  770. fh.write(OUTLIERS_HEADER)
  771. s = ''.join(['<th>%s</th>' % x for x in OUTLIERS_HEADER_list])
  772. repOut.append('<tr align="center">%s</tr>' % s)
  773. for n,rec in enumerate(outliers):
  774. #(mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd) = rec
  775. s = '%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t' % tuple(rec)
  776. fh.write('%s%s\n' % (s,relName))
  777. # (mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd, did1,mid1,did2,mid2))
  778. s = '''<td>%f</td><td>%f</td><td>%f</td><td>%f</td><td>%s</td><td>%s</td>
  779. <td>%s</td><td>%s</td><td>%f</td><td>%f</td><td>%f</td><td>%f</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td>''' % tuple(rec)
  780. s = '%s<td>%s</td>' % (s,relName)
  781. if n < MAX_SHOW_ROWS:
  782. repOut.append('<tr align="center">%s</tr>' % s)
  783. if truncated > 0:
  784. repOut.append('<H2>WARNING: %d rows truncated - see outlier file for all %d rows</H2>' % (truncated,
  785. nrows))
  786. fh.close()
  787. repOut.append('</table><p>')
  788. ### Now, draw the plot in jpeg and svg formats, and optionally in the PDF format
  789. ### if requested
  790. logf.write('Plotting ...')
  791. pointColors = [REL_COLORS[rel] for rel in rels]
  792. pointStyles = [REL_POINTS[rel] for rel in rels]
  793. mainTitle = '%s (%s subjects, %d snp)' % (title, nSubjects, nrsSamples)
  794. svg.write(SVG_HEADER % (SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[4],
  795. SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[0],SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[1],
  796. SVG_COLORS[2],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[3],SVG_COLORS[4],SVG_COLORS[4],
  797. SVG_COLORS[5],SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[6],mainTitle))
  798. #rpy.r.jpeg(filename='%s.jpg' % (title), width=1600, height=1200, pointsize=12, quality=100, bg='white')
  799. #rpy.r.par(mai=(1,1,1,0.5))
  800. #rpy.r('par(xaxs="i",yaxs="i")')
  801. #rpy.r.plot(means, sdevs, main=mainTitle, ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2))
  802. #rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE)
  803. #rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted')
  804. #rpy.r.dev_off()
  805. ### We will now go through each relationship type to partition plot points
  806. ### into "bulk" and "outlier" groups. Bulk points will represent common
  807. ### mean/sdev pairs and will cover the majority of the points in the plot --
  808. ### they will use generic tooltip informtion about all of the pairs
  809. ### represented by that point. "Outlier" points will be uncommon pairs,
  810. ### with very specific information in their tooltips. It would be nice to
  811. ### keep hte total number of plotted points in the SVG representation to
  812. ### ~10000 (certainly less than 100000?)
  813. pointMap = {}
  814. orderedRels = [y[1] for y in reversed(sorted([(relCounts.get(x, 0),x) for x in REL_LOOKUP.keys()]))]
  815. # do we really want this? I want out of zone points last and big
  816. for relCode in orderedRels:
  817. svgColor = SVG_COLORS[relCode]
  818. relName, relColor, relStyle = REL_LOOKUP[relCode]
  819. svg.write('<g id="%s" style="stroke:%s; fill:%s; fill-opacity:1.0; stroke-width:1;" cursor="pointer">\n' % (relName, svgColor, svgColor))
  820. pMap = pointMap.setdefault(relCode, {})
  821. nPoints = 0
  822. rpairs=[]
  823. rgenos=[]
  824. rmeans=[]
  825. rsdevs=[]
  826. rz = []
  827. for x,rel in enumerate(rels): # all pairs
  828. if rel == relCode:
  829. s1,s2 = pairs[x]
  830. pid=(s1,s2)
  831. zmean,zsd,r,zrad = pair_data[pid][:4]
  832. rpairs.append(pairs[x])
  833. rgenos.append(ngenoL[x])
  834. rmeans.append(means[x])
  835. rsdevs.append(sdevs[x])
  836. rz.append(zrad)
  837. ### Now add the svg point group for this relationship to the svg file
  838. for x in range(len(rmeans)):
  839. svgX = '%d' % ((rmeans[x] - 1.0) * PLOT_WIDTH) # changed so mean scale is 1-2
  840. svgY = '%d' % (PLOT_HEIGHT - (rsdevs[x] * PLOT_HEIGHT)) # changed so sd scale is 0-1
  841. s1, s2 = rpairs[x]
  842. (fid1,uid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache[s1]
  843. (fid2,uid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache[s2]
  844. ngenos = rgenos[x]
  845. nPoints += 1
  846. point = pMap.setdefault((svgX, svgY), [])
  847. point.append((rmeans[x], rsdevs[x], fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos,rz[x]))
  848. for (svgX, svgY) in pMap:
  849. points = pMap[(svgX, svgY)]
  850. svgX = int(svgX)
  851. svgY = int(svgY)
  852. if len(points) > 1:
  853. mmean,dmean = calcMeanSD([p[0] for p in points])
  854. msdev,dsdev = calcMeanSD([p[1] for p in points])
  855. mgeno,dgeno = calcMeanSD([p[-1] for p in points])
  856. mingeno = min([p[-1] for p in points])
  857. maxgeno = max([p[-1] for p in points])
  858. svg.write("""<circle cx="%d" cy="%d" r="2"
  859. onmouseover="showBTT(evt, %d, %1.2f, %1.2f, %1.2f, %1.2f, %d, %d, %d, %d, %d)"
  860. onmouseout="hideBTT(evt)" />\n""" % (svgX, svgY, relCode, mmean, dmean, msdev, dsdev, len(points), mgeno, dgeno, mingeno, maxgeno))
  861. else:
  862. mean, sdev, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos, zrad = points[0][:12]
  863. rmean = float(relstats[relCode]['mean'][0])
  864. rsdev = float(relstats[relCode]['sd'][0])
  865. if zrad < 4:
  866. zrad = 2
  867. elif 4 < zrad < 9:
  868. zrad = 3 # to 9
  869. else: # > 9 5=15+
  870. zrad=zrad/3
  871. zrad = min(zrad,5) # scale limit
  872. if zrad <= 3:
  873. svg.write('<circle cx="%d" cy="%d" r="%s" onmouseover="showOTT(evt, %d, \'%s,%s,%s,%s\', \'%s,%s,%s,%s\', %1.2f, %1.2f, %s, %1.2f, %1.2f)" onmouseout="hideOTT(evt)" />\n' % (svgX, svgY, zrad, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev))
  874. else: # highlight pairs a long way from expectation by outlining circle in red
  875. svg.write("""<circle cx="%d" cy="%d" r="%s" style="stroke:red; fill:%s; fill-opacity:1.0; stroke-width:1;"
  876. onmouseover="showOTT(evt, %d, \'%s,%s,%s,%s\', \'%s,%s,%s,%s\', %1.2f, %1.2f, %s, %1.2f, %1.2f)"
  877. onmouseout="hideOTT(evt)" />\n""" % \
  878. (svgX, svgY, zrad, svgColor, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev))
  879. svg.write('</g>\n')
  880. ### Create a pdf as well if indicated on the command line
  881. ### WARNING! for framingham share, with about 50M pairs, this is a 5.5GB pdf!
  882. ## if pdftoo:
  883. ## pdfname = '%s.pdf' % (title)
  884. ## rpy.r.pdf(pdfname, 6, 6)
  885. ## rpy.r.par(mai=(1,1,1,0.5))
  886. ## rpy.r('par(xaxs="i",yaxs="i")')
  887. ## rpy.r.plot(means, sdevs, main='%s, %d snp' % (title, nSamples), ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2))
  888. ## rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE)
  889. ## rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted')
  890. ## rpy.r.dev_off()
  891. ### Draw polygons
  892. if showPolygons:
  893. svg.write('<g id="polygons" cursor="pointer">\n')
  894. for rel, poly in POLYGONS.items():
  895. points = ' '.join(['%s,%s' % ((p[0]-1.0)*float(PLOT_WIDTH), (PLOT_HEIGHT - p[1]*PLOT_HEIGHT)) for p in poly])
  896. svg.write('<polygon points="%s" fill="transparent" style="stroke:%s; stroke-width:1"/>\n' % (points, SVG_COLORS[rel]))
  897. svg.write('</g>\n')
  898. svg.write(SVG_FOOTER)
  899. svg.close()
  900. return newfiles,explanations,repOut
  901. def doIBS(n=100):
  902. """parse parameters from galaxy
  903. expect 'input pbed path' 'basename' 'outpath' 'title' 'logpath' 'n'
  904. <command interpreter="python">
  905. rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name"
  906. '$out_file1' '$out_file1.files_path' "$title1" '$n' '$Z'
  907. </command>
  908. """
  909. u="""<command interpreter="python">
  910. rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name"
  911. '$out_file1' '$out_file1.files_path' "$title1" '$n' '$Z'
  912. </command>
  913. """
  914. if len(sys.argv) < 7:
  915. print >> sys.stdout, 'Need pbed inpath, basename, out_htmlname, outpath, title, logpath, nSNP, Zcutoff on command line please'
  916. print >> sys.stdout, u
  917. sys.exit(1)
  918. ts = '%s%s' % (string.punctuation,string.whitespace)
  919. ptran = string.maketrans(ts,'_'*len(ts))
  920. inpath = sys.argv[1]
  921. ldinpath = os.path.split(inpath)[0]
  922. basename = sys.argv[2]
  923. outhtml = sys.argv[3]
  924. newfilepath = sys.argv[4]
  925. title = sys.argv[5].translate(ptran)
  926. logfname = 'Log_%s.txt' % title
  927. logpath = os.path.join(newfilepath,logfname) # log was a child - make part of html extra_files_path zoo
  928. n = int(sys.argv[6])
  929. try:
  930. Zcutoff = float(sys.argv[7])
  931. except:
  932. Zcutoff = 2.0
  933. try:
  934. os.makedirs(newfilepath)
  935. except:
  936. pass
  937. logf = file(logpath,'w')
  938. efp,ibase_name = os.path.split(inpath) # need to use these for outputs in files_path
  939. ped = plinkbinJZ.BPed(inpath)
  940. ped.parse(quick=True)
  941. if ped == None:
  942. print >> sys.stderr, '## doIBSpy problem - cannot open %s or %s - cannot run' % (ldreduced,basename)
  943. sys.exit(1)
  944. newfiles,explanations,repOut = doIBSpy(ped=ped,basename=basename,outdir=newfilepath,
  945. logf=logf,nrsSamples=n,title=title,pdftoo=0,Zcutoff=Zcutoff)
  946. logf.close()
  947. logfs = file(logpath,'r').readlines()
  948. lf = file(outhtml,'w')
  949. lf.write(galhtmlprefix % PROGNAME)
  950. # this is a mess. todo clean up - should each datatype have it's own directory? Yes
  951. # probably. Then titles are universal - but userId libraries are separate.
  952. s = '<div>Output from %s run at %s<br>\n' % (PROGNAME,timenow())
  953. lf.write('<h4>%s</h4>\n' % s)
  954. fixed = ["'%s'" % x for x in sys.argv] # add quotes just in case
  955. s = 'If you need to rerun this analysis, the command line was\n<pre>%s</pre>\n</div>' % (' '.join(fixed))
  956. lf.write(s)
  957. # various ways of displaying svg - experiments related to missing svg mimetype on test (!)
  958. #s = """<object data="%s" type="image/svg+xml" width="%d" height="%d">
  959. # <embed src="%s" type="image/svg+xml" width="%d" height="%d" />
  960. # </object>""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT,newfiles[0],PLOT_WIDTH,PLOT_HEIGHT)
  961. s = """ <embed src="%s" type="image/svg+xml" width="%d" height="%d" />""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT)
  962. #s = """ <iframe src="%s" type="image/svg+xml" width="%d" height="%d" />""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT)
  963. lf.write(s)
  964. lf.write('<div><h4>Click the links below to save output files and plots</h4><br><ol>\n')
  965. for i in range(len(newfiles)):
  966. if i == 0:
  967. lf.write('<li><a href="%s" type="image/svg+xml" >%s</a></li>\n' % (newfiles[i],explanations[i]))
  968. else:
  969. lf.write('<li><a href="%s">%s</a></li>\n' % (newfiles[i],explanations[i]))
  970. flist = os.listdir(newfilepath)
  971. for fname in flist:
  972. if not fname in newfiles:
  973. lf.write('<li><a href="%s">%s</a></li>\n' % (fname,fname))
  974. lf.write('</ol></div>')
  975. lf.write('<div>%s</div>' % ('\n'.join(repOut))) # repOut is a list of tables
  976. lf.write('<div><hr><h3>Log from this job (also stored in %s)</h3><pre>%s</pre><hr></div>' % (logfname,''.join(logfs)))
  977. lf.write('</body></html>\n')
  978. lf.close()
  979. logf.close()
  980. if __name__ == '__main__':
  981. doIBS()