PageRenderTime 89ms CodeModel.GetById 16ms app.highlight 60ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/rgenetics/rgGRR.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 1089 lines | 974 code | 30 blank | 85 comment | 19 complexity | f2b858baca27314e3adaa7f42540fa9a MD5 | raw file
   1"""
   2# july 2009: Need to see outliers so need to draw them last?
   3# could use clustering on the zscores to guess real relationships for unrelateds
   4# but definitely need to draw last
   5# added MAX_SHOW_ROWS to limit the length of the main report page
   6# Changes for Galaxy integration
   7# added more robust knuth method for one pass mean and sd
   8# no difference really - let's use scipy.mean() and scipy.std() instead...
   9# fixed labels and changed to .xls for outlier reports so can open in excel
  10# interesting - with a few hundred subjects, 5k gives good resolution
  11# and 100k gives better but not by much
  12# TODO remove non autosomal markers
  13# TODO it would be best if label had the zmean and zsd as these are what matter for
  14# outliers rather than the group mean/sd
  15# mods to rgGRR.py from channing CVS which John Ziniti has rewritten to produce SVG plots
  16# to make a Galaxy tool - we need the table of mean and SD for interesting pairs, the SVG and the log
  17# so the result should be an HTML file
  18
  19# rgIBS.py
  20# use a random subset of markers for a quick ibs
  21# to identify sample dups and closely related subjects
  22# try snpMatrix and plink and see which one works best for us?
  23# abecasis grr plots mean*sd for every subject to show clusters
  24# mods june 23 rml to avoid non-autosomal markers
  25# we seem to be distinguishing parent-child by gender - 2 clouds!
  26
  27
  28snpMatrix from David Clayton has:
  29ibs.stats function to calculate the identity-by-state stats of a group of samples
  30Description
  31Given a snp.matrix-class or a X.snp.matrix-class object with N samples, calculates some statistics
  32about the relatedness of every pair of samples within.
  33
  34Usage
  35ibs.stats(x)
  368 ibs.stats
  37Arguments
  38x a snp.matrix-class or a X.snp.matrix-class object containing N samples
  39Details
  40No-calls are excluded from consideration here.
  41Value
  42A data.frame containing N(N - 1)/2 rows, where the row names are the sample name pairs separated
  43by a comma, and the columns are:
  44Count count of identical calls, exclusing no-calls
  45Fraction fraction of identical calls comparied to actual calls being made in both samples
  46Warning
  47In some applications, it may be preferable to subset a (random) selection of SNPs first - the
  48calculation
  49time increases as N(N - 1)M/2 . Typically for N = 800 samples and M = 3000 SNPs, the
  50calculation time is about 1 minute. A full GWA scan could take hours, and quite unnecessary for
  51simple applications such as checking for duplicate or related samples.
  52Note
  53This is mostly written to find mislabelled and/or duplicate samples.
  54Illumina indexes their SNPs in alphabetical order so the mitochondria SNPs comes first - for most
  55purpose it is undesirable to use these SNPs for IBS purposes.
  56TODO: Worst-case S4 subsetting seems to make 2 copies of a large object, so one might want to
  57subset before rbind(), etc; a future version of this routine may contain a built-in subsetting facility
  58"""
  59import sys,os,time,random,string,copy,optparse
  60
  61try:
  62  set
  63except NameError:
  64  from Sets import Set as set
  65
  66from rgutils import timenow,plinke
  67
  68import plinkbinJZ
  69
  70
  71opts = None
  72verbose = False
  73
  74showPolygons = False
  75
  76class NullDevice:
  77  def write(self, s):
  78    pass
  79
  80tempstderr = sys.stderr # save
  81#sys.stderr = NullDevice()
  82# need to avoid blather about deprecation and other strange stuff from scipy
  83# the current galaxy job runner assumes that
  84# the job is in error if anything appears on sys.stderr
  85# grrrrr. James wants to keep it that way instead of using the
  86# status flag for some strange reason. Presumably he doesn't use R or (in this case, scipy)
  87import numpy
  88import scipy
  89from scipy import weave
  90
  91
  92sys.stderr=tempstderr
  93
  94
  95PROGNAME = os.path.split(sys.argv[0])[-1]
  96X_AXIS_LABEL = 'Mean Alleles Shared'
  97Y_AXIS_LABEL = 'SD Alleles Shared'
  98LEGEND_ALIGN = 'topleft'
  99LEGEND_TITLE = 'Relationship'
 100DEFAULT_SYMBOL_SIZE = 1.0 # default symbol size
 101DEFAULT_SYMBOL_SIZE = 0.5 # default symbol size
 102
 103### Some colors for R/rpy
 104R_BLACK  = 1
 105R_RED    = 2
 106R_GREEN  = 3
 107R_BLUE   = 4
 108R_CYAN   = 5
 109R_PURPLE = 6
 110R_YELLOW = 7
 111R_GRAY   = 8
 112
 113### ... and some point-styles
 114
 115###
 116PLOT_HEIGHT = 600
 117PLOT_WIDTH = 1150
 118
 119
 120#SVG_COLORS = ('black', 'darkblue', 'blue', 'deepskyblue', 'firebrick','maroon','crimson')
 121#SVG_COLORS = ('cyan','dodgerblue','mediumpurple', 'fuchsia', 'red','gold','gray')
 122SVG_COLORS = ('cyan','dodgerblue','mediumpurple','forestgreen', 'lightgreen','gold','gray')
 123# dupe,parentchild,sibpair,halfsib,parents,unrel,unkn
 124#('orange', 'red', 'green', 'chartreuse', 'blue', 'purple', 'gray')
 125
 126OUTLIERS_HEADER_list = ['Mean','Sdev','ZMean','ZSdev','FID1','IID1','FID2','IID2','RelMean_M','RelMean_SD','RelSD_M','RelSD_SD','PID1','MID1','PID2','MID2','Ped']
 127OUTLIERS_HEADER = '\t'.join(OUTLIERS_HEADER_list)
 128TABLE_HEADER='fid1_iid1\tfid2_iid2\tmean\tsdev\tzmean\tzsdev\tgeno\trelcode\tpid1\tmid1\tpid2\tmid2\n'
 129
 130
 131### Relationship codes, text, and lookups/mappings
 132N_RELATIONSHIP_TYPES = 7
 133REL_DUPE, REL_PARENTCHILD, REL_SIBS, REL_HALFSIBS, REL_RELATED, REL_UNRELATED, REL_UNKNOWN = range(N_RELATIONSHIP_TYPES)
 134REL_LOOKUP = {
 135    REL_DUPE:        ('dupe',        R_BLUE,   1),
 136    REL_PARENTCHILD: ('parentchild', R_YELLOW, 1),
 137    REL_SIBS:        ('sibpairs',    R_RED,    1),
 138    REL_HALFSIBS:    ('halfsibs',    R_GREEN,  1),
 139    REL_RELATED:     ('parents',     R_PURPLE, 1),
 140    REL_UNRELATED:   ('unrelated',   R_CYAN,   1),
 141    REL_UNKNOWN:     ('unknown',     R_GRAY,   1),
 142    }
 143OUTLIER_STDEVS = {
 144    REL_DUPE:        2,
 145    REL_PARENTCHILD: 2,
 146    REL_SIBS:        2,
 147    REL_HALFSIBS:    2,
 148    REL_RELATED:     2,
 149    REL_UNRELATED:   3,
 150    REL_UNKNOWN:     2,
 151    }
 152# note now Z can be passed in
 153
 154REL_STATES = [REL_LOOKUP[r][0] for r in range(N_RELATIONSHIP_TYPES)]
 155REL_COLORS = SVG_COLORS
 156REL_POINTS = [REL_LOOKUP[r][2] for r in range(N_RELATIONSHIP_TYPES)]
 157
 158DEFAULT_MAX_SAMPLE_SIZE = 10000
 159
 160REF_COUNT_HOM1 = 3
 161REF_COUNT_HET  = 2
 162REF_COUNT_HOM2 = 1
 163MISSING        = 0
 164MAX_SHOW_ROWS = 100 # framingham has millions - delays showing output page - so truncate and explain
 165MARKER_PAIRS_PER_SECOND_SLOW = 15000000.0
 166MARKER_PAIRS_PER_SECOND_FAST = 70000000.0
 167
 168
 169galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
 170<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 171<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 172<head>
 173<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
 174<meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" />
 175<title></title>
 176<link rel="stylesheet" href="/static/style/base.css" type="text/css" />
 177</head>
 178<body>
 179<div class="document">
 180"""
 181
 182
 183SVG_HEADER = '''<?xml version="1.0" standalone="no"?>
 184<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.2//EN" "http://www.w3.org/Graphics/SVG/1.2/DTD/svg12.dtd">
 185
 186<svg width="1280" height="800"
 187     xmlns="http://www.w3.org/2000/svg" version="1.2"
 188     xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 1280 800" onload="init()">
 189
 190  <script type="text/ecmascript" xlink:href="/static/scripts/checkbox_and_radiobutton.js"/>
 191  <script type="text/ecmascript" xlink:href="/static/scripts/helper_functions.js"/>
 192  <script type="text/ecmascript" xlink:href="/static/scripts/timer.js"/>
 193  <script type="text/ecmascript">
 194    <![CDATA[
 195      var checkBoxes = new Array();
 196      var radioGroupBandwidth;
 197      var colours = ['%s','%s','%s','%s','%s','%s','%s'];
 198      function init() {
 199          var style = {"font-family":"Arial,Helvetica", "fill":"black", "font-size":12};
 200          var dist = 12;
 201          var yOffset = 4;
 202
 203          //A checkBox for each relationship type dupe,parentchild,sibpair,halfsib,parents,unrel,unkn
 204          checkBoxes["dupe"] = new checkBox("dupe","checkboxes",20,40,"cbRect","cbCross",true,"Duplicate",style,dist,yOffset,undefined,hideShowLayer);
 205          checkBoxes["parentchild"] = new checkBox("parentchild","checkboxes",20,60,"cbRect","cbCross",true,"Parent-Child",style,dist,yOffset,undefined,hideShowLayer);
 206          checkBoxes["sibpairs"] = new checkBox("sibpairs","checkboxes",20,80,"cbRect","cbCross",true,"Sib-pairs",style,dist,yOffset,undefined,hideShowLayer);
 207          checkBoxes["halfsibs"] = new checkBox("halfsibs","checkboxes",20,100,"cbRect","cbCross",true,"Half-sibs",style,dist,yOffset,undefined,hideShowLayer);
 208          checkBoxes["parents"] = new checkBox("parents","checkboxes",20,120,"cbRect","cbCross",true,"Parents",style,dist,yOffset,undefined,hideShowLayer);
 209          checkBoxes["unrelated"] = new checkBox("unrelated","checkboxes",20,140,"cbRect","cbCross",true,"Unrelated",style,dist,yOffset,undefined,hideShowLayer);
 210          checkBoxes["unknown"] = new checkBox("unknown","checkboxes",20,160,"cbRect","cbCross",true,"Unknown",style,dist,yOffset,undefined,hideShowLayer);
 211
 212      }
 213
 214      function hideShowLayer(id, status, label) {
 215          var vis = "hidden";
 216          if (status) {
 217              vis = "visible";
 218          }
 219          document.getElementById(id).setAttributeNS(null, 'visibility', vis);
 220      }
 221
 222      function showBTT(evt, rel, mm, dm, md, dd, n, mg, dg, lg, hg) {
 223    var x = parseInt(evt.pageX)-250;
 224    var y = parseInt(evt.pageY)-110;
 225        switch(rel) {
 226        case 0:
 227        fill = colours[rel];
 228        relt = "dupe";
 229        break;
 230        case 1:
 231        fill = colours[rel];
 232        relt = "parentchild";
 233        break;
 234        case 2:
 235        fill = colours[rel];
 236        relt = "sibpairs";
 237        break;
 238        case 3:
 239        fill = colours[rel];
 240        relt = "halfsibs";
 241        break;
 242        case 4:
 243        fill = colours[rel];
 244        relt = "parents";
 245        break;
 246        case 5:
 247        fill = colours[rel];
 248        relt = "unrelated";
 249        break;
 250        case 6:
 251        fill = colours[rel];
 252        relt = "unknown";
 253        break;
 254        default:
 255        fill = "cyan";
 256        relt = "ERROR_CODE: "+rel;
 257    }
 258
 259    document.getElementById("btRel").textContent = "GROUP: "+relt;
 260    document.getElementById("btMean").textContent = "mean="+mm+" +/- "+dm;
 261        document.getElementById("btSdev").textContent = "sdev="+dm+" +/- "+dd;
 262        document.getElementById("btPair").textContent = "npairs="+n;
 263        document.getElementById("btGeno").textContent = "ngenos="+mg+" +/- "+dg+" (min="+lg+", max="+hg+")";
 264        document.getElementById("btHead").setAttribute('fill', fill);
 265
 266        var tt = document.getElementById("btTip");
 267    tt.setAttribute("transform", "translate("+x+","+y+")");
 268    tt.setAttribute('visibility', 'visible');
 269      }
 270
 271      function showOTT(evt, rel, s1, s2, mean, sdev, ngeno, rmean, rsdev) {
 272    var x = parseInt(evt.pageX)-150;
 273    var y = parseInt(evt.pageY)-180;
 274
 275        switch(rel) {
 276        case 0:
 277        fill = colours[rel];
 278        relt = "dupe";
 279        break;
 280        case 1:
 281        fill = colours[rel];
 282        relt = "parentchild";
 283        break;
 284        case 2:
 285        fill = colours[rel];
 286        relt = "sibpairs";
 287        break;
 288        case 3:
 289        fill = colours[rel];
 290        relt = "halfsibs";
 291        break;
 292        case 4:
 293        fill = colours[rel];
 294        relt = "parents";
 295        break;
 296        case 5:
 297        fill = colours[rel];
 298        relt = "unrelated";
 299        break;
 300        case 6:
 301        fill = colours[rel];
 302        relt = "unknown";
 303        break;
 304        default:
 305        fill = "cyan";
 306        relt = "ERROR_CODE: "+rel;
 307    }
 308
 309    document.getElementById("otRel").textContent = "PAIR: "+relt;
 310    document.getElementById("otS1").textContent = "s1="+s1;
 311    document.getElementById("otS2").textContent = "s2="+s2;
 312    document.getElementById("otMean").textContent = "mean="+mean;
 313        document.getElementById("otSdev").textContent = "sdev="+sdev;
 314        document.getElementById("otGeno").textContent = "ngenos="+ngeno;
 315        document.getElementById("otRmean").textContent = "relmean="+rmean;
 316        document.getElementById("otRsdev").textContent = "relsdev="+rsdev;
 317    document.getElementById("otHead").setAttribute('fill', fill);
 318
 319        var tt = document.getElementById("otTip");
 320    tt.setAttribute("transform", "translate("+x+","+y+")");
 321    tt.setAttribute('visibility', 'visible');
 322      }
 323
 324      function hideBTT(evt) {
 325        document.getElementById("btTip").setAttributeNS(null, 'visibility', 'hidden');
 326      }
 327
 328      function hideOTT(evt) {
 329        document.getElementById("otTip").setAttributeNS(null, 'visibility', 'hidden');
 330      }
 331
 332     ]]>
 333  </script>
 334  <defs>
 335    <!-- symbols for check boxes -->
 336    <symbol id="cbRect" overflow="visible">
 337        <rect x="-5" y="-5" width="10" height="10" fill="white" stroke="dimgray" stroke-width="1" cursor="pointer"/>
 338    </symbol>
 339    <symbol id="cbCross" overflow="visible">
 340        <g pointer-events="none" stroke="black" stroke-width="1">
 341            <line x1="-3" y1="-3" x2="3" y2="3"/>
 342            <line x1="3" y1="-3" x2="-3" y2="3"/>
 343        </g>
 344    </symbol>
 345  </defs>
 346
 347<desc>Developer Works Dynamic Scatter Graph Scaling Example</desc>
 348
 349<!-- Now Draw the main X and Y axis -->
 350<g style="stroke-width:1.0; stroke:black; shape-rendering:crispEdges">
 351   <!-- X Axis top and bottom -->
 352   <path d="M 100 100 L 1250 100 Z"/>
 353   <path d="M 100 700 L 1250 700 Z"/>
 354
 355   <!-- Y Axis left and right -->
 356   <path d="M 100  100 L 100  700 Z"/>
 357   <path d="M 1250 100 L 1250 700 Z"/>
 358</g>
 359
 360<g transform="translate(100,100)">
 361
 362  <!-- Grid Lines -->
 363  <g style="fill:none; stroke:#dddddd; stroke-width:1; stroke-dasharray:2,2; text-anchor:end; shape-rendering:crispEdges">
 364
 365    <!-- Vertical grid lines -->
 366    <line x1="125" y1="0" x2="115" y2="600" />
 367    <line x1="230" y1="0" x2="230" y2="600" />
 368    <line x1="345" y1="0" x2="345" y2="600" />
 369    <line x1="460" y1="0" x2="460" y2="600" />
 370    <line x1="575" y1="0" x2="575" y2="600" style="stroke-dasharray:none;" />
 371    <line x1="690" y1="0" x2="690" y2="600"   />
 372    <line x1="805" y1="0" x2="805" y2="600"   />
 373    <line x1="920" y1="0" x2="920" y2="600"   />
 374    <line x1="1035" y1="0" x2="1035" y2="600" />
 375
 376    <!-- Horizontal grid lines -->
 377    <line x1="0" y1="60" x2="1150" y2="60"   />
 378    <line x1="0" y1="120" x2="1150" y2="120" />
 379    <line x1="0" y1="180" x2="1150" y2="180" />
 380    <line x1="0" y1="240" x2="1150" y2="240" />
 381    <line x1="0" y1="300" x2="1150" y2="300" style="stroke-dasharray:none;" />
 382    <line x1="0" y1="360" x2="1150" y2="360" />
 383    <line x1="0" y1="420" x2="1150" y2="420" />
 384    <line x1="0" y1="480" x2="1150" y2="480" />
 385    <line x1="0" y1="540" x2="1150" y2="540" />
 386  </g>
 387
 388  <!-- Legend -->
 389  <g style="fill:black; stroke:none" font-size="12" font-family="Arial" transform="translate(25,25)">
 390    <rect width="160" height="270" style="fill:none; stroke:black; shape-rendering:crispEdges" />
 391    <text x="5" y="20" style="fill:black; stroke:none;" font-size="13" font-weight="bold">Given Pair Relationship</text>
 392    <rect x="120" y="35" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
 393    <rect x="120" y="55" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
 394    <rect x="120" y="75" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
 395    <rect x="120" y="95" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
 396    <rect x="120" y="115" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
 397    <rect x="120" y="135" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
 398    <rect x="120" y="155" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
 399    <text x="15"  y="195" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore gt 15</text>
 400    <circle cx="125" cy="192" r="6" style="stroke:red; fill:gold; fill-opacity:1.0; stroke-width:1;"/>
 401    <text x="15" y="215" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore 4 to 15</text>
 402    <circle cx="125" cy="212" r="3" style="stroke:gold; fill:gold; fill-opacity:1.0; stroke-width:1;"/>
 403    <text x="15" y="235" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore lt 4</text>
 404    <circle cx="125" cy="232" r="2" style="stroke:gold; fill:gold; fill-opacity:1.0; stroke-width:1;"/>
 405    <g id="checkboxes">
 406    </g>
 407  </g>
 408
 409
 410   <g style='fill:black; stroke:none' font-size="17" font-family="Arial">
 411    <!-- X Axis Labels -->
 412    <text x="480" y="660">Mean Alleles Shared</text>
 413    <text x="0"    y="630" >1.0</text>
 414    <text x="277"  y="630" >1.25</text>
 415    <text x="564"  y="630" >1.5</text>
 416    <text x="842" y="630" >1.75</text>
 417    <text x="1140" y="630" >2.0</text>
 418  </g>
 419
 420  <g transform="rotate(270)" style="fill:black; stroke:none" font-size="17" font-family="Arial">
 421    <!-- Y Axis Labels -->
 422    <text x="-350" y="-40">SD Alleles Shared</text>
 423    <text x="-20" y="-10" >1.0</text>
 424    <text x="-165" y="-10" >0.75</text>
 425    <text x="-310" y="-10" >0.5</text>
 426    <text x="-455" y="-10" >0.25</text>
 427    <text x="-600" y="-10" >0.0</text>
 428  </g>
 429
 430<!-- Plot Title -->
 431<g style="fill:black; stroke:none" font-size="18" font-family="Arial">
 432    <text x="425" y="-30">%s</text>
 433</g>
 434
 435<!-- One group/layer of points for each relationship type -->
 436'''
 437
 438SVG_FOOTER = '''
 439<!-- End of Data -->
 440</g>
 441<g id="btTip" visibility="hidden" style="stroke-width:1.0; fill:black; stroke:none;" font-size="10" font-family="Arial">
 442  <rect width="250" height="110" style="fill:silver" rx="2" ry="2"/>
 443  <rect id="btHead" width="250" height="20" rx="2" ry="2" />
 444  <text id="btRel" y="14" x="85">unrelated</text>
 445  <text id="btMean" y="40" x="4">mean=1.5 +/- 0.04</text>
 446  <text id="btSdev" y="60" x="4">sdev=0.7 +/- 0.03</text>
 447  <text id="btPair" y="80" x="4">npairs=1152</text>
 448  <text id="btGeno" y="100" x="4">ngenos=4783 +/- 24 (min=1000, max=5000)</text>
 449</g>
 450
 451<g id="otTip" visibility="hidden" style="stroke-width:1.0; fill:black; stroke:none;" font-size="10" font-family="Arial">
 452  <rect width="150" height="180" style="fill:silver" rx="2" ry="2"/>
 453  <rect id="otHead" width="150" height="20" rx="2" ry="2" />
 454  <text id="otRel" y="14" x="40">sibpairs</text>
 455  <text id="otS1" y="40" x="4">s1=fid1,iid1</text>
 456  <text id="otS2" y="60" x="4">s2=fid2,iid2</text>
 457  <text id="otMean" y="80" x="4">mean=1.82</text>
 458  <text id="otSdev" y="100" x="4">sdev=0.7</text>
 459  <text id="otGeno" y="120" x="4">ngeno=4487</text>
 460  <text id="otRmean" y="140" x="4">relmean=1.85</text>
 461  <text id="otRsdev" y="160" x="4">relsdev=0.65</text>
 462</g>
 463</svg>
 464'''
 465
 466
 467DEFAULT_MAX_SAMPLE_SIZE = 5000
 468
 469REF_COUNT_HOM1 = 3
 470REF_COUNT_HET  = 2
 471REF_COUNT_HOM2 = 1
 472MISSING        = 0
 473
 474MARKER_PAIRS_PER_SECOND_SLOW = 15000000
 475MARKER_PAIRS_PER_SECOND_FAST = 70000000
 476
 477POLYGONS = {
 478    REL_UNRELATED:   ((1.360, 0.655), (1.385, 0.730), (1.620, 0.575), (1.610, 0.505)),
 479    REL_HALFSIBS:    ((1.630, 0.500), (1.630, 0.550), (1.648, 0.540), (1.648, 0.490)),
 480    REL_SIBS:        ((1.660, 0.510), (1.665, 0.560), (1.820, 0.410), (1.820, 0.390)),
 481    REL_PARENTCHILD: ((1.650, 0.470), (1.650, 0.490), (1.750, 0.440), (1.750, 0.420)),
 482    REL_DUPE:        ((1.970, 0.000), (1.970, 0.150), (2.000, 0.150), (2.000, 0.000)),
 483    }
 484
 485def distance(point1, point2):
 486    """ Calculate the distance between two points
 487    """
 488    (x1,y1) = [float(d) for d in point1]
 489    (x2,y2) = [float(d) for d in point2]
 490    dx = abs(x1 - x2)
 491    dy = abs(y1 - y2)
 492    return math.sqrt(dx**2 + dy**2)
 493
 494def point_inside_polygon(x, y, poly):
 495    """ Determine if a point (x,y) is inside a given polygon or not
 496        poly is a list of (x,y) pairs.
 497
 498        Taken from: http://www.ariel.com.au/a/python-point-int-poly.html
 499    """
 500
 501    n = len(poly)
 502    inside = False
 503
 504    p1x,p1y = poly[0]
 505    for i in range(n+1):
 506        p2x,p2y = poly[i % n]
 507        if y > min(p1y,p2y):
 508            if y <= max(p1y,p2y):
 509                if x <= max(p1x,p2x):
 510                    if p1y != p2y:
 511                        xinters = (y-p1y)*(p2x-p1x)/(p2y-p1y)+p1x
 512                    if p1x == p2x or x <= xinters:
 513                        inside = not inside
 514        p1x,p1y = p2x,p2y
 515    return inside
 516
 517def readMap(pedfile):
 518    """
 519    """
 520    mapfile = pedfile.replace('.ped', '.map')
 521    marker_list = []
 522    if os.path.exists(mapfile):
 523        print 'readMap: %s' % (mapfile)
 524        fh = file(mapfile, 'r')
 525    for line in fh:
 526        marker_list.append(line.strip().split())
 527    fh.close()
 528    print 'readMap: %s markers' % (len(marker_list))
 529    return marker_list
 530
 531def calcMeanSD(useme):
 532    """
 533    A numerically stable algorithm is given below. It also computes the mean.
 534    This algorithm is due to Knuth,[1] who cites Welford.[2]
 535    n = 0
 536    mean = 0
 537    M2 = 0
 538
 539    foreach x in data:
 540      n = n + 1
 541      delta = x - mean
 542      mean = mean + delta/n
 543      M2 = M2 + delta*(x - mean)      // This expression uses the new value of mean
 544    end for
 545
 546    variance_n = M2/n
 547    variance = M2/(n - 1)
 548    """
 549    mean = 0.0
 550    M2 = 0.0
 551    sd = 0.0
 552    n = len(useme)
 553    if n > 1:
 554        for i,x in enumerate(useme):
 555            delta = x - mean
 556            mean = mean + delta/(i+1) # knuth uses n+=1 at start
 557            M2 = M2 + delta*(x - mean)      # This expression uses the new value of mean
 558        variance = M2/(n-1) # assume is sample so lose 1 DOF
 559        sd = pow(variance,0.5)
 560    return mean,sd
 561
 562
 563def doIBSpy(ped=None,basename='',outdir=None,logf=None,
 564            nrsSamples=10000,title='title',pdftoo=0,Zcutoff=2.0):
 565    #def doIBS(pedName, title, nrsSamples=None, pdftoo=False):
 566    """ started with snpmatrix but GRR uses actual IBS counts and sd's
 567    """
 568    repOut = [] # text strings to add to the html display
 569    refallele = {}
 570    tblf = '%s_table.xls' % (title)
 571    tbl = file(os.path.join(outdir,tblf), 'w')
 572    tbl.write(TABLE_HEADER)
 573    svgf = '%s.svg' % (title)
 574    svg = file(os.path.join(outdir,svgf), 'w')
 575
 576    nMarkers = len(ped._markers)
 577    if nMarkers < 5:
 578        print sys.stderr, '### ERROR - %d is too few markers for reliable estimation in %s - terminating' % (nMarkers,PROGNAME)
 579        sys.exit(1)
 580    nSubjects = len(ped._subjects)
 581    nrsSamples = min(nMarkers, nrsSamples)
 582    if opts and opts.use_mito:
 583        markers = range(nMarkers)
 584        nrsSamples = min(len(markers), nrsSamples)
 585        sampleIndexes = sorted(random.sample(markers, nrsSamples))
 586    else:
 587        autosomals = ped.autosomal_indices()
 588        nrsSamples = min(len(autosomals), nrsSamples)
 589        sampleIndexes = sorted(random.sample(autosomals, nrsSamples))
 590
 591    print ''
 592    print 'Getting random.sample of %s from %s total' % (nrsSamples, nMarkers)
 593    npairs = (nSubjects*(nSubjects-1))/2 # total rows in table
 594    newfiles=[svgf,tblf]
 595    explanations = ['rgGRR Plot (requires SVG)','Mean by SD alleles shared - %d rows' % npairs]
 596    # these go with the output file links in the html file
 597    s = 'Reading genotypes for %s subjects and %s markers\n' % (nSubjects, nrsSamples)
 598    logf.write(s)
 599    minUsegenos = nrsSamples/2 # must have half?
 600    nGenotypes = nSubjects*nrsSamples
 601    stime = time.time()
 602    emptyRows = set()
 603    genos = numpy.zeros((nSubjects, nrsSamples), dtype=int)
 604    for s in xrange(nSubjects):
 605        nValid = 0
 606        #getGenotypesByIndices(self, s, mlist, format)
 607        genos[s] = ped.getGenotypesByIndices(s, sampleIndexes, format='ref')
 608        nValid = sum([1 for g in genos[s] if g])
 609        if not nValid:
 610            emptyRows.add(s)
 611            sub = ped.getSubject(s)
 612            print 'All missing for row %d (%s)' % (s, sub)
 613            logf.write('All missing for row %d (%s)\n' % (s, sub))
 614    rtime = time.time() - stime
 615    if verbose:
 616        print '@@Read %s genotypes in %s seconds' % (nGenotypes, rtime)
 617
 618
 619    ### Now the expensive part.  For each pair of subjects, we get the mean number
 620    ### and standard deviation of shared alleles over all of the markers where both
 621    ### subjects have a known genotype.  Identical subjects should have mean shared
 622    ### alleles very close to 2.0 with a standard deviation very close to 0.0.
 623    tot = nSubjects*(nSubjects-1)/2
 624    nprog = tot/10
 625    nMarkerpairs = tot * nrsSamples
 626    estimatedTimeSlow = nMarkerpairs/MARKER_PAIRS_PER_SECOND_SLOW
 627    estimatedTimeFast = nMarkerpairs/MARKER_PAIRS_PER_SECOND_FAST
 628
 629    pairs = []
 630    pair_data = {}
 631    means = []    ## Mean IBS for each pair
 632    ngenoL = []   ## Count of comparable genotypes for each pair
 633    sdevs = []    ## Standard dev for each pair
 634    rels  = []    ## A relationship code for each pair
 635    zmeans  = [0.0 for x in xrange(tot)]    ## zmean score for each pair for the relgroup
 636    zstds  = [0.0 for x in xrange(tot)]   ## zstd score for each pair for the relgrp
 637    skip = set()
 638    ndone = 0     ## How many have been done so far
 639
 640    logf.write('Calculating %d pairs...\n' % (tot))
 641    logf.write('Estimated time is %2.2f to %2.2f seconds ...\n' % (estimatedTimeFast, estimatedTimeSlow))
 642
 643    t1sum = 0
 644    t2sum = 0
 645    t3sum = 0
 646    now = time.time()
 647    scache = {}
 648    _founder_cache = {}
 649    C_CODE = """
 650    #include "math.h"
 651    int i;
 652    int sumibs = 0;
 653    int ssqibs = 0;
 654    int ngeno  = 0;
 655    float mean = 0;
 656    float M2 = 0;
 657    float delta = 0;
 658    float sdev=0;
 659    float variance=0;
 660    for (i=0; i<nrsSamples; i++) {
 661        int a1 = g1[i];
 662        int a2 = g2[i];
 663        if (a1 != 0 && a2 != 0) {
 664            ngeno += 1;
 665            int shared = 2-abs(a1-a2);
 666            delta = shared - mean;
 667            mean = mean + delta/ngeno;
 668            M2 += delta*(shared-mean);
 669            // yes that second time, the updated mean is used see calcmeansd above;
 670            //printf("%d %d %d %d %d %d\\n", i, a1, a2, ngeno, shared, squared);
 671            }
 672    }
 673    if (ngeno > 1) {
 674        variance = M2/(ngeno-1);
 675        sdev = sqrt(variance);
 676        //printf("OK: %d %3.2f %3.2f\\n", ngeno, mean, sdev);
 677    }
 678    //printf("%d %d %d %1.2f %1.2f\\n", ngeno, sumibs, ssqibs, mean, sdev);
 679    result[0] = ngeno;
 680    result[1] = mean;
 681    result[2] = sdev;
 682    return_val = ngeno;
 683    """
 684    started = time.time()
 685    for s1 in xrange(nSubjects):
 686        if s1 in emptyRows:
 687            continue
 688        (fid1,iid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache.setdefault(s1, ped.getSubject(s1))
 689
 690        isFounder1 = _founder_cache.setdefault(s1, (did1==mid1))
 691        g1 = genos[s1]
 692
 693        for s2 in xrange(s1+1, nSubjects):
 694            if s2 in emptyRows:
 695                continue
 696            t1s = time.time()
 697
 698            (fid2,iid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache.setdefault(s2, ped.getSubject(s2))
 699
 700            g2 = genos[s2]
 701            isFounder2 = _founder_cache.setdefault(s2, (did2==mid2))
 702
 703            # Determine the relationship for this pair
 704            relcode = REL_UNKNOWN
 705            if (fid2 == fid1):
 706                if iid1 == iid2:
 707                    relcode = REL_DUPE
 708                elif (did2 == did1) and (mid2 == mid1) and did1 != mid1:
 709                    relcode = REL_SIBS
 710                elif (iid1 == mid2) or (iid1 == did2) or (iid2 == mid1) or (iid2 == did1):
 711                    relcode = REL_PARENTCHILD
 712                elif (str(did1) != '0' and (did2 == did1)) or (str(mid1) != '0' and (mid2 == mid1)):
 713                    relcode = REL_HALFSIBS
 714                else:
 715                    # People in the same family should be marked as some other
 716                    # form of related.  In general, these people will have a
 717                    # pretty random spread of similarity. This distinction is
 718                    # probably not very useful most of the time
 719                    relcode = REL_RELATED
 720            else:
 721                ### Different families
 722                relcode = REL_UNRELATED
 723
 724            t1e = time.time()
 725            t1sum += t1e-t1s
 726
 727
 728            ### Calculate sum(2-abs(a1-a2)) and sum((2-abs(a1-a2))**2) and count
 729            ### the number of contributing genotypes.  These values are not actually
 730            ### calculated here, but instead are looked up in a table for speed.
 731            ### FIXME: This is still too slow ...
 732            result = [0.0, 0.0, 0.0]
 733            ngeno = weave.inline(C_CODE, ['g1', 'g2', 'nrsSamples', 'result'])
 734            if ngeno >= minUsegenos:
 735                _, mean, sdev = result
 736                means.append(mean)
 737                sdevs.append(sdev)
 738                ngenoL.append(ngeno)
 739                pairs.append((s1, s2))
 740                rels.append(relcode)
 741            else:
 742                skip.add(ndone) # signal no comparable genotypes for this pair
 743            ndone += 1
 744            t2e = time.time()
 745            t2sum += t2e-t1e
 746            t3e = time.time()
 747            t3sum += t3e-t2e
 748
 749    logme = [ 'T1:  %s' % (t1sum), 'T2:  %s' % (t2sum), 'T3:  %s' % (t3sum),'TOT: %s' % (t3e-now),
 750             '%s pairs with no (or not enough) comparable genotypes (%3.1f%%)' % (len(skip),
 751                                                            float(len(skip))/float(tot)*100)]
 752    logf.write('%s\n' % '\t'.join(logme))
 753    ### Calculate mean and standard deviation of scores on a per relationship
 754    ### type basis, allowing us to flag outliers for each particular relationship
 755    ### type
 756    relstats = {}
 757    relCounts = {}
 758    outlierFiles = {}
 759    for relCode, relInfo in REL_LOOKUP.items():
 760        relName, relColor, relStyle = relInfo
 761        useme = [means[x] for x in xrange(len(means)) if rels[x] == relCode]
 762        relCounts[relCode] = len(useme)
 763        mm = scipy.mean(useme)
 764        ms = scipy.std(useme)
 765        useme = [sdevs[x] for x in xrange(len(sdevs)) if rels[x] == relCode]
 766        sm = scipy.mean(useme)
 767        ss = scipy.std(useme)
 768        relstats[relCode] = {'sd':(sm,ss), 'mean':(mm,ms)}
 769        s = 'Relstate %s (n=%d): mean(mean)=%3.2f sdev(mean)=%3.2f, mean(sdev)=%3.2f sdev(sdev)=%3.2f\n' % \
 770          (relName,relCounts[relCode], mm, ms, sm, ss)
 771        logf.write(s)
 772
 773    ### now fake z scores for each subject like abecasis recommends max(|zmu|,|zsd|)
 774    ### within each group, for each pair, z=(groupmean-pairmean)/groupsd
 775    available = len(means)
 776    logf.write('%d pairs are available of %d\n' % (available, tot))
 777    ### s = '\nOutliers:\nrelationship\tzmean\tzsd\tped1\tped2\tmean\tsd\trmeanmean\trmeansd\trsdmean\trsdsd\n'
 778    ### logf.write(s)
 779    pairnum   = 0
 780    offset    = 0
 781    nOutliers = 0
 782    cexs      = []
 783    outlierRecords = dict([(r, []) for r in range(N_RELATIONSHIP_TYPES)])
 784    zsdmax = 0
 785    for s1 in range(nSubjects):
 786        if s1 in emptyRows:
 787            continue
 788        (fid1,iid1,did1,mid1,sex1,aff1,ok1,d_sid1,m_sid1) = scache[s1]
 789        for s2 in range(s1+1, nSubjects):
 790            if s2 in emptyRows:
 791                continue
 792            if pairnum not in skip:
 793                ### Get group stats for this relationship
 794                (fid2,iid2,did2,mid2,sex2,aff2,ok2,d_sid2,m_sid2) = scache[s2]
 795                try:
 796                    r = rels[offset]
 797                except IndexError:
 798                    logf.write('###OOPS offset %d available %d  pairnum %d  len(rels) %d', offset, available, pairnum, len(rels))
 799                notfound = ('?',('?','0','0'))
 800                relInfo = REL_LOOKUP.get(r,notfound)
 801                relName, relColor, relStyle = relInfo
 802                rmm,rmd = relstats[r]['mean'] # group mean, group meansd alleles shared
 803                rdm,rdd = relstats[r]['sd'] # group sdmean, group sdsd alleles shared
 804
 805                try:
 806                    zsd = (sdevs[offset] - rdm)/rdd # distance from group mean in group sd units
 807                except:
 808                    zsd = 1
 809                if abs(zsd) > zsdmax:
 810                    zsdmax = zsd # keep for sort scaling
 811                try:
 812                    zmean = (means[offset] - rmm)/rmd # distance from group mean
 813                except:
 814                    zmean = 1
 815                zmeans[offset] = zmean
 816                zstds[offset] = zsd
 817                pid=(s1,s2)
 818                zrad = max(zsd,zmean)
 819                if zrad < 4:
 820                    zrad = 2
 821                elif 4 < zrad < 15:
 822                    zrad = 3 # to 9
 823                else: # > 15 6=24+
 824                    zrad=zrad/4
 825                    zrad = min(zrad,6) # scale limit
 826                zrad = max(2,max(zsd,zmean)) # as > 2, z grows
 827                pair_data[pid] = (zmean,zsd,r,zrad)
 828                if max(zsd,zmean) > Zcutoff: # is potentially interesting
 829                    mean = means[offset]
 830                    sdev = sdevs[offset]
 831                    outlierRecords[r].append((mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd,did1,mid1,did2,mid2))
 832                    nOutliers += 1
 833                tbl.write('%s_%s\t%s_%s\t%f\t%f\t%f\t%f\t%d\t%s\t%s\t%s\t%s\t%s\n' % \
 834                          (fid1, iid1, fid2, iid2, mean, sdev, zmean,zsd, ngeno, relName, did1,mid1,did2,mid2))
 835                offset += 1
 836            pairnum += 1
 837    logf.write( 'Outliers: %s\n' % (nOutliers))
 838
 839    ### Write outlier files for each relationship type
 840    repOut.append('<h2>Outliers in tab delimited files linked above are also listed below</h2>')
 841    lzsd = round(numpy.log10(zsdmax)) + 1
 842    scalefactor = 10**lzsd
 843    for relCode, relInfo in REL_LOOKUP.items():
 844        relName, _, _ = relInfo
 845        outliers = outlierRecords[relCode]
 846        if not outliers:
 847            continue
 848        outliers = [(scalefactor*int(abs(x[3]))+ int(abs(x[2])),x) for x in outliers] # decorate
 849        outliers.sort()
 850        outliers.reverse() # largest deviation first
 851        outliers = [x[1] for x in outliers] # undecorate
 852        nrows = len(outliers)
 853        truncated = 0
 854        if nrows > MAX_SHOW_ROWS:
 855            s = '<h3>%s outlying pairs (top %d of %d) from %s</h3><table border="0" cellpadding="3">' % \
 856               (relName,MAX_SHOW_ROWS,nrows,title)
 857            truncated = nrows - MAX_SHOW_ROWS
 858        else:
 859            s = '<h3>%s outlying pairs (n=%d) from %s</h3><table border="0" cellpadding="3">' % (relName,nrows,title)
 860        repOut.append(s)
 861        fhname = '%s_rgGRR_%s_outliers.xls' % (title, relName)
 862        fhpath = os.path.join(outdir,fhname)
 863        fh = open(fhpath, 'w')
 864        newfiles.append(fhname)
 865        explanations.append('%s Outlier Pairs %s, N=%d, Cutoff SD=%f' % (relName,title,len(outliers),Zcutoff))
 866        fh.write(OUTLIERS_HEADER)
 867        s = ''.join(['<th>%s</th>' % x for x in OUTLIERS_HEADER_list])
 868        repOut.append('<tr align="center">%s</tr>' % s)
 869        for n,rec in enumerate(outliers):
 870            #(mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd) = rec
 871            s = '%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t' % tuple(rec)
 872            fh.write('%s%s\n' % (s,relName))
 873            # (mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd, did1,mid1,did2,mid2))
 874            s = '''<td>%f</td><td>%f</td><td>%f</td><td>%f</td><td>%s</td><td>%s</td>
 875            <td>%s</td><td>%s</td><td>%f</td><td>%f</td><td>%f</td><td>%f</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td>''' % tuple(rec)
 876            s = '%s<td>%s</td>' % (s,relName)
 877            if n < MAX_SHOW_ROWS:
 878                repOut.append('<tr align="center">%s</tr>' % s)
 879        if truncated > 0:
 880            repOut.append('<H2>WARNING: %d rows truncated - see outlier file for all %d rows</H2>' % (truncated,
 881                                                                                            nrows))
 882        fh.close()
 883        repOut.append('</table><p>')
 884
 885    ### Now, draw the plot in jpeg and svg formats, and optionally in the PDF format
 886    ### if requested
 887    logf.write('Plotting ...')
 888    pointColors = [REL_COLORS[rel] for rel in rels]
 889    pointStyles = [REL_POINTS[rel] for rel in rels]
 890
 891    mainTitle = '%s (%s subjects, %d snp)' % (title, nSubjects, nrsSamples)
 892    svg.write(SVG_HEADER % (SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[4],
 893        SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[0],SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[1],
 894        SVG_COLORS[2],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[3],SVG_COLORS[4],SVG_COLORS[4],
 895        SVG_COLORS[5],SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[6],mainTitle))
 896    #rpy.r.jpeg(filename='%s.jpg' % (title), width=1600, height=1200, pointsize=12, quality=100, bg='white')
 897    #rpy.r.par(mai=(1,1,1,0.5))
 898    #rpy.r('par(xaxs="i",yaxs="i")')
 899    #rpy.r.plot(means, sdevs, main=mainTitle, ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2))
 900    #rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE)
 901    #rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted')
 902    #rpy.r.dev_off()
 903
 904    ### We will now go through each relationship type to partition plot points
 905    ### into "bulk" and "outlier" groups.  Bulk points will represent common
 906    ### mean/sdev pairs and will cover the majority of the points in the plot --
 907    ### they will use generic tooltip informtion about all of the pairs
 908    ### represented by that point.  "Outlier" points will be uncommon pairs,
 909    ### with very specific information in their tooltips.  It would be nice to
 910    ### keep hte total number of plotted points in the SVG representation to
 911    ### ~10000 (certainly less than 100000?)
 912    pointMap = {}
 913    orderedRels = [y[1] for y in reversed(sorted([(relCounts.get(x, 0),x) for x in REL_LOOKUP.keys()]))]
 914    # do we really want this? I want out of zone points last and big
 915    for relCode in orderedRels:
 916        svgColor = SVG_COLORS[relCode]
 917        relName, relColor, relStyle = REL_LOOKUP[relCode]
 918        svg.write('<g id="%s" style="stroke:%s; fill:%s; fill-opacity:1.0; stroke-width:1;" cursor="pointer">\n' % (relName, svgColor, svgColor))
 919        pMap = pointMap.setdefault(relCode, {})
 920        nPoints = 0
 921        rpairs=[]
 922        rgenos=[]
 923        rmeans=[]
 924        rsdevs=[]
 925        rz = []
 926        for x,rel in enumerate(rels): # all pairs
 927            if rel == relCode:
 928                s1,s2 = pairs[x]
 929                pid=(s1,s2)
 930                zmean,zsd,r,zrad = pair_data[pid][:4]
 931                rpairs.append(pairs[x])
 932                rgenos.append(ngenoL[x])
 933                rmeans.append(means[x])
 934                rsdevs.append(sdevs[x])
 935                rz.append(zrad)
 936        ### Now add the svg point group for this relationship to the svg file
 937        for x in range(len(rmeans)):
 938            svgX = '%d' % ((rmeans[x] - 1.0) * PLOT_WIDTH) # changed so mean scale is 1-2
 939            svgY = '%d' % (PLOT_HEIGHT - (rsdevs[x] * PLOT_HEIGHT)) # changed so sd scale is 0-1
 940            s1, s2 = rpairs[x]
 941            (fid1,uid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache[s1]
 942            (fid2,uid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache[s2]
 943            ngenos = rgenos[x]
 944            nPoints += 1
 945            point = pMap.setdefault((svgX, svgY), [])
 946            point.append((rmeans[x], rsdevs[x], fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos,rz[x]))
 947        for (svgX, svgY) in pMap:
 948            points = pMap[(svgX, svgY)]
 949            svgX = int(svgX)
 950            svgY = int(svgY)
 951            if len(points) > 1:
 952                mmean,dmean = calcMeanSD([p[0] for p in points])
 953                msdev,dsdev = calcMeanSD([p[1] for p in points])
 954                mgeno,dgeno = calcMeanSD([p[-1] for p in points])
 955                mingeno = min([p[-1] for p in points])
 956                maxgeno = max([p[-1] for p in points])
 957                svg.write("""<circle cx="%d" cy="%d" r="2"
 958                onmouseover="showBTT(evt, %d, %1.2f, %1.2f, %1.2f, %1.2f, %d, %d, %d, %d, %d)"
 959                onmouseout="hideBTT(evt)" />\n""" % (svgX, svgY, relCode, mmean, dmean, msdev, dsdev, len(points), mgeno, dgeno, mingeno, maxgeno))
 960            else:
 961                mean, sdev, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos, zrad = points[0][:12]
 962                rmean = float(relstats[relCode]['mean'][0])
 963                rsdev = float(relstats[relCode]['sd'][0])
 964                if zrad < 4:
 965                    zrad = 2
 966                elif 4 < zrad < 9:
 967                    zrad = 3 # to 9
 968                else: # > 9 5=15+
 969                    zrad=zrad/3
 970                    zrad = min(zrad,5) # scale limit
 971                if zrad <= 3:
 972                    svg.write('<circle cx="%d" cy="%d" r="%s" onmouseover="showOTT(evt, %d, \'%s,%s,%s,%s\', \'%s,%s,%s,%s\', %1.2f, %1.2f, %s, %1.2f, %1.2f)" onmouseout="hideOTT(evt)" />\n' % (svgX, svgY, zrad, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev))
 973                else: # highlight pairs a long way from expectation by outlining circle in red
 974                    svg.write("""<circle cx="%d" cy="%d" r="%s" style="stroke:red; fill:%s; fill-opacity:1.0; stroke-width:1;"
 975                    onmouseover="showOTT(evt, %d, \'%s,%s,%s,%s\', \'%s,%s,%s,%s\', %1.2f, %1.2f, %s, %1.2f, %1.2f)"
 976                    onmouseout="hideOTT(evt)" />\n""" % \
 977                    (svgX, svgY, zrad, svgColor, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev))
 978        svg.write('</g>\n')
 979
 980    ### Create a pdf as well if indicated on the command line
 981    ### WARNING! for framingham share, with about 50M pairs, this is a 5.5GB pdf!
 982##    if pdftoo:
 983##        pdfname = '%s.pdf' % (title)
 984##        rpy.r.pdf(pdfname, 6, 6)
 985##        rpy.r.par(mai=(1,1,1,0.5))
 986##        rpy.r('par(xaxs="i",yaxs="i")')
 987##        rpy.r.plot(means, sdevs, main='%s, %d snp' % (title, nSamples), ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2))
 988##        rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE)
 989##        rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted')
 990##        rpy.r.dev_off()
 991
 992    ### Draw polygons
 993    if showPolygons:
 994        svg.write('<g id="polygons" cursor="pointer">\n')
 995        for rel, poly in POLYGONS.items():
 996            points = ' '.join(['%s,%s' % ((p[0]-1.0)*float(PLOT_WIDTH), (PLOT_HEIGHT - p[1]*PLOT_HEIGHT)) for p in poly])
 997            svg.write('<polygon points="%s" fill="transparent" style="stroke:%s; stroke-width:1"/>\n' % (points, SVG_COLORS[rel]))
 998        svg.write('</g>\n')
 999
1000
1001    svg.write(SVG_FOOTER)
1002    svg.close()
1003    return newfiles,explanations,repOut
1004
1005def doIBS(n=100):
1006    """parse parameters from galaxy
1007    expect 'input pbed path' 'basename' 'outpath' 'title' 'logpath' 'n'
1008    <command interpreter="python">
1009         rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name"
1010        '$out_file1' '$out_file1.files_path' "$title1"  '$n' '$Z' 
1011    </command>
1012
1013    """
1014    u="""<command interpreter="python">
1015         rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name"
1016        '$out_file1' '$out_file1.files_path' "$title1"  '$n' '$Z'
1017         </command>
1018      """
1019
1020
1021    if len(sys.argv) < 7:
1022        print >> sys.stdout, 'Need pbed inpath, basename, out_htmlname, outpath, title, logpath, nSNP, Zcutoff on command line please'
1023        print >> sys.stdout, u
1024        sys.exit(1)
1025    ts = '%s%s' % (string.punctuation,string.whitespace)
1026    ptran =  string.maketrans(ts,'_'*len(ts))
1027    inpath = sys.argv[1]
1028    ldinpath = os.path.split(inpath)[0]
1029    basename = sys.argv[2]
1030    outhtml = sys.argv[3]
1031    newfilepath = sys.argv[4]
1032    title = sys.argv[5].translate(ptran)
1033    logfname = 'Log_%s.txt' % title
1034    logpath = os.path.join(newfilepath,logfname) # log was a child - make part of html extra_files_path zoo
1035    n = int(sys.argv[6])
1036    try:
1037        Zcutoff = float(sys.argv[7])
1038    except:
1039        Zcutoff = 2.0
1040    try:
1041        os.makedirs(newfilepath)
1042    except:
1043        pass
1044    logf = file(logpath,'w')
1045    efp,ibase_name = os.path.split(inpath) # need to use these for outputs in files_path
1046    ped = plinkbinJZ.BPed(inpath)
1047    ped.parse(quick=True)	
1048    if ped == None:
1049        print >> sys.stderr, '## doIBSpy problem - cannot open %s or %s - cannot run' % (ldreduced,basename)
1050        sys.exit(1)
1051    newfiles,explanations,repOut = doIBSpy(ped=ped,basename=basename,outdir=newfilepath,
1052                                    logf=logf,nrsSamples=n,title=title,pdftoo=0,Zcutoff=Zcutoff)
1053    logf.close()
1054    logfs = file(logpath,'r').readlines()
1055    lf = file(outhtml,'w')
1056    lf.write(galhtmlprefix % PROGNAME)
1057    # this is a mess. todo clean up - should each datatype have it's own directory? Yes
1058    # probably. Then titles are universal - but userId libraries are separate.
1059    s = '<div>Output from %s run at %s<br>\n' % (PROGNAME,timenow())
1060    lf.write('<h4>%s</h4>\n' % s)
1061    fixed = ["'%s'" % x for x in sys.argv] # add quotes just in case
1062    s = 'If you need to rerun this analysis, the command line was\n<pre>%s</pre>\n</div>' % (' '.join(fixed))
1063    lf.write(s)
1064    # various ways of displaying svg - experiments related to missing svg mimetype on test (!)
1065    #s = """<object data="%s" type="image/svg+xml"  width="%d" height="%d">
1066    #       <embed src="%s" type="image/svg+xml" width="%d" height="%d" />
1067    #       </object>""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT,newfiles[0],PLOT_WIDTH,PLOT_HEIGHT)
1068    s = """ <embed src="%s" type="image/svg+xml" width="%d" height="%d" />""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT)
1069    #s = """ <iframe src="%s" type="image/svg+xml" width="%d" height="%d" />""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT)
1070    lf.write(s)
1071    lf.write('<div><h4>Click the links below to save output files and plots</h4><br><ol>\n')
1072    for i in range(len(newfiles)):
1073       if i == 0:
1074            lf.write('<li><a href="%s" type="image/svg+xml" >%s</a></li>\n' % (newfiles[i],explanations[i]))
1075       else:
1076             lf.write('<li><a href="%s">%s</a></li>\n' % (newfiles[i],explanations[i]))
1077    flist = os.listdir(newfilepath)
1078    for fname in flist:
1079        if not fname in newfiles:
1080             lf.write('<li><a href="%s">%s</a></li>\n' % (fname,fname))
1081    lf.write('</ol></div>')
1082    lf.write('<div>%s</div>' % ('\n'.join(repOut))) # repOut is a list of tables
1083    lf.write('<div><hr><h3>Log from this job (also stored in %s)</h3><pre>%s</pre><hr></div>' % (logfname,''.join(logfs)))
1084    lf.write('</body></html>\n')
1085    lf.close()
1086    logf.close()
1087
1088if __name__ == '__main__':
1089    doIBS()