/lib/galaxy/tools/util/hyphy_util.py
Python | 1163 lines | 1159 code | 2 blank | 2 comment | 0 complexity | 34f2c6b348f53d5e0c7314ec2fa2d535 MD5 | raw file
1#Dan Blankenberg
2#Contains file contents and helper methods for HYPHY configurations
3import tempfile, os
4
5def get_filled_temp_filename(contents):
6 fh = tempfile.NamedTemporaryFile('w')
7 filename = fh.name
8 fh.close()
9 fh = open(filename, 'w')
10 fh.write(contents)
11 fh.close()
12 return filename
13
14NJ_tree_shared_ibf = """
15COUNT_GAPS_IN_FREQUENCIES = 0;
16methodIndex = 1;
17
18/*-----------------------------------------------------------------------------------------------------------------------------------------*/
19
20function InferTreeTopology(verbFlag)
21{
22 distanceMatrix = {ds.species,ds.species};
23
24 MESSAGE_LOGGING = 0;
25 ExecuteAFile (HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"chooseDistanceFormula.def");
26 InitializeDistances (0);
27
28 for (i = 0; i<ds.species; i=i+1)
29 {
30 for (j = i+1; j<ds.species; j = j+1)
31 {
32 distanceMatrix[i][j] = ComputeDistanceFormula (i,j);
33 }
34 }
35
36 MESSAGE_LOGGING = 1;
37 cladesMade = 1;
38
39
40 if (ds.species == 2)
41 {
42 d1 = distanceMatrix[0][1]/2;
43 treeNodes = {{0,1,d1__},
44 {1,1,d1__},
45 {2,0,0}};
46
47 cladesInfo = {{2,0}};
48 }
49 else
50 {
51 if (ds.species == 3)
52 {
53 /* generate least squares estimates here */
54
55 d1 = (distanceMatrix[0][1]+distanceMatrix[0][2]-distanceMatrix[1][2])/2;
56 d2 = (distanceMatrix[0][1]-distanceMatrix[0][2]+distanceMatrix[1][2])/2;
57 d3 = (distanceMatrix[1][2]+distanceMatrix[0][2]-distanceMatrix[0][1])/2;
58
59 treeNodes = {{0,1,d1__},
60 {1,1,d2__},
61 {2,1,d3__}
62 {3,0,0}};
63
64 cladesInfo = {{3,0}};
65 }
66 else
67 {
68 njm = (distanceMatrix > methodIndex)>=ds.species;
69
70 treeNodes = {2*(ds.species+1),3};
71 cladesInfo = {ds.species-1,2};
72
73 for (i=Rows(treeNodes)-1; i>=0; i=i-1)
74 {
75 treeNodes[i][0] = njm[i][0];
76 treeNodes[i][1] = njm[i][1];
77 treeNodes[i][2] = njm[i][2];
78 }
79
80 for (i=Rows(cladesInfo)-1; i>=0; i=i-1)
81 {
82 cladesInfo[i][0] = njm[i][3];
83 cladesInfo[i][1] = njm[i][4];
84 }
85
86 njm = 0;
87 }
88 }
89 return 1.0;
90}
91
92/*-----------------------------------------------------------------------------------------------------------------------------------------*/
93
94function TreeMatrix2TreeString (doLengths)
95{
96 treeString = "";
97 p = 0;
98 k = 0;
99 m = treeNodes[0][1];
100 n = treeNodes[0][0];
101 treeString*(Rows(treeNodes)*25);
102
103 while (m)
104 {
105 if (m>p)
106 {
107 if (p)
108 {
109 treeString*",";
110 }
111 for (j=p;j<m;j=j+1)
112 {
113 treeString*"(";
114 }
115 }
116 else
117 {
118 if (m<p)
119 {
120 for (j=m;j<p;j=j+1)
121 {
122 treeString*")";
123 }
124 }
125 else
126 {
127 treeString*",";
128 }
129 }
130 if (n<ds.species)
131 {
132 GetString (nodeName, ds, n);
133 if (doLengths != 1)
134 {
135 treeString*nodeName;
136 }
137 else
138 {
139 treeString*taxonNameMap[nodeName];
140 }
141 }
142 if (doLengths>.5)
143 {
144 nodeName = ":"+treeNodes[k][2];
145 treeString*nodeName;
146 }
147 k=k+1;
148 p=m;
149 n=treeNodes[k][0];
150 m=treeNodes[k][1];
151 }
152
153 for (j=m;j<p;j=j+1)
154 {
155 treeString*")";
156 }
157
158 treeString*0;
159 return treeString;
160}
161"""
162
163def get_NJ_tree (filename):
164 return """
165DISTANCE_PROMPTS = 1;
166ExecuteAFile ("%s");
167
168DataSet ds = ReadDataFile (PROMPT_FOR_FILE);
169DataSetFilter filteredData = CreateFilter (ds,1);
170
171/* do sequence to branch map */
172
173taxonNameMap = {};
174
175for (k=0; k<ds.species; k=k+1)
176{
177 GetString (thisName, ds,k);
178 shortName = (thisName^{{"\\\\..+",""}})&&1;
179 taxonNameMap[shortName] = thisName;
180 SetParameter (ds,k,shortName);
181}
182
183DataSetFilter filteredData = CreateFilter (ds,1);
184InferTreeTopology (0);
185treeString = TreeMatrix2TreeString (1);
186
187fprintf (PROMPT_FOR_FILE, CLEAR_FILE, treeString);
188fscanf (stdin, "String", ps_file);
189
190if (Abs(ps_file))
191{
192 treeString = TreeMatrix2TreeString (2);
193 UseModel (USE_NO_MODEL);
194 Tree givenTree = treeString;
195 baseHeight = TipCount (givenTree)*28;
196 TREE_OUTPUT_OPTIONS = {};
197 TREE_OUTPUT_OPTIONS["__FONT_SIZE__"] = 14;
198 baseWidth = 0;
199 treeAVL = givenTree^0;
200 drawLetter = "/drawletter {"+TREE_OUTPUT_OPTIONS["__FONT_SIZE__"]$4+" -"+TREE_OUTPUT_OPTIONS["__FONT_SIZE__"]$2+ " show} def\\n";
201 for (k3 = 1; k3 < Abs(treeAVL); k3=k3+1)
202 {
203 nodeName = (treeAVL[k3])["Name"];
204 if(Abs((treeAVL[k3])["Children"]) == 0)
205 {
206 mySpecs = {};
207 mySpecs ["TREE_OUTPUT_BRANCH_LABEL"] = "(" + taxonNameMap[nodeName] + ") drawLetter";
208 baseWidth = Max (baseWidth, (treeAVL[k3])["Depth"]);
209 }
210 }
211 baseWidth = 40*baseWidth;
212
213 fprintf (ps_file, CLEAR_FILE, drawLetter, PSTreeString (givenTree, "STRING_SUPPLIED_LENGTHS",{{baseWidth,baseHeight}}));
214}
215""" % (filename)
216
217def get_NJ_treeMF (filename):
218 return """
219ExecuteAFile ("%s");
220
221VERBOSITY_LEVEL = -1;
222fscanf (PROMPT_FOR_FILE, "Lines", inLines);
223
224_linesIn = Columns (inLines);
225isomorphicTreesBySequenceCount = {};
226
227/*---------------------------------------------------------*/
228
229_currentGene = 1;
230_currentState = 0;
231geneSeqs = "";
232geneSeqs * 128;
233
234fprintf (PROMPT_FOR_FILE, CLEAR_FILE, KEEP_OPEN);
235treeOutFile = LAST_FILE_PATH;
236
237fscanf (stdin,"String", ps_file);
238if (Abs(ps_file))
239{
240 fprintf (ps_file, CLEAR_FILE, KEEP_OPEN);
241}
242
243for (l=0; l<_linesIn; l=l+1)
244{
245 if (Abs(inLines[l]) == 0)
246 {
247 if (_currentState == 1)
248 {
249 geneSeqs * 0;
250 DataSet ds = ReadFromString (geneSeqs);
251 _processAGene (_currentGene,treeOutFile,ps_file);
252 geneSeqs * 128;
253 _currentGene = _currentGene + 1;
254 }
255 }
256 else
257 {
258 if (_currentState == 0)
259 {
260 _currentState = 1;
261 }
262 geneSeqs * inLines[l];
263 geneSeqs * "\\n";
264 }
265}
266
267
268if (_currentState == 1)
269{
270 geneSeqs * 0;
271 if (Abs(geneSeqs))
272 {
273 DataSet ds = ReadFromString (geneSeqs);
274 _processAGene (_currentGene,treeOutFile,ps_file);
275 }
276}
277
278fprintf (treeOutFile,CLOSE_FILE);
279if (Abs(ps_file))
280{
281 fprintf (ps_file,CLOSE_FILE);
282}
283/*---------------------------------------------------------*/
284
285function _processAGene (_geneID, nwk_file, ps_file)
286{
287 if (ds.species == 1)
288 {
289 fprintf (nwk_file, _geneID-1, "\\tNone \\tNone\\n");
290 return 0;
291
292 }
293
294 DataSetFilter filteredData = CreateFilter (ds,1);
295
296 /* do sequence to branch map */
297
298 taxonNameMap = {};
299
300 for (k=0; k<ds.species; k=k+1)
301 {
302 GetString (thisName, ds,k);
303 shortName = (thisName^{{"\\\\..+",""}});
304 taxonNameMap[shortName] = thisName;
305 SetParameter (ds,k,shortName);
306 }
307
308 DataSetFilter filteredData = CreateFilter (ds,1);
309 DISTANCE_PROMPTS = (_geneID==1);
310
311 InferTreeTopology (0);
312 baseTree = TreeMatrix2TreeString (0);
313 UseModel (USE_NO_MODEL);
314
315 Tree baseTop = baseTree;
316
317 /* standardize this top */
318
319 for (k=0; k<Abs(isomorphicTreesBySequenceCount[filteredData.species]); k=k+1)
320 {
321 testString = (isomorphicTreesBySequenceCount[filteredData.species])[k];
322 Tree testTree = testString;
323 if (testTree == baseTop)
324 {
325 baseTree = testString;
326 break;
327 }
328 }
329 if (k==Abs(isomorphicTreesBySequenceCount[filteredData.species]))
330 {
331 if (k==0)
332 {
333 isomorphicTreesBySequenceCount[filteredData.species] = {};
334 }
335 (isomorphicTreesBySequenceCount[filteredData.species])[k] = baseTree;
336 }
337
338 fprintf (nwk_file, _geneID-1, "\\t", baseTree, "\\t", TreeMatrix2TreeString (1), "\\n");
339 if (Abs(ps_file))
340 {
341 treeString = TreeMatrix2TreeString (2);
342 UseModel (USE_NO_MODEL);
343 Tree givenTree = treeString;
344 baseHeight = TipCount (givenTree)*28;
345 TREE_OUTPUT_OPTIONS = {};
346 TREE_OUTPUT_OPTIONS["__FONT_SIZE__"] = 14;
347 baseWidth = 0;
348 treeAVL = givenTree^0;
349 drawLetter = "/drawletter {"+TREE_OUTPUT_OPTIONS["__FONT_SIZE__"]$4+" -"+TREE_OUTPUT_OPTIONS["__FONT_SIZE__"]$2+ " show} def\\n";
350 for (k3 = 1; k3 < Abs(treeAVL); k3=k3+1)
351 {
352 nodeName = (treeAVL[k3])["Name"];
353 if(Abs((treeAVL[k3])["Children"]) == 0)
354 {
355 mySpecs = {};
356 mySpecs ["TREE_OUTPUT_BRANCH_LABEL"] = "(" + taxonNameMap[nodeName] + ") drawLetter";
357 baseWidth = Max (baseWidth, (treeAVL[k3])["Depth"]);
358 }
359 }
360 baseWidth = 40*baseWidth;
361
362 fprintf (stdout, _geneID, ":", givenTree,"\\n");
363 fprintf (ps_file, PSTreeString (givenTree, "STRING_SUPPLIED_LENGTHS",{{baseWidth,baseHeight}}));
364 }
365 return 0;
366}
367""" % (filename)
368
369BranchLengthsMF = """
370VERBOSITY_LEVEL = -1;
371
372fscanf (PROMPT_FOR_FILE, "Lines", inLines);
373
374
375
376_linesIn = Columns (inLines);
377
378
379
380/*---------------------------------------------------------*/
381
382
383
384_currentGene = 1;
385
386_currentState = 0;
387
388geneSeqs = "";
389
390geneSeqs * 128;
391
392
393
394for (l=0; l<_linesIn; l=l+1)
395
396{
397
398 if (Abs(inLines[l]) == 0)
399
400 {
401
402 if (_currentState == 1)
403
404 {
405
406 geneSeqs * 0;
407
408 DataSet ds = ReadFromString (geneSeqs);
409
410 _processAGene (_currentGene);
411
412 geneSeqs * 128;
413
414 _currentGene = _currentGene + 1;
415
416 }
417
418 }
419
420 else
421
422 {
423
424 if (_currentState == 0)
425
426 {
427
428 _currentState = 1;
429
430 }
431
432 geneSeqs * inLines[l];
433
434 geneSeqs * "\\n";
435
436 }
437
438}
439
440
441
442if (_currentState == 1)
443
444{
445
446 geneSeqs * 0;
447
448 if (Abs(geneSeqs))
449
450 {
451
452 DataSet ds = ReadFromString (geneSeqs);
453
454 _processAGene (_currentGene);
455
456 }
457
458}
459
460
461
462fprintf (resultFile,CLOSE_FILE);
463
464
465
466/*---------------------------------------------------------*/
467
468
469
470function _processAGene (_geneID)
471
472{
473
474 DataSetFilter filteredData = CreateFilter (ds,1);
475
476 if (_currentGene == 1)
477
478 {
479
480 SelectTemplateModel (filteredData);
481
482
483
484 SetDialogPrompt ("Tree file");
485
486 fscanf (PROMPT_FOR_FILE, "Tree", givenTree);
487
488 fscanf (stdin, "String", resultFile);
489
490
491
492 /* do sequence to branch map */
493
494
495
496 validNames = {};
497
498 taxonNameMap = {};
499
500
501
502 for (k=0; k<TipCount(givenTree); k=k+1)
503
504 {
505
506 validNames[TipName(givenTree,k)&&1] = 1;
507
508 }
509
510
511
512 for (k=0; k<BranchCount(givenTree); k=k+1)
513
514 {
515
516 thisName = BranchName(givenTree,k);
517
518 taxonNameMap[thisName&&1] = thisName;
519
520 }
521
522
523
524 storeValidNames = validNames;
525
526 fprintf (resultFile,CLEAR_FILE,KEEP_OPEN,"Block\\tBranch\\tLength\\tLowerBound\\tUpperBound\\n");
527
528 }
529
530 else
531
532 {
533
534 HarvestFrequencies (vectorOfFrequencies, filteredData, 1,1,1);
535
536 validNames = storeValidNames;
537
538 }
539
540
541
542 for (k=0; k<ds.species; k=k+1)
543
544 {
545
546 GetString (thisName, ds,k);
547
548 shortName = (thisName^{{"\\\\..+",""}})&&1;
549
550 if (validNames[shortName])
551
552 {
553
554 taxonNameMap[shortName] = thisName;
555
556 validNames - (shortName);
557
558 SetParameter (ds,k,shortName);
559
560 }
561
562 else
563
564 {
565
566 fprintf (resultFile,"ERROR:", thisName, " could not be matched to any of the leaves in tree ", givenTree,"\\n");
567
568 return 0;
569
570 }
571
572 }
573
574
575
576 /* */
577
578
579
580 LikelihoodFunction lf = (filteredData,givenTree);
581
582 Optimize (res,lf);
583
584
585
586 timer = Time(0)-timer;
587
588
589
590 branchNames = BranchName (givenTree,-1);
591
592 branchLengths = BranchLength (givenTree,-1);
593
594
595
596
597
598 for (k=0; k<Columns(branchNames)-1; k=k+1)
599
600 {
601
602 COVARIANCE_PARAMETER = "givenTree."+branchNames[k]+".t";
603
604 COVARIANCE_PRECISION = 0.95;
605
606 CovarianceMatrix (cmx,lf);
607
608 if (k==0)
609
610 {
611
612 /* compute a scaling factor */
613
614 ExecuteCommands ("givenTree."+branchNames[0]+".t=1");
615
616 scaleFactor = BranchLength (givenTree,0);
617
618 ExecuteCommands ("givenTree."+branchNames[0]+".t="+cmx[0][1]);
619
620 }
621
622 fprintf (resultFile,_geneID,"\\t",taxonNameMap[branchNames[k]&&1],"\\t",branchLengths[k],"\\t",scaleFactor*cmx[0][0],"\\t",scaleFactor*cmx[0][2],"\\n");
623
624 }
625
626
627
628 ttl = (branchLengths*(Transpose(branchLengths["1"])))[0];
629
630 global treeScaler = 1;
631
632 ReplicateConstraint ("this1.?.t:=treeScaler*this2.?.t__",givenTree,givenTree);
633
634 COVARIANCE_PARAMETER = "treeScaler";
635
636 COVARIANCE_PRECISION = 0.95;
637
638 CovarianceMatrix (cmx,lf);
639
640 fprintf (resultFile,_geneID,"\\tTotal Tree\\t",ttl,"\\t",ttl*cmx[0][0],"\\t",ttl*cmx[0][2],"\\n");
641
642 ClearConstraints (givenTree);
643
644 return 0;
645
646}
647"""
648
649BranchLengths = """
650DataSet ds = ReadDataFile (PROMPT_FOR_FILE);
651DataSetFilter filteredData = CreateFilter (ds,1);
652
653SelectTemplateModel (filteredData);
654
655SetDialogPrompt ("Tree file");
656fscanf (PROMPT_FOR_FILE, "Tree", givenTree);
657fscanf (stdin, "String", resultFile);
658
659/* do sequence to branch map */
660
661validNames = {};
662taxonNameMap = {};
663
664for (k=0; k<TipCount(givenTree); k=k+1)
665{
666 validNames[TipName(givenTree,k)&&1] = 1;
667}
668
669for (k=0; k<BranchCount(givenTree); k=k+1)
670{
671 thisName = BranchName(givenTree,k);
672 taxonNameMap[thisName&&1] = thisName;
673}
674
675for (k=0; k<ds.species; k=k+1)
676{
677 GetString (thisName, ds,k);
678 shortName = (thisName^{{"\\\\..+",""}})&&1;
679 if (validNames[shortName])
680 {
681 taxonNameMap[shortName] = thisName;
682 validNames - (shortName);
683 SetParameter (ds,k,shortName);
684 }
685 else
686 {
687 fprintf (resultFile,CLEAR_FILE,"ERROR:", thisName, " could not be matched to any of the leaves in tree ", givenTree);
688 return 0;
689 }
690}
691
692/* */
693
694LikelihoodFunction lf = (filteredData,givenTree);
695
696Optimize (res,lf);
697
698timer = Time(0)-timer;
699
700branchNames = BranchName (givenTree,-1);
701branchLengths = BranchLength (givenTree,-1);
702
703fprintf (resultFile,CLEAR_FILE,KEEP_OPEN,"Branch\\tLength\\tLowerBound\\tUpperBound\\n");
704
705for (k=0; k<Columns(branchNames)-1; k=k+1)
706{
707 COVARIANCE_PARAMETER = "givenTree."+branchNames[k]+".t";
708 COVARIANCE_PRECISION = 0.95;
709 CovarianceMatrix (cmx,lf);
710 if (k==0)
711 {
712 /* compute a scaling factor */
713 ExecuteCommands ("givenTree."+branchNames[0]+".t=1");
714 scaleFactor = BranchLength (givenTree,0);
715 ExecuteCommands ("givenTree."+branchNames[0]+".t="+cmx[0][1]);
716 }
717 fprintf (resultFile,taxonNameMap[branchNames[k]&&1],"\\t",branchLengths[k],"\\t",scaleFactor*cmx[0][0],"\\t",scaleFactor*cmx[0][2],"\\n");
718}
719
720ttl = (branchLengths*(Transpose(branchLengths["1"])))[0];
721global treeScaler = 1;
722ReplicateConstraint ("this1.?.t:=treeScaler*this2.?.t__",givenTree,givenTree);
723COVARIANCE_PARAMETER = "treeScaler";
724COVARIANCE_PRECISION = 0.95;
725CovarianceMatrix (cmx,lf);
726ClearConstraints (givenTree);
727fprintf (resultFile,"Total Tree\\t",ttl,"\\t",ttl*cmx[0][0],"\\t",ttl*cmx[0][2],"\\n");
728fprintf (resultFile,CLOSE_FILE);
729"""
730
731SimpleLocalFitter = """
732VERBOSITY_LEVEL = -1;
733COUNT_GAPS_IN_FREQUENCIES = 0;
734
735/*---------------------------------------------------------*/
736
737function returnResultHeaders (dummy)
738{
739 _analysisHeaders = {};
740 _analysisHeaders[0] = "BLOCK";
741 _analysisHeaders[1] = "BP";
742 _analysisHeaders[2] = "S_sites";
743 _analysisHeaders[3] = "NS_sites";
744 _analysisHeaders[4] = "Stop_codons";
745 _analysisHeaders[5] = "LogL";
746 _analysisHeaders[6] = "AC";
747 _analysisHeaders[7] = "AT";
748 _analysisHeaders[8] = "CG";
749 _analysisHeaders[9] = "CT";
750 _analysisHeaders[10] = "GT";
751 _analysisHeaders[11] = "Tree";
752
753 for (_biterator = 0; _biterator < treeBranchCount; _biterator = _biterator + 1)
754 {
755 branchName = treeBranchNames[_biterator];
756
757 _analysisHeaders [Abs(_analysisHeaders)] = "length("+branchName+")";
758 _analysisHeaders [Abs(_analysisHeaders)] = "dS("+branchName+")";
759 _analysisHeaders [Abs(_analysisHeaders)] = "dN("+branchName+")";
760 _analysisHeaders [Abs(_analysisHeaders)] = "omega("+branchName+")";
761 }
762
763 return _analysisHeaders;
764}
765
766/*---------------------------------------------------------*/
767
768function runAGeneFit (myID)
769{
770 DataSetFilter filteredData = CreateFilter (ds,3,"","",GeneticCodeExclusions);
771
772 if (_currentGene==1)
773 {
774 _MG94stdinOverload = {};
775 _MG94stdinOverload ["0"] = "Local";
776 _MG94stdinOverload ["1"] = modelSpecString;
777
778 ExecuteAFile (HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"TemplateModels"+DIRECTORY_SEPARATOR+"MG94custom.mdl",
779 _MG94stdinOverload);
780
781 Tree codonTree = treeString;
782 }
783 else
784 {
785 HarvestFrequencies (observedFreq,filteredData,3,1,1);
786 MULTIPLY_BY_FREQS = PopulateModelMatrix ("MG94custom", observedFreq);
787 vectorOfFrequencies = BuildCodonFrequencies (observedFreq);
788 Model MG94customModel = (MG94custom,vectorOfFrequencies,0);
789
790 Tree codonTree = treeString;
791 }
792
793 LikelihoodFunction lf = (filteredData,codonTree);
794
795 Optimize (res,lf);
796
797 _snsAVL = _computeSNSSites ("filteredData", _Genetic_Code, vectorOfFrequencies, 0);
798 _cL = ReturnVectorsOfCodonLengths (ComputeScalingStencils (0), "codonTree");
799
800
801 _returnMe = {};
802 _returnMe ["BLOCK"] = myID;
803 _returnMe ["LogL"] = res[1][0];
804 _returnMe ["BP"] = _snsAVL ["Sites"];
805 _returnMe ["S_sites"] = _snsAVL ["SSites"];
806 _returnMe ["NS_sites"] = _snsAVL ["NSSites"];
807 _returnMe ["AC"] = AC;
808 _returnMe ["AT"] = AT;
809 _returnMe ["CG"] = CG;
810 _returnMe ["CT"] = CT;
811 _returnMe ["GT"] = GT;
812 _returnMe ["Tree"] = Format(codonTree,0,1);
813
814 for (_biterator = 0; _biterator < treeBranchCount; _biterator = _biterator + 1)
815 {
816 branchName = treeBranchNames[_biterator];
817
818 _returnMe ["length("+branchName+")"] = (_cL["Total"])[_biterator];
819 _returnMe ["dS("+branchName+")"] = (_cL["Syn"])[_biterator]*(_returnMe ["BP"]/_returnMe ["S_sites"]);
820 _returnMe ["dN("+branchName+")"] = (_cL["NonSyn"])[_biterator]*(_returnMe ["BP"]/_returnMe ["NS_sites"]);
821
822 ExecuteCommands ("_lom = _standardizeRatio(codonTree."+treeBranchNames[_biterator]+".nonSynRate,codonTree."+treeBranchNames[_biterator]+".synRate);");
823 _returnMe ["omega("+branchName+")"] = _lom;
824 }
825
826 return _returnMe;
827}
828
829"""
830
831SimpleGlobalFitter = """
832VERBOSITY_LEVEL = -1;
833COUNT_GAPS_IN_FREQUENCIES = 0;
834
835/*---------------------------------------------------------*/
836
837function returnResultHeaders (dummy)
838{
839 _analysisHeaders = {};
840 _analysisHeaders[0] = "BLOCK";
841 _analysisHeaders[1] = "BP";
842 _analysisHeaders[2] = "S_sites";
843 _analysisHeaders[3] = "NS_sites";
844 _analysisHeaders[4] = "Stop_codons";
845 _analysisHeaders[5] = "LogL";
846 _analysisHeaders[6] = "omega";
847 _analysisHeaders[7] = "omega_range";
848 _analysisHeaders[8] = "AC";
849 _analysisHeaders[9] = "AT";
850 _analysisHeaders[10] = "CG";
851 _analysisHeaders[11] = "CT";
852 _analysisHeaders[12] = "GT";
853 _analysisHeaders[13] = "Tree";
854
855 return _analysisHeaders;
856}
857
858/*---------------------------------------------------------*/
859
860function runAGeneFit (myID)
861{
862 fprintf (stdout, "[SimpleGlobalFitter.bf on GENE ", myID, "]\\n");
863 taxonNameMap = {};
864
865 for (k=0; k<ds.species; k=k+1)
866 {
867 GetString (thisName, ds,k);
868 shortName = (thisName^{{"\\\\..+",""}})&&1;
869 taxonNameMap[shortName] = thisName;
870 SetParameter (ds,k,shortName);
871 }
872
873 DataSetFilter filteredData = CreateFilter (ds,1);
874 _nucSites = filteredData.sites;
875
876 if (Abs(treeString))
877 {
878 givenTreeString = treeString;
879 }
880 else
881 {
882 if (_currentGene==1)
883 {
884 ExecuteAFile (HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"Utility"+DIRECTORY_SEPARATOR+"NJ.bf");
885 }
886 givenTreeString = InferTreeTopology (0);
887 treeString = "";
888 }
889
890 DataSetFilter filteredData = CreateFilter (ds,3,"","",GeneticCodeExclusions);
891
892 if (_currentGene==1)
893 {
894 _MG94stdinOverload = {};
895 _MG94stdinOverload ["0"] = "Global";
896 _MG94stdinOverload ["1"] = modelSpecString;
897
898 ExecuteAFile (HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"TemplateModels"+DIRECTORY_SEPARATOR+"MG94custom.mdl",
899 _MG94stdinOverload);
900
901 Tree codonTree = givenTreeString;
902 }
903 else
904 {
905 HarvestFrequencies (observedFreq,filteredData,3,1,1);
906 MULTIPLY_BY_FREQS = PopulateModelMatrix ("MG94custom", observedFreq);
907 vectorOfFrequencies = BuildCodonFrequencies (observedFreq);
908 Model MG94customModel = (MG94custom,vectorOfFrequencies,0);
909
910 Tree codonTree = givenTreeString;
911 }
912
913 LikelihoodFunction lf = (filteredData,codonTree);
914
915 Optimize (res,lf);
916
917 _snsAVL = _computeSNSSites ("filteredData", _Genetic_Code, vectorOfFrequencies, 0);
918 _cL = ReturnVectorsOfCodonLengths (ComputeScalingStencils (0), "codonTree");
919
920
921 _returnMe = {};
922 _returnMe ["BLOCK"] = myID;
923 _returnMe ["LogL"] = res[1][0];
924 _returnMe ["BP"] = _snsAVL ["Sites"];
925 _returnMe ["S_sites"] = _snsAVL ["SSites"];
926 _returnMe ["NS_sites"] = _snsAVL ["NSSites"];
927 _returnMe ["Stop_codons"] = (_nucSites-filteredData.sites*3)$3;
928 _returnMe ["AC"] = AC;
929 _returnMe ["AT"] = AT;
930 _returnMe ["CG"] = CG;
931 _returnMe ["CT"] = CT;
932 _returnMe ["GT"] = GT;
933 _returnMe ["omega"] = R;
934 COVARIANCE_PARAMETER = "R";
935 COVARIANCE_PRECISION = 0.95;
936 CovarianceMatrix (cmx,lf);
937 _returnMe ["omega_range"] = ""+cmx[0]+"-"+cmx[2];
938 _returnMe ["Tree"] = Format(codonTree,0,1);
939
940
941 return _returnMe;
942}
943"""
944
945FastaReader = """
946fscanf (stdin, "String", _coreAnalysis);
947fscanf (stdin, "String", _outputDriver);
948
949ExecuteAFile (HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"TemplateModels"+DIRECTORY_SEPARATOR+"chooseGeneticCode.def");
950ExecuteAFile (HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"dSdNTreeTools.ibf");
951ExecuteAFile (HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"Utility"+DIRECTORY_SEPARATOR+"CodonTools.bf");
952ExecuteAFile (HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"Utility"+DIRECTORY_SEPARATOR+"GrabBag.bf");
953
954SetDialogPrompt ("Tree file");
955fscanf (PROMPT_FOR_FILE, "Tree", givenTree);
956
957treeBranchNames = BranchName (givenTree,-1);
958treeBranchCount = Columns (treeBranchNames)-1;
959treeString = Format (givenTree,1,1);
960
961SetDialogPrompt ("Multiple gene FASTA file");
962fscanf (PROMPT_FOR_FILE, "Lines", inLines);
963fscanf (stdin, "String", modelSpecString);
964fscanf (stdin, "String", _outPath);
965
966ExecuteAFile (_outputDriver);
967ExecuteAFile (_coreAnalysis);
968
969/*---------------------------------------------------------*/
970
971_linesIn = Columns (inLines);
972_currentGene = 1;
973 _currentState = 0;
974/* 0 - waiting for a non-empty line */
975/* 1 - reading files */
976
977geneSeqs = "";
978geneSeqs * 0;
979
980_prepareFileOutput (_outPath);
981
982for (l=0; l<_linesIn; l=l+1)
983{
984 if (Abs(inLines[l]) == 0)
985 {
986 if (_currentState == 1)
987 {
988 geneSeqs * 0;
989 DataSet ds = ReadFromString (geneSeqs);
990 _processAGene (ds.species == treeBranchCount,_currentGene);
991 geneSeqs * 128;
992 _currentGene = _currentGene + 1;
993 }
994 }
995 else
996 {
997 if (_currentState == 0)
998 {
999 _currentState = 1;
1000 }
1001 geneSeqs * inLines[l];
1002 geneSeqs * "\\n";
1003 }
1004}
1005
1006if (_currentState == 1)
1007{
1008 geneSeqs * 0;
1009 DataSet ds = ReadFromString (geneSeqs);
1010 _processAGene (ds.species == treeBranchCount,_currentGene);
1011}
1012
1013_finishFileOutput (0);
1014"""
1015
1016TabWriter = """
1017/*---------------------------------------------------------*/
1018function _prepareFileOutput (_outPath)
1019{
1020 _outputFilePath = _outPath;
1021
1022 _returnHeaders = returnResultHeaders(0);
1023
1024 fprintf (_outputFilePath, CLEAR_FILE, KEEP_OPEN, _returnHeaders[0]);
1025 for (_biterator = 1; _biterator < Abs(_returnHeaders); _biterator = _biterator + 1)
1026 {
1027 fprintf (_outputFilePath,"\\t",_returnHeaders[_biterator]);
1028 }
1029
1030
1031
1032 fprintf (_outputFilePath,"\\n");
1033 return 0;
1034}
1035
1036/*---------------------------------------------------------*/
1037
1038function _processAGene (valid, _geneID)
1039{
1040 if (valid)
1041 {
1042 returnValue = runAGeneFit (_geneID);
1043 fprintf (_outputFilePath, returnValue[_returnHeaders[0]]);
1044 for (_biterator = 1; _biterator < Abs(_returnHeaders); _biterator = _biterator + 1)
1045 {
1046 fprintf (_outputFilePath,"\\t",returnValue[_returnHeaders[_biterator]]);
1047 }
1048 fprintf (_outputFilePath, "\\n");
1049 }
1050 /*
1051 else
1052 {
1053 fprintf (_outputFilePath,
1054 _geneID, ", Incorrect number of sequences\\n");
1055 }
1056 */
1057 _currentState = 0;
1058 return 0;
1059}
1060
1061/*---------------------------------------------------------*/
1062function _finishFileOutput (dummy)
1063{
1064 return 0;
1065}
1066"""
1067
1068def get_dnds_config_filename(Fitter_filename, TabWriter_filename, genetic_code, tree_filename, input_filename, nuc_model, output_filename, FastaReader_filename ):
1069 contents = """
1070_genomeScreenOptions = {};
1071
1072/* all paths are either absolute or relative
1073to the DATA READER */
1074
1075_genomeScreenOptions ["0"] = "%s";
1076 /* which analysis to run on each gene; */
1077_genomeScreenOptions ["1"] = "%s";
1078 /* what output to produce; */
1079_genomeScreenOptions ["2"] = "%s";
1080 /* genetic code */
1081_genomeScreenOptions ["3"] = "%s";
1082 /* tree file */
1083_genomeScreenOptions ["4"] = "%s";
1084 /* alignment file */
1085_genomeScreenOptions ["5"] = "%s";
1086 /* nucleotide bias string; can define any of the 203 models */
1087_genomeScreenOptions ["6"] = "%s";
1088 /* output csv file */
1089
1090ExecuteAFile ("%s", _genomeScreenOptions);
1091""" % (Fitter_filename, TabWriter_filename, genetic_code, tree_filename, input_filename, nuc_model, output_filename, FastaReader_filename )
1092 return get_filled_temp_filename(contents)
1093
1094
1095def get_branch_lengths_config_filename(input_filename, nuc_model, model_options, base_freq, tree_filename, output_filename, BranchLengths_filename):
1096 contents = """
1097_genomeScreenOptions = {};
1098
1099/* all paths are either absolute or relative
1100to the NucDataBranchLengths.bf */
1101
1102_genomeScreenOptions ["0"] = "%s";
1103 /* the file to analyze; */
1104_genomeScreenOptions ["1"] = "CUSTOM";
1105 /* use an arbitrary nucleotide model */
1106_genomeScreenOptions ["2"] = "%s";
1107 /* which model to use */
1108_genomeScreenOptions ["3"] = "%s";
1109 /* model options */
1110_genomeScreenOptions ["4"] = "Estimated";
1111 /* rate parameters */
1112_genomeScreenOptions ["5"] = "%s";
1113 /* base frequencies */
1114_genomeScreenOptions ["6"] = "%s";
1115 /* the tree to use; */
1116_genomeScreenOptions ["7"] = "%s";
1117 /* write .csv output to; */
1118
1119ExecuteAFile ("%s", _genomeScreenOptions);
1120""" % (input_filename, nuc_model, model_options, base_freq, tree_filename, output_filename, BranchLengths_filename)
1121 return get_filled_temp_filename(contents)
1122
1123
1124def get_nj_tree_config_filename(input_filename, distance_metric, output_filename1, output_filename2, NJ_tree_filename):
1125 contents = """
1126_genomeScreenOptions = {};
1127
1128/* all paths are either absolute or relative
1129to the BuildNJTree.bf */
1130
1131_genomeScreenOptions ["0"] = "%s";
1132 /* the file to analyze; */
1133_genomeScreenOptions ["1"] = "%s";
1134 /* pick which distance metric to use; TN93 is a good default */
1135_genomeScreenOptions ["2"] = "%s";
1136 /* write Newick tree output to; */
1137_genomeScreenOptions ["3"] = "%s";
1138 /* write a postscript tree file to this file; leave blank to not write a tree */
1139
1140ExecuteAFile ("%s", _genomeScreenOptions);
1141""" % (input_filename, distance_metric, output_filename1, output_filename2, NJ_tree_filename)
1142 return get_filled_temp_filename(contents)
1143
1144
1145def get_nj_treeMF_config_filename(input_filename, output_filename1, output_filename2, distance_metric, NJ_tree_filename):
1146 contents = """
1147_genomeScreenOptions = {};
1148
1149/* all paths are either absolute or relative
1150to the BuildNJTreeMF.bf */
1151
1152_genomeScreenOptions ["0"] = "%s";
1153 /* the multiple alignment file to analyze; */
1154_genomeScreenOptions ["1"] = "%s";
1155 /* write Newick tree output to; */
1156_genomeScreenOptions ["2"] = "%s";
1157 /* write a postscript tree file to this file; leave blank to not write a tree */
1158_genomeScreenOptions ["3"] = "%s";
1159 /* pick which distance metric to use; TN93 is a good default */
1160
1161ExecuteAFile ("%s", _genomeScreenOptions);
1162""" % (input_filename, output_filename1, output_filename2, distance_metric, NJ_tree_filename)
1163 return get_filled_temp_filename(contents)