/lib/galaxy/tools/util/hyphy_util.py

https://bitbucket.org/ialbert/galaxy-genetrack · Python · 1163 lines · 1144 code · 17 blank · 2 comment · 0 complexity · 34f2c6b348f53d5e0c7314ec2fa2d535 MD5 · raw file

  1. #Dan Blankenberg
  2. #Contains file contents and helper methods for HYPHY configurations
  3. import tempfile, os
  4. def get_filled_temp_filename(contents):
  5. fh = tempfile.NamedTemporaryFile('w')
  6. filename = fh.name
  7. fh.close()
  8. fh = open(filename, 'w')
  9. fh.write(contents)
  10. fh.close()
  11. return filename
  12. NJ_tree_shared_ibf = """
  13. COUNT_GAPS_IN_FREQUENCIES = 0;
  14. methodIndex = 1;
  15. /*-----------------------------------------------------------------------------------------------------------------------------------------*/
  16. function InferTreeTopology(verbFlag)
  17. {
  18. distanceMatrix = {ds.species,ds.species};
  19. MESSAGE_LOGGING = 0;
  20. ExecuteAFile (HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"chooseDistanceFormula.def");
  21. InitializeDistances (0);
  22. for (i = 0; i<ds.species; i=i+1)
  23. {
  24. for (j = i+1; j<ds.species; j = j+1)
  25. {
  26. distanceMatrix[i][j] = ComputeDistanceFormula (i,j);
  27. }
  28. }
  29. MESSAGE_LOGGING = 1;
  30. cladesMade = 1;
  31. if (ds.species == 2)
  32. {
  33. d1 = distanceMatrix[0][1]/2;
  34. treeNodes = {{0,1,d1__},
  35. {1,1,d1__},
  36. {2,0,0}};
  37. cladesInfo = {{2,0}};
  38. }
  39. else
  40. {
  41. if (ds.species == 3)
  42. {
  43. /* generate least squares estimates here */
  44. d1 = (distanceMatrix[0][1]+distanceMatrix[0][2]-distanceMatrix[1][2])/2;
  45. d2 = (distanceMatrix[0][1]-distanceMatrix[0][2]+distanceMatrix[1][2])/2;
  46. d3 = (distanceMatrix[1][2]+distanceMatrix[0][2]-distanceMatrix[0][1])/2;
  47. treeNodes = {{0,1,d1__},
  48. {1,1,d2__},
  49. {2,1,d3__}
  50. {3,0,0}};
  51. cladesInfo = {{3,0}};
  52. }
  53. else
  54. {
  55. njm = (distanceMatrix > methodIndex)>=ds.species;
  56. treeNodes = {2*(ds.species+1),3};
  57. cladesInfo = {ds.species-1,2};
  58. for (i=Rows(treeNodes)-1; i>=0; i=i-1)
  59. {
  60. treeNodes[i][0] = njm[i][0];
  61. treeNodes[i][1] = njm[i][1];
  62. treeNodes[i][2] = njm[i][2];
  63. }
  64. for (i=Rows(cladesInfo)-1; i>=0; i=i-1)
  65. {
  66. cladesInfo[i][0] = njm[i][3];
  67. cladesInfo[i][1] = njm[i][4];
  68. }
  69. njm = 0;
  70. }
  71. }
  72. return 1.0;
  73. }
  74. /*-----------------------------------------------------------------------------------------------------------------------------------------*/
  75. function TreeMatrix2TreeString (doLengths)
  76. {
  77. treeString = "";
  78. p = 0;
  79. k = 0;
  80. m = treeNodes[0][1];
  81. n = treeNodes[0][0];
  82. treeString*(Rows(treeNodes)*25);
  83. while (m)
  84. {
  85. if (m>p)
  86. {
  87. if (p)
  88. {
  89. treeString*",";
  90. }
  91. for (j=p;j<m;j=j+1)
  92. {
  93. treeString*"(";
  94. }
  95. }
  96. else
  97. {
  98. if (m<p)
  99. {
  100. for (j=m;j<p;j=j+1)
  101. {
  102. treeString*")";
  103. }
  104. }
  105. else
  106. {
  107. treeString*",";
  108. }
  109. }
  110. if (n<ds.species)
  111. {
  112. GetString (nodeName, ds, n);
  113. if (doLengths != 1)
  114. {
  115. treeString*nodeName;
  116. }
  117. else
  118. {
  119. treeString*taxonNameMap[nodeName];
  120. }
  121. }
  122. if (doLengths>.5)
  123. {
  124. nodeName = ":"+treeNodes[k][2];
  125. treeString*nodeName;
  126. }
  127. k=k+1;
  128. p=m;
  129. n=treeNodes[k][0];
  130. m=treeNodes[k][1];
  131. }
  132. for (j=m;j<p;j=j+1)
  133. {
  134. treeString*")";
  135. }
  136. treeString*0;
  137. return treeString;
  138. }
  139. """
  140. def get_NJ_tree (filename):
  141. return """
  142. DISTANCE_PROMPTS = 1;
  143. ExecuteAFile ("%s");
  144. DataSet ds = ReadDataFile (PROMPT_FOR_FILE);
  145. DataSetFilter filteredData = CreateFilter (ds,1);
  146. /* do sequence to branch map */
  147. taxonNameMap = {};
  148. for (k=0; k<ds.species; k=k+1)
  149. {
  150. GetString (thisName, ds,k);
  151. shortName = (thisName^{{"\\\\..+",""}})&&1;
  152. taxonNameMap[shortName] = thisName;
  153. SetParameter (ds,k,shortName);
  154. }
  155. DataSetFilter filteredData = CreateFilter (ds,1);
  156. InferTreeTopology (0);
  157. treeString = TreeMatrix2TreeString (1);
  158. fprintf (PROMPT_FOR_FILE, CLEAR_FILE, treeString);
  159. fscanf (stdin, "String", ps_file);
  160. if (Abs(ps_file))
  161. {
  162. treeString = TreeMatrix2TreeString (2);
  163. UseModel (USE_NO_MODEL);
  164. Tree givenTree = treeString;
  165. baseHeight = TipCount (givenTree)*28;
  166. TREE_OUTPUT_OPTIONS = {};
  167. TREE_OUTPUT_OPTIONS["__FONT_SIZE__"] = 14;
  168. baseWidth = 0;
  169. treeAVL = givenTree^0;
  170. drawLetter = "/drawletter {"+TREE_OUTPUT_OPTIONS["__FONT_SIZE__"]$4+" -"+TREE_OUTPUT_OPTIONS["__FONT_SIZE__"]$2+ " show} def\\n";
  171. for (k3 = 1; k3 < Abs(treeAVL); k3=k3+1)
  172. {
  173. nodeName = (treeAVL[k3])["Name"];
  174. if(Abs((treeAVL[k3])["Children"]) == 0)
  175. {
  176. mySpecs = {};
  177. mySpecs ["TREE_OUTPUT_BRANCH_LABEL"] = "(" + taxonNameMap[nodeName] + ") drawLetter";
  178. baseWidth = Max (baseWidth, (treeAVL[k3])["Depth"]);
  179. }
  180. }
  181. baseWidth = 40*baseWidth;
  182. fprintf (ps_file, CLEAR_FILE, drawLetter, PSTreeString (givenTree, "STRING_SUPPLIED_LENGTHS",{{baseWidth,baseHeight}}));
  183. }
  184. """ % (filename)
  185. def get_NJ_treeMF (filename):
  186. return """
  187. ExecuteAFile ("%s");
  188. VERBOSITY_LEVEL = -1;
  189. fscanf (PROMPT_FOR_FILE, "Lines", inLines);
  190. _linesIn = Columns (inLines);
  191. isomorphicTreesBySequenceCount = {};
  192. /*---------------------------------------------------------*/
  193. _currentGene = 1;
  194. _currentState = 0;
  195. geneSeqs = "";
  196. geneSeqs * 128;
  197. fprintf (PROMPT_FOR_FILE, CLEAR_FILE, KEEP_OPEN);
  198. treeOutFile = LAST_FILE_PATH;
  199. fscanf (stdin,"String", ps_file);
  200. if (Abs(ps_file))
  201. {
  202. fprintf (ps_file, CLEAR_FILE, KEEP_OPEN);
  203. }
  204. for (l=0; l<_linesIn; l=l+1)
  205. {
  206. if (Abs(inLines[l]) == 0)
  207. {
  208. if (_currentState == 1)
  209. {
  210. geneSeqs * 0;
  211. DataSet ds = ReadFromString (geneSeqs);
  212. _processAGene (_currentGene,treeOutFile,ps_file);
  213. geneSeqs * 128;
  214. _currentGene = _currentGene + 1;
  215. }
  216. }
  217. else
  218. {
  219. if (_currentState == 0)
  220. {
  221. _currentState = 1;
  222. }
  223. geneSeqs * inLines[l];
  224. geneSeqs * "\\n";
  225. }
  226. }
  227. if (_currentState == 1)
  228. {
  229. geneSeqs * 0;
  230. if (Abs(geneSeqs))
  231. {
  232. DataSet ds = ReadFromString (geneSeqs);
  233. _processAGene (_currentGene,treeOutFile,ps_file);
  234. }
  235. }
  236. fprintf (treeOutFile,CLOSE_FILE);
  237. if (Abs(ps_file))
  238. {
  239. fprintf (ps_file,CLOSE_FILE);
  240. }
  241. /*---------------------------------------------------------*/
  242. function _processAGene (_geneID, nwk_file, ps_file)
  243. {
  244. if (ds.species == 1)
  245. {
  246. fprintf (nwk_file, _geneID-1, "\\tNone \\tNone\\n");
  247. return 0;
  248. }
  249. DataSetFilter filteredData = CreateFilter (ds,1);
  250. /* do sequence to branch map */
  251. taxonNameMap = {};
  252. for (k=0; k<ds.species; k=k+1)
  253. {
  254. GetString (thisName, ds,k);
  255. shortName = (thisName^{{"\\\\..+",""}});
  256. taxonNameMap[shortName] = thisName;
  257. SetParameter (ds,k,shortName);
  258. }
  259. DataSetFilter filteredData = CreateFilter (ds,1);
  260. DISTANCE_PROMPTS = (_geneID==1);
  261. InferTreeTopology (0);
  262. baseTree = TreeMatrix2TreeString (0);
  263. UseModel (USE_NO_MODEL);
  264. Tree baseTop = baseTree;
  265. /* standardize this top */
  266. for (k=0; k<Abs(isomorphicTreesBySequenceCount[filteredData.species]); k=k+1)
  267. {
  268. testString = (isomorphicTreesBySequenceCount[filteredData.species])[k];
  269. Tree testTree = testString;
  270. if (testTree == baseTop)
  271. {
  272. baseTree = testString;
  273. break;
  274. }
  275. }
  276. if (k==Abs(isomorphicTreesBySequenceCount[filteredData.species]))
  277. {
  278. if (k==0)
  279. {
  280. isomorphicTreesBySequenceCount[filteredData.species] = {};
  281. }
  282. (isomorphicTreesBySequenceCount[filteredData.species])[k] = baseTree;
  283. }
  284. fprintf (nwk_file, _geneID-1, "\\t", baseTree, "\\t", TreeMatrix2TreeString (1), "\\n");
  285. if (Abs(ps_file))
  286. {
  287. treeString = TreeMatrix2TreeString (2);
  288. UseModel (USE_NO_MODEL);
  289. Tree givenTree = treeString;
  290. baseHeight = TipCount (givenTree)*28;
  291. TREE_OUTPUT_OPTIONS = {};
  292. TREE_OUTPUT_OPTIONS["__FONT_SIZE__"] = 14;
  293. baseWidth = 0;
  294. treeAVL = givenTree^0;
  295. drawLetter = "/drawletter {"+TREE_OUTPUT_OPTIONS["__FONT_SIZE__"]$4+" -"+TREE_OUTPUT_OPTIONS["__FONT_SIZE__"]$2+ " show} def\\n";
  296. for (k3 = 1; k3 < Abs(treeAVL); k3=k3+1)
  297. {
  298. nodeName = (treeAVL[k3])["Name"];
  299. if(Abs((treeAVL[k3])["Children"]) == 0)
  300. {
  301. mySpecs = {};
  302. mySpecs ["TREE_OUTPUT_BRANCH_LABEL"] = "(" + taxonNameMap[nodeName] + ") drawLetter";
  303. baseWidth = Max (baseWidth, (treeAVL[k3])["Depth"]);
  304. }
  305. }
  306. baseWidth = 40*baseWidth;
  307. fprintf (stdout, _geneID, ":", givenTree,"\\n");
  308. fprintf (ps_file, PSTreeString (givenTree, "STRING_SUPPLIED_LENGTHS",{{baseWidth,baseHeight}}));
  309. }
  310. return 0;
  311. }
  312. """ % (filename)
  313. BranchLengthsMF = """
  314. VERBOSITY_LEVEL = -1;
  315. fscanf (PROMPT_FOR_FILE, "Lines", inLines);
  316. _linesIn = Columns (inLines);
  317. /*---------------------------------------------------------*/
  318. _currentGene = 1;
  319. _currentState = 0;
  320. geneSeqs = "";
  321. geneSeqs * 128;
  322. for (l=0; l<_linesIn; l=l+1)
  323. {
  324. if (Abs(inLines[l]) == 0)
  325. {
  326. if (_currentState == 1)
  327. {
  328. geneSeqs * 0;
  329. DataSet ds = ReadFromString (geneSeqs);
  330. _processAGene (_currentGene);
  331. geneSeqs * 128;
  332. _currentGene = _currentGene + 1;
  333. }
  334. }
  335. else
  336. {
  337. if (_currentState == 0)
  338. {
  339. _currentState = 1;
  340. }
  341. geneSeqs * inLines[l];
  342. geneSeqs * "\\n";
  343. }
  344. }
  345. if (_currentState == 1)
  346. {
  347. geneSeqs * 0;
  348. if (Abs(geneSeqs))
  349. {
  350. DataSet ds = ReadFromString (geneSeqs);
  351. _processAGene (_currentGene);
  352. }
  353. }
  354. fprintf (resultFile,CLOSE_FILE);
  355. /*---------------------------------------------------------*/
  356. function _processAGene (_geneID)
  357. {
  358. DataSetFilter filteredData = CreateFilter (ds,1);
  359. if (_currentGene == 1)
  360. {
  361. SelectTemplateModel (filteredData);
  362. SetDialogPrompt ("Tree file");
  363. fscanf (PROMPT_FOR_FILE, "Tree", givenTree);
  364. fscanf (stdin, "String", resultFile);
  365. /* do sequence to branch map */
  366. validNames = {};
  367. taxonNameMap = {};
  368. for (k=0; k<TipCount(givenTree); k=k+1)
  369. {
  370. validNames[TipName(givenTree,k)&&1] = 1;
  371. }
  372. for (k=0; k<BranchCount(givenTree); k=k+1)
  373. {
  374. thisName = BranchName(givenTree,k);
  375. taxonNameMap[thisName&&1] = thisName;
  376. }
  377. storeValidNames = validNames;
  378. fprintf (resultFile,CLEAR_FILE,KEEP_OPEN,"Block\\tBranch\\tLength\\tLowerBound\\tUpperBound\\n");
  379. }
  380. else
  381. {
  382. HarvestFrequencies (vectorOfFrequencies, filteredData, 1,1,1);
  383. validNames = storeValidNames;
  384. }
  385. for (k=0; k<ds.species; k=k+1)
  386. {
  387. GetString (thisName, ds,k);
  388. shortName = (thisName^{{"\\\\..+",""}})&&1;
  389. if (validNames[shortName])
  390. {
  391. taxonNameMap[shortName] = thisName;
  392. validNames - (shortName);
  393. SetParameter (ds,k,shortName);
  394. }
  395. else
  396. {
  397. fprintf (resultFile,"ERROR:", thisName, " could not be matched to any of the leaves in tree ", givenTree,"\\n");
  398. return 0;
  399. }
  400. }
  401. /* */
  402. LikelihoodFunction lf = (filteredData,givenTree);
  403. Optimize (res,lf);
  404. timer = Time(0)-timer;
  405. branchNames = BranchName (givenTree,-1);
  406. branchLengths = BranchLength (givenTree,-1);
  407. for (k=0; k<Columns(branchNames)-1; k=k+1)
  408. {
  409. COVARIANCE_PARAMETER = "givenTree."+branchNames[k]+".t";
  410. COVARIANCE_PRECISION = 0.95;
  411. CovarianceMatrix (cmx,lf);
  412. if (k==0)
  413. {
  414. /* compute a scaling factor */
  415. ExecuteCommands ("givenTree."+branchNames[0]+".t=1");
  416. scaleFactor = BranchLength (givenTree,0);
  417. ExecuteCommands ("givenTree."+branchNames[0]+".t="+cmx[0][1]);
  418. }
  419. fprintf (resultFile,_geneID,"\\t",taxonNameMap[branchNames[k]&&1],"\\t",branchLengths[k],"\\t",scaleFactor*cmx[0][0],"\\t",scaleFactor*cmx[0][2],"\\n");
  420. }
  421. ttl = (branchLengths*(Transpose(branchLengths["1"])))[0];
  422. global treeScaler = 1;
  423. ReplicateConstraint ("this1.?.t:=treeScaler*this2.?.t__",givenTree,givenTree);
  424. COVARIANCE_PARAMETER = "treeScaler";
  425. COVARIANCE_PRECISION = 0.95;
  426. CovarianceMatrix (cmx,lf);
  427. fprintf (resultFile,_geneID,"\\tTotal Tree\\t",ttl,"\\t",ttl*cmx[0][0],"\\t",ttl*cmx[0][2],"\\n");
  428. ClearConstraints (givenTree);
  429. return 0;
  430. }
  431. """
  432. BranchLengths = """
  433. DataSet ds = ReadDataFile (PROMPT_FOR_FILE);
  434. DataSetFilter filteredData = CreateFilter (ds,1);
  435. SelectTemplateModel (filteredData);
  436. SetDialogPrompt ("Tree file");
  437. fscanf (PROMPT_FOR_FILE, "Tree", givenTree);
  438. fscanf (stdin, "String", resultFile);
  439. /* do sequence to branch map */
  440. validNames = {};
  441. taxonNameMap = {};
  442. for (k=0; k<TipCount(givenTree); k=k+1)
  443. {
  444. validNames[TipName(givenTree,k)&&1] = 1;
  445. }
  446. for (k=0; k<BranchCount(givenTree); k=k+1)
  447. {
  448. thisName = BranchName(givenTree,k);
  449. taxonNameMap[thisName&&1] = thisName;
  450. }
  451. for (k=0; k<ds.species; k=k+1)
  452. {
  453. GetString (thisName, ds,k);
  454. shortName = (thisName^{{"\\\\..+",""}})&&1;
  455. if (validNames[shortName])
  456. {
  457. taxonNameMap[shortName] = thisName;
  458. validNames - (shortName);
  459. SetParameter (ds,k,shortName);
  460. }
  461. else
  462. {
  463. fprintf (resultFile,CLEAR_FILE,"ERROR:", thisName, " could not be matched to any of the leaves in tree ", givenTree);
  464. return 0;
  465. }
  466. }
  467. /* */
  468. LikelihoodFunction lf = (filteredData,givenTree);
  469. Optimize (res,lf);
  470. timer = Time(0)-timer;
  471. branchNames = BranchName (givenTree,-1);
  472. branchLengths = BranchLength (givenTree,-1);
  473. fprintf (resultFile,CLEAR_FILE,KEEP_OPEN,"Branch\\tLength\\tLowerBound\\tUpperBound\\n");
  474. for (k=0; k<Columns(branchNames)-1; k=k+1)
  475. {
  476. COVARIANCE_PARAMETER = "givenTree."+branchNames[k]+".t";
  477. COVARIANCE_PRECISION = 0.95;
  478. CovarianceMatrix (cmx,lf);
  479. if (k==0)
  480. {
  481. /* compute a scaling factor */
  482. ExecuteCommands ("givenTree."+branchNames[0]+".t=1");
  483. scaleFactor = BranchLength (givenTree,0);
  484. ExecuteCommands ("givenTree."+branchNames[0]+".t="+cmx[0][1]);
  485. }
  486. fprintf (resultFile,taxonNameMap[branchNames[k]&&1],"\\t",branchLengths[k],"\\t",scaleFactor*cmx[0][0],"\\t",scaleFactor*cmx[0][2],"\\n");
  487. }
  488. ttl = (branchLengths*(Transpose(branchLengths["1"])))[0];
  489. global treeScaler = 1;
  490. ReplicateConstraint ("this1.?.t:=treeScaler*this2.?.t__",givenTree,givenTree);
  491. COVARIANCE_PARAMETER = "treeScaler";
  492. COVARIANCE_PRECISION = 0.95;
  493. CovarianceMatrix (cmx,lf);
  494. ClearConstraints (givenTree);
  495. fprintf (resultFile,"Total Tree\\t",ttl,"\\t",ttl*cmx[0][0],"\\t",ttl*cmx[0][2],"\\n");
  496. fprintf (resultFile,CLOSE_FILE);
  497. """
  498. SimpleLocalFitter = """
  499. VERBOSITY_LEVEL = -1;
  500. COUNT_GAPS_IN_FREQUENCIES = 0;
  501. /*---------------------------------------------------------*/
  502. function returnResultHeaders (dummy)
  503. {
  504. _analysisHeaders = {};
  505. _analysisHeaders[0] = "BLOCK";
  506. _analysisHeaders[1] = "BP";
  507. _analysisHeaders[2] = "S_sites";
  508. _analysisHeaders[3] = "NS_sites";
  509. _analysisHeaders[4] = "Stop_codons";
  510. _analysisHeaders[5] = "LogL";
  511. _analysisHeaders[6] = "AC";
  512. _analysisHeaders[7] = "AT";
  513. _analysisHeaders[8] = "CG";
  514. _analysisHeaders[9] = "CT";
  515. _analysisHeaders[10] = "GT";
  516. _analysisHeaders[11] = "Tree";
  517. for (_biterator = 0; _biterator < treeBranchCount; _biterator = _biterator + 1)
  518. {
  519. branchName = treeBranchNames[_biterator];
  520. _analysisHeaders [Abs(_analysisHeaders)] = "length("+branchName+")";
  521. _analysisHeaders [Abs(_analysisHeaders)] = "dS("+branchName+")";
  522. _analysisHeaders [Abs(_analysisHeaders)] = "dN("+branchName+")";
  523. _analysisHeaders [Abs(_analysisHeaders)] = "omega("+branchName+")";
  524. }
  525. return _analysisHeaders;
  526. }
  527. /*---------------------------------------------------------*/
  528. function runAGeneFit (myID)
  529. {
  530. DataSetFilter filteredData = CreateFilter (ds,3,"","",GeneticCodeExclusions);
  531. if (_currentGene==1)
  532. {
  533. _MG94stdinOverload = {};
  534. _MG94stdinOverload ["0"] = "Local";
  535. _MG94stdinOverload ["1"] = modelSpecString;
  536. ExecuteAFile (HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"TemplateModels"+DIRECTORY_SEPARATOR+"MG94custom.mdl",
  537. _MG94stdinOverload);
  538. Tree codonTree = treeString;
  539. }
  540. else
  541. {
  542. HarvestFrequencies (observedFreq,filteredData,3,1,1);
  543. MULTIPLY_BY_FREQS = PopulateModelMatrix ("MG94custom", observedFreq);
  544. vectorOfFrequencies = BuildCodonFrequencies (observedFreq);
  545. Model MG94customModel = (MG94custom,vectorOfFrequencies,0);
  546. Tree codonTree = treeString;
  547. }
  548. LikelihoodFunction lf = (filteredData,codonTree);
  549. Optimize (res,lf);
  550. _snsAVL = _computeSNSSites ("filteredData", _Genetic_Code, vectorOfFrequencies, 0);
  551. _cL = ReturnVectorsOfCodonLengths (ComputeScalingStencils (0), "codonTree");
  552. _returnMe = {};
  553. _returnMe ["BLOCK"] = myID;
  554. _returnMe ["LogL"] = res[1][0];
  555. _returnMe ["BP"] = _snsAVL ["Sites"];
  556. _returnMe ["S_sites"] = _snsAVL ["SSites"];
  557. _returnMe ["NS_sites"] = _snsAVL ["NSSites"];
  558. _returnMe ["AC"] = AC;
  559. _returnMe ["AT"] = AT;
  560. _returnMe ["CG"] = CG;
  561. _returnMe ["CT"] = CT;
  562. _returnMe ["GT"] = GT;
  563. _returnMe ["Tree"] = Format(codonTree,0,1);
  564. for (_biterator = 0; _biterator < treeBranchCount; _biterator = _biterator + 1)
  565. {
  566. branchName = treeBranchNames[_biterator];
  567. _returnMe ["length("+branchName+")"] = (_cL["Total"])[_biterator];
  568. _returnMe ["dS("+branchName+")"] = (_cL["Syn"])[_biterator]*(_returnMe ["BP"]/_returnMe ["S_sites"]);
  569. _returnMe ["dN("+branchName+")"] = (_cL["NonSyn"])[_biterator]*(_returnMe ["BP"]/_returnMe ["NS_sites"]);
  570. ExecuteCommands ("_lom = _standardizeRatio(codonTree."+treeBranchNames[_biterator]+".nonSynRate,codonTree."+treeBranchNames[_biterator]+".synRate);");
  571. _returnMe ["omega("+branchName+")"] = _lom;
  572. }
  573. return _returnMe;
  574. }
  575. """
  576. SimpleGlobalFitter = """
  577. VERBOSITY_LEVEL = -1;
  578. COUNT_GAPS_IN_FREQUENCIES = 0;
  579. /*---------------------------------------------------------*/
  580. function returnResultHeaders (dummy)
  581. {
  582. _analysisHeaders = {};
  583. _analysisHeaders[0] = "BLOCK";
  584. _analysisHeaders[1] = "BP";
  585. _analysisHeaders[2] = "S_sites";
  586. _analysisHeaders[3] = "NS_sites";
  587. _analysisHeaders[4] = "Stop_codons";
  588. _analysisHeaders[5] = "LogL";
  589. _analysisHeaders[6] = "omega";
  590. _analysisHeaders[7] = "omega_range";
  591. _analysisHeaders[8] = "AC";
  592. _analysisHeaders[9] = "AT";
  593. _analysisHeaders[10] = "CG";
  594. _analysisHeaders[11] = "CT";
  595. _analysisHeaders[12] = "GT";
  596. _analysisHeaders[13] = "Tree";
  597. return _analysisHeaders;
  598. }
  599. /*---------------------------------------------------------*/
  600. function runAGeneFit (myID)
  601. {
  602. fprintf (stdout, "[SimpleGlobalFitter.bf on GENE ", myID, "]\\n");
  603. taxonNameMap = {};
  604. for (k=0; k<ds.species; k=k+1)
  605. {
  606. GetString (thisName, ds,k);
  607. shortName = (thisName^{{"\\\\..+",""}})&&1;
  608. taxonNameMap[shortName] = thisName;
  609. SetParameter (ds,k,shortName);
  610. }
  611. DataSetFilter filteredData = CreateFilter (ds,1);
  612. _nucSites = filteredData.sites;
  613. if (Abs(treeString))
  614. {
  615. givenTreeString = treeString;
  616. }
  617. else
  618. {
  619. if (_currentGene==1)
  620. {
  621. ExecuteAFile (HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"Utility"+DIRECTORY_SEPARATOR+"NJ.bf");
  622. }
  623. givenTreeString = InferTreeTopology (0);
  624. treeString = "";
  625. }
  626. DataSetFilter filteredData = CreateFilter (ds,3,"","",GeneticCodeExclusions);
  627. if (_currentGene==1)
  628. {
  629. _MG94stdinOverload = {};
  630. _MG94stdinOverload ["0"] = "Global";
  631. _MG94stdinOverload ["1"] = modelSpecString;
  632. ExecuteAFile (HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"TemplateModels"+DIRECTORY_SEPARATOR+"MG94custom.mdl",
  633. _MG94stdinOverload);
  634. Tree codonTree = givenTreeString;
  635. }
  636. else
  637. {
  638. HarvestFrequencies (observedFreq,filteredData,3,1,1);
  639. MULTIPLY_BY_FREQS = PopulateModelMatrix ("MG94custom", observedFreq);
  640. vectorOfFrequencies = BuildCodonFrequencies (observedFreq);
  641. Model MG94customModel = (MG94custom,vectorOfFrequencies,0);
  642. Tree codonTree = givenTreeString;
  643. }
  644. LikelihoodFunction lf = (filteredData,codonTree);
  645. Optimize (res,lf);
  646. _snsAVL = _computeSNSSites ("filteredData", _Genetic_Code, vectorOfFrequencies, 0);
  647. _cL = ReturnVectorsOfCodonLengths (ComputeScalingStencils (0), "codonTree");
  648. _returnMe = {};
  649. _returnMe ["BLOCK"] = myID;
  650. _returnMe ["LogL"] = res[1][0];
  651. _returnMe ["BP"] = _snsAVL ["Sites"];
  652. _returnMe ["S_sites"] = _snsAVL ["SSites"];
  653. _returnMe ["NS_sites"] = _snsAVL ["NSSites"];
  654. _returnMe ["Stop_codons"] = (_nucSites-filteredData.sites*3)$3;
  655. _returnMe ["AC"] = AC;
  656. _returnMe ["AT"] = AT;
  657. _returnMe ["CG"] = CG;
  658. _returnMe ["CT"] = CT;
  659. _returnMe ["GT"] = GT;
  660. _returnMe ["omega"] = R;
  661. COVARIANCE_PARAMETER = "R";
  662. COVARIANCE_PRECISION = 0.95;
  663. CovarianceMatrix (cmx,lf);
  664. _returnMe ["omega_range"] = ""+cmx[0]+"-"+cmx[2];
  665. _returnMe ["Tree"] = Format(codonTree,0,1);
  666. return _returnMe;
  667. }
  668. """
  669. FastaReader = """
  670. fscanf (stdin, "String", _coreAnalysis);
  671. fscanf (stdin, "String", _outputDriver);
  672. ExecuteAFile (HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"TemplateModels"+DIRECTORY_SEPARATOR+"chooseGeneticCode.def");
  673. ExecuteAFile (HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"dSdNTreeTools.ibf");
  674. ExecuteAFile (HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"Utility"+DIRECTORY_SEPARATOR+"CodonTools.bf");
  675. ExecuteAFile (HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"Utility"+DIRECTORY_SEPARATOR+"GrabBag.bf");
  676. SetDialogPrompt ("Tree file");
  677. fscanf (PROMPT_FOR_FILE, "Tree", givenTree);
  678. treeBranchNames = BranchName (givenTree,-1);
  679. treeBranchCount = Columns (treeBranchNames)-1;
  680. treeString = Format (givenTree,1,1);
  681. SetDialogPrompt ("Multiple gene FASTA file");
  682. fscanf (PROMPT_FOR_FILE, "Lines", inLines);
  683. fscanf (stdin, "String", modelSpecString);
  684. fscanf (stdin, "String", _outPath);
  685. ExecuteAFile (_outputDriver);
  686. ExecuteAFile (_coreAnalysis);
  687. /*---------------------------------------------------------*/
  688. _linesIn = Columns (inLines);
  689. _currentGene = 1;
  690. _currentState = 0;
  691. /* 0 - waiting for a non-empty line */
  692. /* 1 - reading files */
  693. geneSeqs = "";
  694. geneSeqs * 0;
  695. _prepareFileOutput (_outPath);
  696. for (l=0; l<_linesIn; l=l+1)
  697. {
  698. if (Abs(inLines[l]) == 0)
  699. {
  700. if (_currentState == 1)
  701. {
  702. geneSeqs * 0;
  703. DataSet ds = ReadFromString (geneSeqs);
  704. _processAGene (ds.species == treeBranchCount,_currentGene);
  705. geneSeqs * 128;
  706. _currentGene = _currentGene + 1;
  707. }
  708. }
  709. else
  710. {
  711. if (_currentState == 0)
  712. {
  713. _currentState = 1;
  714. }
  715. geneSeqs * inLines[l];
  716. geneSeqs * "\\n";
  717. }
  718. }
  719. if (_currentState == 1)
  720. {
  721. geneSeqs * 0;
  722. DataSet ds = ReadFromString (geneSeqs);
  723. _processAGene (ds.species == treeBranchCount,_currentGene);
  724. }
  725. _finishFileOutput (0);
  726. """
  727. TabWriter = """
  728. /*---------------------------------------------------------*/
  729. function _prepareFileOutput (_outPath)
  730. {
  731. _outputFilePath = _outPath;
  732. _returnHeaders = returnResultHeaders(0);
  733. fprintf (_outputFilePath, CLEAR_FILE, KEEP_OPEN, _returnHeaders[0]);
  734. for (_biterator = 1; _biterator < Abs(_returnHeaders); _biterator = _biterator + 1)
  735. {
  736. fprintf (_outputFilePath,"\\t",_returnHeaders[_biterator]);
  737. }
  738. fprintf (_outputFilePath,"\\n");
  739. return 0;
  740. }
  741. /*---------------------------------------------------------*/
  742. function _processAGene (valid, _geneID)
  743. {
  744. if (valid)
  745. {
  746. returnValue = runAGeneFit (_geneID);
  747. fprintf (_outputFilePath, returnValue[_returnHeaders[0]]);
  748. for (_biterator = 1; _biterator < Abs(_returnHeaders); _biterator = _biterator + 1)
  749. {
  750. fprintf (_outputFilePath,"\\t",returnValue[_returnHeaders[_biterator]]);
  751. }
  752. fprintf (_outputFilePath, "\\n");
  753. }
  754. /*
  755. else
  756. {
  757. fprintf (_outputFilePath,
  758. _geneID, ", Incorrect number of sequences\\n");
  759. }
  760. */
  761. _currentState = 0;
  762. return 0;
  763. }
  764. /*---------------------------------------------------------*/
  765. function _finishFileOutput (dummy)
  766. {
  767. return 0;
  768. }
  769. """
  770. def get_dnds_config_filename(Fitter_filename, TabWriter_filename, genetic_code, tree_filename, input_filename, nuc_model, output_filename, FastaReader_filename ):
  771. contents = """
  772. _genomeScreenOptions = {};
  773. /* all paths are either absolute or relative
  774. to the DATA READER */
  775. _genomeScreenOptions ["0"] = "%s";
  776. /* which analysis to run on each gene; */
  777. _genomeScreenOptions ["1"] = "%s";
  778. /* what output to produce; */
  779. _genomeScreenOptions ["2"] = "%s";
  780. /* genetic code */
  781. _genomeScreenOptions ["3"] = "%s";
  782. /* tree file */
  783. _genomeScreenOptions ["4"] = "%s";
  784. /* alignment file */
  785. _genomeScreenOptions ["5"] = "%s";
  786. /* nucleotide bias string; can define any of the 203 models */
  787. _genomeScreenOptions ["6"] = "%s";
  788. /* output csv file */
  789. ExecuteAFile ("%s", _genomeScreenOptions);
  790. """ % (Fitter_filename, TabWriter_filename, genetic_code, tree_filename, input_filename, nuc_model, output_filename, FastaReader_filename )
  791. return get_filled_temp_filename(contents)
  792. def get_branch_lengths_config_filename(input_filename, nuc_model, model_options, base_freq, tree_filename, output_filename, BranchLengths_filename):
  793. contents = """
  794. _genomeScreenOptions = {};
  795. /* all paths are either absolute or relative
  796. to the NucDataBranchLengths.bf */
  797. _genomeScreenOptions ["0"] = "%s";
  798. /* the file to analyze; */
  799. _genomeScreenOptions ["1"] = "CUSTOM";
  800. /* use an arbitrary nucleotide model */
  801. _genomeScreenOptions ["2"] = "%s";
  802. /* which model to use */
  803. _genomeScreenOptions ["3"] = "%s";
  804. /* model options */
  805. _genomeScreenOptions ["4"] = "Estimated";
  806. /* rate parameters */
  807. _genomeScreenOptions ["5"] = "%s";
  808. /* base frequencies */
  809. _genomeScreenOptions ["6"] = "%s";
  810. /* the tree to use; */
  811. _genomeScreenOptions ["7"] = "%s";
  812. /* write .csv output to; */
  813. ExecuteAFile ("%s", _genomeScreenOptions);
  814. """ % (input_filename, nuc_model, model_options, base_freq, tree_filename, output_filename, BranchLengths_filename)
  815. return get_filled_temp_filename(contents)
  816. def get_nj_tree_config_filename(input_filename, distance_metric, output_filename1, output_filename2, NJ_tree_filename):
  817. contents = """
  818. _genomeScreenOptions = {};
  819. /* all paths are either absolute or relative
  820. to the BuildNJTree.bf */
  821. _genomeScreenOptions ["0"] = "%s";
  822. /* the file to analyze; */
  823. _genomeScreenOptions ["1"] = "%s";
  824. /* pick which distance metric to use; TN93 is a good default */
  825. _genomeScreenOptions ["2"] = "%s";
  826. /* write Newick tree output to; */
  827. _genomeScreenOptions ["3"] = "%s";
  828. /* write a postscript tree file to this file; leave blank to not write a tree */
  829. ExecuteAFile ("%s", _genomeScreenOptions);
  830. """ % (input_filename, distance_metric, output_filename1, output_filename2, NJ_tree_filename)
  831. return get_filled_temp_filename(contents)
  832. def get_nj_treeMF_config_filename(input_filename, output_filename1, output_filename2, distance_metric, NJ_tree_filename):
  833. contents = """
  834. _genomeScreenOptions = {};
  835. /* all paths are either absolute or relative
  836. to the BuildNJTreeMF.bf */
  837. _genomeScreenOptions ["0"] = "%s";
  838. /* the multiple alignment file to analyze; */
  839. _genomeScreenOptions ["1"] = "%s";
  840. /* write Newick tree output to; */
  841. _genomeScreenOptions ["2"] = "%s";
  842. /* write a postscript tree file to this file; leave blank to not write a tree */
  843. _genomeScreenOptions ["3"] = "%s";
  844. /* pick which distance metric to use; TN93 is a good default */
  845. ExecuteAFile ("%s", _genomeScreenOptions);
  846. """ % (input_filename, output_filename1, output_filename2, distance_metric, NJ_tree_filename)
  847. return get_filled_temp_filename(contents)