PageRenderTime 62ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/code/application_core/Parameters.cpp

http://github.com/sebhtml/ray
C++ | 1791 lines | 1624 code | 120 blank | 47 comment | 35 complexity | d4c5437bd621df343c8a1b8fe701764f MD5 | raw file
Possible License(s): GPL-3.0
  1. /*
  2. Ray
  3. Copyright (C) 2010, 2011 Sébastien Boisvert
  4. http://DeNovoAssembler.SourceForge.Net/
  5. This program is free software: you can redistribute it and/or modify
  6. it under the terms of the GNU General Public License as published by
  7. the Free Software Foundation, version 3 of the License.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You have received a copy of the GNU General Public License
  13. along with this program (gpl-3.0.txt).
  14. see <http://www.gnu.org/licenses/>
  15. */
  16. /*
  17. * TODO: add option -minimumScaffoldLength
  18. */
  19. #include <core/OperatingSystem.h>
  20. #include<application_core/common_functions.h>
  21. #include<application_core/Parameters.h>
  22. #include <plugin_Library/LibraryPeakFinder.h>
  23. #include<plugin_SequencesLoader/Read.h>
  24. #include<plugin_SequencesLoader/Loader.h>
  25. #include <memory/MyAllocator.h>
  26. #include<string>
  27. #include<sstream>
  28. #include<iostream>
  29. #include<vector>
  30. #include<cstdlib>
  31. #include<fstream>
  32. #include<assert.h>
  33. #include<math.h>
  34. using namespace std;
  35. void Parameters::getIndexes(int count,vector<int>*out){
  36. vector<int> numbers;
  37. for(int i=0;i<count;i++)
  38. numbers.push_back(i);
  39. srand(99);
  40. while(numbers.size()>0){
  41. int randomIndex=rand()%numbers.size();
  42. int index=numbers[randomIndex];
  43. out->push_back(index);
  44. vector<int> newNumbers;
  45. for(int i=0;i<(int)numbers.size();i++){
  46. if(randomIndex==i)
  47. continue;
  48. newNumbers.push_back(numbers[i]);
  49. }
  50. numbers=newNumbers;
  51. }
  52. }
  53. Parameters::Parameters(){
  54. m_providedPeakCoverage=false;
  55. m_providedRepeatCoverage=false;
  56. m_providedMinimumCoverage=false;
  57. m_prefix="RayOutput";
  58. m_initiated=false;
  59. m_showMemoryAllocations=false;
  60. m_directory="assembly";
  61. m_minimumContigLength=100;
  62. m_wordSize=21;
  63. m_colorSpaceMode=false;
  64. m_reducerIsActivated=false;
  65. m_amos=false;
  66. m_error=false;
  67. m_memoryFilePrefix=m_prefix;
  68. m_profiler=false;
  69. m_debugBubbles=false;
  70. m_debugSeeds=false;
  71. m_showMemoryUsage=true;
  72. m_showEndingContext=false;
  73. m_writeKmers=false;
  74. m_showExtensionChoice=false;
  75. m_showReadPlacement=false;
  76. /** use the new NovaEngine (TM) */
  77. m_options.insert("-use-NovaEngine");
  78. /** the default is to not use a default value */
  79. m_degree=2;
  80. m_checkpointDirectory="Checkpoints";
  81. m_hasCheckpointDirectory=false;
  82. }
  83. bool Parameters::showExtensionChoice(){
  84. return m_showExtensionChoice;
  85. }
  86. bool Parameters::showEndingContext(){
  87. return m_showEndingContext;
  88. }
  89. bool Parameters::debugBubbles(){
  90. return m_debugBubbles;
  91. }
  92. bool Parameters::runProfiler(){
  93. return m_profiler;
  94. }
  95. bool Parameters::debugSeeds(){
  96. return m_debugSeeds;
  97. }
  98. int Parameters::getWordSize(){
  99. return m_wordSize;
  100. }
  101. void Parameters::loadCommandsFromFile(char*file){
  102. ifstream f(file);
  103. while(!f.eof()){
  104. string token="";
  105. f>>token;
  106. if(token!=""){
  107. m_commands.push_back(token);
  108. }
  109. }
  110. f.close();
  111. }
  112. void Parameters::loadCommandsFromArguments(int argc,char**argv){
  113. for(int i=1;i<argc;i++){
  114. m_commands.push_back(argv[i]);
  115. }
  116. }
  117. /* parse commands */
  118. void Parameters::parseCommands(){
  119. m_initiated=true;
  120. set<string> commands;
  121. for(int i=0;i<(int)m_commands.size();i++)
  122. m_options.insert(m_commands[i]);
  123. m_showCommunicationEvents=false;
  124. if(hasOption("-show-communication-events"))
  125. m_showCommunicationEvents=true;
  126. /*
  127. if(hasOption("-test-network-only")){
  128. m_options.insert("-write-network-test-raw-data");
  129. }
  130. */
  131. if(hasOption("-show-read-placement")){
  132. m_showReadPlacement=true;
  133. }
  134. m_originalCommands=m_commands;
  135. /* shuffle randomly arguments */
  136. vector<vector<string> > opCodes;
  137. int i=0;
  138. while(i<(int)m_commands.size()){
  139. int j=i;
  140. while(j+1<(int)m_commands.size() && m_commands[j+1][0]!='-'){
  141. j++;
  142. }
  143. vector<string> opCode;
  144. opCode.push_back(m_commands[i]);
  145. for(int k=i+1;k<=j;k++){
  146. opCode.push_back(m_commands[k]);
  147. }
  148. opCodes.push_back(opCode);
  149. i=j+1;
  150. }
  151. vector<int> indexes;
  152. getIndexes(opCodes.size(),&indexes);
  153. vector<string> newCommands;
  154. for(int i=0;i<(int)indexes.size();i++){
  155. for(int j=0;j<(int)opCodes[indexes[i]].size();j++){
  156. newCommands.push_back(opCodes[indexes[i]][j]);
  157. }
  158. }
  159. #ifdef ASSERT
  160. assert(newCommands.size()==m_commands.size());
  161. #endif
  162. m_commands=newCommands;
  163. newCommands.clear();
  164. opCodes.clear();
  165. if(getRank() == MASTER_RANK){
  166. cout<<"Rank 0: Shuffled opcodes"<<endl;
  167. for(int i=0;i<(int)m_commands.size();i++){
  168. cout<<" "<<m_commands[i]<<" \\"<<endl;
  169. }
  170. cout<<endl;
  171. }
  172. set<string> singleReadsCommands;
  173. singleReadsCommands.insert("-s");
  174. singleReadsCommands.insert("LoadSingleEndReads");
  175. singleReadsCommands.insert("-LoadSingleEndReads");
  176. singleReadsCommands.insert("--LoadSingleEndReads");
  177. set<string> pairedReadsCommands;
  178. pairedReadsCommands.insert("-p");
  179. pairedReadsCommands.insert("LoadPairedEndReads");
  180. pairedReadsCommands.insert("-LoadPairedEndReads");
  181. pairedReadsCommands.insert("--LoadPairedEndReads");
  182. set<string> interleavedCommands;
  183. interleavedCommands.insert("-i");
  184. set<string> colorSpaceMode;
  185. colorSpaceMode.insert("-color-space");
  186. set<string> outputAmosCommands;
  187. outputAmosCommands.insert("-a");
  188. outputAmosCommands.insert("-amos");
  189. outputAmosCommands.insert("--amos");
  190. outputAmosCommands.insert("--output-amos");
  191. outputAmosCommands.insert("-OutputAmosFile");
  192. outputAmosCommands.insert("--OutputAmosFile");
  193. set<string> showMalloc;
  194. showMalloc.insert("-show-memory-allocations");
  195. set<string> outputFileCommands;
  196. outputFileCommands.insert("-o");
  197. outputFileCommands.insert("-OutputFile");
  198. outputFileCommands.insert("--OutputFile");
  199. set<string> memoryMappedFileCommands;
  200. memoryMappedFileCommands.insert("-MemoryPrefix");
  201. set<string> kmerSetting;
  202. kmerSetting.insert("-k");
  203. set<string> routingDegree;
  204. routingDegree.insert("-routing-graph-degree");
  205. set<string> connectionType;
  206. connectionType.insert("-connection-type");
  207. set<string> reduceMemoryUsage;
  208. reduceMemoryUsage.insert("-r");
  209. set<string> showMemory;
  210. showMemory.insert("-show-memory-usage");
  211. showMemory.insert("--show-memory-usage");
  212. set<string> debugBubbles;
  213. debugBubbles.insert("-debug-bubbles");
  214. debugBubbles.insert("--debug-bubbles");
  215. set<string> debugSeeds;
  216. debugSeeds.insert("-debug-seeds");
  217. debugSeeds.insert("--debug-seeds");
  218. set<string> runProfiler;
  219. runProfiler.insert("-run-profiler");
  220. runProfiler.insert("--run-profiler");
  221. set<string> showContext;
  222. showContext.insert("-show-ending-context");
  223. showContext.insert("--show-ending-context");
  224. set<string> showExtensionChoiceOption;
  225. showExtensionChoiceOption.insert("-show-extension-choice");
  226. set<string> writeKmers;
  227. writeKmers.insert("-write-kmers");
  228. set<string> setMinimumCoverage;
  229. set<string> setPeakCoverage;
  230. set<string> setRepeatCoverage;
  231. setMinimumCoverage.insert("-minimumCoverage");
  232. setPeakCoverage.insert("-peakCoverage");
  233. setRepeatCoverage.insert("-repeatCoverage");
  234. set<string> minimumContigLength;
  235. minimumContigLength.insert("-minimum-contig-length");
  236. set<string> searchOption;
  237. searchOption.insert("-search");
  238. set<string> phylogeny;
  239. phylogeny.insert("-with-taxonomy");
  240. set<string> coloringOneColor;
  241. coloringOneColor.insert("-one-color-per-file");
  242. set<string> ontology;
  243. ontology.insert("-gene-ontology");
  244. set<string> checkpoints;
  245. checkpoints.insert("-read-write-checkpoints");
  246. checkpoints.insert("-write-checkpoints");
  247. checkpoints.insert("-read-checkpoints");
  248. vector<set<string> > toAdd;
  249. toAdd.push_back(checkpoints);
  250. toAdd.push_back(coloringOneColor);
  251. toAdd.push_back(ontology);
  252. toAdd.push_back(phylogeny);
  253. toAdd.push_back(minimumContigLength);
  254. toAdd.push_back(showExtensionChoiceOption);
  255. toAdd.push_back(setRepeatCoverage);
  256. toAdd.push_back(setPeakCoverage);
  257. toAdd.push_back(setMinimumCoverage);
  258. toAdd.push_back(singleReadsCommands);
  259. toAdd.push_back(pairedReadsCommands);
  260. toAdd.push_back(outputAmosCommands);
  261. toAdd.push_back(outputFileCommands);
  262. toAdd.push_back(kmerSetting);
  263. toAdd.push_back(routingDegree);
  264. toAdd.push_back(interleavedCommands);
  265. toAdd.push_back(searchOption);
  266. toAdd.push_back(reduceMemoryUsage);
  267. toAdd.push_back(memoryMappedFileCommands);
  268. toAdd.push_back(connectionType);
  269. toAdd.push_back(showMemory);
  270. toAdd.push_back(debugBubbles);
  271. toAdd.push_back(debugSeeds);
  272. toAdd.push_back(runProfiler);
  273. toAdd.push_back(showContext);
  274. toAdd.push_back(showMalloc);
  275. toAdd.push_back(writeKmers);
  276. toAdd.push_back(colorSpaceMode);
  277. for(int i=0;i<(int)toAdd.size();i++){
  278. for(set<string>::iterator j=toAdd[i].begin();j!=toAdd[i].end();j++){
  279. commands.insert(*j);
  280. }
  281. }
  282. m_numberOfLibraries=0;
  283. bool providedMemoryPrefix=false;
  284. for(int i=0;i<(int)m_commands.size();i++){
  285. string token=m_commands[i];
  286. if(singleReadsCommands.count(token)>0){
  287. i++;
  288. int items=m_commands.size()-i;
  289. if(items<1){
  290. if(m_rank==MASTER_RANK){
  291. cout<<"Error: "<<token<<" needs 1 item, you provided only "<<items<<endl;
  292. }
  293. m_error=true;
  294. return;
  295. }
  296. token=m_commands[i];
  297. m_singleEndReadsFile.push_back(token);
  298. fileNameHook(token);
  299. if(m_rank==MASTER_RANK){
  300. cout<<endl;
  301. cout<<"-s (single sequences)"<<endl;
  302. cout<<" Sequences: "<<token<<endl;
  303. }
  304. }else if(memoryMappedFileCommands.count(token)>0){
  305. i++;
  306. int items=m_commands.size()-i;
  307. if(items<1){
  308. if(m_rank==MASTER_RANK){
  309. cout<<"Error: "<<token<<" needs 1 item, you provided "<<items<<endl;
  310. }
  311. m_error=true;
  312. return;
  313. }
  314. token=m_commands[i];
  315. m_memoryFilePrefix=token;
  316. providedMemoryPrefix=true;
  317. }else if(checkpoints.count(token)>0){
  318. i++;
  319. int items=m_commands.size()-i;
  320. if(items<1){
  321. if(m_rank==MASTER_RANK){
  322. cout<<"Error: "<<token<<" needs 1 item, you provided "<<items<<endl;
  323. }
  324. m_error=true;
  325. return;
  326. }
  327. token=m_commands[i];
  328. if(m_hasCheckpointDirectory){
  329. cout<<"Warning: can not set already set checkpoint directory."<<endl;
  330. continue;
  331. }
  332. m_checkpointDirectory=token;
  333. m_hasCheckpointDirectory=true;
  334. cout<<"Rank "<<m_rank<<" checkpoint directory: "<<m_checkpointDirectory;
  335. if(m_rank==MASTER_RANK){
  336. if(!fileExists(m_checkpointDirectory.c_str())){
  337. createDirectory(m_checkpointDirectory.c_str());
  338. }
  339. }
  340. }else if(outputFileCommands.count(token)>0){
  341. i++;
  342. int items=m_commands.size()-i;
  343. if(items<1){
  344. if(m_rank==MASTER_RANK){
  345. cout<<"Error: "<<token<<" needs 1 item, you provided "<<items<<endl;
  346. }
  347. m_error=true;
  348. return;
  349. }
  350. token=m_commands[i];
  351. m_prefix=token;
  352. if(!providedMemoryPrefix){
  353. m_memoryFilePrefix=m_prefix;
  354. }
  355. }else if(interleavedCommands.count(token)>0){
  356. // make sure there is at least 4 elements left.
  357. int items=0;
  358. int k=0;
  359. for(int j=i+1;j<(int)m_commands.size();j++){
  360. string cmd=m_commands[j];
  361. if(commands.count(cmd)==0 && cmd[0]!='-'){
  362. items++;
  363. }else{
  364. break;
  365. }
  366. k++;
  367. }
  368. if(items!=1 && items!=3){
  369. if(m_rank==MASTER_RANK){
  370. cout<<"Error: "<<token<<" needs 1 or 3 items, you provided "<<items<<endl;
  371. }
  372. m_error=true;
  373. return;
  374. }
  375. i++;
  376. token=m_commands[i];
  377. string interleavedFile=token;
  378. int interleavedFileIndex=m_singleEndReadsFile.size();
  379. m_interleavedFiles.insert(interleavedFileIndex);
  380. m_singleEndReadsFile.push_back(interleavedFile);
  381. fileNameHook(interleavedFile);
  382. int meanFragmentLength=0;
  383. int standardDeviation=0;
  384. #ifdef ASSERT
  385. assert(items==1 or items==3);
  386. #endif
  387. if(m_rank==MASTER_RANK){
  388. cout<<endl;
  389. cout<<"Paired library # "<<m_numberOfLibraries<<endl;
  390. cout<<" -i (paired-end interleaved sequences)"<<endl;
  391. cout<<" Sequences: "<<token<<endl;
  392. }
  393. if(items==3){
  394. i++;
  395. token=m_commands[i];
  396. meanFragmentLength=atoi(token.c_str());
  397. i++;
  398. token=m_commands[i];
  399. standardDeviation=atoi(token.c_str());
  400. if(m_rank==MASTER_RANK){
  401. cout<<" Average length: "<<meanFragmentLength<<endl;
  402. cout<<" Standard deviation: "<<standardDeviation<<endl;
  403. }
  404. int distance=meanFragmentLength+standardDeviation;
  405. if(distance>m_maximumDistance){
  406. m_maximumDistance=distance;
  407. }
  408. }else if(items==1){// automatic detection.
  409. map<int,int> t;
  410. m_automaticLibraries.insert(m_numberOfLibraries);
  411. if(m_rank==MASTER_RANK){
  412. cout<<" Average length: automatic detection"<<endl;
  413. cout<<" Standard deviation: automatic detection"<<endl;
  414. }
  415. }else{
  416. #ifdef ASSERT
  417. assert(false);
  418. #endif
  419. }
  420. m_fileLibrary[interleavedFileIndex]=m_numberOfLibraries;
  421. vector<int> files;
  422. files.push_back(interleavedFileIndex);
  423. m_libraryFiles.push_back(files);
  424. addLibraryData(m_numberOfLibraries,meanFragmentLength,standardDeviation);
  425. m_numberOfLibraries++;
  426. }else if(pairedReadsCommands.count(token)>0){
  427. // make sure there is at least 4 elements left.
  428. int items=0;
  429. int k=0;
  430. for(int j=i+1;j<(int)m_commands.size();j++){
  431. string cmd=m_commands[j];
  432. if(commands.count(cmd)==0 && cmd[0]!='-'){
  433. items++;
  434. }else{
  435. break;
  436. }
  437. k++;
  438. }
  439. if(items!=2 && items!=4){
  440. if(m_rank==MASTER_RANK){
  441. cout<<"Error: "<<token<<" needs 2 or 4 items, you provided "<<items<<endl;
  442. }
  443. m_error=true;
  444. return;
  445. }
  446. i++;
  447. token=m_commands[i];
  448. string left=token;
  449. // add left file
  450. int leftFile=m_singleEndReadsFile.size();
  451. m_leftFiles.insert(leftFile);
  452. m_singleEndReadsFile.push_back(left);
  453. i++;
  454. token=m_commands[i];
  455. // add right file
  456. string right=token;
  457. int rightFile=m_singleEndReadsFile.size();
  458. m_rightFiles.insert(rightFile);
  459. m_singleEndReadsFile.push_back(right);
  460. fileNameHook(left);
  461. fileNameHook(right);
  462. int meanFragmentLength=0;
  463. int standardDeviation=0;
  464. #ifdef ASSERT
  465. assert(items==4 or items==2);
  466. #endif
  467. if(m_rank==MASTER_RANK){
  468. cout<<endl;
  469. cout<<"Paired library # "<<m_numberOfLibraries<<endl;
  470. cout<<" -p (paired-end sequences)"<<endl;
  471. cout<<" Left sequences: "<<left<<endl;
  472. cout<<" Right sequences: "<<right<<endl;
  473. }
  474. if(items==4){
  475. i++;
  476. token=m_commands[i];
  477. meanFragmentLength=atoi(token.c_str());
  478. i++;
  479. token=m_commands[i];
  480. standardDeviation=atoi(token.c_str());
  481. if(m_rank==MASTER_RANK){
  482. cout<<" Average length: "<<meanFragmentLength<<endl;
  483. cout<<" Standard deviation: "<<standardDeviation<<endl;
  484. }
  485. }else if(items==2){// automatic detection.
  486. m_automaticLibraries.insert(m_numberOfLibraries);
  487. if(m_rank==MASTER_RANK){
  488. cout<<" Average length: automatic detection"<<endl;
  489. cout<<" Standard deviation: automatic detection"<<endl;
  490. }
  491. }
  492. m_fileLibrary[rightFile]=m_numberOfLibraries;
  493. m_fileLibrary[leftFile]=m_numberOfLibraries;
  494. vector<int> files;
  495. files.push_back(leftFile);
  496. files.push_back(rightFile);
  497. m_libraryFiles.push_back(files);
  498. addLibraryData(m_numberOfLibraries,meanFragmentLength,standardDeviation);
  499. m_numberOfLibraries++;
  500. }else if(outputAmosCommands.count(token)>0){
  501. m_amos=true;
  502. }else if(showExtensionChoiceOption.count(token)>0){
  503. m_showExtensionChoice=true;
  504. }else if(showMalloc.count(token)>0){
  505. m_showMemoryAllocations=true;
  506. }else if(reduceMemoryUsage.count(token)>0){
  507. int items=0;
  508. for(int j=i+1;j<(int)m_commands.size();j++){
  509. string cmd=m_commands[j];
  510. if(commands.count(cmd)==0){
  511. items++;
  512. }else{
  513. break;
  514. }
  515. }
  516. if(!(items==0||items==1)){
  517. if(m_rank==MASTER_RANK){
  518. cout<<"Error: "<<token<<" needs 0 or 1 item, you provided "<<items<<endl;
  519. }
  520. m_error=true;
  521. return;
  522. }
  523. m_reducerIsActivated=true;
  524. m_reducerPeriod=1000000;
  525. if(items==1){
  526. m_reducerPeriod=atoi(m_commands[i+1].c_str());
  527. }
  528. }else if(setRepeatCoverage.count(token)>0){
  529. i++;
  530. int items=m_commands.size()-i;
  531. if(items<1){
  532. if(m_rank==MASTER_RANK){
  533. cout<<"Error: "<<token<<" needs 1 item, you provided only "<<items<<endl;
  534. }
  535. m_error=true;
  536. return;
  537. }
  538. token=m_commands[i];
  539. m_repeatCoverage=atoi(token.c_str());
  540. m_providedRepeatCoverage=true;
  541. }else if(setMinimumCoverage.count(token)>0){
  542. i++;
  543. int items=m_commands.size()-i;
  544. if(items<1){
  545. if(m_rank==MASTER_RANK){
  546. cout<<"Error: "<<token<<" needs 1 item, you provided only "<<items<<endl;
  547. }
  548. m_error=true;
  549. return;
  550. }
  551. token=m_commands[i];
  552. m_minimumCoverage=atoi(token.c_str());
  553. m_providedMinimumCoverage=true;
  554. }else if(setPeakCoverage.count(token)>0){
  555. i++;
  556. int items=m_commands.size()-i;
  557. if(items<1){
  558. if(m_rank==MASTER_RANK){
  559. cout<<"Error: "<<token<<" needs 1 item, you provided only "<<items<<endl;
  560. }
  561. m_error=true;
  562. return;
  563. }
  564. token=m_commands[i];
  565. m_peakCoverage=atoi(token.c_str());
  566. m_providedPeakCoverage=true;
  567. }else if(connectionType.count(token)>0){
  568. i++;
  569. int items=m_commands.size()-i;
  570. if(items<1){
  571. if(m_rank==MASTER_RANK){
  572. cout<<"Error: "<<token<<" needs 1 item but you provided only "<<items<<endl;
  573. }
  574. m_error=true;
  575. return;
  576. }
  577. token=m_commands[i];
  578. m_connectionType=token;
  579. }else if(routingDegree.count(token)>0){
  580. i++;
  581. int items=m_commands.size()-i;
  582. if(items<1){
  583. if(m_rank==MASTER_RANK){
  584. cout<<"Error: "<<token<<" needs 1 item, you provided only "<<items<<endl;
  585. }
  586. m_error=true;
  587. return;
  588. }
  589. token=m_commands[i];
  590. m_degree=atoi(token.c_str());
  591. }else if(phylogeny.count(token)>0){
  592. cout<<"Enabling CorePlugin 'PhylogenyViewer'"<<endl;
  593. i++;
  594. int items=m_commands.size()-i;
  595. if(items<3){
  596. if(m_rank==MASTER_RANK){
  597. cout<<"Error: "<<token<<" needs 3 item, but you provided "<<items<<endl;
  598. }
  599. m_error=true;
  600. return;
  601. }
  602. m_genomeToTaxonFile=m_commands[i];
  603. i++;
  604. m_treeFile=m_commands[i];
  605. i++;
  606. m_taxonNameFile=m_commands[i];
  607. }else if(searchOption.count(token)>0){
  608. i++;
  609. int items=m_commands.size()-i;
  610. if(items<1){
  611. if(m_rank==MASTER_RANK){
  612. cout<<"Error: "<<token<<" needs 1 item, you provided only "<<items<<endl;
  613. }
  614. m_error=true;
  615. return;
  616. }
  617. token=m_commands[i];
  618. m_searchDirectories.push_back(token);
  619. }else if(minimumContigLength.count(token)>0){
  620. i++;
  621. int items=m_commands.size()-i;
  622. if(items<1){
  623. if(m_rank==MASTER_RANK){
  624. cout<<"Error: "<<token<<" needs 1 item, you provided only "<<items<<endl;
  625. }
  626. m_error=true;
  627. return;
  628. }
  629. token=m_commands[i];
  630. m_minimumContigLength=atoi(token.c_str());
  631. if(m_rank==MASTER_RANK){
  632. cout<<endl;
  633. cout<<"Rank "<<MASTER_RANK<<" the minimum contig length is "<<m_minimumContigLength<<endl;
  634. cout<<endl;
  635. }
  636. }else if(kmerSetting.count(token)>0){
  637. i++;
  638. int items=m_commands.size()-i;
  639. if(items<1){
  640. if(m_rank==MASTER_RANK){
  641. cout<<"Error: "<<token<<" needs 1 item, you provided only "<<items<<endl;
  642. }
  643. m_error=true;
  644. return;
  645. }
  646. token=m_commands[i];
  647. m_wordSize=atoi(token.c_str());
  648. if(m_wordSize<15){
  649. m_wordSize=15;
  650. }
  651. if(m_wordSize>MAXKMERLENGTH){
  652. if(m_rank==MASTER_RANK){
  653. cout<<endl;
  654. cout<<"Rank "<<MASTER_RANK<<": Warning, k > MAXKMERLENGTH"<<endl;
  655. cout<<"Rank "<<MASTER_RANK<<": Change MAXKMERLENGTH in the Makefile and recompile Ray."<<endl;
  656. }
  657. m_wordSize=MAXKMERLENGTH;
  658. }
  659. if(m_wordSize%2==0){
  660. m_wordSize--;
  661. }
  662. if(m_rank==MASTER_RANK){
  663. cout<<endl;
  664. cout<<"-k (to set the k-mer size)"<<endl;
  665. cout<<" Value: "<<m_wordSize<<endl;
  666. cout<<endl;
  667. }
  668. }else if(writeKmers.count(token)>0){
  669. m_writeKmers=true;
  670. if(m_rank==MASTER_RANK){
  671. cout<<endl;
  672. cout<<"Ray will write k-mers ("<<token<<")"<<endl;
  673. }
  674. }else if(runProfiler.count(token)>0){
  675. m_profiler=true;
  676. if(m_rank==MASTER_RANK){
  677. printf("Enabling profiler!\n");
  678. }
  679. }else if(debugBubbles.count(token)>0){
  680. m_debugBubbles=true;
  681. if(m_rank==MASTER_RANK){
  682. printf("Enabling bubble debug mode.\n");
  683. }
  684. }else if(colorSpaceMode.count(token)>0){
  685. m_colorSpaceMode=true;
  686. if(m_rank==MASTER_RANK){
  687. cout<<endl;
  688. cout<<"Enabling color-space mode"<<endl;
  689. cout<<"All reads should be in color space."<<endl;
  690. }
  691. }else if(debugSeeds.count(token)>0){
  692. m_debugSeeds=true;
  693. if(m_rank==MASTER_RANK){
  694. printf("Enabling seed debug mode.\n");
  695. }
  696. }else if(showMemory.count(token)>0){
  697. m_showMemoryUsage=true;
  698. if(m_rank==MASTER_RANK){
  699. printf("Enabling memory usage reporting.\n");
  700. }
  701. }else if(showContext.count(token)>0){
  702. m_showEndingContext=true;
  703. if(m_rank==MASTER_RANK){
  704. printf("Ray will show the ending context of extensions.\n");
  705. }
  706. }
  707. }
  708. int maximumNumberOfFiles=MAXIMUM_MESSAGE_SIZE_IN_BYTES/sizeof(uint32_t);
  709. assert((int)m_singleEndReadsFile.size()<=maximumNumberOfFiles);
  710. LargeCount result=1;
  711. for(int p=0;p<m_wordSize;p++){
  712. result*=4;
  713. }
  714. }
  715. void Parameters::writeCommandFile(){
  716. ostringstream commandFile;
  717. commandFile<<getPrefix()<<"RayCommand.txt";
  718. ofstream f(commandFile.str().c_str());
  719. f<<"mpiexec -n "<<getSize()<<" Ray \\"<<endl;
  720. for(int i=0;i<(int)m_originalCommands.size();i++){
  721. if(i!=(int)m_originalCommands.size()-1){
  722. f<<" "<<m_originalCommands[i]<<" \\"<<endl;
  723. }else{
  724. f<<" "<<m_originalCommands[i]<<endl;
  725. }
  726. }
  727. f.close();
  728. cout<<"Rank "<<MASTER_RANK<<" wrote "<<commandFile.str()<<endl;
  729. cout<<endl;
  730. cout<<"k-mer length: "<<m_wordSize<<endl;
  731. if(m_reducerIsActivated){
  732. cout<<"Memory Consumption Reducer is enabled, threshold="<<m_reducerPeriod<<endl;
  733. }
  734. cout<<endl;
  735. cout<<"Output files will be prefixed with "<<getPrefix()<<endl;
  736. cout<<endl;
  737. ostringstream rayRuntime;
  738. rayRuntime<<getPrefix()<<"RayVersion.txt";
  739. ofstream f2(rayRuntime.str().c_str());
  740. f2<<"Ray version: "<<RAY_VERSION<<endl;
  741. f2.close();
  742. }
  743. void Parameters::constructor(int argc,char**argv,int rank,int size){
  744. m_maximumDistance=0;
  745. m_totalNumberOfSequences=0;
  746. m_rank=rank;
  747. m_size=size;
  748. bool hasCommandFile=false;
  749. if(argc==2){
  750. ifstream f(argv[1]);
  751. hasCommandFile=f;
  752. f.close();
  753. }
  754. if(argc==2&&hasCommandFile){
  755. m_input=argv[1];
  756. loadCommandsFromFile(argv[1]);
  757. }else{
  758. loadCommandsFromArguments(argc,argv);
  759. }
  760. parseCommands();
  761. }
  762. int Parameters::getRank(){
  763. return m_rank;
  764. }
  765. bool Parameters::isInitiated(){
  766. return m_initiated;
  767. }
  768. vector<string> Parameters::getAllFiles(){
  769. vector<string> l;
  770. for(int i=0;i<(int)m_singleEndReadsFile.size();i++)
  771. l.push_back(m_singleEndReadsFile[i]);
  772. return l;
  773. }
  774. string Parameters::getFile(int file){
  775. return m_singleEndReadsFile[file];
  776. }
  777. string Parameters::getDirectory(){
  778. return m_directory;
  779. }
  780. string Parameters::getOutputFile(){
  781. return getPrefix()+"Contigs.fasta";
  782. }
  783. int Parameters::getMinimumContigLength(){
  784. return m_minimumContigLength;
  785. }
  786. bool Parameters::isLeftFile(int i){
  787. return m_leftFiles.count(i)>0;
  788. }
  789. bool Parameters::isRightFile(int i){
  790. return m_rightFiles.count(i)>0;
  791. }
  792. int Parameters::getLibraryAverageLength(int i,int j){
  793. return m_libraryAverageLength[i][j];
  794. }
  795. int Parameters::getLibraryStandardDeviation(int i,int j){
  796. return m_libraryDeviation[i][j];
  797. }
  798. int Parameters::getLibraryMaxAverageLength(int i){
  799. if(m_libraryAverageLength[i].size()==1)
  800. return m_libraryAverageLength[i][0];
  801. int max=0;
  802. for(int j=0;j<(int)m_libraryAverageLength[i].size();j++)
  803. if(m_libraryAverageLength[i][j]>max)
  804. max=m_libraryAverageLength[i][j];
  805. return max;
  806. }
  807. int Parameters::getLibraryMaxStandardDeviation(int i){
  808. if(m_libraryDeviation[i].size()==1)
  809. return m_libraryDeviation[i][0];
  810. int max=0;
  811. for(int j=0;j<(int)m_libraryDeviation[i].size();j++)
  812. if(m_libraryDeviation[i][j]>max)
  813. max=m_libraryDeviation[i][j];
  814. return max;
  815. }
  816. bool Parameters::getColorSpaceMode(){
  817. return m_colorSpaceMode;
  818. }
  819. bool Parameters::useAmos(){
  820. return m_amos;
  821. }
  822. string Parameters::getInputFile(){
  823. return m_input;
  824. }
  825. string Parameters::getParametersFile(){
  826. return "Ray-Parameters.txt";
  827. }
  828. string Parameters::getPrefix(){
  829. ostringstream directory;
  830. directory<<m_prefix<<"/";
  831. return directory.str();
  832. }
  833. string Parameters::getCoverageDistributionFile(){
  834. return getPrefix()+"CoverageDistribution.txt";
  835. }
  836. string Parameters::getAmosFile(){
  837. return getPrefix()+"AMOS.afg";
  838. }
  839. vector<string> Parameters::getCommands(){
  840. return m_commands;
  841. }
  842. bool Parameters::getError(){
  843. return m_error;
  844. }
  845. void Parameters::addDistance(int library,int distance,int count){
  846. m_observedDistances[library][distance]+=count;
  847. }
  848. string Parameters::getLibraryFile(int library){
  849. ostringstream s;
  850. s<<getPrefix();
  851. s<<""<<"Library"<<library<<".txt";
  852. return s.str();
  853. }
  854. #define WRITE_LIBRARY_OBSERVATIONS
  855. void Parameters::computeAverageDistances(){
  856. cout<<endl;
  857. for(map<int,map<int,int> >::iterator i=m_observedDistances.begin();
  858. i!=m_observedDistances.end();i++){
  859. int library=i->first;
  860. if(!isAutomatic(library))
  861. continue;
  862. vector<int> x;
  863. vector<int> y;
  864. string fileName=getLibraryFile(library);
  865. #ifdef WRITE_LIBRARY_OBSERVATIONS
  866. ofstream f(fileName.c_str());
  867. #endif
  868. for(map<int,int>::iterator j=m_observedDistances[library].begin();
  869. j!=m_observedDistances[library].end();j++){
  870. int d=j->first;
  871. int count=j->second;
  872. #ifdef WRITE_LIBRARY_OBSERVATIONS
  873. f<<d<<"\t"<<count<<endl;
  874. #endif
  875. x.push_back(d);
  876. y.push_back(count);
  877. }
  878. #ifdef WRITE_LIBRARY_OBSERVATIONS
  879. f.close();
  880. #endif
  881. vector<int> averages;
  882. vector<int> deviations;
  883. LibraryPeakFinder finder;
  884. finder.findPeaks(&x,&y,&averages,&deviations);
  885. for(int i=0;i<(int)averages.size();i++)
  886. addLibraryData(library,averages[i],deviations[i]);
  887. }
  888. cout<<endl;
  889. cout<<endl;
  890. ostringstream fileName;
  891. fileName<<getPrefix();
  892. fileName<<"LibraryStatistics.txt";
  893. ofstream f2(fileName.str().c_str());
  894. f2<<"NumberOfPairedLibraries: "<<m_numberOfLibraries<<endl;
  895. f2<<endl;
  896. for(int i=0;i<(int)m_numberOfLibraries;i++){
  897. int library=i;
  898. string type="Manual";
  899. if(m_automaticLibraries.count(library)>0){
  900. type="Automatic";
  901. }
  902. f2<<"LibraryNumber: "<<library<<endl;
  903. string format="Interleaved,Paired";
  904. vector<int> files=m_libraryFiles[i];
  905. if(files.size()==2){
  906. format="TwoFiles,Paired";
  907. }
  908. f2<<" InputFormat: "<<format<<endl;
  909. f2<<" DetectionType: "<<type<<endl;
  910. f2<<" File: "<<m_singleEndReadsFile[files[0]]<<endl;
  911. f2<<" NumberOfSequences: "<<m_numberOfSequencesInFile[files[0]]<<endl;
  912. if(files.size()>1){
  913. f2<<" File: "<<m_singleEndReadsFile[files[1]]<<endl;
  914. f2<<" NumberOfSequences: "<<m_numberOfSequencesInFile[files[1]]<<endl;
  915. }
  916. f2<<" Distribution: "<<getLibraryFile(library)<<endl;
  917. for(int j=0;j<getLibraryPeaks(library);j++){
  918. int average=getLibraryAverageLength(library,j);
  919. int standardDeviation=getLibraryStandardDeviation(library,j);
  920. cout<<"Library # "<<library<<" ("<<type<<") -> average length: "<<average<<" and standard deviation: "<<standardDeviation<<endl;
  921. f2<<" Peak "<<j<<endl;
  922. f2<<" AverageOuterDistance: "<<average<<endl;
  923. f2<<" StandardDeviation: "<<standardDeviation<<endl;
  924. if(standardDeviation*2>average){
  925. f2<<" DetectionFailure: Yes"<<endl;
  926. }
  927. }
  928. f2<<endl;
  929. }
  930. f2.close();
  931. }
  932. void Parameters::addLibraryData(int library,int average,int deviation){
  933. if(average==0)
  934. return;
  935. m_libraryAverageLength[library].push_back(average);
  936. m_libraryDeviation[library].push_back(deviation);
  937. int distance=average+4*deviation;
  938. if(distance>m_maximumDistance){
  939. m_maximumDistance=distance;
  940. }
  941. }
  942. void Parameters::setNumberOfSequences(int file,LargeCount n){
  943. if(m_numberOfSequencesInFile.count(file)==0){
  944. m_numberOfSequencesInFile[file]=n;
  945. m_totalNumberOfSequences+=n;
  946. }
  947. }
  948. int Parameters::getNumberOfLibraries(){
  949. return m_numberOfLibraries;
  950. }
  951. LargeCount Parameters::getNumberOfSequences(int file){
  952. #ifdef ASSERT
  953. if(file>=(int)m_numberOfSequencesInFile.size())
  954. cout<<"Error File= "<<file<<" Files: "<<m_numberOfSequencesInFile.size()<<endl;
  955. assert(file<(int)m_numberOfSequencesInFile.size());
  956. #endif
  957. return m_numberOfSequencesInFile[file];
  958. }
  959. int Parameters::getNumberOfFiles(){
  960. return m_singleEndReadsFile.size();
  961. }
  962. bool Parameters::isAutomatic(int library){
  963. return m_automaticLibraries.count(library)>0;
  964. }
  965. int Parameters::getLibrary(int file){
  966. return m_fileLibrary[file];
  967. }
  968. bool Parameters::isInterleavedFile(int i){
  969. return m_interleavedFiles.count(i)>0;
  970. }
  971. bool Parameters::showMemoryUsage(){
  972. return m_showMemoryUsage;
  973. }
  974. string Parameters::getReceivedMessagesFile(){
  975. string outputForMessages=getPrefix()+"ReceivedMessages.txt";
  976. return outputForMessages;
  977. }
  978. void Parameters::printFinalMessage(){
  979. cout<<"Rank "<<MASTER_RANK<<" wrote library statistics"<<endl;
  980. }
  981. CoverageDepth Parameters::getMaximumAllowedCoverage(){
  982. CoverageDepth a=0;
  983. a--;
  984. return a;
  985. }
  986. void Parameters::setPeakCoverage(CoverageDepth a){
  987. if(!m_providedPeakCoverage){
  988. m_peakCoverage=a;
  989. }
  990. }
  991. void Parameters::setRepeatCoverage(CoverageDepth a){
  992. if(!m_providedRepeatCoverage){
  993. m_repeatCoverage=a;
  994. }
  995. }
  996. CoverageDepth Parameters::getPeakCoverage(){
  997. return m_peakCoverage;
  998. }
  999. CoverageDepth Parameters::getRepeatCoverage(){
  1000. return m_repeatCoverage;
  1001. }
  1002. int Parameters::getSize(){
  1003. return m_size;
  1004. }
  1005. void Parameters::setSize(int a){
  1006. m_size=a;
  1007. }
  1008. bool Parameters::runReducer(){
  1009. return m_reducerIsActivated;
  1010. }
  1011. void Parameters::showOption(string option,string description){
  1012. string spacesBeforeOption=" ";
  1013. cout<<spacesBeforeOption<<option<<endl;
  1014. showOptionDescription(description);
  1015. }
  1016. void Parameters::showOptionDescription(string description){
  1017. string spacesBeforeDescription=" ";
  1018. cout<<spacesBeforeDescription<<description<<endl;
  1019. }
  1020. /* obviously shows usage */
  1021. void Parameters::showUsage(){
  1022. string basicSpaces=" ";
  1023. cout<<"NAME"<<endl<<basicSpaces<<"Ray - assemble genomes in parallel using the message-passing interface"<<endl<<endl;
  1024. cout<<"SYNOPSIS"<<endl;
  1025. cout<<basicSpaces<<"mpiexec -np NUMBER_OF_RANKS Ray -k KMERLENGTH -p l1_1.fastq l1_2.fastq -p l2_1.fastq l2_2.fastq -o test"<<endl;
  1026. cout<<endl;
  1027. cout<<"DESCRIPTION:"<<endl;
  1028. cout<<endl;
  1029. cout<<" The Ray genome assembler is built on top of the RayPlatform, a generic plugin-based"<<endl;
  1030. cout<<" distributed and parallel compute engine that uses the message-passing interface"<<endl;
  1031. cout<<" for passing messages."<<endl;
  1032. cout<<endl;
  1033. cout<<" Ray targets several applications:"<<endl;
  1034. cout<<endl;
  1035. cout<<" - de novo genome assembly"<<endl;
  1036. cout<<" - de novo meta-genome assembly"<<endl;
  1037. cout<<" - de novo transcriptome assembly (works, but not tested a lot)"<<endl;
  1038. cout<<" - quantification of contig abundances"<<endl;
  1039. cout<<" - quantification of microbiome consortia members"<<endl;
  1040. cout<<" - quantification of transcript expression"<<endl;
  1041. cout<<" - taxonomy profiling of samples"<<endl;
  1042. cout<<" - gene ontology profiling of samples"<<endl;
  1043. cout<<endl;
  1044. showOption("-help","Displays this help page.");
  1045. cout<<endl;
  1046. showOption("-version","Displays Ray version and compilation options.");
  1047. cout<<endl;
  1048. cout<<" K-mer length"<<endl;
  1049. cout<<endl;
  1050. showOption("-k kmerLength","Selects the length of k-mers. The default value is 21. ");
  1051. showOptionDescription("It must be odd because reverse-complement vertices are stored together.");
  1052. showOptionDescription("The maximum length is defined at compilation by MAXKMERLENGTH");
  1053. showOptionDescription("Larger k-mers utilise more memory.");
  1054. cout<<endl;
  1055. cout<<" Inputs"<<endl;
  1056. cout<<endl;
  1057. showOption("-p leftSequenceFile rightSequenceFile [averageOuterDistance standardDeviation]","Provides two files containing paired-end reads.");
  1058. showOptionDescription("averageOuterDistance and standardDeviation are automatically computed if not provided.");
  1059. cout<<endl;
  1060. showOption("-i interleavedSequenceFile [averageOuterDistance standardDeviation]","Provides one file containing interleaved paired-end reads.");
  1061. showOptionDescription("averageOuterDistance and standardDeviation are automatically computed if not provided.");
  1062. cout<<endl;
  1063. showOption("-s sequenceFile","Provides a file containing single-end reads.");
  1064. cout<<endl;
  1065. cout<<" Biological abundances"<<endl;
  1066. cout<<endl;
  1067. showOption("-search searchDirectory","Provides a directory containing fasta files to be searched in the de Bruijn graph.");
  1068. showOptionDescription("Biological abundances will be written to RayOutput/BiologicalAbundances");
  1069. showOptionDescription("See Documentation/BiologicalAbundances.txt");
  1070. cout<<endl;
  1071. showOption("-one-color-per-file", "Sets one color per file instead of one per sequence.");
  1072. showOptionDescription("By default, each sequence in each file has a different color.");
  1073. showOptionDescription("For files with large numbers of sequences, using one single color per file may be more efficient.");
  1074. cout<<endl;
  1075. cout<<" Taxonomic profiling with colored de Bruijn graphs"<<endl;
  1076. cout<<endl;
  1077. showOption("-with-taxonomy Genome-to-Taxon.tsv TreeOfLife-Edges.tsv Taxon-Names.tsv","Provides a taxonomy.");
  1078. showOptionDescription("Computes and writes detailed taxonomic profiles.");
  1079. showOptionDescription("See Documentation/Taxonomy.txt for details.");
  1080. cout<<endl;
  1081. showOption("-gene-ontology OntologyTerms.txt Annotations.txt","Provides an ontology and annotations.");
  1082. showOptionDescription("OntologyTerms.txt is fetched from http://geneontology.org");
  1083. showOptionDescription("Annotations.txt is a 2-column file (EMBL_CDS handle & gene ontology identifier)");
  1084. showOptionDescription("See Documentation/GeneOntology.txt");
  1085. cout<<" Outputs"<<endl;
  1086. cout<<endl;
  1087. showOption("-o outputDirectory","Specifies the directory for outputted files. Default is RayOutput");
  1088. cout<<endl;
  1089. cout<<" Other outputs"<<endl;
  1090. cout<<endl;
  1091. showOption("-amos","Writes the AMOS file called RayOutput/AMOS.afg");
  1092. showOptionDescription("An AMOS file contains read positions on contigs.");
  1093. showOptionDescription("Can be opened with software with graphical user interface.");
  1094. cout<<endl;
  1095. showOption("-write-kmers","Writes k-mer graph to RayOutput/kmers.txt");
  1096. showOptionDescription("The resulting file is not utilised by Ray.");
  1097. showOptionDescription("The resulting file is very large.");
  1098. cout<<endl;
  1099. showOption("-write-read-markers","Writes read markers to disk.");
  1100. cout<<endl;
  1101. showOption("-write-seeds","Writes seed DNA sequences to RayOutput/Rank<rank>.RaySeeds.fasta");
  1102. cout<<endl;
  1103. showOption("-write-extensions","Writes extension DNA sequences to RayOutput/Rank<rank>.RayExtensions.fasta");
  1104. cout<<endl;
  1105. showOption("-write-contig-paths","Writes contig paths with coverage values");
  1106. showOptionDescription("to RayOutput/Rank<rank>.RayContigPaths.txt");
  1107. cout<<endl;
  1108. showOption("-write-marker-summary","Writes marker statistics.");
  1109. cout<<endl;
  1110. cout<<" Memory usage"<<endl;
  1111. cout<<endl;
  1112. showOption("-show-memory-usage","Shows memory usage. Data is fetched from /proc on GNU/Linux");
  1113. showOptionDescription("Needs __linux__");
  1114. cout<<endl;
  1115. showOption("-show-memory-allocations","Shows memory allocation events");
  1116. cout<<endl;
  1117. cout<<" Algorithm verbosity"<<endl;
  1118. cout<<endl;
  1119. showOption("-show-extension-choice","Shows the choice made (with other choices) during the extension.");
  1120. cout<<endl;
  1121. showOption("-show-ending-context","Shows the ending context of each extension.");
  1122. showOptionDescription("Shows the children of the vertex where extension was too difficult.");
  1123. cout<<endl;
  1124. showOption("-show-distance-summary","Shows summary of outer distances used for an extension path.");
  1125. cout<<endl;
  1126. showOption("-show-consensus","Shows the consensus when a choice is done.");
  1127. cout<<endl;
  1128. cout<<" Assembly options (defaults work well)"<<endl;
  1129. cout<<endl;
  1130. showOption("-disable-recycling","Disables read recycling during the assembly");
  1131. showOptionDescription("reads will be set free in 3 cases:");
  1132. showOptionDescription("1. the distance did not match for a pair");
  1133. showOptionDescription("2. the read has not met its mate");
  1134. showOptionDescription("3. the library population indicates a wrong placement");
  1135. showOptionDescription("see Constrained traversal of repeats with paired sequences.");
  1136. showOptionDescription("Sébastien Boisvert, Élénie Godzaridis, François Laviolette & Jacques Corbeil.");
  1137. showOptionDescription("First Annual RECOMB Satellite Workshop on Massively Parallel Sequencing, March 26-27 2011, Vancouver, BC, Canada.");
  1138. cout<<endl;
  1139. showOption("-minimum-contig-length","Changes the minimum contig length, default is 100");
  1140. cout<<endl;
  1141. showOption("-color-space","Runs in color-space");
  1142. showOptionDescription("Needs csfasta files. Activated automatically if csfasta files are provided.");
  1143. cout<<endl;
  1144. /*
  1145. showOption("-minimumCoverage minimumCoverage","Sets manually the minimum coverage.");
  1146. showOptionDescription("If not provided, it is computed by Ray automatically.");
  1147. cout<<endl;
  1148. showOption("-peakCoverage peakCoverage","Sets manually the peak coverage.");
  1149. showOptionDescription("If not provided, it is computed by Ray automatically.");
  1150. cout<<endl;
  1151. showOption("-repeatCoverage repeatCoverage","Sets manually the repeat coverage.");
  1152. showOptionDescription("If not provided, it is computed by Ray automatically.");
  1153. cout<<endl;
  1154. */
  1155. cout<<" Checkpointing"<<endl;
  1156. cout<<endl;
  1157. showOption("-write-checkpoints checkpointDirectory","Write checkpoint files");
  1158. cout<<endl;
  1159. showOption("-read-checkpoints checkpointDirectory","Read checkpoint files");
  1160. cout<<endl;
  1161. showOption("-read-write-checkpoints checkpointDirectory","Read and write checkpoint files");
  1162. cout<<endl;
  1163. cout<<" Message routing for large number of cores"<<endl;
  1164. cout<<endl;
  1165. showOption("-route-messages","Enables the Ray message router. Disabled by default.");
  1166. showOptionDescription("Messages will be routed accordingly so that any rank can communicate directly with only a few others.");
  1167. showOptionDescription("Without -route-messages, any rank can communicate directly with any other rank.");
  1168. showOptionDescription("Files generated: Routing/Connections.txt, Routing/Routes.txt and Routing/RelayEvents.txt");
  1169. showOptionDescription("and Routing/Summary.txt");
  1170. cout<<endl;
  1171. showOption("-connection-type type","Sets the connection type for routes.");
  1172. showOptionDescription("Accepted values are debruijn, group, random, kautz and complete. Default is debruijn.");
  1173. showOptionDescription("With the type debruijn, the number of ranks must be a power of something.");
  1174. showOptionDescription("Examples: 256 = 16*16, 512=8*8*8, 49=7*7, and so on.");
  1175. showOptionDescription("Otherwise, don't use debruijn routing but use another one");
  1176. showOptionDescription("With the type kautz, the number of ranks n must be n=(k+1)*k^(d-1) for some k and d");
  1177. cout<<endl;
  1178. showOption("-routing-graph-degree degree","Specifies the outgoing degree for the routing graph.");
  1179. cout<<endl;
  1180. cout<<" Hardware testing"<<endl;
  1181. cout<<endl;
  1182. showOption("-test-network-only","Tests the network and returns.");
  1183. cout<<endl;
  1184. showOption("-write-network-test-raw-data","Writes one additional file per rank detailing the network test.");
  1185. cout<<endl;
  1186. cout<<" Debugging"<<endl;
  1187. cout<<endl;
  1188. showOption("-run-profiler","Runs the profiler as the code runs. By default, only show granularity warnings.");
  1189. showOptionDescription("Running the profiler increases running times.");
  1190. cout<<endl;
  1191. showOption("-with-profiler-details","Shows number of messages sent and received in each methods during in each time slices (epochs). Needs -run-profiler.");
  1192. cout<<endl;
  1193. showOption("-show-communication-events","Shows all messages sent and received.");
  1194. cout<<endl;
  1195. showOption("-show-read-placement","Shows read placement in the graph during the extension.");
  1196. cout<<endl;
  1197. showOption("-debug-bubbles","Debugs bubble code.");
  1198. showOptionDescription("Bubbles can be due to heterozygous sites or sequencing errors or other (unknown) events");
  1199. cout<<endl;
  1200. showOption("-debug-seeds","Debugs seed code.");
  1201. showOptionDescription("Seeds are paths in the graph that are likely unique.");
  1202. cout<<endl;
  1203. showOption("-debug-fusions","Debugs fusion code.");
  1204. cout<<endl;
  1205. showOption("-debug-scaffolder","Debug the scaffolder.");
  1206. cout<<endl;
  1207. cout<<endl;
  1208. cout<<"FILES"<<endl;
  1209. cout<<endl;
  1210. cout<<" Input files"<<endl;
  1211. cout<<endl;
  1212. cout<<" Note: file format is determined with file extension."<<endl;
  1213. cout<<endl;
  1214. cout<<" .fasta"<<endl;
  1215. cout<<" .fasta.gz (needs HAVE_LIBZ=y at compilation)"<<endl;
  1216. cout<<" .fasta.bz2 (needs HAVE_LIBBZ2=y at compilation)"<<endl;
  1217. cout<<" .fastq"<<endl;
  1218. cout<<" .fastq.gz (needs HAVE_LIBZ=y at compilation)"<<endl;
  1219. cout<<" .fastq.bz2 (needs HAVE_LIBBZ2=y at compilation)"<<endl;
  1220. cout<<" .sff (paired reads must be extracted manually)"<<endl;
  1221. cout<<" .csfasta (color-space reads)"<<endl;
  1222. cout<<endl;
  1223. cout<<" Outputted files"<<endl;
  1224. cout<<endl;
  1225. cout<<" Scaffolds"<<endl;
  1226. cout<<endl;
  1227. cout<<" RayOutput/Scaffolds.fasta"<<endl;
  1228. cout<<" The scaffold sequences in FASTA format"<<endl;
  1229. cout<<" RayOutput/ScaffoldComponents.txt"<<endl;
  1230. cout<<" The components of each scaffold"<<endl;
  1231. cout<<" RayOutput/ScaffoldLengths.txt"<<endl;
  1232. cout<<" The length of each scaffold"<<endl;
  1233. cout<<" RayOutput/ScaffoldLinks.txt"<<endl;
  1234. cout<<" Scaffold links"<<endl;
  1235. cout<<endl;
  1236. cout<<" Contigs"<<endl;
  1237. cout<<endl;
  1238. cout<<" RayOutput/Contigs.fasta"<<endl;
  1239. cout<<" Contiguous sequences in FASTA format"<<endl;
  1240. cout<<" RayOutput/ContigLengths.txt"<<endl;
  1241. cout<<" The lengths of contiguous sequences"<<endl;
  1242. cout<<endl;
  1243. cout<<" Summary"<<endl;
  1244. cout<<endl;
  1245. cout<<" RayOutput/OutputNumbers.txt"<<endl;
  1246. cout<<" Overall numbers for the assembly"<<endl;
  1247. cout<<endl;
  1248. cout<<" de Bruijn graph"<<endl;
  1249. cout<<endl;
  1250. cout<<" RayOutput/CoverageDistribution.txt"<<endl;
  1251. cout<<" The distribution of coverage values"<<endl;
  1252. cout<<" RayOutput/CoverageDistributionAnalysis.txt"<<endl;
  1253. cout<<" Analysis of the coverage distribution"<<endl;
  1254. cout<<" RayOutput/degreeDistribution.txt"<<endl;
  1255. cout<<" Distribution of ingoing and outgoing degrees"<<endl;
  1256. cout<<" RayOutput/kmers.txt"<<endl;
  1257. cout<<" k-mer graph, required option: -write-kmers"<<endl;
  1258. cout<<" The resulting file is not utilised by Ray."<<endl;
  1259. cout<<" The resulting file is very large."<<endl;
  1260. cout<<endl;
  1261. cout<<" Assembly steps"<<endl;
  1262. cout<<endl;
  1263. cout<<" RayOutput/SeedLengthDistribution.txt"<<endl;
  1264. cout<<" Distribution of seed length"<<endl;
  1265. cout<<" RayOutput/Rank<rank>.OptimalReadMarkers.txt"<<endl;
  1266. cout<<" Read markers."<<endl;
  1267. cout<<" RayOutput/Rank<rank>.RaySeeds.fasta"<<endl;
  1268. cout<<" Seed DNA sequences, required option: -write-seeds"<<endl;
  1269. cout<<" RayOutput/Rank<rank>.RayExtensions.fasta"<<endl;
  1270. cout<<" Extension DNA sequences, required option: -write-extensions"<<endl;
  1271. cout<<" RayOutput/Rank<rank>.RayContigPaths.txt"<<endl;
  1272. cout<<" Contig paths with coverage values, required option: -write-contig-paths"<<endl;
  1273. cout<<endl;
  1274. cout<<" Paired reads"<<endl;
  1275. cout<<endl;
  1276. cout<<" RayOutput/LibraryStatistics.txt"<<endl;
  1277. cout<<" Estimation of outer distances for paired reads"<<endl;
  1278. cout<<" RayOutput/Library<LibraryNumber>.txt"<<endl;
  1279. cout<<" Frequencies for observed outer distances (insert size + read lengths)"<<endl;
  1280. cout<<endl;
  1281. cout<<" Partition"<<endl;
  1282. cout<<endl;
  1283. cout<<" RayOutput/NumberOfSequences.txt"<<endl;
  1284. cout<<" Number of reads in each file"<<endl;
  1285. cout<<" RayOutput/SequencePartition.txt"<<endl;
  1286. cout<<" Sequence partition"<<endl;
  1287. cout<<endl;
  1288. cout<<" Ray software"<<endl;
  1289. cout<<endl;
  1290. cout<<" RayOutput/RayVersion.txt"<<endl;
  1291. cout<<" The version of Ray"<<endl;
  1292. cout<<" RayOutput/RayCommand.txt"<<endl;
  1293. cout<<" The exact same command provided "<<endl;
  1294. cout<<endl;
  1295. cout<<" AMOS"<<endl;
  1296. cout<<endl;
  1297. cout<<" RayOutput/AMOS.afg"<<endl;
  1298. cout<<" Assembly representation in AMOS format, required option: -amos"<<endl;
  1299. cout<<endl;
  1300. cout<<" Communication"<<endl;
  1301. cout<<endl;
  1302. cout<<" RayOutput/MessagePassingInterface.txt"<<endl;
  1303. cout<<" Number of messages sent"<<endl;
  1304. cout<<" RayOutput/NetworkTest.txt"<<endl;
  1305. cout<<" Latencies in microseconds"<<endl;
  1306. cout<<" RayOutput/Rank<rank>NetworkTestData.txt"<<endl;
  1307. cout<<" Network test raw data"<<endl;
  1308. cout<<endl;
  1309. cout<<"DOCUMENTATION"<<endl;
  1310. cout<<endl;
  1311. cout<<basicSpaces<<"This help page (always up-to-date)"<<endl;
  1312. cout<<basicSpaces<<"Manual (Portable Document Format): InstructionManual.pdf"<<endl;
  1313. cout<<basicSpaces<<"Mailing list archives: http://sourceforge.net/mailarchive/forum.php?forum_name=denovoassembler-users"<<endl;
  1314. cout<<endl;
  1315. cout<<"AUTHOR"<<endl;
  1316. cout<<basicSpaces<<"Written by Sébastien Boisvert."<<endl;
  1317. cout<<endl;
  1318. cout<<"REPORTING BUGS"<<endl;
  1319. cout<<basicSpaces<<"Report bugs to denovoassembler-users@lists.sourceforge.net"<<endl;
  1320. cout<<basicSpaces<<"Home page: <http://denovoassembler.sourceforge.net/>"<<endl;
  1321. cout<<endl;
  1322. cout<<"COPYRIGHT"<<endl;
  1323. cout<<basicSpaces<<"This program is free software: you can redistribute it and/or modify"<<endl;
  1324. cout<<basicSpaces<<"it under the terms of the GNU General Public License as published by"<<endl;
  1325. cout<<basicSpaces<<"the Free Software Foundation, version 3 of the License."<<endl;
  1326. cout<<endl;
  1327. cout<<basicSpaces<<"This program is distributed in the hope that it will be useful,"<<endl;
  1328. cout<<basicSpaces<<"but WITHOUT ANY WARRANTY; without even the implied warranty of"<<endl;
  1329. cout<<basicSpaces<<"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the"<<endl;
  1330. cout<<basicSpaces<<"GNU General Public License for more details."<<endl;
  1331. cout<<endl;
  1332. cout<<basicSpaces<<"You have received a copy of the GNU General Public License"<<endl;
  1333. cout<<basicSpaces<<"along with this program (see LICENSE)."<<endl;
  1334. cout<<endl;
  1335. cout<<"Ray "<<RAY_VERSION<<endl;
  1336. }
  1337. /* get the prefix for mmap'ed files
  1338. * currently not used. */
  1339. string Parameters::getMemoryPrefix(){
  1340. return m_memoryFilePrefix;
  1341. }
  1342. int Parameters::getReducerValue(){
  1343. return m_reducerPeriod;
  1344. }
  1345. Rank Parameters::getRankFromGlobalId(ReadHandle a){
  1346. uint64_t elementsPerRank=m_totalNumberOfSequences/m_size;
  1347. Rank rank=a/elementsPerRank;
  1348. if(rank >= m_size){
  1349. rank=m_size-1;
  1350. }
  1351. #ifdef ASSERT
  1352. assert(rank>=0);
  1353. if(rank>=m_size){
  1354. cout<<"GlobalIdentifier="<<a<<" Total="<<m_totalNumberOfSequences<<" Size="<<m_size<<" Rank="<<rank<<endl;
  1355. }
  1356. assert(rank<m_size);
  1357. #endif
  1358. return rank;
  1359. }
  1360. int Parameters::getIdFromGlobalId(ReadHandle a){
  1361. int bin=getRankFromGlobalId(a);
  1362. LargeCount x=m_totalNumberOfSequences/m_size;
  1363. return a-bin*x;
  1364. }
  1365. int Parameters::getMaximumDistance(){
  1366. return m_maximumDistance;
  1367. }
  1368. uint64_t Parameters::getGlobalIdFromRankAndLocalId(Rank rank,int id){
  1369. uint64_t x=m_totalNumberOfSequences/m_size;
  1370. return rank*x+id;
  1371. }
  1372. CoverageDepth Parameters::getMinimumCoverage(){
  1373. return m_minimumCoverage;
  1374. }
  1375. void Parameters::setMinimumCoverage(CoverageDepth a){
  1376. if(!m_providedMinimumCoverage)
  1377. m_minimumCoverage=a;
  1378. }
  1379. Kmer Parameters::_complementVertex(Kmer*a){
  1380. return a->complementVertex(m_wordSize,m_colorSpaceMode);
  1381. }
  1382. bool Parameters::hasPairedReads(){
  1383. return m_numberOfLibraries!=0;
  1384. }
  1385. int Parameters::_vertexRank(Kmer*a){
  1386. return a->vertexRank(m_size,m_wordSize,m_colorSpaceMode);
  1387. }
  1388. string Parameters::getScaffoldFile(){
  1389. ostringstream a;
  1390. a<<getPrefix()<<"Scaffolds.fasta";
  1391. return a.str();
  1392. }
  1393. int Parameters::getColumns(){
  1394. return 60;
  1395. }
  1396. int Parameters::getLargeContigThreshold(){
  1397. return 500;
  1398. }
  1399. bool Parameters::showMemoryAllocations(){
  1400. return m_showMemoryAllocations;
  1401. }
  1402. bool Parameters::writeKmers(){
  1403. return m_writeKmers;
  1404. }
  1405. void Parameters::fileNameHook(string fileName){
  1406. if(fileName.find(".csfasta")!=string::npos){
  1407. if(!m_colorSpaceMode&&m_rank==MASTER_RANK){
  1408. cout<<endl;
  1409. cout<<"Enabling color-space mode"<<endl;
  1410. cout<<"All reads should be in color space."<<endl;
  1411. }
  1412. m_colorSpaceMode=true;
  1413. }
  1414. }
  1415. CoverageDepth Parameters::getMinimumCoverageToStore(){
  1416. return 2;
  1417. }
  1418. int Parameters::getLibraryPeaks(int library){
  1419. return m_libraryAverageLength[library].size();
  1420. }
  1421. bool Parameters::hasOption(string a){
  1422. return m_options.count(a)>0;
  1423. }
  1424. bool Parameters::hasFile(const char*file){
  1425. ifstream f(file);
  1426. bool fileIsOk=f.good();
  1427. f.close();
  1428. return fileIsOk;
  1429. }
  1430. string Parameters::getCheckpointFile(const char*checkpointName){
  1431. ostringstream a;
  1432. a<<m_checkpointDirectory<<"/";
  1433. a<<"Rank"<<getRank()<<".Checkpoint."<<checkpointName<<".ray";
  1434. return a.str();
  1435. }
  1436. bool Parameters::hasCheckpoint(const char*checkpointName){
  1437. //cout<<"hasCheckpoint? "<<checkpointName<<endl;
  1438. if(!readCheckpoints())
  1439. return false;
  1440. return hasFile(getCheckpointFile(checkpointName).c_str());
  1441. }
  1442. bool Parameters::writeCheckpoints(){
  1443. if(hasOption("-write-checkpoints"))
  1444. return true;
  1445. if(hasOption("-read-write-checkpoints"))
  1446. return true;
  1447. return false;
  1448. }
  1449. bool Parameters::readCheckpoints(){
  1450. if(hasOption("-read-checkpoints"))
  1451. return true;
  1452. if(hasOption("-read-write-checkpoints"))
  1453. return true;
  1454. return false;
  1455. }
  1456. bool Parameters::showCommunicationEvents(){
  1457. return m_showCommunicationEvents;
  1458. }
  1459. bool Parameters::showReadPlacement(){
  1460. return m_showReadPlacement;
  1461. }
  1462. string Parameters::getConnectionType(){
  1463. return m_connectionType;
  1464. }
  1465. int Parameters::getRoutingDegree(){
  1466. return m_degree;
  1467. }
  1468. vector<string>*Parameters::getSearchDirectories(){
  1469. return &m_searchDirectories;
  1470. }
  1471. string Parameters::getGenomeToTaxonFile(){
  1472. return m_genomeToTaxonFile;
  1473. }
  1474. string Parameters::getTreeFile(){
  1475. return m_treeFile;
  1476. }
  1477. string Parameters::getTaxonNameFile(){
  1478. return m_taxonNameFile;
  1479. }
  1480. string Parameters::getSampleName(){
  1481. string sample=getPrefix();
  1482. return getBaseName(sample);
  1483. }
  1484. string Parameters::getBaseName(string directory){
  1485. int last=directory.length()-1;
  1486. while(last>0 && isDirectorySeparator(directory[last])){
  1487. last--;
  1488. }
  1489. int first=last;
  1490. while(first>0 && !isDirectorySeparator(directory[first])){
  1491. first--;
  1492. }
  1493. if(isDirectorySeparator(directory[first])){
  1494. first++;
  1495. }
  1496. int length=last-first+1;
  1497. return directory.substr(first,length);
  1498. }
  1499. bool Parameters::isDirectorySeparator(char a){
  1500. /* POSIX or Redmond separator */
  1501. return a=='/' || a=='\\';
  1502. }