PageRenderTime 30ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/NRPeditor/clusterFinder.cpp

https://bitbucket.org/antismash/antismash1
C++ | 3156 lines | 2778 code | 187 blank | 191 comment | 925 complexity | 7188e981c84876c41029f52412a10742 MD5 | raw file
  1. #include <iostream>
  2. #include <string>
  3. #include <fstream>
  4. #include <stdlib.h>
  5. #include <sstream>
  6. using namespace std;
  7. #include "constants.h"
  8. #include "UnitClass2.h"
  9. #include "QueueClass.h"
  10. #include "ReactionClass.h"
  11. #include "spawnl.h"
  12. // function declarations
  13. // overloading operator for outputing nodes
  14. ostream& operator <<(ostream& outputStream, const UnitClass2& domain );
  15. ostream& operator <<(ostream& outputStream, const UnitClass2& domain )
  16. {
  17. if( domain.getStartSite() == 0 )
  18. {
  19. outputStream << domain.getUnit() << " "
  20. << domain.getSmile() << endl;
  21. }
  22. else
  23. {
  24. outputStream << " Formula: "
  25. << domain.getFormula()
  26. << " start site: "
  27. << domain.getStartSite() << endl
  28. << " " << domain.getSmile() << endl;
  29. }
  30. return outputStream;
  31. }
  32. void clearPositions( string &pos235, string &pos236, string &pos239,
  33. string &pos278, string &pos299, string &pos301, string &pos322,
  34. string &pos330, string &pos331, string &pos58, string &pos59,
  35. string &pos60, string &pos61, string &pos62, string &pos70,
  36. string &pos72, string &pos74, string &pos75, string &pos77,
  37. string &pos93, string &pos189, string &pos191, string &pos198,
  38. string &pos199, string &pos200 );
  39. void outputSpecificitiesNRPS( int Aad, int Ala, int Asn, int Asp, int Cys,
  40. int Dab, int Dhb, int Gln, int Glu, int Ile,
  41. int Leu, int Orn, int Phe, int Phg, int Pip,
  42. int Pro, int Ser, int Thr, int Tyr, int His,
  43. int Lys, int Gly, int Val);
  44. void outputSpecificitiesPKS( int mal, int mmal, int emal, int omal);
  45. string maxSpecificityNRPS( int Aad, int Ala, int Asn, int Asp, int Cys,
  46. int Dab, int Dhb, int Gln, int Glu, int Ile,
  47. int Leu, int Orn, int Phe, int Phg, int Pip,
  48. int Pro, int Ser, int Thr, int Tyr, int His,
  49. int Lys, int Gly, int Val);
  50. string maxSpecificityPKS( int mal, int mmal, int emal, int omal);
  51. int getIntVal(string strConvert);
  52. float getFloatVal(string strConvert);
  53. void translateSeq( string &inSeq, string &outputSeq, int &frame);
  54. void predictDomains( string aminoAcidSeq,
  55. string &predictedSubstrates,
  56. QueueClass< UnitClass2 > &aaNameList,
  57. const bool &allFramesRead,
  58. const int &frame, const string &nucSeq,
  59. string &allClusters, int&NRPScount, int &PKScount,
  60. int &mixedNRPSPKScount, int &transATcount, int &mepCount,
  61. int &mvaCount);
  62. // function bodies
  63. void outputSpecificitiesPKS( int mal, int mmal, int emal, int omal)
  64. {
  65. cout << "PKS specificity scores: " << endl;
  66. cout << "mal: " << mal << " ";
  67. cout << "mmal: " << mmal << " ";
  68. cout << "emal: " << emal << " ";
  69. cout << "omal: " << omal << " ";
  70. cout << endl;
  71. }
  72. void outputSpecificitiesNRPS( int Aad, int Ala, int Asn, int Asp, int Cys,
  73. int Dab, int Dhb, int Gln, int Glu, int Ile,
  74. int Leu, int Orn, int Phe, int Phg, int Pip,
  75. int Pro, int Ser, int Thr, int Tyr, int His,
  76. int Lys, int Gly, int Val)
  77. {
  78. cout << "NRPS specificity scores: " << endl;
  79. cout << "Aad: " << Aad << " ";
  80. cout << "Ala: " << Ala << " ";
  81. cout << "Asn: " << Asn << " ";
  82. cout << "Asp: " << Asp << " ";
  83. cout << "Cys: " << Cys << " ";
  84. cout << "Dab: " << Dab << " ";
  85. cout << "Dhb: " << Dhb << " ";
  86. cout << "Gln: " << Gln << " ";
  87. cout << "Glu: " << Glu << " ";
  88. cout << "Gly: " << Gly << " ";
  89. cout << "His: " << His << endl;
  90. cout << "Ile: " << Ile << " ";
  91. cout << "Leu: " << Leu << " ";
  92. cout << "Lys: " << Lys << " ";
  93. cout << "Orn: " << Orn << " ";
  94. cout << "Phe: " << Phe << " ";
  95. cout << "Phg: " << Phg << " ";
  96. cout << "Pip: " << Pip << " ";
  97. cout << "Pro: " << Pro << " ";
  98. cout << "Ser: " << Ser << " ";
  99. cout << "Thr: " << Thr << " ";
  100. cout << "Tyr: " << Tyr << endl;
  101. cout << "Val: " << Val << " ";
  102. cout << endl;
  103. }
  104. string maxSpecificityPKS( int mal, int mmal, int emal, int omal)
  105. {
  106. int max;
  107. int distinguishingFactor = 5;
  108. string CA;
  109. max = mal;
  110. CA = "mal";
  111. if(mmal > mal)
  112. {
  113. max = mmal;
  114. CA = "mmal";
  115. }
  116. if(emal > max)
  117. {
  118. max = emal;
  119. CA = "emal";
  120. }
  121. if(omal > max)
  122. {
  123. max = omal;
  124. CA = "omal";
  125. }
  126. if(max < distinguishingFactor)
  127. {
  128. CA = "pk";
  129. }
  130. return CA;
  131. }
  132. string maxSpecificityNRPS( int Aad, int Ala, int Asn, int Asp, int Cys,
  133. int Dab, int Dhb, int Gln, int Glu, int Ile,
  134. int Leu, int Orn, int Phe, int Phg, int Pip,
  135. int Pro, int Ser, int Thr, int Tyr, int His,
  136. int Lys, int Gly, int Val)
  137. {
  138. int max;
  139. int distinguishingFactor = 3;
  140. string AA;
  141. max = Aad;
  142. AA = "Aad";
  143. if(Ala >= Aad)
  144. {
  145. max = Ala;
  146. AA = "ala";
  147. }
  148. if(Asn >= max)
  149. {
  150. max = Asn;
  151. AA = "asn";
  152. }
  153. if(Asp >= max)
  154. {
  155. max = Asp;
  156. AA = "asp";
  157. }
  158. if(Cys >= max)
  159. {
  160. max = Cys;
  161. AA = "cys";
  162. }
  163. if(Dab >= max)
  164. {
  165. max = Dab;
  166. AA = "dab";
  167. }
  168. if(Dhb >= max)
  169. {
  170. max = Dhb;
  171. AA = "dhb";
  172. }
  173. if(Gln >= max)
  174. {
  175. max = Gln;
  176. AA = "gln";
  177. }
  178. if(Glu >= max)
  179. {
  180. max = Glu;
  181. AA = "glu";
  182. }
  183. if(Ile >= max)
  184. {
  185. max = Ile;
  186. AA = "ile";
  187. }
  188. if(Leu >= max)
  189. {
  190. max = Leu;
  191. AA = "leu";
  192. }
  193. if(Orn >= max)
  194. {
  195. max = Orn;
  196. AA = "orn";
  197. }
  198. if(Phe >= max)
  199. {
  200. max = Phe;
  201. AA = "phe";
  202. }
  203. if(Phg >= max)
  204. {
  205. max = Phg;
  206. AA = "phg";
  207. }
  208. if(Pip >= max)
  209. {
  210. max = Pip;
  211. AA = "pip";
  212. }
  213. if(Pro >= max)
  214. {
  215. max = Pro;
  216. AA = "pro";
  217. }
  218. if(Ser >= max)
  219. {
  220. max = Ser;
  221. AA = "ser";
  222. }
  223. if(Thr >= max)
  224. {
  225. max = Thr;
  226. AA = "thr";
  227. }
  228. if(Tyr >= max)
  229. {
  230. max = Tyr;
  231. AA = "tyr";
  232. }
  233. if(His >= max)
  234. {
  235. max = His;
  236. AA = "his";
  237. }
  238. if(Lys >= max)
  239. {
  240. max = Lys;
  241. AA = "lys";
  242. }
  243. if(Gly >= max)
  244. {
  245. max = Gly;
  246. AA = "gly";
  247. }
  248. if(Val >= max)
  249. {
  250. max = Val;
  251. AA = "val";
  252. }
  253. if( max < distinguishingFactor)
  254. {
  255. AA = "nrp";
  256. }
  257. return AA;
  258. }
  259. int getIntVal(string strConvert)
  260. {
  261. int intReturn;
  262. // NOTE: You should probably do some checks to ensure that
  263. // this string contains only numbers. If the string is not
  264. // a valid integer, zero will be returned.
  265. intReturn = atoi(strConvert.c_str());
  266. return(intReturn);
  267. }
  268. float getFloatVal(string strConvert)
  269. {
  270. float floatReturn;
  271. // NOTE: You should probably do some checks to ensure that
  272. // this string contains only numbers. If the string is not
  273. // a valid integer, zero will be returned.
  274. floatReturn = atof(strConvert.c_str());
  275. return(floatReturn);
  276. }
  277. void clearPositions( string &pos235, string &pos236, string &pos239,
  278. string &pos278, string &pos299, string &pos301, string &pos322,
  279. string &pos330, string &pos331, string &pos58, string &pos59,
  280. string &pos60, string &pos61, string &pos62, string &pos70,
  281. string &pos72, string &pos74, string &pos75, string &pos77,
  282. string &pos93, string &pos189, string &pos191, string &pos198,
  283. string &pos199, string &pos200 )
  284. {
  285. pos235 = "";
  286. pos236 = "";
  287. pos239 = "";
  288. pos278 = "";
  289. pos299 = "";
  290. pos301 = "";
  291. pos322 = "";
  292. pos330 = "";
  293. pos331 = "";
  294. pos58 = "";
  295. pos59 = "";
  296. pos60 = "";
  297. pos61 = "";
  298. pos62 = "";
  299. pos70 = "";
  300. pos72 = "";
  301. pos74 = "";
  302. pos75 = "";
  303. pos77 = "";
  304. pos93 = "";
  305. pos189 = "";
  306. pos191 = "";
  307. pos198 = "";
  308. pos199 = "";
  309. pos200 = "";
  310. }
  311. // finds and returns the reverse complement sequence
  312. string revComp( const string &inSeq, const string &nucleotideSeq);
  313. string revComp( const string &inSeq, const string &nucleotideSeq)
  314. {
  315. string inputSeq;
  316. string outputSeq;
  317. string segment;
  318. string outputFilename;
  319. //string outFile = "rev-" + inSeq;
  320. int seqMin = 50; // minimum length of nucleotide line
  321. ifstream In;
  322. ofstream Out;
  323. int lineCharsCount = 0;
  324. int lineCharsMax = 70;
  325. In.open(inSeq.c_str());
  326. outputFilename = "genomes/rev-"
  327. + nucleotideSeq;
  328. In >> segment;
  329. // bypass initial FASTA details
  330. while(segment.length() < seqMin && !In.fail())
  331. {
  332. In >> segment;
  333. }
  334. // stores nucleotide
  335. inputSeq.append(segment);
  336. while(!In.fail())
  337. {
  338. In >> segment;
  339. inputSeq.append(segment);
  340. }
  341. In.close();
  342. for(int i = inputSeq.length(); i > 0; i--)
  343. {
  344. if(inputSeq[i] == 'A')
  345. {
  346. outputSeq.append("T");
  347. }
  348. if(inputSeq[i] == 'T')
  349. {
  350. outputSeq.append("A");
  351. }
  352. if(inputSeq[i] == 'C')
  353. {
  354. outputSeq.append("G");
  355. }
  356. if(inputSeq[i] == 'G')
  357. {
  358. outputSeq.append("C");
  359. }
  360. lineCharsCount++;
  361. if(lineCharsCount == lineCharsMax)
  362. {
  363. outputSeq.append("\n");
  364. lineCharsCount = 0;
  365. }
  366. }
  367. Out.open(outputFilename.c_str());
  368. Out << ">revComp-" << nucleotideSeq << "\n";
  369. Out << outputSeq;
  370. Out.clear();
  371. Out.close();
  372. return outputSeq;
  373. }
  374. // translates DNA to amino acid through common code
  375. void translateSeq( string &inSeq, string &outputSeq, int &frame)
  376. {
  377. int stopPrev = 0;
  378. int stopPres = 0;
  379. int stopLimit = 50;
  380. int start = 0;
  381. int choice;
  382. int stopCount = 0;
  383. int seqMin = 50;
  384. ifstream In;
  385. ofstream Out;
  386. string segment;
  387. outputSeq.clear();
  388. In.open(inSeq.c_str());
  389. string inputSeq;
  390. int charCount = 0;
  391. int charCountMax = 70;
  392. In >> segment;
  393. //cout << "segment: " << segment << endl;
  394. while(segment.length() < seqMin && !In.fail())
  395. {
  396. In >> segment;
  397. }
  398. inputSeq.append(segment);
  399. while(!In.fail())
  400. {
  401. In >> segment;
  402. inputSeq.append(segment);
  403. }
  404. //cout << "FASTA of DNA stored" << endl;
  405. In.close();
  406. //cout << "Frame: " << frame << endl;
  407. for(int i = frame; i < inputSeq.length()-3; i=i+3)
  408. {
  409. charCount++;
  410. if(charCount == charCountMax)
  411. {
  412. outputSeq.append("\n");
  413. charCount = 0;
  414. }
  415. if(inputSeq[i] == 't' ||
  416. inputSeq[i] == 'T')
  417. {
  418. if(inputSeq[i+1] == 't' ||
  419. inputSeq[i+1] == 'T')
  420. {
  421. if(inputSeq[i+2] == 't' ||
  422. inputSeq[i+2] == 'T' ||
  423. inputSeq[i+2] == 'c' ||
  424. inputSeq[i+2] == 'C')
  425. {
  426. outputSeq.append("F");
  427. }
  428. else
  429. {
  430. outputSeq.append("L");
  431. }
  432. }
  433. else if(inputSeq[i+1] == 'c' ||
  434. inputSeq[i+1] == 'C')
  435. {
  436. outputSeq.append("S");
  437. }
  438. else if(inputSeq[i+1] == 'a' ||
  439. inputSeq[i+1] == 'A')
  440. {
  441. if(inputSeq[i+2] == 't' ||
  442. inputSeq[i+2] == 'T' ||
  443. inputSeq[i+2] == 'c' ||
  444. inputSeq[i+2] == 'C')
  445. {
  446. outputSeq.append("Y");
  447. }
  448. else
  449. {
  450. outputSeq.append("-");
  451. stopCount++; // stop codon
  452. }
  453. }
  454. else if(inputSeq[i+1] == 'g' ||
  455. inputSeq[i+1] == 'G')
  456. {
  457. if(inputSeq[i+2] == 't' ||
  458. inputSeq[i+2] == 'T' ||
  459. inputSeq[i+2] == 'c' ||
  460. inputSeq[i+2] == 'C')
  461. {
  462. outputSeq.append("C");
  463. }
  464. else if(inputSeq[i+2] == 'g' ||
  465. inputSeq[i+2] == 'G')
  466. {
  467. outputSeq.append("W");
  468. }
  469. else
  470. {
  471. outputSeq.append("-");
  472. stopCount++; // stop codon
  473. }
  474. }
  475. } // sequence begins with 't' ends
  476. else if(inputSeq[i] == 'c' ||
  477. inputSeq[i] == 'C')
  478. {
  479. if(inputSeq[i+1] == 't' ||
  480. inputSeq[i+1] == 'T')
  481. {
  482. outputSeq.append("L");
  483. }
  484. else if(inputSeq[i+1] == 'c' ||
  485. inputSeq[i+1] == 'C')
  486. {
  487. outputSeq.append("P");
  488. }
  489. else if(inputSeq[i+1] == 'a' ||
  490. inputSeq[i+1] == 'A')
  491. {
  492. if(inputSeq[i+2] == 't' ||
  493. inputSeq[i+2] == 'T' ||
  494. inputSeq[i+2] == 'c' ||
  495. inputSeq[i+2] == 'C')
  496. {
  497. outputSeq.append("H");
  498. }
  499. else
  500. {
  501. outputSeq.append("Q");
  502. }
  503. }
  504. else if(inputSeq[i+1] == 'g' ||
  505. inputSeq[i+1] == 'G')
  506. {
  507. outputSeq.append("R");
  508. }
  509. } // sequence begins with 'c' ends
  510. else if(inputSeq[i] == 'a' ||
  511. inputSeq[i] == 'A')
  512. {
  513. if(inputSeq[i+1] == 't' ||
  514. inputSeq[i+1] == 'T')
  515. {
  516. if(inputSeq[i+2] != 'g' &&
  517. inputSeq[i+2] != 'G')
  518. {
  519. outputSeq.append("I");
  520. }
  521. else
  522. {
  523. outputSeq.append("M");
  524. }
  525. }
  526. else if(inputSeq[i+1] == 'c' ||
  527. inputSeq[i+1] == 'C')
  528. {
  529. outputSeq.append("T");
  530. }
  531. else if(inputSeq[i+1] == 'a' ||
  532. inputSeq[i+1] == 'A')
  533. {
  534. if(inputSeq[i+2] == 't' ||
  535. inputSeq[i+2] == 'T' ||
  536. inputSeq[i+2] == 'c' ||
  537. inputSeq[i+2] == 'C')
  538. {
  539. outputSeq.append("N");
  540. }
  541. else
  542. {
  543. outputSeq.append("K");
  544. }
  545. }
  546. else if(inputSeq[i+1] == 'g' ||
  547. inputSeq[i+1] == 'G')
  548. {
  549. if(inputSeq[i+2] == 't' ||
  550. inputSeq[i+2] == 'T' ||
  551. inputSeq[i+2] == 'c' ||
  552. inputSeq[i+2] == 'C')
  553. {
  554. outputSeq.append("S");
  555. }
  556. else
  557. {
  558. outputSeq.append("R");
  559. }
  560. }
  561. } // sequence begins with 'a' ends
  562. else if(inputSeq[i] == 'g' ||
  563. inputSeq[i] == 'G')
  564. {
  565. if(inputSeq[i+1] == 't' ||
  566. inputSeq[i+1] == 'T')
  567. {
  568. outputSeq.append("V");
  569. }
  570. else if(inputSeq[i+1] == 'c' ||
  571. inputSeq[i+1] == 'C')
  572. {
  573. outputSeq.append("A");
  574. }
  575. else if(inputSeq[i+1] == 'a' ||
  576. inputSeq[i+1] == 'A')
  577. {
  578. if(inputSeq[i+2] == 't' ||
  579. inputSeq[i+2] == 'T' ||
  580. inputSeq[i+2] == 'c' ||
  581. inputSeq[i+2] == 'C')
  582. {
  583. outputSeq.append("D");
  584. }
  585. else
  586. {
  587. outputSeq.append("E");
  588. }
  589. }
  590. else if(inputSeq[i+1] == 'g' ||
  591. inputSeq[i+1] == 'G')
  592. {
  593. outputSeq.append("G");
  594. }
  595. } // codon begins with 'g' ends
  596. else
  597. {
  598. outputSeq.append("-");
  599. stopCount++;
  600. }
  601. } // for loop ends
  602. Out.open("aminoAcidSeq.txt");
  603. Out.clear();
  604. Out << "Protein from one DNA frame" << "\n";
  605. Out << outputSeq;
  606. Out.close();
  607. } // function ends
  608. void predictDomains(string aminoAcidSeq,
  609. string &predictedSubstrates,
  610. QueueClass< UnitClass2 > &aaNameList,
  611. const bool &allFramesRead,
  612. const int &frame, const string &nucSeq,
  613. string &allClusters, int &NRPScount, int &PKScount,
  614. int &mixedNRPSPKScount, int &transATcount, int &mepCount,
  615. int &mvaCount )
  616. {
  617. QueueClass <UnitClass2> sortedListOfDomains;
  618. QueueClass <UnitClass2> sortedListOfMods;
  619. QueueClass <UnitClass2> sortedListOfATKS;
  620. UnitClass2 *domain;
  621. ifstream In;
  622. ifstream InMods;
  623. ifstream tempIn;
  624. ifstream aaSeqIn;
  625. ofstream Out;
  626. ofstream tempOut;
  627. string segment;
  628. float score;
  629. string scoreString;
  630. float modsScore;
  631. float modsScoreMax = 0.1;
  632. float scoreMax = 0.1; // minimum for domain synthetase matching
  633. float idenScore;
  634. float idenScoreMin = 120; // minimum for mods alignment
  635. float length;
  636. string SMILE;
  637. string formula;
  638. string mod;
  639. string substrate;
  640. float siteToAnalyze;
  641. int siteCount;
  642. bool siteFound;
  643. int whileCount = 0;
  644. int queryOrSbjctLength = 6;
  645. string aaSeq;
  646. char *arg[21];
  647. arg[0] = "./blastall"; // program to execute
  648. arg[1] = "blastp"; // protein blast
  649. arg[2] = "synthetases.fasta"; // control A-domain
  650. arg[3] = "mods.fasta"; // output file
  651. arg[4] = "100.0";
  652. arg[6] = "aaSeq.txt";
  653. arg[7] = "blast2.out";
  654. int AdomainStart = 5;
  655. int AdomainEnd = 5;
  656. // A-domains to analyze
  657. bool substrateKnown;
  658. string query;
  659. string sbjct;
  660. string predictedSubstrate;
  661. string proposedSubstrate;
  662. bool foundSubstrate;
  663. bool stringsCompare;
  664. int hits = 0;
  665. int misses = 0;
  666. int maxChars = 10000;
  667. // NRPS key positions
  668. int p235 = 11;
  669. int p236 = 12;
  670. int p239 = 15;
  671. int p278 = 10;
  672. int p299 = 1;
  673. int p301 = 3;
  674. int p322 = 1;
  675. int p330 = 9;
  676. int p331 = 10;
  677. string pos235;
  678. string pos236;
  679. string pos239;
  680. string pos278;
  681. string pos299;
  682. string pos301;
  683. string pos322;
  684. string pos330;
  685. string pos331;
  686. int Aad = 0;
  687. int Ala = 0;
  688. int Asn = 0;
  689. int Asp = 0;
  690. int Cys = 0;
  691. int Dab = 0;
  692. int Dhb = 0;
  693. int Gln = 0;
  694. int Glu = 0;
  695. int Ile = 0;
  696. int Leu = 0;
  697. int Orn = 0;
  698. int Phe = 0;
  699. int Phg = 0;
  700. int Pip = 0;
  701. int Pro = 0;
  702. int Ser = 0;
  703. int Thr = 0;
  704. int Tyr = 0;
  705. int Val = 0;
  706. int His = 0;
  707. int Lys = 0;
  708. int Gly = 0;
  709. int Gly1 = 0;
  710. int Gly2 = 0;
  711. int Gly3 = 0;
  712. int Leu1 = 0;
  713. int Leu2 = 0;
  714. int Leu3 = 0;
  715. int Leu4 = 0;
  716. int Tyr1 = 0;
  717. int Tyr2 = 0;
  718. int Tyr3 = 0;
  719. int Val1 = 0;
  720. int Val2 = 0;
  721. int Val3 = 0;
  722. int mal = 0;
  723. int mmal = 0;
  724. int emal = 0;
  725. int omal = 0;
  726. char FDxAA;
  727. char FD4xAA;
  728. char IT3xAA;
  729. char x1xG299;
  730. char x1xG301;
  731. char x7xxY322;
  732. char x7xxY330;
  733. char x7xxY331;
  734. // PKS key positions
  735. int p58 = 0;
  736. int p59 = 1;
  737. int p60 = 2;
  738. int p61 = 3;
  739. int p62 = 4;
  740. int p70 = 0;
  741. int p72 = 2;
  742. int p74 = 4;
  743. int p75 = 5;
  744. int p77 = 7;
  745. int p93 = 3;
  746. int p189 = 1;
  747. int p191 = 3;
  748. int p198 = 0;
  749. int p199 = 1;
  750. int p200 = 2;
  751. string pos58;
  752. string pos59;
  753. string pos60;
  754. string pos61;
  755. string pos62;
  756. string pos70;
  757. string pos72;
  758. string pos74;
  759. string pos75;
  760. string pos77;
  761. string pos93;
  762. string pos189;
  763. string pos191;
  764. string pos198;
  765. string pos199;
  766. string pos200;
  767. string sigSeq;
  768. float position;
  769. bool NRPS = 0;
  770. bool PKS = 0;
  771. string residue;
  772. string fname;
  773. bool success;
  774. string smileName;
  775. string inString;
  776. UnitClass2 *aaName;
  777. float currentDomainPos;
  778. float nextDomainPos;
  779. float thirdDomainPos;
  780. int PKSmodRange = 7000;
  781. bool KR = 0;
  782. bool DH = 0;
  783. bool ER = 0;
  784. string endSiteString;
  785. string prevSegment;
  786. int endSite;
  787. int startSite;
  788. int extraRange = 1000;
  789. int lastPKSkey = 3;
  790. string scoredSubstrate;
  791. string proteinID;
  792. string proteinFrame;
  793. string aminoAcids;
  794. int nucStartSite;
  795. int nucEndSite;
  796. cout << "Analyzing data..." << endl;
  797. //get every NRPS/PKS domain until no further domains are detected;
  798. // does blast alignment of cluster
  799. spawnl( 1, "./blastall", "./blastall",
  800. "-g", "T", "-p", arg[1], "-d", arg[2], "-i", aminoAcidSeq.c_str(),
  801. "-F", "F", "-C", "0", "-o", "blast.out", NULL);
  802. //cout << "called blast" << endl;
  803. spawnl( 1, "./blastall", "./blastall",
  804. "-g", "T", "-p", arg[1], "-d", arg[3], "-i", aminoAcidSeq.c_str(),
  805. "-F", "F", "-C", "0", "-o", "modsBlast.out", NULL);
  806. //cout << "called modblast" << endl;
  807. //cout << "frame: " << frame << endl;
  808. if(frame == 0)
  809. {
  810. system("cp blast.out frame0.out");
  811. //cout << "copied to frame0.out" << endl;
  812. system("cp modsBlast.out modsFrame0.out");
  813. //cout << "copied to modsFrame0.out" << endl;
  814. system("cp aminoAcidSeq.txt aminoAcidSeq0.txt");
  815. //cout << "aminoAcidSeq0 copied" << endl;
  816. }
  817. if(frame == 1)
  818. {
  819. system("cp blast.out frame1.out");
  820. //cout << "copied to frame1.out" << endl;
  821. system("cp modsBlast.out modsFrame1.out");
  822. //cout << "copied to modsFrame1.out" << endl;
  823. system("cp aminoAcidSeq.txt aminoAcidSeq1.txt");
  824. //cout << "aminoAcidSeq1 copied" << endl;
  825. }
  826. if(frame == 2)
  827. {
  828. system("cp blast.out frame2.out");
  829. //cout << "copied to frame2.out" << endl;
  830. system("cp modsBlast.out modsFrame2.out");
  831. //cout << "copied to modsFrame2.out" << endl;
  832. system("cat frame0.out frame1.out frame2.out > blast.out");
  833. //cout << "concatenated all domain frames" << endl;
  834. system("cat modsFrame0.out modsFrame1.out modsFrame2.out > modsBlast.out");
  835. //cout << "concatenated all mod frames" << endl;
  836. system("rm frame0.out frame1.out frame2.out modsFrame1.out modsFrame2.out");
  837. //cout << "last aminoAcidSeq" << endl;
  838. }
  839. //cout << "after frames copies" << endl;
  840. if(allFramesRead)
  841. {
  842. //cout << "ALL FRAMES READ" << endl;
  843. score = 0;
  844. // open NRPS/PKS alignment file
  845. In.open("blast.out");
  846. In >> segment;
  847. while( !In.fail())
  848. {
  849. //cout << "read blast.out" << endl;
  850. if(segment[0] == '*')
  851. {
  852. In >> segment;
  853. substrate = segment;
  854. }
  855. if(segment[0] == 'E' && segment[1] == 'x'
  856. && segment[2] == 'p')
  857. {
  858. In >> segment; // "="
  859. In >> segment; // expectation
  860. for(int i = 0; i < segment.length(); i++)
  861. {
  862. if(segment[i] == 'e')
  863. {
  864. i = segment.length();
  865. }
  866. else
  867. {
  868. scoreString.append(1, segment[i]);
  869. }
  870. }
  871. score = getFloatVal(scoreString);
  872. //cout << "scoreString: " << scoreString << endl;
  873. scoreString.clear();
  874. for(int i = 0; i < segment.length(); i++)
  875. {
  876. if(segment[i] == 'e')
  877. {
  878. for(int j = i+1; j < segment.length(); j++)
  879. {
  880. scoreString.append(1, segment[j]);
  881. }
  882. }
  883. }
  884. score = score * pow(10.0,(static_cast<double>(getFloatVal(scoreString))));
  885. //cout << "segment: " << segment << endl;
  886. //cout << "score: " << score << endl;
  887. //cout << "scoreString float: " << getFloatVal(scoreString) << endl;
  888. scoreString.clear();
  889. }
  890. if(segment[0] == 'I' && segment[1] == 'd'
  891. && segment[2] == 'e')
  892. {
  893. In >> segment; // "="
  894. In >> segment; // identities
  895. // collect denominator of identities value
  896. for(int i = 0; i < segment.length(); i++)
  897. {
  898. if(segment[i] == '/')
  899. {
  900. for(int j = i+1; j < segment.length(); j++)
  901. {
  902. scoreString.append(1, segment[j]);
  903. i = j;
  904. }
  905. }
  906. }
  907. idenScore = getIntVal(scoreString);
  908. scoreString.clear();
  909. //cout << "score: " << " idenScore: " << idenScore << endl;
  910. if(score < scoreMax && idenScore > idenScoreMin)
  911. {
  912. //cout << "score: " << score << " idenScore: " << idenScore << endl;
  913. while(segment[0] != 'Q' && segment[1] != 'u')
  914. {
  915. In >> segment;
  916. }
  917. In >> segment; // start site of domain on protein
  918. startSite = getIntVal(segment);
  919. //cout << "start site: " << startSite << endl;
  920. In >> segment;
  921. // get protein frame and find where adenylation domain begins
  922. proteinID = segment;
  923. for(int i = 0; i < proteinID.length(); i++)
  924. {
  925. while(proteinID[i] == '-')
  926. {
  927. proteinID.erase(i, 1);
  928. }
  929. }
  930. //cout << "PROTEINID: " << proteinID << endl;
  931. proteinFrame.clear();
  932. aaSeqIn.open("aminoAcidSeq.txt");
  933. aaSeqIn >> aminoAcids;
  934. while(!aaSeqIn.fail())
  935. {
  936. if(aminoAcids.length() > 1)
  937. {
  938. proteinFrame.append(aminoAcids);
  939. }
  940. aaSeqIn >> aminoAcids;
  941. }
  942. aaSeqIn.clear();
  943. aaSeqIn.close();
  944. //cout << "proteinFrame.length(): " << proteinFrame.length() << endl;
  945. nucStartSite = proteinFrame.find(proteinID, startSite - 100);
  946. if(nucStartSite < 0)
  947. {
  948. proteinFrame.clear();
  949. aaSeqIn.open("aminoAcidSeq0.txt");
  950. aaSeqIn >> aminoAcids;
  951. while(!aaSeqIn.fail())
  952. {
  953. if(aminoAcids.length() > 1)
  954. {
  955. proteinFrame.append(aminoAcids);
  956. }
  957. aaSeqIn >> aminoAcids;
  958. }
  959. aaSeqIn.clear();
  960. aaSeqIn.close();
  961. nucStartSite = proteinFrame.find(proteinID, startSite - 100);
  962. //cout << "NOT IN LAST AA FRAME" << endl;
  963. }
  964. if(nucStartSite < 0)
  965. {
  966. proteinFrame.clear();
  967. aaSeqIn.open("aminoAcidSeq1.txt");
  968. aaSeqIn >> aminoAcids;
  969. while(!aaSeqIn.fail())
  970. {
  971. if(aminoAcids.length() > 1)
  972. {
  973. proteinFrame.append(aminoAcids);
  974. }
  975. aaSeqIn >> aminoAcids;
  976. }
  977. aaSeqIn.clear();
  978. aaSeqIn.close();
  979. nucStartSite = proteinFrame.find(proteinID, startSite - 100);
  980. // cout << "NOT IN FIRST AA FRAME" << endl;
  981. }
  982. if(nucStartSite < 0)
  983. {
  984. //cout << "NOT IN SECOND AA FRAME" << endl;
  985. }
  986. nucStartSite = nucStartSite * 3; // translate amino acid position
  987. // into nucleotide position
  988. //cout << "NUCSTARTSITE: " << nucStartSite << endl;
  989. while((segment[0] != 'S' || segment[4] != 'e') && segment[0] != '>'
  990. && !In.fail())
  991. {
  992. prevSegment = segment;
  993. In >> segment;
  994. }
  995. endSiteString = prevSegment;
  996. endSite = getIntVal(prevSegment) + startSite + extraRange;
  997. domain = new UnitClass2;
  998. domain->setStartSite(startSite);
  999. domain->setFormula(substrate);
  1000. domain->setEndSite(endSite);
  1001. domain->setNucStartSite(nucStartSite);
  1002. sortedListOfDomains.insertValue(*domain);
  1003. delete domain;
  1004. }
  1005. score = 1;
  1006. idenScore = 0;
  1007. }
  1008. In >> segment;
  1009. }
  1010. In.close();
  1011. if(sortedListOfDomains.getNumElems() > 0)
  1012. {
  1013. //sortedListOfDomains.printForward();
  1014. }
  1015. else
  1016. {
  1017. cout << "No NRPS-A or PKS-AT domains detected" << endl;
  1018. }
  1019. // detect modifications in cluster and their locations
  1020. InMods.open("modsBlast.out");
  1021. InMods >> segment;
  1022. //cout << "InMods: " << InMods << " segment: " << segment << endl;
  1023. while( !InMods.fail())
  1024. {
  1025. //cout << "in InMods loop" << endl;
  1026. if(segment[0] == '*')
  1027. {
  1028. InMods >> segment;
  1029. mod = segment;
  1030. }
  1031. if(segment[0] == 'E' && segment[1] == 'x'
  1032. && segment[2] == 'p')
  1033. {
  1034. InMods >> segment; // "="
  1035. InMods >> segment; // expectation
  1036. //cout << "segment(expectation): " << segment << endl;
  1037. for(int i = 0; i < segment.length(); i++)
  1038. {
  1039. if(segment[i] == 'e')
  1040. {
  1041. i = segment.length();
  1042. }
  1043. else
  1044. {
  1045. scoreString.append(1, segment[i]);
  1046. }
  1047. }
  1048. modsScore = getFloatVal(scoreString);
  1049. scoreString.clear();
  1050. for(int i = 0; i < segment.length(); i++)
  1051. {
  1052. if(segment[i] == 'e')
  1053. {
  1054. for(int j = i+1; j < segment.length(); j++)
  1055. {
  1056. scoreString.append(1, segment[j]);
  1057. i = j;
  1058. }
  1059. }
  1060. }
  1061. }
  1062. modsScore = modsScore * pow(10.0,(static_cast<double>(getFloatVal(scoreString))));
  1063. scoreString.clear();
  1064. if(segment[0] == 'I' && segment[1] == 'd'
  1065. && segment[2] == 'e')
  1066. {
  1067. InMods >> segment; // "="
  1068. InMods >> segment; // identities
  1069. // collect denominator of identities value
  1070. for(int i = 0; i < segment.length(); i++)
  1071. {
  1072. if(segment[i] == '/')
  1073. {
  1074. for(int j = i+1; j < segment.length(); j++)
  1075. {
  1076. scoreString.append(1, segment[j]);
  1077. i = j;
  1078. }
  1079. }
  1080. }
  1081. idenScore = getIntVal(scoreString);
  1082. scoreString.clear();
  1083. //cout << "/identities: " << score << endl;
  1084. if(modsScore < modsScoreMax and (idenScore > idenScoreMin
  1085. || mod[0] == 'T' && mod[1] == 'E'))
  1086. {
  1087. //cout << "modsScore: " << modsScore << " idenScore: " << idenScore << endl;
  1088. while( segment[0] != 'Q')
  1089. {
  1090. InMods >> segment;
  1091. }
  1092. InMods >> segment; // start site of domain on protein
  1093. startSite = getIntVal(segment);
  1094. InMods >> segment; // start site of domain on protein
  1095. // get protein frame and find where adenylation domains begins
  1096. proteinID = segment;
  1097. for(int i = 0; i < proteinID.length(); i++)
  1098. {
  1099. while(proteinID[i] == '-')
  1100. {
  1101. proteinID.erase(i, 1);
  1102. }
  1103. }
  1104. // cout << "PROTEINID: " << proteinID << endl;
  1105. proteinFrame.clear();
  1106. aaSeqIn.open("aminoAcidSeq.txt");
  1107. aaSeqIn >> aminoAcids;
  1108. while(!aaSeqIn.fail())
  1109. {
  1110. if(aminoAcids.length() > 1)
  1111. {
  1112. proteinFrame.append(aminoAcids);
  1113. }
  1114. aaSeqIn >> aminoAcids;
  1115. }
  1116. aaSeqIn.clear();
  1117. aaSeqIn.close();
  1118. //cout << "proteinFrame.length(): " << proteinFrame.length() << endl;
  1119. nucStartSite = proteinFrame.find(proteinID, startSite);
  1120. if(nucStartSite < 0)
  1121. {
  1122. proteinFrame.clear();
  1123. aaSeqIn.open("aminoAcidSeq0.txt");
  1124. aaSeqIn >> aminoAcids;
  1125. while(!aaSeqIn.fail())
  1126. {
  1127. if(aminoAcids.length() > 1)
  1128. {
  1129. proteinFrame.append(aminoAcids);
  1130. }
  1131. aaSeqIn >> aminoAcids;
  1132. }
  1133. aaSeqIn.clear();
  1134. aaSeqIn.close();
  1135. nucStartSite = proteinFrame.find(proteinID, startSite);
  1136. // cout << "NOT IN LAST AA FRAME" << endl;
  1137. }
  1138. if(nucStartSite < 0)
  1139. {
  1140. proteinFrame.clear();
  1141. aaSeqIn.open("aminoAcidSeq1.txt");
  1142. aaSeqIn >> aminoAcids;
  1143. while(!aaSeqIn.fail())
  1144. {
  1145. if(aminoAcids.length() > 1)
  1146. {
  1147. proteinFrame.append(aminoAcids);
  1148. }
  1149. aaSeqIn >> aminoAcids;
  1150. }
  1151. aaSeqIn.clear();
  1152. aaSeqIn.close();
  1153. nucStartSite = proteinFrame.find(proteinID, startSite);
  1154. // cout << "NOT IN FIRST AA FRAME" << endl;
  1155. }
  1156. if(nucStartSite < 0)
  1157. {
  1158. // cout << "NOT IN SECOND AA FRAME" << endl;
  1159. }
  1160. nucStartSite = nucStartSite * 3; // translate amino acid position
  1161. // into nucleotide position
  1162. domain = new UnitClass2;
  1163. domain->setStartSite(startSite);
  1164. domain->setNucStartSite(nucStartSite);
  1165. domain->setFormula(mod);
  1166. sortedListOfMods.insertValue(*domain);
  1167. delete domain;
  1168. }
  1169. score = 1;
  1170. idenScore = 0;
  1171. }
  1172. InMods >> segment;
  1173. }
  1174. InMods.clear();
  1175. InMods.close();
  1176. if(sortedListOfMods.getNumElems() > 0)
  1177. {
  1178. //sortedListOfMods.printForward();
  1179. }
  1180. else
  1181. {
  1182. cout << "No auxiliary domains detected" << endl;
  1183. }
  1184. siteCount = 0;
  1185. // go through each domain and collect and align each
  1186. while(siteCount < sortedListOfDomains.getNumElems() && whileCount < 1000000)
  1187. {
  1188. In.close();
  1189. In.open("blast.out");
  1190. In.clear();
  1191. In >> segment;
  1192. siteFound = 0;
  1193. while( !In.fail() && siteFound == 0 && whileCount < 1000000)
  1194. {
  1195. //cout << "inner while" << endl;
  1196. siteToAnalyze = sortedListOfDomains.getElemAtIndex(siteCount).getStartSite();
  1197. //cout << "siteToAnalyze: " << siteToAnalyze << endl;
  1198. if(segment[0] == 'Q' && segment[1] == 'u'
  1199. && segment[queryOrSbjctLength - 1] == ':')
  1200. {
  1201. In >> segment;
  1202. if(getIntVal(segment) == siteToAnalyze)
  1203. {
  1204. // analyze this segment only and add to aaNameList
  1205. // store any modifications between this domain and the next
  1206. // cout << "segment " << segment << " found " << endl;
  1207. //cout << segment << " ";
  1208. cout << sortedListOfDomains.getElemAtIndex(siteCount).getNucStartSite() << " ";
  1209. cout << sortedListOfDomains.getElemAtIndex(siteCount).getNucStartSite() + 10000
  1210. << " ";
  1211. position = getIntVal(segment);
  1212. siteFound = 1;
  1213. siteCount++;
  1214. In >> segment;
  1215. aaSeq.append(segment);
  1216. In >> segment;
  1217. while((segment[0] != 'S' || segment[1] != 'c') && !In.fail())
  1218. {
  1219. if(segment[0] == 'Q' && segment[1] == 'u'
  1220. && segment[queryOrSbjctLength - 1] == ':')
  1221. {
  1222. In >> segment;
  1223. In >> segment;
  1224. for(int i = 0; i < segment.length(); i++)
  1225. {
  1226. if((static_cast< unsigned int >(segment[i]) < 65
  1227. || static_cast< unsigned int >(segment[i]) > 90)
  1228. && (static_cast< unsigned int >(segment[i]) < 97
  1229. || static_cast< unsigned int >(segment[i]) > 122))
  1230. {
  1231. segment.erase(i,1);
  1232. i--;
  1233. }
  1234. }
  1235. aaSeq.append(segment);
  1236. }
  1237. In >> segment;
  1238. }
  1239. // recognize amino acid sequence here
  1240. //cout << "aaSeq: " << aaSeq << endl;
  1241. Out.open("aaSeq.txt");
  1242. Out.clear();
  1243. Out << "> " << "aaSeq" << '\n' << aaSeq;
  1244. Out.close();
  1245. aaSeq.clear();
  1246. // does blast alignment of individual domain to recognize
  1247. // conserved motifs and substrate specificity
  1248. spawnl( 1, "./blastall", "blastall",
  1249. "-g", "T", "-p", arg[1], "-d", arg[2], "-i", "aaSeq.txt",
  1250. "-F", "F", "-C", "0", "-o", "blast2.out", NULL);
  1251. substrateKnown = 0;
  1252. In.close();
  1253. In.open("blast2.out");
  1254. In.clear();
  1255. segment.clear();
  1256. In >> segment;
  1257. while(!In.fail())
  1258. {
  1259. //cout << !In.fail() << " ";
  1260. //cout << segment << endl;
  1261. if(segment == "NRPS" && PKS == 0 && NRPS == 0)
  1262. {
  1263. NRPS = 1;
  1264. //cout << "NRPS detected" << endl;
  1265. }
  1266. if(segment == "PKS" && NRPS == 0 && PKS == 0)
  1267. {
  1268. PKS = 1;
  1269. //cout << "PKS detected" << endl;
  1270. }
  1271. if(segment.length() == queryOrSbjctLength)
  1272. {
  1273. if(segment[0] == 'Q' && segment[1] == 'u'
  1274. && segment[queryOrSbjctLength - 1] == ':')
  1275. {
  1276. In >> segment;
  1277. In >> segment;
  1278. query.append(segment);
  1279. //cout << "Query: " << segment << endl;
  1280. }
  1281. if(segment[0] == 'S' && segment[1] == 'b')
  1282. {
  1283. In >> segment;
  1284. In >> segment;
  1285. sbjct.append(segment);
  1286. //cout << "Sbjct: " << segment << endl;
  1287. }
  1288. }
  1289. In >> segment;
  1290. }
  1291. In.clear();
  1292. In.close();
  1293. segment.clear();
  1294. // search for first FDX motif containing residues 235,236,239
  1295. if(NRPS)
  1296. {
  1297. if(sbjct.find("SRVLQFASFSFD") > 0
  1298. && sbjct.find("SRVLQFASFSFD") < maxChars )
  1299. {
  1300. pos235 = query[sbjct.find("SRVLQFASFSFD")+p235];
  1301. pos236 = query[sbjct.find("SRVLQFASFSFD")+p236];
  1302. pos239 = query[sbjct.find("SRVLQFASFSFD")+p239];
  1303. sigSeq.append(pos235);
  1304. sigSeq.append(pos236);
  1305. sigSeq.append(pos239);
  1306. }
  1307. // search for IT motif containing the residue 278
  1308. if(sbjct.find("RLKNDGITHVTLPP") > 0
  1309. && sbjct.find("RLKNDGITHVTLPP") < maxChars )
  1310. {
  1311. pos278 = query[sbjct.find("RLKNDGITHVTLPP")+p278];
  1312. sigSeq.append(pos278);
  1313. }
  1314. // search for GE motif containing the residues 299,301
  1315. if(sbjct.find("IVVQGEACP") > 0
  1316. && sbjct.find("IVVAGEACP") < maxChars )
  1317. {
  1318. pos299 = query[sbjct.find("IVVAGEACP")+p299];
  1319. pos301 = query[sbjct.find("IVVAGEACP")+p301];
  1320. sigSeq.append(pos299);
  1321. sigSeq.append(pos301);
  1322. }
  1323. // search for YGPTE motif containing the residues 322,330,331
  1324. if(sbjct.find("NAYGPTE") > 0
  1325. && sbjct.find("NAYGPTE") < maxChars )
  1326. {
  1327. pos322 = query[sbjct.find("NAYGPTE")+p322];
  1328. pos330 = query[sbjct.find("NAYGPTE")+p330];
  1329. pos331 = query[sbjct.find("NAYGPTE")+p331];
  1330. sigSeq.append(pos322);
  1331. sigSeq.append(pos330);
  1332. sigSeq.append(pos331);
  1333. }
  1334. FDxAA = pos236[0];
  1335. FD4xAA = pos239[0];
  1336. IT3xAA = pos278[0];
  1337. x1xG299 = pos299[0];
  1338. x1xG301 = pos301[0];
  1339. x7xxY322 = pos322[0];
  1340. x7xxY330 = pos330[0];
  1341. x7xxY331 = pos331[0];
  1342. }
  1343. if(PKS)
  1344. {
  1345. if(sbjct.find("DVPLV") > 0
  1346. && sbjct.find("DVPLV") < maxChars )
  1347. {
  1348. pos58 = query[sbjct.find("DVPLV")+p58];
  1349. pos59 = query[sbjct.find("DVPLV")+p59];
  1350. pos60 = query[sbjct.find("DVPLV")+p60];
  1351. pos61 = query[sbjct.find("DVPLV")+p61];
  1352. pos62 = query[sbjct.find("DVPLV")+p62];
  1353. sigSeq.append(pos58);
  1354. sigSeq.append(pos59);
  1355. sigSeq.append(pos60);
  1356. sigSeq.append(pos61);
  1357. sigSeq.append(pos62);
  1358. }
  1359. if(sbjct.find("QVAL") > 0
  1360. && sbjct.find("QVAL") < maxChars )
  1361. {
  1362. pos70 = query[sbjct.find("QVAL")+p70];
  1363. pos72 = query[sbjct.find("QVAL")+p72];
  1364. pos74 = query[sbjct.find("QVAL")+p74];
  1365. pos75 = query[sbjct.find("QVAL")+p75];
  1366. pos77 = query[sbjct.find("QVAL")+p77];
  1367. sigSeq.append(pos70);
  1368. sigSeq.append(pos72);
  1369. sigSeq.append(pos74);
  1370. sigSeq.append(pos75);
  1371. sigSeq.append(pos77);
  1372. }
  1373. if(sbjct.find("GQSMGE") > 0
  1374. && sbjct.find("GQSMGE") < maxChars )
  1375. {
  1376. pos93 = query[sbjct.find("GQSMGE")+p93];
  1377. sigSeq.append(pos93);
  1378. }
  1379. if(sbjct.find("ITVRRI") > 0
  1380. && sbjct.find("ITVRRI") < maxChars )
  1381. {
  1382. pos189 = query[sbjct.find("ITVRRI")+p189];
  1383. sigSeq.append(pos189);
  1384. pos191 = query[sbjct.find("ITVRRI")+p191];
  1385. sigSeq.append(pos191);
  1386. }
  1387. if(sbjct.find("YASH") > 0
  1388. && sbjct.find("YASH") < maxChars )
  1389. {
  1390. pos198 = query[sbjct.find("YASH")+p198];
  1391. pos199 = query[sbjct.find("YASH")+p199];
  1392. pos200 = query[sbjct.find("YASH")+p200];
  1393. sigSeq.append(pos198);
  1394. sigSeq.append(pos199);
  1395. sigSeq.append(pos200);
  1396. }
  1397. if(pos58[0] == 'E' || pos58[0] == 'D' || pos58[0] == 'Q'
  1398. || pos58[0] == 'S' || pos58[0] == 'K')
  1399. {
  1400. mal++;
  1401. }
  1402. if(pos58[0] == 'R' || pos58[0] == 'G')
  1403. {
  1404. mmal++;
  1405. emal++;
  1406. omal++;
  1407. }
  1408. if(pos59[0] == 'T' || pos59[0] == 'S')
  1409. {
  1410. mal++;
  1411. }
  1412. if(pos59[0] == 'V' || pos59[0] == 'A')
  1413. {
  1414. mmal++;
  1415. emal++;
  1416. omal++;
  1417. }
  1418. if(pos60[0] == 'G' || pos60[0] == 'P' || pos60[0] == 'A'
  1419. || pos60[0] == 'L' || pos60[0] == 'R')
  1420. {
  1421. mal++;
  1422. }
  1423. if(pos60[0] == 'D' || pos60[0] == 'E')
  1424. {
  1425. mmal++;
  1426. emal++;
  1427. omal++;
  1428. }
  1429. if(pos61[0] == 'Y' || pos61[0] == 'F')
  1430. {
  1431. mal++;
  1432. }
  1433. if(pos61[0] == 'V')
  1434. {
  1435. mmal++;
  1436. emal++;
  1437. omal++;
  1438. }
  1439. if(pos62[0] == 'A' || pos62[0] == 'T')
  1440. {
  1441. mal++;
  1442. }
  1443. if(pos62[0] == 'V')
  1444. {
  1445. mmal++;
  1446. emal++;
  1447. omal++;
  1448. }
  1449. if(pos70[0] == 'Q')
  1450. {
  1451. mal++;
  1452. }
  1453. if(pos70[0] == 'M')
  1454. {
  1455. mmal++;
  1456. emal++;
  1457. omal++;
  1458. }
  1459. if(pos72[0] == 'A')
  1460. {
  1461. mal++;
  1462. }
  1463. if(pos72[0] == 'S')
  1464. {
  1465. mmal++;
  1466. }
  1467. if(pos74[0] == 'F')
  1468. {
  1469. mal++;
  1470. }
  1471. if(pos74[0] == 'A')
  1472. {
  1473. mmal++;
  1474. emal++;
  1475. omal++;
  1476. }
  1477. if(pos75[0] == 'G')
  1478. {
  1479. mal++;
  1480. }
  1481. if(pos75[0] == 'A')
  1482. {
  1483. mmal++;
  1484. emal++;
  1485. omal++;
  1486. }
  1487. if(pos77[0] == 'L')
  1488. {
  1489. mal++;
  1490. }
  1491. if(pos77[0] == 'W')
  1492. {
  1493. mmal++;
  1494. emal++;
  1495. omal++;
  1496. }
  1497. if(pos93[0] == 'L' || pos93[0] == 'V' || pos93[0] == 'I')
  1498. {
  1499. mal++;
  1500. }
  1501. if(pos93[0] == 'Q')
  1502. {
  1503. mmal++;
  1504. emal++;
  1505. omal++;
  1506. }
  1507. if(pos189[0] == 'M' || pos189[0] == 'L' || pos189[0] == 'Y')
  1508. {
  1509. omal++;
  1510. }
  1511. else
  1512. {
  1513. mal++;
  1514. mmal++;
  1515. emal++;
  1516. }
  1517. if(pos191[0] == 'W')
  1518. {
  1519. omal++;
  1520. }
  1521. else
  1522. {
  1523. mal++;
  1524. mmal++;
  1525. emal++;
  1526. }
  1527. if(pos198[0] == 'H')
  1528. {
  1529. mal = mal + lastPKSkey;
  1530. }
  1531. if(pos198[0] == 'Y')
  1532. {
  1533. mmal = mmal + lastPKSkey;
  1534. }
  1535. if(pos198[0] == 'T' || pos198[0] == 'C')
  1536. {
  1537. emal = emal + lastPKSkey;
  1538. }
  1539. if(pos199[0] == 'P')
  1540. {
  1541. emal++;
  1542. }
  1543. if(pos200[0] == 'F')
  1544. {
  1545. mal = mal + lastPKSkey;
  1546. }
  1547. if(pos200[0] == 'S')
  1548. {
  1549. mmal = mmal + lastPKSkey;
  1550. }
  1551. if(pos200[0] == 'G')
  1552. {
  1553. emal = emal + lastPKSkey;
  1554. }
  1555. if(pos200[0] == 'T')
  1556. {
  1557. emal++;
  1558. }
  1559. //outputSpecificitiesPKS( mal, mmal, emal, omal);
  1560. scoredSubstrate = maxSpecificityPKS(mal, mmal, emal, omal);
  1561. //cout << "scoredSubstrate: " << scoredSubstrate << endl;
  1562. mal = 0;
  1563. mmal = 0;
  1564. emal = 0;
  1565. omal = 0;
  1566. clearPositions( pos235, pos236, pos239,
  1567. pos278, pos299, pos301, pos322, pos330,
  1568. pos331, pos58, pos59, pos60, pos61, pos62,
  1569. pos70, pos72, pos74, pos75, pos77, pos93,
  1570. pos189, pos191, pos198, pos199, pos200 );
  1571. }
  1572. //cout << "signature sequence: " << sigSeq << " ";
  1573. cout << sigSeq << " ";
  1574. Out.open("sigSeq.txt");
  1575. Out.clear();
  1576. Out << "> " << "SigSeq" << '\n'
  1577. << sigSeq;
  1578. Out.close();
  1579. // align with NRPS/PKS signatures
  1580. if(PKS)
  1581. {
  1582. spawnl(1, "./blastall", "blastall", "-p", "blastp", "-d",
  1583. "PKSsignatures.fasta", "-i", "sigSeq.txt",
  1584. "-e", arg[4], "-F", "F", "-C", "0", "-e", "10.0",
  1585. "-o", "blast3.out", NULL);
  1586. }
  1587. if(NRPS)
  1588. {
  1589. spawnl(1, "./blastall", "blastall", "-p", "blastp", "-d",
  1590. "NRPSsignatures.fasta", "-i", "sigSeq.txt",
  1591. "-e", arg[4], "-F", "F", "-C", "0", "-e", "10.0",
  1592. "-o", "blast3.out", NULL);
  1593. if(FDxAA == 'P' || FDxAA == 'p')
  1594. {
  1595. Aad++;
  1596. }
  1597. if(FDxAA == 'L' || FDxAA == 'l')
  1598. {
  1599. Ala++;
  1600. Asn++;
  1601. Asp++;
  1602. Cys++;
  1603. Dab++;
  1604. Dhb++;
  1605. Gly3++;
  1606. }
  1607. if(FDxAA == 'H' || FDxAA == 'h')
  1608. {
  1609. Cys++;
  1610. }
  1611. if(FDxAA == 'A' || FDxAA == 'a')
  1612. {
  1613. Gln++;
  1614. Glu++;
  1615. Ile++;
  1616. Leu1++;
  1617. Leu2++;
  1618. Leu4++;
  1619. Phe++;
  1620. Tyr2++;
  1621. Tyr3++;
  1622. Val1++;
  1623. Val3++;
  1624. Lys++;
  1625. }
  1626. if(FDxAA == 'G' || FDxAA == 'g')
  1627. {
  1628. Ile++;
  1629. Leu3++;
  1630. Tyr1++;
  1631. }
  1632. if(FDxAA == 'M' || FDxAA == 'm')
  1633. {
  1634. Orn++;
  1635. }
  1636. if(FDxAA == 'V' || FDxAA == 'v')
  1637. {
  1638. Orn++;
  1639. Pro++;
  1640. Ser++;
  1641. }
  1642. if(FDxAA == 'I' || FDxAA == 'i')
  1643. {
  1644. Phg++;
  1645. Gly1++;
  1646. Gly2++;
  1647. }
  1648. if(FDxAA == 'F' || FDxAA == 'f')
  1649. {
  1650. Phe++;
  1651. Thr++;
  1652. Val2++;
  1653. }
  1654. if(FDxAA == 'S' || FDxAA == 's')
  1655. {
  1656. His++;
  1657. }
  1658. if(FD4xAA == 'R' || FD4xAA == 'r')
  1659. {
  1660. Aad++;
  1661. }
  1662. if(FD4xAA == 'L' || FD4xAA == 'l')
  1663. {
  1664. Ala++;
  1665. Tyr2++;
  1666. Gly1++;
  1667. }
  1668. if(FD4xAA == 'T' || FD4xAA == 't')
  1669. {
  1670. Asn++;
  1671. Asp++;
  1672. Tyr1++;
  1673. }
  1674. if(FD4xAA == 'E' || FD4xAA == 'e')
  1675. {
  1676. Cys++;
  1677. Dab++;
  1678. Orn++;
  1679. Val2++;
  1680. His++;
  1681. Lys++;
  1682. }
  1683. if(FD4xAA == 'Y' || FD4xAA == 'y')
  1684. {
  1685. Cys++;
  1686. }
  1687. if(FD4xAA == 'P' || FD4xAA == 'p')
  1688. {
  1689. Dhb++;
  1690. }
  1691. if(FD4xAA == 'Q' || FD4xAA == 'q')
  1692. {
  1693. Gln++;
  1694. Pip++;
  1695. Pro++;
  1696. Gly2++;
  1697. }
  1698. if(FD4xAA == 'W' || FD4xAA == 'w')
  1699. {
  1700. Glu++;
  1701. Leu1++;
  1702. Leu2++;
  1703. Phe++;
  1704. Ser++;
  1705. Thr++;
  1706. Val3++;
  1707. Gln++;
  1708. }
  1709. if(FD4xAA == 'K' || FD4xAA == 'k')
  1710. {
  1711. Glu++;
  1712. }
  1713. if(FD4xAA == 'F' || FD4xAA == 'f')
  1714. {
  1715. Ile++;
  1716. Leu4++;
  1717. Gly3++;
  1718. Val1++;
  1719. }
  1720. if(FD4xAA == 'A' || FD4xAA == 'a')
  1721. {
  1722. Leu3++;
  1723. }
  1724. if(FD4xAA == 'G' || FD4xAA == 'g')
  1725. {
  1726. Orn++;
  1727. }
  1728. if(FD4xAA == 'S' || FD4xAA == 's')
  1729. {
  1730. Tyr3++;
  1731. }
  1732. // score IT3xAA
  1733. if(IT3xAA == 'N' || IT3xAA == 'n')
  1734. {
  1735. Aad++;
  1736. Cys++;
  1737. Orn++;
  1738. Thr++;
  1739. Gly3++;
  1740. }
  1741. if(IT3xAA == 'F' || IT3xAA == 'f')
  1742. {
  1743. Ala++;
  1744. Ile++;
  1745. Leu++;
  1746. }
  1747. if(IT3xAA == 'K' || IT3xAA == 'k')
  1748. {
  1749. Asn++;
  1750. Asp++;
  1751. }
  1752. if(IT3xAA == 'S' || IT3xAA == 's')
  1753. {
  1754. Cys++;
  1755. Val++;
  1756. Lys++;
  1757. }
  1758. if(IT3xAA == 'H' || IT3xAA == 'h')
  1759. {
  1760. Dab++;
  1761. Glu++;
  1762. Ser++;
  1763. }
  1764. if(IT3xAA == 'A' || IT3xAA == 'a')
  1765. {
  1766. Dhb++;
  1767. His++;
  1768. }
  1769. if(IT3xAA == 'D' || IT3xAA == 'd')
  1770. {
  1771. Gln++;
  1772. Glu++;
  1773. }
  1774. if(IT3xAA == 'L' || IT3xAA == 'l')
  1775. {
  1776. Leu++;
  1777. Phg++;
  1778. Pip++;
  1779. Pro++;
  1780. }
  1781. if(IT3xAA == 'Y' || IT3xAA == 'y')
  1782. {
  1783. Leu++;
  1784. }
  1785. if(IT3xAA == 'M' || IT3xAA == 'm')
  1786. {
  1787. Leu++;
  1788. Val++;
  1789. Gly2++;
  1790. }
  1791. if(IT3xAA == 'E' || IT3xAA == 'e')
  1792. {
  1793. Orn++;
  1794. }
  1795. if(IT3xAA == 'T' || IT3xAA == 't')
  1796. {
  1797. Phe++;
  1798. Tyr++;
  1799. }
  1800. if(IT3xAA == 'I' || IT3xAA == 'i')
  1801. {
  1802. Tyr++;
  1803. }
  1804. if(IT3xAA == 'V' || IT3xAA == 'v')
  1805. {
  1806. Tyr++;
  1807. }
  1808. if(IT3xAA == 'W' || IT3xAA == 'w')
  1809. {
  1810. Val++;
  1811. }
  1812. if(IT3xAA == 'Q' || IT3xAA == 'q')
  1813. {
  1814. Gly1++;
  1815. Gln++;
  1816. }
  1817. if(x1xG299 == 'I' || x1xG299 == 'i')
  1818. {
  1819. Aad++;
  1820. Orn++;
  1821. Phe++;
  1822. Pro++;
  1823. Thr++;
  1824. Val1++;
  1825. }
  1826. if(x1xG299 == 'G' || x1xG299 == 'g')
  1827. {
  1828. Ala++;
  1829. }
  1830. if(x1xG299 == 'L' || x1xG299 == 'l')
  1831. {
  1832. Asn++;
  1833. Cys++;
  1834. Glu++;
  1835. Ile++;
  1836. Leu1++;
  1837. Leu4++;
  1838. Orn++;
  1839. Phg++;
  1840. Pip++;
  1841. Ser++;
  1842. Gly1++;
  1843. }
  1844. if(x1xG299 == 'V' || x1xG299 == 'v')
  1845. {
  1846. Asp++;
  1847. Phe++; // added from Challis
  1848. Tyr3++;
  1849. }
  1850. if(x1xG299 == 'D' || x1xG299 == 'd')
  1851. {
  1852. Cys++;
  1853. }
  1854. if(x1xG299 == 'N' || x1xG299 == 'n')
  1855. {
  1856. Dab++;
  1857. Gly3++;
  1858. }
  1859. if(x1xG299 == 'Q' || x1xG299 == 'q')
  1860. {
  1861. Dhb++;
  1862. }
  1863. if(x1xG299 == 'F' || x1xG299 == 'f')
  1864. {
  1865. Glu++;
  1866. Val3++;
  1867. Gly2++;
  1868. Gln++;
  1869. }
  1870. if(x1xG299 == 'Y' || x1xG299 == 'y')
  1871. {
  1872. Ile++;
  1873. Leu2++;
  1874. }
  1875. if(x1xG299 == 'T' || x1xG299 == 't')
  1876. {
  1877. Leu3++;
  1878. Tyr1++;
  1879. Tyr2++;
  1880. Val2++;
  1881. His++;
  1882. }
  1883. // detect 301 signature
  1884. if(x1xG301 == 'V' || x1xG301 == 'v')
  1885. {
  1886. Aad++;
  1887. Cys++;
  1888. Gly2++;
  1889. }
  1890. if(x1xG301 == 'I' || x1xG301 == 'i')
  1891. {
  1892. Ala++;
  1893. Cys++;
  1894. Lys++;
  1895. }
  1896. if(x1xG301 == 'G' || x1xG301 == 'g')
  1897. {
  1898. Asn++;
  1899. Asp++;
  1900. Dhb++;
  1901. Gln++;
  1902. Glu++;
  1903. Ile++;
  1904. Leu1++;
  1905. Leu2++;
  1906. Leu3++;
  1907. Leu4++;
  1908. Orn++;
  1909. Phg++;
  1910. Phe++; // extra
  1911. Pip++;
  1912. Thr++;
  1913. Tyr2++;
  1914. Val1++;
  1915. Lys++;
  1916. Gly1++;
  1917. }
  1918. if(x1xG301 == 'S' || x1xG301 == 's')
  1919. {
  1920. Cys++;
  1921. Ser++;
  1922. }
  1923. if(x1xG301 == 'T' || x1xG301 == 't')
  1924. {
  1925. Dab++;
  1926. }
  1927. if(x1xG301 == 'A' || x1xG301 == 'a')
  1928. {
  1929. Phe++;
  1930. Pro++;
  1931. Tyr1++;
  1932. Tyr3++;
  1933. Val2++;
  1934. Val3++;
  1935. His++;
  1936. Gly3++;
  1937. }
  1938. // analyze 322 residue
  1939. if(x7xxY322 == 'E' || x7xxY322 == 'e')
  1940. {
  1941. Aad++;
  1942. Asn++;
  1943. Leu3++;
  1944. Tyr1++;
  1945. His++;
  1946. }
  1947. if(x7xxY322 == 'A' || x7xxY322 == 'a')
  1948. {
  1949. Ala++;
  1950. Leu2++;
  1951. Phe++;
  1952. Tyr2++;
  1953. Tyr3++;
  1954. Val2++;
  1955. Val3++;
  1956. Gly2++;
  1957. }
  1958. if(x7xxY322 == 'H' || x7xxY322 == 'h')
  1959. {
  1960. Asp++;
  1961. Pro++;
  1962. }
  1963. if(x7xxY322 == 'G' || x7xxY322 == 'g')
  1964. {
  1965. Cys++;
  1966. Glu++;
  1967. Val1++;
  1968. Phe++; // extra
  1969. }
  1970. if(x7xxY322 == 'L' || x7xxY322 == 'l')
  1971. {
  1972. Cys++;
  1973. Orn++;
  1974. Phg++;
  1975. Ser++;
  1976. Gly1++;
  1977. Gly3++;
  1978. Gln++;
  1979. }
  1980. if(x7xxY322 == 'T' || x7xxY322 == 't')
  1981. {
  1982. Dab++;
  1983. }
  1984. if(x7xxY322 == 'V' || x7xxY322 == 'v')
  1985. {
  1986. Dhb++;
  1987. Gln++;
  1988. Glu++;
  1989. Ile++;
  1990. Pip++;
  1991. }
  1992. if(x7xxY322 == 'I' || x7xxY322 == 'i')
  1993. {
  1994. Ile++;
  1995. Gln++;
  1996. }
  1997. if(x7xxY322 == 'N' || x7xxY322 == 'n')
  1998. {
  1999. Leu1++;
  2000. }
  2001. if(x7xxY322 == 'M' || x7xxY322 == 'm')
  2002. {
  2003. Leu4++;
  2004. Thr++;
  2005. Gly2++;
  2006. }
  2007. if(x7xxY322 == 'S' || x7xxY322 == 's')
  2008. {
  2009. Orn++;
  2010. Lys++;
  2011. }
  2012. // analyze 330 residue
  2013. if(x7xxY330 == 'F' || x7xxY330 == 'f')
  2014. {
  2015. Aad++;
  2016. }
  2017. if(x7xxY330 == 'V' || x7xxY330 == 'v')
  2018. {
  2019. Ala++;
  2020. Asn++;
  2021. Dab++;
  2022. Dhb++;
  2023. Gln++;
  2024. Glu++;
  2025. Ile++;
  2026. Leu1++;
  2027. Leu2++;
  2028. Leu3++;
  2029. Leu4++;
  2030. Phe++;
  2031. Pro++;
  2032. Thr++;
  2033. Tyr1++;
  2034. Tyr2++;
  2035. Tyr3++;
  2036. Val2++;
  2037. Val3++;
  2038. Lys++;
  2039. }
  2040. if(x7xxY330 == 'I' || x7xxY330 == 'i')
  2041. {
  2042. Asp++;
  2043. Cys++;
  2044. Orn++;
  2045. Ser++;
  2046. Gly1++;
  2047. }
  2048. if(x7xxY330 == 'T' || x7xxY330 == 't')
  2049. {
  2050. Ile++;
  2051. Val1++;
  2052. Gly3++;
  2053. }
  2054. if(x7xxY330 == 'L' || x7xxY330 == 'l')
  2055. {
  2056. Phg++;
  2057. }
  2058. if(x7xxY330 == 'A' || x7xxY330 == 'a')
  2059. {
  2060. Pip++;
  2061. }
  2062. if(x7xxY330 == 'M' || x7xxY330 == 'm')
  2063. {
  2064. Leu2++;
  2065. Gly2++;
  2066. }
  2067. // analyze 331 residue
  2068. if(x7xxY331 == 'V' || x7xxY331 == 'v')
  2069. {
  2070. Aad++;
  2071. Leu1++;
  2072. Leu3++;
  2073. Pip++;
  2074. Pro++;
  2075. Tyr2++;
  2076. His++;
  2077. }
  2078. if(x7xxY331 == 'L' || x7xxY331 == 'l')
  2079. {
  2080. Ala++;
  2081. Val3++;
  2082. }
  2083. if(x7xxY331 == 'G' || x7xxY331 == 'g')
  2084. {
  2085. Asn++;
  2086. Asp++;
  2087. }
  2088. if(x7xxY331 == 'T' || x7xxY331 == 't')
  2089. {
  2090. Cys++;
  2091. }
  2092. if(x7xxY331 == 'W' || x7xxY331 == 'w')
  2093. {
  2094. Cys++;
  2095. }
  2096. if(x7xxY331 == 'S' || x7xxY331 == 's')
  2097. {
  2098. Dab++;
  2099. }
  2100. if(x7xxY331 == 'N' || x7xxY331 == 'n')
  2101. {
  2102. Dhb++;
  2103. }
  2104. if(x7xxY331 == 'D' || x7xxY331 == 'd')
  2105. {
  2106. Gln++;
  2107. Glu++;
  2108. Orn++;
  2109. Ser++;
  2110. }
  2111. if(x7xxY331 == 'Y' || x7xxY331 == 'y')
  2112. {
  2113. Ile++;
  2114. Val2++;
  2115. }
  2116. if(x7xxY331 == 'F' || x7xxY331 == 'f')
  2117. {
  2118. Ile++;
  2119. Leu4++;
  2120. Val1++;
  2121. }
  2122. if(x7xxY331 == 'M' || x7xxY331 == 'm')
  2123. {
  2124. Leu1++;
  2125. }
  2126. if(x7xxY331 == 'C' || x7xxY331 == 'c')
  2127. {
  2128. Phe++;
  2129. Phg++;
  2130. Tyr3++;
  2131. }
  2132. if(x7xxY331 == 'H' || x7xxY331 == 'h')
  2133. {
  2134. Thr++;
  2135. }
  2136. if(x7xxY331 == 'A' || x7xxY331 == 'a')
  2137. {
  2138. Tyr1++;
  2139. }
  2140. // determine most specific Gly
  2141. Gly = Gly1;
  2142. if(Gly2 > Gly)
  2143. {
  2144. Gly = Gly2;
  2145. }
  2146. if(Gly3 > Gly)
  2147. {
  2148. Gly = Gly3;
  2149. }
  2150. // determine most specific Tyr
  2151. Tyr = Tyr1;
  2152. if(Tyr2 > Tyr)
  2153. {
  2154. Tyr = Tyr2;
  2155. }
  2156. if(Tyr3 > Tyr)
  2157. {
  2158. Tyr = Tyr3;
  2159. }
  2160. // determine most specific Leu
  2161. Leu = Leu1;
  2162. if(Leu2 > Leu)
  2163. {
  2164. Leu = Leu2;
  2165. }
  2166. if(Leu3 > Leu)
  2167. {
  2168. Leu = Leu3;
  2169. }
  2170. // determine most specific Val
  2171. Val = Val1;
  2172. if(Val2 > Val)
  2173. {
  2174. Val = Val2;
  2175. }
  2176. if(Val3 > Val)
  2177. {
  2178. Val = Val3;
  2179. }
  2180. /* outputSpecificitiesNRPS( Aad, Ala, Asn, Asp, Cys, Dab, Dhb, Gln, Glu,
  2181. Ile, Leu, Orn, Phe, Phg, Pip, Pro, Ser, Thr,
  2182. Tyr, His, Lys, Gly, Val);*/
  2183. scoredSubstrate = maxSpecificityNRPS(Aad, Ala, Asn, Asp, Cys, Dab,
  2184. Dhb, Gln, Glu, Ile, Leu, Orn,
  2185. Phe, Phg, Pip, Pro, Ser, Thr,
  2186. Tyr, His, Lys, Gly, Val);
  2187. //cout << endl;
  2188. //cout << "scoredSubstrate: " << scoredSubstrate << endl;
  2189. clearPositions( pos235, pos236, pos239,
  2190. pos278, pos299, pos301, pos322, pos330,
  2191. pos331, pos58, pos59, pos60, pos61, pos62,
  2192. pos70, pos72, pos74, pos75, pos77, pos93,
  2193. pos189, pos191, pos198, pos199, pos200 );
  2194. Aad = 0;
  2195. Ala = 0;
  2196. Asn = 0;
  2197. Asp = 0;
  2198. Cys = 0;
  2199. Dab = 0;
  2200. Dhb = 0;
  2201. Gln = 0;
  2202. Glu = 0;
  2203. Ile = 0;
  2204. Leu = 0;
  2205. Orn = 0;
  2206. Phe = 0;
  2207. Phg = 0;
  2208. Pip = 0;
  2209. Pro = 0;
  2210. Ser = 0;
  2211. Thr = 0;
  2212. Tyr = 0;
  2213. Val = 0;
  2214. His = 0;
  2215. Lys = 0;
  2216. Gly = 0;
  2217. Gly1 = 0;
  2218. Gly2 = 0;
  2219. Gly3 = 0;
  2220. Leu1 = 0;
  2221. Leu2 = 0;
  2222. Leu3 = 0;
  2223. Leu4 = 0;
  2224. Tyr1 = 0;
  2225. Tyr2 = 0;
  2226. Tyr3 = 0;
  2227. Val1 = 0;
  2228. Val2 = 0;
  2229. Val3 = 0;
  2230. }
  2231. foundSubstrate = 0;
  2232. In.open("blast3.out");
  2233. In >> segment;
  2234. while( !In.fail())
  2235. {
  2236. if(segment == "*" && foundSubstrate == 0)
  2237. {
  2238. In >> segment;
  2239. predictedSubstrate = segment;
  2240. //cout << "predictedSubstrate: " << predictedSubstrate << endl;
  2241. foundSubstrate = 1;
  2242. }
  2243. In >> segment;
  2244. }
  2245. if(predictedSubstrate.length() == 0 && NRPS)
  2246. {
  2247. predictedSubstrate = scoredSubstrate; // unknown NRPS
  2248. }
  2249. if(predictedSubstrate.length() == 0 && PKS)
  2250. {
  2251. predictedSubstrate = scoredSubstrate; // unknown PKS
  2252. }
  2253. //reset
  2254. sbjct.clear();
  2255. query.clear();
  2256. sigSeq.clear();
  2257. In.close();
  2258. //cout << "predicted substrate: " << predictedSubstrate << endl;
  2259. cout << predictedSubstrate << endl;
  2260. // string to keep track of substrates
  2261. predictedSubstrates.append(predictedSubstrate);
  2262. predictedSubstrates.append(" ");
  2263. predictedSubstrate.clear();
  2264. NRPS = 0;
  2265. PKS = 0;
  2266. mod.clear();
  2267. }
  2268. }
  2269. In >> segment;
  2270. whileCount++;
  2271. }
  2272. whileCount++;
  2273. }
  2274. // } previous allframesread end?
  2275. //cout << "clusterwork" << endl;
  2276. int prevStart;
  2277. int clusterStart;
  2278. int clusterEnd;
  2279. int separationMax = 15000;
  2280. int aStore = 0;
  2281. int minLineLength = 60;
  2282. int lineLength = 1;
  2283. int startBuffer = 10000;
  2284. int endBuffer = 10000;
  2285. int domainWidth = 5000;
  2286. string clusterOut;
  2287. string aaGenome;
  2288. string str;
  2289. string str1;
  2290. string str2;
  2291. string strFile;
  2292. //string inputFilename = "genomes/"
  2293. string inputFilename = "genomes/"
  2294. + nucSeq;
  2295. int extraSpaceCount = 1;
  2296. stringstream outStr1;
  2297. stringstream outStr2;
  2298. ifstream InGenome;
  2299. ofstream OutCluster;
  2300. ofstream clustersSummary;
  2301. //InGenome.open(nucSeq.c_str());
  2302. InGenome.open(inputFilename.c_str());
  2303. InGenome.clear();
  2304. InGenome >> segment;
  2305. int minUnits = 3; // minimum domain of activating domains in a molecule
  2306. aaGenome = "";
  2307. aaGenome.clear();
  2308. int tempStart;
  2309. int tempRange;
  2310. float AATcount = 0;
  2311. float KScount = 0;
  2312. float minTransATratio = 1.5; // ratio of KS/AT
  2313. bool NRPSfound = 0;
  2314. bool PKSfound = 0;
  2315. // skip first lines
  2316. while(segment.length() < minLineLength && !In.fail())
  2317. {
  2318. InGenome >> segment;
  2319. }
  2320. while(!InGenome.fail())
  2321. {
  2322. aaGenome.append(segment);
  2323. aaGenome.append("\n");
  2324. InGenome >> segment;
  2325. if(segment.length() > minLineLength && lineLength == 1)
  2326. {
  2327. lineLength = segment.length();
  2328. }
  2329. }
  2330. if(lineLength < minLineLength)
  2331. {
  2332. lineLength = 70; // length of FASTA line
  2333. }
  2334. //cout << "genome size: " << aaGenome.length() << endl;
  2335. // searches and outputs clusters within a genome into files
  2336. for(int i = 0; i < sortedListOfDomains.getNumElems(); i++)
  2337. {
  2338. if( i == 0) // the first clusterStart
  2339. {
  2340. clusterStart = sortedListOfDomains.getElemAtIndex(0).getNucStartSite();
  2341. prevStart = clusterStart;
  2342. //cout << "0. i: " << i << endl;
  2343. }
  2344. for(int a = i+1; a < sortedListOfDomains.getNumElems(); a++)
  2345. {
  2346. aStore = a;
  2347. // if next domain is within a range of the previous domain, designate
  2348. // it to be within the cluster
  2349. if(sortedListOfDomains.getElemAtIndex(a).getNucStartSite() -
  2350. prevStart < separationMax)
  2351. {
  2352. prevStart = sortedListOfDomains.getElemAtIndex(a).getNucStartSite();
  2353. }
  2354. // if next start site is significantly apart, designate it ias the
  2355. // end of the cluster
  2356. if( sortedListOfDomains.getElemAtIndex(a).getNucStartSite() -
  2357. prevStart > separationMax )
  2358. {
  2359. clusterStart = sortedListOfDomains.getElemAtIndex(i).getNucStartSite();
  2360. clusterEnd = sortedListOfDomains.getElemAtIndex(a-1).getNucStartSite() + domainWidth;
  2361. cout << ".clusterStart: " << clusterStart
  2362. << " .clusterEnd: " << clusterEnd << " " << a - i << endl;
  2363. //cout << "1. a: " << a << " i: " << i << endl;
  2364. if(a - i >= minUnits)
  2365. {
  2366. tempStart = clusterStart + clusterStart/lineLength - startBuffer;
  2367. tempRange = clusterEnd - clusterStart + (clusterEnd - clusterStart)/lineLength + endBuffer;
  2368. if(tempStart < 0)
  2369. {
  2370. tempStart = 1;
  2371. }
  2372. if(tempRange > (aaGenome.length() - tempStart))
  2373. {
  2374. tempRange = aaGenome.length() - tempStart - 1;
  2375. }
  2376. clusterOut = aaGenome.substr(tempStart, tempRange);
  2377. for(int b = i; b < a; b++)
  2378. {
  2379. // cout << "!!"
  2380. // << sortedListOfDomains.getElemAtIndex(b).getFormula();
  2381. if(sortedListOfDomains.getElemAtIndex(b).getFormula()=="NRPS")
  2382. {
  2383. NRPSfound = 1;
  2384. }
  2385. if(sortedListOfDomains.getElemAtIndex(b).getFormula() == "PKS")
  2386. {
  2387. PKSfound = 1;
  2388. }
  2389. }
  2390. if(NRPSfound && PKSfound)
  2391. {
  2392. mixedNRPSPKScount++;
  2393. }
  2394. else if(NRPSfound)
  2395. {
  2396. NRPScount++;
  2397. }
  2398. else if (PKSfound)
  2399. {
  2400. PKScount++;
  2401. }
  2402. NRPSfound = 0;
  2403. PKSfound = 0;
  2404. outStr1.clear();
  2405. outStr2.clear();
  2406. outStr1.str("");
  2407. outStr2.str("");
  2408. str1.clear();
  2409. str2.clear();
  2410. str.clear();
  2411. outStr1 << clusterStart;
  2412. str1 = outStr1.str();
  2413. outStr2 << clusterEnd;
  2414. str2 = outStr2.str();
  2415. strFile = "clusters/"
  2416. + nucSeq + "-" + str1 + "-" + str2 + ".txt";
  2417. str = ">" + str1 + "-" + str2;
  2418. OutCluster.open(strFile.c_str());
  2419. OutCluster << str << "\n";
  2420. OutCluster << clusterOut;
  2421. allClusters.append(strFile);
  2422. allClusters.append("\n");
  2423. OutCluster.close();
  2424. OutCluster.clear();
  2425. }
  2426. //as long as domain analyzed is within the total number of domains
  2427. if(a < sortedListOfDomains.getNumElems())
  2428. {
  2429. clusterStart = sortedListOfDomains.getElemAtIndex(a).getNucStartSite();
  2430. prevStart = clusterStart;
  2431. }
  2432. //cout << endl;
  2433. //cout << "i: " << i << " a: " << a << endl;
  2434. i = a - 1;
  2435. aStore = a;
  2436. a = sortedListOfDomains.getNumElems();
  2437. // if next domain is last domain, iterate i since loop cannot iterate i
  2438. if(aStore == sortedListOfDomains.getNumElems()-1)
  2439. {
  2440. i++;
  2441. }
  2442. }
  2443. if(aStore == sortedListOfDomains.getNumElems()-1) // if domain is last domain in list
  2444. {
  2445. clusterEnd = sortedListOfDomains.getElemAtIndex(aStore).getNucStartSite() + domainWidth;
  2446. cout << "clusterStart: " << clusterStart
  2447. << " clusterEnd: " << clusterEnd << " " << aStore + 1 - i << endl;
  2448. //cout << "2. a: " << aStore + 1 << " i: " << i << endl;
  2449. if(a - i >= minUnits)
  2450. {
  2451. for(int b = i; b < aStore; b++)
  2452. {
  2453. //cout << "!!"
  2454. // << sortedListOfDomains.getElemAtIndex(b).getFormula();
  2455. if(sortedListOfDomains.getElemAtIndex(b).getFormula() == "NRPS")
  2456. {
  2457. NRPSfound = 1;
  2458. }
  2459. if(sortedListOfDomains.getElemAtIndex(b).getFormula() == "PKS")
  2460. {
  2461. PKSfound = 1;
  2462. }
  2463. }
  2464. if(NRPSfound && PKSfound)
  2465. {
  2466. mixedNRPSPKScount++;
  2467. }
  2468. else if(NRPSfound)
  2469. {
  2470. NRPScount++;
  2471. }
  2472. else if(PKSfound)
  2473. {
  2474. PKScount++;
  2475. }
  2476. NRPSfound = 0;
  2477. PKSfound = 0;
  2478. tempStart = clusterStart + clusterStart/lineLength - startBuffer;
  2479. tempRange = clusterEnd - clusterStart + (clusterEnd - clusterStart)/lineLength + endBuffer;
  2480. if(tempStart < 0)
  2481. {
  2482. tempStart = 1;
  2483. }
  2484. if(tempRange > (aaGenome.length() - tempStart))
  2485. {
  2486. tempRange = aaGenome.length() - tempStart -1;
  2487. }
  2488. clusterOut = aaGenome.substr(tempStart, tempRange);
  2489. outStr1.clear();
  2490. outStr2.clear();
  2491. outStr1.str("");
  2492. outStr2.str("");
  2493. str1.clear();
  2494. str2.clear();
  2495. str.clear();
  2496. outStr1 << clusterStart;
  2497. str1 = outStr1.str();
  2498. outStr2 << clusterEnd;
  2499. str2 = outStr2.str();
  2500. strFile = "clusters/" + nucSeq + "-" + str1 + "-" + str2 + ".txt";
  2501. str = ">" + str1 + "-" + str2;
  2502. OutCluster.open(strFile.c_str());
  2503. OutCluster << str << "\n";
  2504. OutCluster << clusterOut;
  2505. allClusters.append(strFile);
  2506. allClusters.append("\n ");
  2507. OutCluster.close();
  2508. OutCluster.clear();
  2509. }
  2510. //cout << "2. i: " << i << " a: " << a << endl;
  2511. i = a;
  2512. aStore = a;
  2513. a = sortedListOfDomains.getNumElems();
  2514. }
  2515. }
  2516. } // cluster search complete
  2517. InGenome.close();
  2518. for(int i = 0; i < sortedListOfMods.getNumElems(); i++)
  2519. {
  2520. if(sortedListOfMods.getElemAtIndex(i).getFormula() == "IspH")
  2521. {
  2522. cout << sortedListOfMods.getElemAtIndex(i).getFormula()
  2523. << " " << sortedListOfMods.getElemAtIndex(i).getStartSite()
  2524. << endl;
  2525. cout << "NON-MEVALONATE PATHWAY TERPENOID GENE IDENTIFIED." << endl;
  2526. mepCount++;
  2527. }
  2528. if(sortedListOfMods.getElemAtIndex(i).getFormula() == "Mvd1")
  2529. {
  2530. cout << sortedListOfMods.getElemAtIndex(i).getFormula()
  2531. << " " << sortedListOfMods.getElemAtIndex(i).getStartSite()
  2532. << endl;
  2533. cout << "MEVALONATE PATHWAY GENE IDENTIFIED (TERPENOID)." << endl;
  2534. mvaCount++;
  2535. }
  2536. // Type of mevalonate pathway
  2537. if(sortedListOfMods.getElemAtIndex(i).getFormula() == "IPP1" ||
  2538. sortedListOfMods.getElemAtIndex(i).getFormula() == "IPP2")
  2539. {
  2540. // cout << sortedListOfMods.getElemAtIndex(i).getFormula()
  2541. // << " " << sortedListOfMods.getElemAtIndex(i).getStartSite()
  2542. // << endl;
  2543. }
  2544. }
  2545. // collect AT or A and KS from previous lists into one new list
  2546. for(int a = 0; a < sortedListOfDomains.getNumElems(); a++)
  2547. {
  2548. domain = new UnitClass2;
  2549. domain->setStartSite(sortedListOfDomains.getElemAtIndex(a).getStartSite());
  2550. domain->setFormula(sortedListOfDomains.getElemAtIndex(a).getFormula());
  2551. domain->setEndSite(sortedListOfDomains.getElemAtIndex(a).getEndSite());
  2552. domain->setNucStartSite(sortedListOfDomains.getElemAtIndex(a).getNucStartSite());
  2553. sortedListOfATKS.insertValue(*domain);
  2554. delete domain;
  2555. }
  2556. for(int a = 0; a < sortedListOfMods.getNumElems(); a++)
  2557. {
  2558. if(sortedListOfMods.getElemAtIndex(a).getFormula() == "KS")
  2559. {
  2560. domain = new UnitClass2;
  2561. domain->setStartSite(sortedListOfMods.getElemAtIndex(a).getStartSite());
  2562. domain->setFormula(sortedListOfMods.getElemAtIndex(a).getFormula());
  2563. domain->setEndSite(sortedListOfMods.getElemAtIndex(a).getEndSite());
  2564. domain->setNucStartSite(sortedListOfMods.getElemAtIndex(a).getNucStartSite());
  2565. sortedListOfATKS.insertValue(*domain);
  2566. delete domain;
  2567. }
  2568. }
  2569. // check list
  2570. /*for(int a = 0; a < sortedListOfATKS.getNumElems(); a++)
  2571. {
  2572. cout << sortedListOfATKS.getElemAtIndex(a).getFormula() << " "
  2573. << sortedListOfATKS.getElemAtIndex(a).getNucStartSite() << endl;
  2574. }*/
  2575. // Searching for Trans AT clusters
  2576. // searches and outputs clusters within a genome into files
  2577. for(int i = 0; i < sortedListOfATKS.getNumElems(); i++)
  2578. {
  2579. if( i == 0) // the first clusterStart
  2580. {
  2581. clusterStart = sortedListOfATKS.getElemAtIndex(0).getNucStartSite();
  2582. prevStart = clusterStart;
  2583. if(sortedListOfATKS.getElemAtIndex(i).getFormula() == "NRPS" ||
  2584. sortedListOfATKS.getElemAtIndex(i).getFormula() == "PKS")
  2585. {
  2586. AATcount++;
  2587. }
  2588. if(sortedListOfATKS.getElemAtIndex(i).getFormula() == "KS")
  2589. {
  2590. KScount++;
  2591. }
  2592. }
  2593. for(int a = i+1; a < sortedListOfATKS.getNumElems(); a++)
  2594. {
  2595. aStore = a;
  2596. // if next domain is within range of the previous domain, designate
  2597. // it to be within the cluster
  2598. if(sortedListOfATKS.getElemAtIndex(a).getFormula() == "NRPS" ||
  2599. sortedListOfATKS.getElemAtIndex(a).getFormula() == "PKS")
  2600. {
  2601. AATcount++;
  2602. }
  2603. if(sortedListOfATKS.getElemAtIndex(a).getFormula() == "KS")
  2604. {
  2605. KScount++;
  2606. }
  2607. if(sortedListOfATKS.getElemAtIndex(a).getNucStartSite() -
  2608. prevStart < separationMax)
  2609. {
  2610. prevStart = sortedListOfATKS.getElemAtIndex(a).getNucStartSite();
  2611. }
  2612. // if next start site is significantly apart, designate it as the
  2613. // end of the cluster
  2614. if(sortedListOfATKS.getElemAtIndex(a).getNucStartSite() -
  2615. prevStart > separationMax)
  2616. {
  2617. clusterStart = sortedListOfATKS.getElemAtIndex(i).getNucStartSite();
  2618. clusterEnd = sortedListOfATKS.getElemAtIndex(a-1).getNucStartSite() +
  2619. domainWidth;
  2620. //cout << ".clusterStart: " << clusterStart
  2621. // <<" .clusterEnd: " << clusterEnd << " " << a - i << endl;
  2622. if(a - i >= minUnits)
  2623. {
  2624. //cout << "KScount: " << KScount << " AATcount: " << AATcount << endl;
  2625. cout << "clusterStart: " << clusterStart << " KS/AAT ratio: "
  2626. << KScount/AATcount << endl;
  2627. if(KScount/AATcount > minTransATratio)
  2628. {
  2629. cout << "TRANS AT PKS SYSTEM IDENTIFIED." << endl;
  2630. transATcount++;
  2631. clusterOut = aaGenome.substr(clusterStart + clusterStart/lineLength
  2632. - startBuffer, clusterEnd - clusterStart + (clusterEnd -
  2633. clusterStart)/lineLength + endBuffer);
  2634. outStr1.clear();
  2635. outStr2.clear();
  2636. outStr1.str("");
  2637. outStr2.str("");
  2638. str1.clear();
  2639. str2.clear();
  2640. outStr1 << clusterStart;
  2641. str1 = outStr1.str();
  2642. outStr2 << clusterEnd;
  2643. str2 = outStr2.str();
  2644. strFile = "clusters/transAT/" + nucSeq + "-" + str1 + "-" + str2
  2645. + ".txt";
  2646. str = ">" + str1 + "-" + str2;
  2647. OutCluster.open(strFile.c_str());
  2648. OutCluster << str << "\n";
  2649. OutCluster << clusterOut;
  2650. allClusters.append(strFile);
  2651. allClusters.append("\n");
  2652. OutCluster.close();
  2653. OutCluster.clear();
  2654. } // end of if statement 'if(KScout/AATcount > minTransATratio...'
  2655. }
  2656. // as long as domain analyzed is within total number of domains
  2657. if(a < sortedListOfATKS.getNumElems())
  2658. {
  2659. clusterStart = sortedListOfATKS.getElemAtIndex(a).getNucStartSite();
  2660. prevStart = clusterStart;
  2661. }
  2662. i = a - 1;
  2663. aStore = a;
  2664. a = sortedListOfATKS.getNumElems();
  2665. AATcount = 0;
  2666. KScount = 0;
  2667. }
  2668. if(aStore == sortedListOfATKS.getNumElems()-1)
  2669. {
  2670. clusterEnd = sortedListOfATKS.getElemAtIndex(aStore).getNucStartSite()
  2671. + domainWidth;
  2672. //cout << "clusterStart: " << clusterStart
  2673. // << " clusterEnd: " << clusterEnd << " " << a - i - 1 << endl;
  2674. if(a - i - 1 >= minUnits)
  2675. {
  2676. if(KScount/AATcount > minTransATratio)
  2677. {
  2678. clusterOut = aaGenome.substr(clusterStart + clusterStart/lineLength
  2679. - startBuffer, clusterEnd - clusterStart + (clusterEnd-clusterStart)/ lineLength + endBuffer);
  2680. outStr1.clear();
  2681. outStr2.clear();
  2682. outStr1.str("");
  2683. outStr2.str("");
  2684. str1.clear();
  2685. str2.clear();
  2686. outStr1 << clusterStart;
  2687. str1 = outStr1.str();
  2688. outStr2 << clusterEnd;
  2689. str2 = outStr2.str();
  2690. strFile = "clusters/transAT/" + nucSeq + "-" + str1 + "-" + str2 +
  2691. ".txt";
  2692. str = ">" + str1 + "-" + str2;
  2693. OutCluster.open(strFile.c_str());
  2694. OutCluster << str << "\n";
  2695. OutCluster << clusterOut;
  2696. allClusters.append(strFile);
  2697. allClusters.append("\n");
  2698. OutCluster.close();
  2699. OutCluster.clear();
  2700. }
  2701. }
  2702. i = a;
  2703. aStore = a;
  2704. a = sortedListOfATKS.getNumElems();
  2705. }
  2706. }
  2707. } // cluster search complete
  2708. aaGenome.clear();
  2709. sortedListOfDomains.clear();
  2710. sortedListOfMods.clear();
  2711. sortedListOfATKS.clear();
  2712. std::stringstream ss;
  2713. std::string NRPScountStr;
  2714. ss << NRPScount;
  2715. ss >> NRPScountStr;
  2716. clustersSummary.open("SMILES/summary.html");
  2717. clustersSummary << " <html><head><title></title></head> ";
  2718. clustersSummary << " <body bgcolor = '#F4FFE4'> <font face='tahoma' color='#254117'>";
  2719. clustersSummary << "<b>" << NRPScountStr << "</b>"
  2720. << " modular NRPSs" << "</br>";
  2721. clustersSummary << "<b>" << PKScount << "</b>" << " modular PKSs" << "</br>";
  2722. clustersSummary << "<b>" << mixedNRPSPKScount << "</b>"
  2723. << " mixed modular NRPS/PKSs" << "</br>" << "</br>";
  2724. clustersSummary << "<b>" << transATcount << "</b>" << " trans AT PKSs"
  2725. << "</br>" << "</br>";
  2726. clustersSummary << "<b>" << mvaCount << "</b>"
  2727. << " mevalonate terpenoid mva genes" << "</br>";
  2728. clustersSummary << "<b>" << mepCount << "</b>"
  2729. << " non-mevalonate terpenoid mep genes" << "</br>";
  2730. clustersSummary.close();
  2731. clustersSummary << "</font></body></html>";
  2732. } // allframes read end
  2733. }
  2734. int main( int argc , char* argv[])
  2735. {
  2736. string nucleotideSeq;
  2737. string predictedSubstrates;
  2738. string outputSeq;
  2739. string revCompSeq;
  2740. string tempSeq;
  2741. string inputFilename;
  2742. ifstream In;
  2743. ofstream allClustersFile;
  2744. QueueClass< UnitClass2 > aaNameList;
  2745. bool allFramesRead = 0;
  2746. string allClusters;
  2747. string clusters;
  2748. string aReaction;
  2749. string aReaction2;
  2750. string aReaction3;
  2751. float minMass;
  2752. float maxMass;
  2753. string outputFilename;
  2754. string fileName;
  2755. int NRPScount = 0;
  2756. int PKScount = 0;
  2757. int mixedNRPSPKScount = 0;
  2758. int transATcount = 0;
  2759. int mepCount = 0;
  2760. int mvaCount = 0;
  2761. int charCount = 0;
  2762. ofstream runTime; // time it takes for program to run genome
  2763. int charTime; // time estimated time needed to analyze genome
  2764. int charTimeRatio = 3000000; // basepairs analyzed per minute
  2765. string character;
  2766. string displayUnknown = "DDU";
  2767. cout << "argc = " << argc << endl;
  2768. for(int i = 0; i < argc; i++)
  2769. {
  2770. cout << "argv[" << i << "] = " << argv[i] << endl;
  2771. }
  2772. if(argc == 1)
  2773. {
  2774. cout << "no file name supplied" << endl;
  2775. return 0;
  2776. }
  2777. if(argc == 2) // if a filename is typed but no limits on mass
  2778. {
  2779. minMass = 1; // default
  2780. maxMass = 5000; // default
  2781. }
  2782. if(argc == 3) // if a min value is typed by not max
  2783. {
  2784. minMass = atof(argv[2]);
  2785. if(minMass < 0 || minMass > 5000)
  2786. {
  2787. minMass = 1;
  2788. }
  2789. maxMass = 5000; // arbitrary max mass
  2790. }
  2791. if(argc > 3) // if both min and max values are typed
  2792. {
  2793. minMass = atof(argv[2]);
  2794. if(minMass < 0 || minMass > 5000)
  2795. {
  2796. minMass = 1;
  2797. }
  2798. maxMass = atof(argv[3]);
  2799. if(maxMass < 0 || maxMass > 5000)
  2800. {
  2801. maxMass = 5000;
  2802. }
  2803. }
  2804. if(argc > 4) // if further reactions are done, store them
  2805. {
  2806. aReaction = argv[4];
  2807. if(aReaction != "CY")
  2808. {
  2809. aReaction = "AA";
  2810. }
  2811. }
  2812. if(argc > 5)
  2813. {
  2814. aReaction2 = argv[5];
  2815. if(aReaction2 != "GL")
  2816. {
  2817. aReaction2 = "AA";
  2818. }
  2819. }
  2820. if(argc > 6) // if displaying unknown reactions
  2821. {
  2822. displayUnknown = argv[6];
  2823. if(displayUnknown != "DU")
  2824. {
  2825. displayUnknown = "DDU";
  2826. }
  2827. else
  2828. {
  2829. //cout << "displayUnknown!!!!!!!!!!!!!!!!!!" << endl;
  2830. }
  2831. }
  2832. if(argc > 7) // dimerize or not
  2833. {
  2834. aReaction3 = argv[7];
  2835. cout << "aReactioN: " << aReaction3 << endl;
  2836. if(aReaction3 == "ND")
  2837. {
  2838. aReaction3 = "NODIM";
  2839. }
  2840. else
  2841. {
  2842. aReaction3 == "DIM";
  2843. }
  2844. }
  2845. nucleotideSeq = argv[1];
  2846. inputFilename = "genomes/" + nucleotideSeq;
  2847. /*
  2848. cout << "inputFile: " << inputFilename << endl;
  2849. In.open(inputFilename.c_str());
  2850. if(In.fail())
  2851. {
  2852. cout << "Invalid file." << endl;
  2853. return 0;
  2854. }
  2855. else
  2856. {
  2857. In >> character;
  2858. while(!In.fail())
  2859. {
  2860. In >> character;
  2861. charCount = charCount + character.length();
  2862. }
  2863. charTime = charCount / charTimeRatio;
  2864. runTime.open("SMILES/runTime");
  2865. runTime << charTime;
  2866. runTime.close();
  2867. }
  2868. In.close();
  2869. */
  2870. //revCompSeq = revComp(nucleotideSeq);
  2871. revCompSeq = revComp(inputFilename, nucleotideSeq);
  2872. tempSeq = "rev-" + nucleotideSeq;
  2873. cout << "Unchanged sequence: " << endl;
  2874. In.open(inputFilename.c_str());
  2875. {
  2876. if(In.fail())
  2877. {
  2878. cout << "Error: no file found." << endl;
  2879. return 0;
  2880. }
  2881. }
  2882. In.close();
  2883. In.clear();
  2884. for(int i = 0; i < 3; i++)
  2885. {
  2886. if(i == 2)
  2887. {
  2888. allFramesRead = 1;
  2889. }
  2890. translateSeq(inputFilename, outputSeq, i);
  2891. predictDomains("aminoAcidSeq.txt",
  2892. predictedSubstrates, aaNameList,
  2893. allFramesRead, i, nucleotideSeq,
  2894. allClusters, NRPScount, PKScount,
  2895. mixedNRPSPKScount, transATcount,
  2896. mepCount, mvaCount);
  2897. }
  2898. allFramesRead = 0;
  2899. if(predictedSubstrates.length() > 0)
  2900. {
  2901. cout << predictedSubstrates << endl;
  2902. }
  2903. else
  2904. {
  2905. cout << "Unchanged sequence yielded no hits" << endl;
  2906. }
  2907. cout << endl;
  2908. predictedSubstrates.clear();
  2909. aaNameList.clear();
  2910. cout << "Reverse complement sequence: " << endl;
  2911. //inputFilename = "genomes/rev-"
  2912. inputFilename = "genomes/rev-"
  2913. + nucleotideSeq;
  2914. for(int i = 0; i < 3; i++)
  2915. {
  2916. if(i == 2)
  2917. {
  2918. allFramesRead = 1;
  2919. }
  2920. translateSeq(inputFilename, outputSeq, i);
  2921. predictDomains("aminoAcidSeq.txt",
  2922. predictedSubstrates, aaNameList,
  2923. allFramesRead, i, tempSeq,
  2924. allClusters, NRPScount, PKScount,
  2925. mixedNRPSPKScount, transATcount,
  2926. mepCount, mvaCount);
  2927. }
  2928. allFramesRead = 0;
  2929. if(predictedSubstrates.length() > 0)
  2930. {
  2931. cout << predictedSubstrates << endl;
  2932. }
  2933. else
  2934. {
  2935. cout << "Reverse complement sequence yielded no hits" << endl;
  2936. }
  2937. allClustersFile.open("clusters.txt");
  2938. allClustersFile << allClusters;
  2939. allClustersFile.close();
  2940. allClusters.clear();
  2941. In.clear();
  2942. In.open("clusters.txt");
  2943. cout << "after opened clusters.txt" << endl;
  2944. In >> outputFilename;
  2945. cout << "outputFilename: " << outputFilename << endl;
  2946. //string minNum = "1";
  2947. //string maxNum = "5000";
  2948. char *out1;
  2949. char *out2;
  2950. out1 = (char *)malloc(100);
  2951. out2 = (char *)malloc(100);
  2952. sprintf(out1, "%f\n", minMass);
  2953. sprintf(out2, "%f\n", maxMass);
  2954. printf(out1, "%f\n", minMass);
  2955. printf(out2, "%f\n", maxMass);
  2956. while(!In.fail())
  2957. {
  2958. fileName = outputFilename.substr(9);
  2959. spawnl(1, "./main.exe", "main.exe", fileName.c_str(),
  2960. out1, out2, aReaction.c_str(), aReaction2.c_str(),
  2961. displayUnknown.c_str(), aReaction3.c_str(),
  2962. fileName.c_str(), NULL);
  2963. In >> outputFilename;
  2964. }
  2965. In.close();
  2966. return 0;
  2967. }