PageRenderTime 67ms CodeModel.GetById 30ms RepoModel.GetById 1ms app.codeStats 0ms

/NRPeditor/clusterFinder.cpp

https://bitbucket.org/antismash/antismash1
C++ | 3156 lines | 2778 code | 187 blank | 191 comment | 925 complexity | 7188e981c84876c41029f52412a10742 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1. #include <iostream>
  2. #include <string>
  3. #include <fstream>
  4. #include <stdlib.h>
  5. #include <sstream>
  6. using namespace std;
  7. #include "constants.h"
  8. #include "UnitClass2.h"
  9. #include "QueueClass.h"
  10. #include "ReactionClass.h"
  11. #include "spawnl.h"
  12. // function declarations
  13. // overloading operator for outputing nodes
  14. ostream& operator <<(ostream& outputStream, const UnitClass2& domain );
  15. ostream& operator <<(ostream& outputStream, const UnitClass2& domain )
  16. {
  17. if( domain.getStartSite() == 0 )
  18. {
  19. outputStream << domain.getUnit() << " "
  20. << domain.getSmile() << endl;
  21. }
  22. else
  23. {
  24. outputStream << " Formula: "
  25. << domain.getFormula()
  26. << " start site: "
  27. << domain.getStartSite() << endl
  28. << " " << domain.getSmile() << endl;
  29. }
  30. return outputStream;
  31. }
  32. void clearPositions( string &pos235, string &pos236, string &pos239,
  33. string &pos278, string &pos299, string &pos301, string &pos322,
  34. string &pos330, string &pos331, string &pos58, string &pos59,
  35. string &pos60, string &pos61, string &pos62, string &pos70,
  36. string &pos72, string &pos74, string &pos75, string &pos77,
  37. string &pos93, string &pos189, string &pos191, string &pos198,
  38. string &pos199, string &pos200 );
  39. void outputSpecificitiesNRPS( int Aad, int Ala, int Asn, int Asp, int Cys,
  40. int Dab, int Dhb, int Gln, int Glu, int Ile,
  41. int Leu, int Orn, int Phe, int Phg, int Pip,
  42. int Pro, int Ser, int Thr, int Tyr, int His,
  43. int Lys, int Gly, int Val);
  44. void outputSpecificitiesPKS( int mal, int mmal, int emal, int omal);
  45. string maxSpecificityNRPS( int Aad, int Ala, int Asn, int Asp, int Cys,
  46. int Dab, int Dhb, int Gln, int Glu, int Ile,
  47. int Leu, int Orn, int Phe, int Phg, int Pip,
  48. int Pro, int Ser, int Thr, int Tyr, int His,
  49. int Lys, int Gly, int Val);
  50. string maxSpecificityPKS( int mal, int mmal, int emal, int omal);
  51. int getIntVal(string strConvert);
  52. float getFloatVal(string strConvert);
  53. void translateSeq( string &inSeq, string &outputSeq, int &frame);
  54. void predictDomains( string aminoAcidSeq,
  55. string &predictedSubstrates,
  56. QueueClass< UnitClass2 > &aaNameList,
  57. const bool &allFramesRead,
  58. const int &frame, const string &nucSeq,
  59. string &allClusters, int&NRPScount, int &PKScount,
  60. int &mixedNRPSPKScount, int &transATcount, int &mepCount,
  61. int &mvaCount);
  62. // function bodies
  63. void outputSpecificitiesPKS( int mal, int mmal, int emal, int omal)
  64. {
  65. cout << "PKS specificity scores: " << endl;
  66. cout << "mal: " << mal << " ";
  67. cout << "mmal: " << mmal << " ";
  68. cout << "emal: " << emal << " ";
  69. cout << "omal: " << omal << " ";
  70. cout << endl;
  71. }
  72. void outputSpecificitiesNRPS( int Aad, int Ala, int Asn, int Asp, int Cys,
  73. int Dab, int Dhb, int Gln, int Glu, int Ile,
  74. int Leu, int Orn, int Phe, int Phg, int Pip,
  75. int Pro, int Ser, int Thr, int Tyr, int His,
  76. int Lys, int Gly, int Val)
  77. {
  78. cout << "NRPS specificity scores: " << endl;
  79. cout << "Aad: " << Aad << " ";
  80. cout << "Ala: " << Ala << " ";
  81. cout << "Asn: " << Asn << " ";
  82. cout << "Asp: " << Asp << " ";
  83. cout << "Cys: " << Cys << " ";
  84. cout << "Dab: " << Dab << " ";
  85. cout << "Dhb: " << Dhb << " ";
  86. cout << "Gln: " << Gln << " ";
  87. cout << "Glu: " << Glu << " ";
  88. cout << "Gly: " << Gly << " ";
  89. cout << "His: " << His << endl;
  90. cout << "Ile: " << Ile << " ";
  91. cout << "Leu: " << Leu << " ";
  92. cout << "Lys: " << Lys << " ";
  93. cout << "Orn: " << Orn << " ";
  94. cout << "Phe: " << Phe << " ";
  95. cout << "Phg: " << Phg << " ";
  96. cout << "Pip: " << Pip << " ";
  97. cout << "Pro: " << Pro << " ";
  98. cout << "Ser: " << Ser << " ";
  99. cout << "Thr: " << Thr << " ";
  100. cout << "Tyr: " << Tyr << endl;
  101. cout << "Val: " << Val << " ";
  102. cout << endl;
  103. }
  104. string maxSpecificityPKS( int mal, int mmal, int emal, int omal)
  105. {
  106. int max;
  107. int distinguishingFactor = 5;
  108. string CA;
  109. max = mal;
  110. CA = "mal";
  111. if(mmal > mal)
  112. {
  113. max = mmal;
  114. CA = "mmal";
  115. }
  116. if(emal > max)
  117. {
  118. max = emal;
  119. CA = "emal";
  120. }
  121. if(omal > max)
  122. {
  123. max = omal;
  124. CA = "omal";
  125. }
  126. if(max < distinguishingFactor)
  127. {
  128. CA = "pk";
  129. }
  130. return CA;
  131. }
  132. string maxSpecificityNRPS( int Aad, int Ala, int Asn, int Asp, int Cys,
  133. int Dab, int Dhb, int Gln, int Glu, int Ile,
  134. int Leu, int Orn, int Phe, int Phg, int Pip,
  135. int Pro, int Ser, int Thr, int Tyr, int His,
  136. int Lys, int Gly, int Val)
  137. {
  138. int max;
  139. int distinguishingFactor = 3;
  140. string AA;
  141. max = Aad;
  142. AA = "Aad";
  143. if(Ala >= Aad)
  144. {
  145. max = Ala;
  146. AA = "ala";
  147. }
  148. if(Asn >= max)
  149. {
  150. max = Asn;
  151. AA = "asn";
  152. }
  153. if(Asp >= max)
  154. {
  155. max = Asp;
  156. AA = "asp";
  157. }
  158. if(Cys >= max)
  159. {
  160. max = Cys;
  161. AA = "cys";
  162. }
  163. if(Dab >= max)
  164. {
  165. max = Dab;
  166. AA = "dab";
  167. }
  168. if(Dhb >= max)
  169. {
  170. max = Dhb;
  171. AA = "dhb";
  172. }
  173. if(Gln >= max)
  174. {
  175. max = Gln;
  176. AA = "gln";
  177. }
  178. if(Glu >= max)
  179. {
  180. max = Glu;
  181. AA = "glu";
  182. }
  183. if(Ile >= max)
  184. {
  185. max = Ile;
  186. AA = "ile";
  187. }
  188. if(Leu >= max)
  189. {
  190. max = Leu;
  191. AA = "leu";
  192. }
  193. if(Orn >= max)
  194. {
  195. max = Orn;
  196. AA = "orn";
  197. }
  198. if(Phe >= max)
  199. {
  200. max = Phe;
  201. AA = "phe";
  202. }
  203. if(Phg >= max)
  204. {
  205. max = Phg;
  206. AA = "phg";
  207. }
  208. if(Pip >= max)
  209. {
  210. max = Pip;
  211. AA = "pip";
  212. }
  213. if(Pro >= max)
  214. {
  215. max = Pro;
  216. AA = "pro";
  217. }
  218. if(Ser >= max)
  219. {
  220. max = Ser;
  221. AA = "ser";
  222. }
  223. if(Thr >= max)
  224. {
  225. max = Thr;
  226. AA = "thr";
  227. }
  228. if(Tyr >= max)
  229. {
  230. max = Tyr;
  231. AA = "tyr";
  232. }
  233. if(His >= max)
  234. {
  235. max = His;
  236. AA = "his";
  237. }
  238. if(Lys >= max)
  239. {
  240. max = Lys;
  241. AA = "lys";
  242. }
  243. if(Gly >= max)
  244. {
  245. max = Gly;
  246. AA = "gly";
  247. }
  248. if(Val >= max)
  249. {
  250. max = Val;
  251. AA = "val";
  252. }
  253. if( max < distinguishingFactor)
  254. {
  255. AA = "nrp";
  256. }
  257. return AA;
  258. }
  259. int getIntVal(string strConvert)
  260. {
  261. int intReturn;
  262. // NOTE: You should probably do some checks to ensure that
  263. // this string contains only numbers. If the string is not
  264. // a valid integer, zero will be returned.
  265. intReturn = atoi(strConvert.c_str());
  266. return(intReturn);
  267. }
  268. float getFloatVal(string strConvert)
  269. {
  270. float floatReturn;
  271. // NOTE: You should probably do some checks to ensure that
  272. // this string contains only numbers. If the string is not
  273. // a valid integer, zero will be returned.
  274. floatReturn = atof(strConvert.c_str());
  275. return(floatReturn);
  276. }
  277. void clearPositions( string &pos235, string &pos236, string &pos239,
  278. string &pos278, string &pos299, string &pos301, string &pos322,
  279. string &pos330, string &pos331, string &pos58, string &pos59,
  280. string &pos60, string &pos61, string &pos62, string &pos70,
  281. string &pos72, string &pos74, string &pos75, string &pos77,
  282. string &pos93, string &pos189, string &pos191, string &pos198,
  283. string &pos199, string &pos200 )
  284. {
  285. pos235 = "";
  286. pos236 = "";
  287. pos239 = "";
  288. pos278 = "";
  289. pos299 = "";
  290. pos301 = "";
  291. pos322 = "";
  292. pos330 = "";
  293. pos331 = "";
  294. pos58 = "";
  295. pos59 = "";
  296. pos60 = "";
  297. pos61 = "";
  298. pos62 = "";
  299. pos70 = "";
  300. pos72 = "";
  301. pos74 = "";
  302. pos75 = "";
  303. pos77 = "";
  304. pos93 = "";
  305. pos189 = "";
  306. pos191 = "";
  307. pos198 = "";
  308. pos199 = "";
  309. pos200 = "";
  310. }
  311. // finds and returns the reverse complement sequence
  312. string revComp( const string &inSeq, const string &nucleotideSeq);
  313. string revComp( const string &inSeq, const string &nucleotideSeq)
  314. {
  315. string inputSeq;
  316. string outputSeq;
  317. string segment;
  318. string outputFilename;
  319. //string outFile = "rev-" + inSeq;
  320. int seqMin = 50; // minimum length of nucleotide line
  321. ifstream In;
  322. ofstream Out;
  323. int lineCharsCount = 0;
  324. int lineCharsMax = 70;
  325. In.open(inSeq.c_str());
  326. outputFilename = "genomes/rev-"
  327. + nucleotideSeq;
  328. In >> segment;
  329. // bypass initial FASTA details
  330. while(segment.length() < seqMin && !In.fail())
  331. {
  332. In >> segment;
  333. }
  334. // stores nucleotide
  335. inputSeq.append(segment);
  336. while(!In.fail())
  337. {
  338. In >> segment;
  339. inputSeq.append(segment);
  340. }
  341. In.close();
  342. for(int i = inputSeq.length(); i > 0; i--)
  343. {
  344. if(inputSeq[i] == 'A')
  345. {
  346. outputSeq.append("T");
  347. }
  348. if(inputSeq[i] == 'T')
  349. {
  350. outputSeq.append("A");
  351. }
  352. if(inputSeq[i] == 'C')
  353. {
  354. outputSeq.append("G");
  355. }
  356. if(inputSeq[i] == 'G')
  357. {
  358. outputSeq.append("C");
  359. }
  360. lineCharsCount++;
  361. if(lineCharsCount == lineCharsMax)
  362. {
  363. outputSeq.append("\n");
  364. lineCharsCount = 0;
  365. }
  366. }
  367. Out.open(outputFilename.c_str());
  368. Out << ">revComp-" << nucleotideSeq << "\n";
  369. Out << outputSeq;
  370. Out.clear();
  371. Out.close();
  372. return outputSeq;
  373. }
  374. // translates DNA to amino acid through common code
  375. void translateSeq( string &inSeq, string &outputSeq, int &frame)
  376. {
  377. int stopPrev = 0;
  378. int stopPres = 0;
  379. int stopLimit = 50;
  380. int start = 0;
  381. int choice;
  382. int stopCount = 0;
  383. int seqMin = 50;
  384. ifstream In;
  385. ofstream Out;
  386. string segment;
  387. outputSeq.clear();
  388. In.open(inSeq.c_str());
  389. string inputSeq;
  390. int charCount = 0;
  391. int charCountMax = 70;
  392. In >> segment;
  393. //cout << "segment: " << segment << endl;
  394. while(segment.length() < seqMin && !In.fail())
  395. {
  396. In >> segment;
  397. }
  398. inputSeq.append(segment);
  399. while(!In.fail())
  400. {
  401. In >> segment;
  402. inputSeq.append(segment);
  403. }
  404. //cout << "FASTA of DNA stored" << endl;
  405. In.close();
  406. //cout << "Frame: " << frame << endl;
  407. for(int i = frame; i < inputSeq.length()-3; i=i+3)
  408. {
  409. charCount++;
  410. if(charCount == charCountMax)
  411. {
  412. outputSeq.append("\n");
  413. charCount = 0;
  414. }
  415. if(inputSeq[i] == 't' ||
  416. inputSeq[i] == 'T')
  417. {
  418. if(inputSeq[i+1] == 't' ||
  419. inputSeq[i+1] == 'T')
  420. {
  421. if(inputSeq[i+2] == 't' ||
  422. inputSeq[i+2] == 'T' ||
  423. inputSeq[i+2] == 'c' ||
  424. inputSeq[i+2] == 'C')
  425. {
  426. outputSeq.append("F");
  427. }
  428. else
  429. {
  430. outputSeq.append("L");
  431. }
  432. }
  433. else if(inputSeq[i+1] == 'c' ||
  434. inputSeq[i+1] == 'C')
  435. {
  436. outputSeq.append("S");
  437. }
  438. else if(inputSeq[i+1] == 'a' ||
  439. inputSeq[i+1] == 'A')
  440. {
  441. if(inputSeq[i+2] == 't' ||
  442. inputSeq[i+2] == 'T' ||
  443. inputSeq[i+2] == 'c' ||
  444. inputSeq[i+2] == 'C')
  445. {
  446. outputSeq.append("Y");
  447. }
  448. else
  449. {
  450. outputSeq.append("-");
  451. stopCount++; // stop codon
  452. }
  453. }
  454. else if(inputSeq[i+1] == 'g' ||
  455. inputSeq[i+1] == 'G')
  456. {
  457. if(inputSeq[i+2] == 't' ||
  458. inputSeq[i+2] == 'T' ||
  459. inputSeq[i+2] == 'c' ||
  460. inputSeq[i+2] == 'C')
  461. {
  462. outputSeq.append("C");
  463. }
  464. else if(inputSeq[i+2] == 'g' ||
  465. inputSeq[i+2] == 'G')
  466. {
  467. outputSeq.append("W");
  468. }
  469. else
  470. {
  471. outputSeq.append("-");
  472. stopCount++; // stop codon
  473. }
  474. }
  475. } // sequence begins with 't' ends
  476. else if(inputSeq[i] == 'c' ||
  477. inputSeq[i] == 'C')
  478. {
  479. if(inputSeq[i+1] == 't' ||
  480. inputSeq[i+1] == 'T')
  481. {
  482. outputSeq.append("L");
  483. }
  484. else if(inputSeq[i+1] == 'c' ||
  485. inputSeq[i+1] == 'C')
  486. {
  487. outputSeq.append("P");
  488. }
  489. else if(inputSeq[i+1] == 'a' ||
  490. inputSeq[i+1] == 'A')
  491. {
  492. if(inputSeq[i+2] == 't' ||
  493. inputSeq[i+2] == 'T' ||
  494. inputSeq[i+2] == 'c' ||
  495. inputSeq[i+2] == 'C')
  496. {
  497. outputSeq.append("H");
  498. }
  499. else
  500. {
  501. outputSeq.append("Q");
  502. }
  503. }
  504. else if(inputSeq[i+1] == 'g' ||
  505. inputSeq[i+1] == 'G')
  506. {
  507. outputSeq.append("R");
  508. }
  509. } // sequence begins with 'c' ends
  510. else if(inputSeq[i] == 'a' ||
  511. inputSeq[i] == 'A')
  512. {
  513. if(inputSeq[i+1] == 't' ||
  514. inputSeq[i+1] == 'T')
  515. {
  516. if(inputSeq[i+2] != 'g' &&
  517. inputSeq[i+2] != 'G')
  518. {
  519. outputSeq.append("I");
  520. }
  521. else
  522. {
  523. outputSeq.append("M");
  524. }
  525. }
  526. else if(inputSeq[i+1] == 'c' ||
  527. inputSeq[i+1] == 'C')
  528. {
  529. outputSeq.append("T");
  530. }
  531. else if(inputSeq[i+1] == 'a' ||
  532. inputSeq[i+1] == 'A')
  533. {
  534. if(inputSeq[i+2] == 't' ||
  535. inputSeq[i+2] == 'T' ||
  536. inputSeq[i+2] == 'c' ||
  537. inputSeq[i+2] == 'C')
  538. {
  539. outputSeq.append("N");
  540. }
  541. else
  542. {
  543. outputSeq.append("K");
  544. }
  545. }
  546. else if(inputSeq[i+1] == 'g' ||
  547. inputSeq[i+1] == 'G')
  548. {
  549. if(inputSeq[i+2] == 't' ||
  550. inputSeq[i+2] == 'T' ||
  551. inputSeq[i+2] == 'c' ||
  552. inputSeq[i+2] == 'C')
  553. {
  554. outputSeq.append("S");
  555. }
  556. else
  557. {
  558. outputSeq.append("R");
  559. }
  560. }
  561. } // sequence begins with 'a' ends
  562. else if(inputSeq[i] == 'g' ||
  563. inputSeq[i] == 'G')
  564. {
  565. if(inputSeq[i+1] == 't' ||
  566. inputSeq[i+1] == 'T')
  567. {
  568. outputSeq.append("V");
  569. }
  570. else if(inputSeq[i+1] == 'c' ||
  571. inputSeq[i+1] == 'C')
  572. {
  573. outputSeq.append("A");
  574. }
  575. else if(inputSeq[i+1] == 'a' ||
  576. inputSeq[i+1] == 'A')
  577. {
  578. if(inputSeq[i+2] == 't' ||
  579. inputSeq[i+2] == 'T' ||
  580. inputSeq[i+2] == 'c' ||
  581. inputSeq[i+2] == 'C')
  582. {
  583. outputSeq.append("D");
  584. }
  585. else
  586. {
  587. outputSeq.append("E");
  588. }
  589. }
  590. else if(inputSeq[i+1] == 'g' ||
  591. inputSeq[i+1] == 'G')
  592. {
  593. outputSeq.append("G");
  594. }
  595. } // codon begins with 'g' ends
  596. else
  597. {
  598. outputSeq.append("-");
  599. stopCount++;
  600. }
  601. } // for loop ends
  602. Out.open("aminoAcidSeq.txt");
  603. Out.clear();
  604. Out << "Protein from one DNA frame" << "\n";
  605. Out << outputSeq;
  606. Out.close();
  607. } // function ends
  608. void predictDomains(string aminoAcidSeq,
  609. string &predictedSubstrates,
  610. QueueClass< UnitClass2 > &aaNameList,
  611. const bool &allFramesRead,
  612. const int &frame, const string &nucSeq,
  613. string &allClusters, int &NRPScount, int &PKScount,
  614. int &mixedNRPSPKScount, int &transATcount, int &mepCount,
  615. int &mvaCount )
  616. {
  617. QueueClass <UnitClass2> sortedListOfDomains;
  618. QueueClass <UnitClass2> sortedListOfMods;
  619. QueueClass <UnitClass2> sortedListOfATKS;
  620. UnitClass2 *domain;
  621. ifstream In;
  622. ifstream InMods;
  623. ifstream tempIn;
  624. ifstream aaSeqIn;
  625. ofstream Out;
  626. ofstream tempOut;
  627. string segment;
  628. float score;
  629. string scoreString;
  630. float modsScore;
  631. float modsScoreMax = 0.1;
  632. float scoreMax = 0.1; // minimum for domain synthetase matching
  633. float idenScore;
  634. float idenScoreMin = 120; // minimum for mods alignment
  635. float length;
  636. string SMILE;
  637. string formula;
  638. string mod;
  639. string substrate;
  640. float siteToAnalyze;
  641. int siteCount;
  642. bool siteFound;
  643. int whileCount = 0;
  644. int queryOrSbjctLength = 6;
  645. string aaSeq;
  646. char *arg[21];
  647. arg[0] = "./blastall"; // program to execute
  648. arg[1] = "blastp"; // protein blast
  649. arg[2] = "synthetases.fasta"; // control A-domain
  650. arg[3] = "mods.fasta"; // output file
  651. arg[4] = "100.0";
  652. arg[6] = "aaSeq.txt";
  653. arg[7] = "blast2.out";
  654. int AdomainStart = 5;
  655. int AdomainEnd = 5;
  656. // A-domains to analyze
  657. bool substrateKnown;
  658. string query;
  659. string sbjct;
  660. string predictedSubstrate;
  661. string proposedSubstrate;
  662. bool foundSubstrate;
  663. bool stringsCompare;
  664. int hits = 0;
  665. int misses = 0;
  666. int maxChars = 10000;
  667. // NRPS key positions
  668. int p235 = 11;
  669. int p236 = 12;
  670. int p239 = 15;
  671. int p278 = 10;
  672. int p299 = 1;
  673. int p301 = 3;
  674. int p322 = 1;
  675. int p330 = 9;
  676. int p331 = 10;
  677. string pos235;
  678. string pos236;
  679. string pos239;
  680. string pos278;
  681. string pos299;
  682. string pos301;
  683. string pos322;
  684. string pos330;
  685. string pos331;
  686. int Aad = 0;
  687. int Ala = 0;
  688. int Asn = 0;
  689. int Asp = 0;
  690. int Cys = 0;
  691. int Dab = 0;
  692. int Dhb = 0;
  693. int Gln = 0;
  694. int Glu = 0;
  695. int Ile = 0;
  696. int Leu = 0;
  697. int Orn = 0;
  698. int Phe = 0;
  699. int Phg = 0;
  700. int Pip = 0;
  701. int Pro = 0;
  702. int Ser = 0;
  703. int Thr = 0;
  704. int Tyr = 0;
  705. int Val = 0;
  706. int His = 0;
  707. int Lys = 0;
  708. int Gly = 0;
  709. int Gly1 = 0;
  710. int Gly2 = 0;
  711. int Gly3 = 0;
  712. int Leu1 = 0;
  713. int Leu2 = 0;
  714. int Leu3 = 0;
  715. int Leu4 = 0;
  716. int Tyr1 = 0;
  717. int Tyr2 = 0;
  718. int Tyr3 = 0;
  719. int Val1 = 0;
  720. int Val2 = 0;
  721. int Val3 = 0;
  722. int mal = 0;
  723. int mmal = 0;
  724. int emal = 0;
  725. int omal = 0;
  726. char FDxAA;
  727. char FD4xAA;
  728. char IT3xAA;
  729. char x1xG299;
  730. char x1xG301;
  731. char x7xxY322;
  732. char x7xxY330;
  733. char x7xxY331;
  734. // PKS key positions
  735. int p58 = 0;
  736. int p59 = 1;
  737. int p60 = 2;
  738. int p61 = 3;
  739. int p62 = 4;
  740. int p70 = 0;
  741. int p72 = 2;
  742. int p74 = 4;
  743. int p75 = 5;
  744. int p77 = 7;
  745. int p93 = 3;
  746. int p189 = 1;
  747. int p191 = 3;
  748. int p198 = 0;
  749. int p199 = 1;
  750. int p200 = 2;
  751. string pos58;
  752. string pos59;
  753. string pos60;
  754. string pos61;
  755. string pos62;
  756. string pos70;
  757. string pos72;
  758. string pos74;
  759. string pos75;
  760. string pos77;
  761. string pos93;
  762. string pos189;
  763. string pos191;
  764. string pos198;
  765. string pos199;
  766. string pos200;
  767. string sigSeq;
  768. float position;
  769. bool NRPS = 0;
  770. bool PKS = 0;
  771. string residue;
  772. string fname;
  773. bool success;
  774. string smileName;
  775. string inString;
  776. UnitClass2 *aaName;
  777. float currentDomainPos;
  778. float nextDomainPos;
  779. float thirdDomainPos;
  780. int PKSmodRange = 7000;
  781. bool KR = 0;
  782. bool DH = 0;
  783. bool ER = 0;
  784. string endSiteString;
  785. string prevSegment;
  786. int endSite;
  787. int startSite;
  788. int extraRange = 1000;
  789. int lastPKSkey = 3;
  790. string scoredSubstrate;
  791. string proteinID;
  792. string proteinFrame;
  793. string aminoAcids;
  794. int nucStartSite;
  795. int nucEndSite;
  796. cout << "Analyzing data..." << endl;
  797. //get every NRPS/PKS domain until no further domains are detected;
  798. // does blast alignment of cluster
  799. spawnl( 1, "./blastall", "./blastall",
  800. "-g", "T", "-p", arg[1], "-d", arg[2], "-i", aminoAcidSeq.c_str(),
  801. "-F", "F", "-C", "0", "-o", "blast.out", NULL);
  802. //cout << "called blast" << endl;
  803. spawnl( 1, "./blastall", "./blastall",
  804. "-g", "T", "-p", arg[1], "-d", arg[3], "-i", aminoAcidSeq.c_str(),
  805. "-F", "F", "-C", "0", "-o", "modsBlast.out", NULL);
  806. //cout << "called modblast" << endl;
  807. //cout << "frame: " << frame << endl;
  808. if(frame == 0)
  809. {
  810. system("cp blast.out frame0.out");
  811. //cout << "copied to frame0.out" << endl;
  812. system("cp modsBlast.out modsFrame0.out");
  813. //cout << "copied to modsFrame0.out" << endl;
  814. system("cp aminoAcidSeq.txt aminoAcidSeq0.txt");
  815. //cout << "aminoAcidSeq0 copied" << endl;
  816. }
  817. if(frame == 1)
  818. {
  819. system("cp blast.out frame1.out");
  820. //cout << "copied to frame1.out" << endl;
  821. system("cp modsBlast.out modsFrame1.out");
  822. //cout << "copied to modsFrame1.out" << endl;
  823. system("cp aminoAcidSeq.txt aminoAcidSeq1.txt");
  824. //cout << "aminoAcidSeq1 copied" << endl;
  825. }
  826. if(frame == 2)
  827. {
  828. system("cp blast.out frame2.out");
  829. //cout << "copied to frame2.out" << endl;
  830. system("cp modsBlast.out modsFrame2.out");
  831. //cout << "copied to modsFrame2.out" << endl;
  832. system("cat frame0.out frame1.out frame2.out > blast.out");
  833. //cout << "concatenated all domain frames" << endl;
  834. system("cat modsFrame0.out modsFrame1.out modsFrame2.out > modsBlast.out");
  835. //cout << "concatenated all mod frames" << endl;
  836. system("rm frame0.out frame1.out frame2.out modsFrame1.out modsFrame2.out");
  837. //cout << "last aminoAcidSeq" << endl;
  838. }
  839. //cout << "after frames copies" << endl;
  840. if(allFramesRead)
  841. {
  842. //cout << "ALL FRAMES READ" << endl;
  843. score = 0;
  844. // open NRPS/PKS alignment file
  845. In.open("blast.out");
  846. In >> segment;
  847. while( !In.fail())
  848. {
  849. //cout << "read blast.out" << endl;
  850. if(segment[0] == '*')
  851. {
  852. In >> segment;
  853. substrate = segment;
  854. }
  855. if(segment[0] == 'E' && segment[1] == 'x'
  856. && segment[2] == 'p')
  857. {
  858. In >> segment; // "="
  859. In >> segment; // expectation
  860. for(int i = 0; i < segment.length(); i++)
  861. {
  862. if(segment[i] == 'e')
  863. {
  864. i = segment.length();
  865. }
  866. else
  867. {
  868. scoreString.append(1, segment[i]);
  869. }
  870. }
  871. score = getFloatVal(scoreString);
  872. //cout << "scoreString: " << scoreString << endl;
  873. scoreString.clear();
  874. for(int i = 0; i < segment.length(); i++)
  875. {
  876. if(segment[i] == 'e')
  877. {
  878. for(int j = i+1; j < segment.length(); j++)
  879. {
  880. scoreString.append(1, segment[j]);
  881. }
  882. }
  883. }
  884. score = score * pow(10.0,(static_cast<double>(getFloatVal(scoreString))));
  885. //cout << "segment: " << segment << endl;
  886. //cout << "score: " << score << endl;
  887. //cout << "scoreString float: " << getFloatVal(scoreString) << endl;
  888. scoreString.clear();
  889. }
  890. if(segment[0] == 'I' && segment[1] == 'd'
  891. && segment[2] == 'e')
  892. {
  893. In >> segment; // "="
  894. In >> segment; // identities
  895. // collect denominator of identities value
  896. for(int i = 0; i < segment.length(); i++)
  897. {
  898. if(segment[i] == '/')
  899. {
  900. for(int j = i+1; j < segment.length(); j++)
  901. {
  902. scoreString.append(1, segment[j]);
  903. i = j;
  904. }
  905. }
  906. }
  907. idenScore = getIntVal(scoreString);
  908. scoreString.clear();
  909. //cout << "score: " << " idenScore: " << idenScore << endl;
  910. if(score < scoreMax && idenScore > idenScoreMin)
  911. {
  912. //cout << "score: " << score << " idenScore: " << idenScore << endl;
  913. while(segment[0] != 'Q' && segment[1] != 'u')
  914. {
  915. In >> segment;
  916. }
  917. In >> segment; // start site of domain on protein
  918. startSite = getIntVal(segment);
  919. //cout << "start site: " << startSite << endl;
  920. In >> segment;
  921. // get protein frame and find where adenylation domain begins
  922. proteinID = segment;
  923. for(int i = 0; i < proteinID.length(); i++)
  924. {
  925. while(proteinID[i] == '-')
  926. {
  927. proteinID.erase(i, 1);
  928. }
  929. }
  930. //cout << "PROTEINID: " << proteinID << endl;
  931. proteinFrame.clear();
  932. aaSeqIn.open("aminoAcidSeq.txt");
  933. aaSeqIn >> aminoAcids;
  934. while(!aaSeqIn.fail())
  935. {
  936. if(aminoAcids.length() > 1)
  937. {
  938. proteinFrame.append(aminoAcids);
  939. }
  940. aaSeqIn >> aminoAcids;
  941. }
  942. aaSeqIn.clear();
  943. aaSeqIn.close();
  944. //cout << "proteinFrame.length(): " << proteinFrame.length() << endl;
  945. nucStartSite = proteinFrame.find(proteinID, startSite - 100);
  946. if(nucStartSite < 0)
  947. {
  948. proteinFrame.clear();
  949. aaSeqIn.open("aminoAcidSeq0.txt");
  950. aaSeqIn >> aminoAcids;
  951. while(!aaSeqIn.fail())
  952. {
  953. if(aminoAcids.length() > 1)
  954. {
  955. proteinFrame.append(aminoAcids);
  956. }
  957. aaSeqIn >> aminoAcids;
  958. }
  959. aaSeqIn.clear();
  960. aaSeqIn.close();
  961. nucStartSite = proteinFrame.find(proteinID, startSite - 100);
  962. //cout << "NOT IN LAST AA FRAME" << endl;
  963. }
  964. if(nucStartSite < 0)
  965. {
  966. proteinFrame.clear();
  967. aaSeqIn.open("aminoAcidSeq1.txt");
  968. aaSeqIn >> aminoAcids;
  969. while(!aaSeqIn.fail())
  970. {
  971. if(aminoAcids.length() > 1)
  972. {
  973. proteinFrame.append(aminoAcids);
  974. }
  975. aaSeqIn >> aminoAcids;
  976. }
  977. aaSeqIn.clear();
  978. aaSeqIn.close();
  979. nucStartSite = proteinFrame.find(proteinID, startSite - 100);
  980. // cout << "NOT IN FIRST AA FRAME" << endl;
  981. }
  982. if(nucStartSite < 0)
  983. {
  984. //cout << "NOT IN SECOND AA FRAME" << endl;
  985. }
  986. nucStartSite = nucStartSite * 3; // translate amino acid position
  987. // into nucleotide position
  988. //cout << "NUCSTARTSITE: " << nucStartSite << endl;
  989. while((segment[0] != 'S' || segment[4] != 'e') && segment[0] != '>'
  990. && !In.fail())
  991. {
  992. prevSegment = segment;
  993. In >> segment;
  994. }
  995. endSiteString = prevSegment;
  996. endSite = getIntVal(prevSegment) + startSite + extraRange;
  997. domain = new UnitClass2;
  998. domain->setStartSite(startSite);
  999. domain->setFormula(substrate);
  1000. domain->setEndSite(endSite);
  1001. domain->setNucStartSite(nucStartSite);
  1002. sortedListOfDomains.insertValue(*domain);
  1003. delete domain;
  1004. }
  1005. score = 1;
  1006. idenScore = 0;
  1007. }
  1008. In >> segment;
  1009. }
  1010. In.close();
  1011. if(sortedListOfDomains.getNumElems() > 0)
  1012. {
  1013. //sortedListOfDomains.printForward();
  1014. }
  1015. else
  1016. {
  1017. cout << "No NRPS-A or PKS-AT domains detected" << endl;
  1018. }
  1019. // detect modifications in cluster and their locations
  1020. InMods.open("modsBlast.out");
  1021. InMods >> segment;
  1022. //cout << "InMods: " << InMods << " segment: " << segment << endl;
  1023. while( !InMods.fail())
  1024. {
  1025. //cout << "in InMods loop" << endl;
  1026. if(segment[0] == '*')
  1027. {
  1028. InMods >> segment;
  1029. mod = segment;
  1030. }
  1031. if(segment[0] == 'E' && segment[1] == 'x'
  1032. && segment[2] == 'p')
  1033. {
  1034. InMods >> segment; // "="
  1035. InMods >> segment; // expectation
  1036. //cout << "segment(expectation): " << segment << endl;
  1037. for(int i = 0; i < segment.length(); i++)
  1038. {
  1039. if(segment[i] == 'e')
  1040. {
  1041. i = segment.length();
  1042. }
  1043. else
  1044. {
  1045. scoreString.append(1, segment[i]);
  1046. }
  1047. }
  1048. modsScore = getFloatVal(scoreString);
  1049. scoreString.clear();
  1050. for(int i = 0; i < segment.length(); i++)
  1051. {
  1052. if(segment[i] == 'e')
  1053. {
  1054. for(int j = i+1; j < segment.length(); j++)
  1055. {
  1056. scoreString.append(1, segment[j]);
  1057. i = j;
  1058. }
  1059. }
  1060. }
  1061. }
  1062. modsScore = modsScore * pow(10.0,(static_cast<double>(getFloatVal(scoreString))));
  1063. scoreString.clear();
  1064. if(segment[0] == 'I' && segment[1] == 'd'
  1065. && segment[2] == 'e')
  1066. {
  1067. InMods >> segment; // "="
  1068. InMods >> segment; // identities
  1069. // collect denominator of identities value
  1070. for(int i = 0; i < segment.length(); i++)
  1071. {
  1072. if(segment[i] == '/')
  1073. {
  1074. for(int j = i+1; j < segment.length(); j++)
  1075. {
  1076. scoreString.append(1, segment[j]);
  1077. i = j;
  1078. }
  1079. }
  1080. }
  1081. idenScore = getIntVal(scoreString);
  1082. scoreString.clear();
  1083. //cout << "/identities: " << score << endl;
  1084. if(modsScore < modsScoreMax and (idenScore > idenScoreMin
  1085. || mod[0] == 'T' && mod[1] == 'E'))
  1086. {
  1087. //cout << "modsScore: " << modsScore << " idenScore: " << idenScore << endl;
  1088. while( segment[0] != 'Q')
  1089. {
  1090. InMods >> segment;
  1091. }
  1092. InMods >> segment; // start site of domain on protein
  1093. startSite = getIntVal(segment);
  1094. InMods >> segment; // start site of domain on protein
  1095. // get protein frame and find where adenylation domains begins
  1096. proteinID = segment;
  1097. for(int i = 0; i < proteinID.length(); i++)
  1098. {
  1099. while(proteinID[i] == '-')
  1100. {
  1101. proteinID.erase(i, 1);
  1102. }
  1103. }
  1104. // cout << "PROTEINID: " << proteinID << endl;
  1105. proteinFrame.clear();
  1106. aaSeqIn.open("aminoAcidSeq.txt");
  1107. aaSeqIn >> aminoAcids;
  1108. while(!aaSeqIn.fail())
  1109. {
  1110. if(aminoAcids.length() > 1)
  1111. {
  1112. proteinFrame.append(aminoAcids);
  1113. }
  1114. aaSeqIn >> aminoAcids;
  1115. }
  1116. aaSeqIn.clear();
  1117. aaSeqIn.close();
  1118. //cout << "proteinFrame.length(): " << proteinFrame.length() << endl;
  1119. nucStartSite = proteinFrame.find(proteinID, startSite);
  1120. if(nucStartSite < 0)
  1121. {
  1122. proteinFrame.clear();
  1123. aaSeqIn.open("aminoAcidSeq0.txt");
  1124. aaSeqIn >> aminoAcids;
  1125. while(!aaSeqIn.fail())
  1126. {
  1127. if(aminoAcids.length() > 1)
  1128. {
  1129. proteinFrame.append(aminoAcids);
  1130. }
  1131. aaSeqIn >> aminoAcids;
  1132. }
  1133. aaSeqIn.clear();
  1134. aaSeqIn.close();
  1135. nucStartSite = proteinFrame.find(proteinID, startSite);
  1136. // cout << "NOT IN LAST AA FRAME" << endl;
  1137. }
  1138. if(nucStartSite < 0)
  1139. {
  1140. proteinFrame.clear();
  1141. aaSeqIn.open("aminoAcidSeq1.txt");
  1142. aaSeqIn >> aminoAcids;
  1143. while(!aaSeqIn.fail())
  1144. {
  1145. if(aminoAcids.length() > 1)
  1146. {
  1147. proteinFrame.append(aminoAcids);
  1148. }
  1149. aaSeqIn >> aminoAcids;
  1150. }
  1151. aaSeqIn.clear();
  1152. aaSeqIn.close();
  1153. nucStartSite = proteinFrame.find(proteinID, startSite);
  1154. // cout << "NOT IN FIRST AA FRAME" << endl;
  1155. }
  1156. if(nucStartSite < 0)
  1157. {
  1158. // cout << "NOT IN SECOND AA FRAME" << endl;
  1159. }
  1160. nucStartSite = nucStartSite * 3; // translate amino acid position
  1161. // into nucleotide position
  1162. domain = new UnitClass2;
  1163. domain->setStartSite(startSite);
  1164. domain->setNucStartSite(nucStartSite);
  1165. domain->setFormula(mod);
  1166. sortedListOfMods.insertValue(*domain);
  1167. delete domain;
  1168. }
  1169. score = 1;
  1170. idenScore = 0;
  1171. }
  1172. InMods >> segment;
  1173. }
  1174. InMods.clear();
  1175. InMods.close();
  1176. if(sortedListOfMods.getNumElems() > 0)
  1177. {
  1178. //sortedListOfMods.printForward();
  1179. }
  1180. else
  1181. {
  1182. cout << "No auxiliary domains detected" << endl;
  1183. }
  1184. siteCount = 0;
  1185. // go through each domain and collect and align each
  1186. while(siteCount < sortedListOfDomains.getNumElems() && whileCount < 1000000)
  1187. {
  1188. In.close();
  1189. In.open("blast.out");
  1190. In.clear();
  1191. In >> segment;
  1192. siteFound = 0;
  1193. while( !In.fail() && siteFound == 0 && whileCount < 1000000)
  1194. {
  1195. //cout << "inner while" << endl;
  1196. siteToAnalyze = sortedListOfDomains.getElemAtIndex(siteCount).getStartSite();
  1197. //cout << "siteToAnalyze: " << siteToAnalyze << endl;
  1198. if(segment[0] == 'Q' && segment[1] == 'u'
  1199. && segment[queryOrSbjctLength - 1] == ':')
  1200. {
  1201. In >> segment;
  1202. if(getIntVal(segment) == siteToAnalyze)
  1203. {
  1204. // analyze this segment only and add to aaNameList
  1205. // store any modifications between this domain and the next
  1206. // cout << "segment " << segment << " found " << endl;
  1207. //cout << segment << " ";
  1208. cout << sortedListOfDomains.getElemAtIndex(siteCount).getNucStartSite() << " ";
  1209. cout << sortedListOfDomains.getElemAtIndex(siteCount).getNucStartSite() + 10000
  1210. << " ";
  1211. position = getIntVal(segment);
  1212. siteFound = 1;
  1213. siteCount++;
  1214. In >> segment;
  1215. aaSeq.append(segment);
  1216. In >> segment;
  1217. while((segment[0] != 'S' || segment[1] != 'c') && !In.fail())
  1218. {
  1219. if(segment[0] == 'Q' && segment[1] == 'u'
  1220. && segment[queryOrSbjctLength - 1] == ':')
  1221. {
  1222. In >> segment;
  1223. In >> segment;
  1224. for(int i = 0; i < segment.length(); i++)
  1225. {
  1226. if((static_cast< unsigned int >(segment[i]) < 65
  1227. || static_cast< unsigned int >(segment[i]) > 90)
  1228. && (static_cast< unsigned int >(segment[i]) < 97
  1229. || static_cast< unsigned int >(segment[i]) > 122))
  1230. {
  1231. segment.erase(i,1);
  1232. i--;
  1233. }
  1234. }
  1235. aaSeq.append(segment);
  1236. }
  1237. In >> segment;
  1238. }
  1239. // recognize amino acid sequence here
  1240. //cout << "aaSeq: " << aaSeq << endl;
  1241. Out.open("aaSeq.txt");
  1242. Out.clear();
  1243. Out << "> " << "aaSeq" << '\n' << aaSeq;
  1244. Out.close();
  1245. aaSeq.clear();
  1246. // does blast alignment of individual domain to recognize
  1247. // conserved motifs and substrate specificity
  1248. spawnl( 1, "./blastall", "blastall",
  1249. "-g", "T", "-p", arg[1], "-d", arg[2], "-i", "aaSeq.txt",
  1250. "-F", "F", "-C", "0", "-o", "blast2.out", NULL);
  1251. substrateKnown = 0;
  1252. In.close();
  1253. In.open("blast2.out");
  1254. In.clear();
  1255. segment.clear();
  1256. In >> segment;
  1257. while(!In.fail())
  1258. {
  1259. //cout << !In.fail() << " ";
  1260. //cout << segment << endl;
  1261. if(segment == "NRPS" && PKS == 0 && NRPS == 0)
  1262. {
  1263. NRPS = 1;
  1264. //cout << "NRPS detected" << endl;
  1265. }
  1266. if(segment == "PKS" && NRPS == 0 && PKS == 0)
  1267. {
  1268. PKS = 1;
  1269. //cout << "PKS detected" << endl;
  1270. }
  1271. if(segment.length() == queryOrSbjctLength)
  1272. {
  1273. if(segment[0] == 'Q' && segment[1] == 'u'
  1274. && segment[queryOrSbjctLength - 1] == ':')
  1275. {
  1276. In >> segment;
  1277. In >> segment;
  1278. query.append(segment);
  1279. //cout << "Query: " << segment << endl;
  1280. }
  1281. if(segment[0] == 'S' && segment[1] == 'b')
  1282. {
  1283. In >> segment;
  1284. In >> segment;
  1285. sbjct.append(segment);
  1286. //cout << "Sbjct: " << segment << endl;
  1287. }
  1288. }
  1289. In >> segment;
  1290. }
  1291. In.clear();
  1292. In.close();
  1293. segment.clear();
  1294. // search for first FDX motif containing residues 235,236,239
  1295. if(NRPS)
  1296. {
  1297. if(sbjct.find("SRVLQFASFSFD") > 0
  1298. && sbjct.find("SRVLQFASFSFD") < maxChars )
  1299. {
  1300. pos235 = query[sbjct.find("SRVLQFASFSFD")+p235];
  1301. pos236 = query[sbjct.find("SRVLQFASFSFD")+p236];
  1302. pos239 = query[sbjct.find("SRVLQFASFSFD")+p239];
  1303. sigSeq.append(pos235);
  1304. sigSeq.append(pos236);
  1305. sigSeq.append(pos239);
  1306. }
  1307. // search for IT motif containing the residue 278
  1308. if(sbjct.find("RLKNDGITHVTLPP") > 0
  1309. && sbjct.find("RLKNDGITHVTLPP") < maxChars )
  1310. {
  1311. pos278 = query[sbjct.find("RLKNDGITHVTLPP")+p278];
  1312. sigSeq.append(pos278);
  1313. }
  1314. // search for GE motif containing the residues 299,301
  1315. if(sbjct.find("IVVQGEACP") > 0
  1316. && sbjct.find("IVVAGEACP") < maxChars )
  1317. {
  1318. pos299 = query[sbjct.find("IVVAGEACP")+p299];
  1319. pos301 = query[sbjct.find("IVVAGEACP")+p301];
  1320. sigSeq.append(pos299);
  1321. sigSeq.append(pos301);
  1322. }
  1323. // search for YGPTE motif containing the residues 322,330,331
  1324. if(sbjct.find("NAYGPTE") > 0
  1325. && sbjct.find("NAYGPTE") < maxChars )
  1326. {
  1327. pos322 = query[sbjct.find("NAYGPTE")+p322];
  1328. pos330 = query[sbjct.find("NAYGPTE")+p330];
  1329. pos331 = query[sbjct.find("NAYGPTE")+p331];
  1330. sigSeq.append(pos322);
  1331. sigSeq.append(pos330);
  1332. sigSeq.append(pos331);
  1333. }
  1334. FDxAA = pos236[0];
  1335. FD4xAA = pos239[0];
  1336. IT3xAA = pos278[0];
  1337. x1xG299 = pos299[0];
  1338. x1xG301 = pos301[0];
  1339. x7xxY322 = pos322[0];
  1340. x7xxY330 = pos330[0];
  1341. x7xxY331 = pos331[0];
  1342. }
  1343. if(PKS)
  1344. {
  1345. if(sbjct.find("DVPLV") > 0
  1346. && sbjct.find("DVPLV") < maxChars )
  1347. {
  1348. pos58 = query[sbjct.find("DVPLV")+p58];
  1349. pos59 = query[sbjct.find("DVPLV")+p59];
  1350. pos60 = query[sbjct.find("DVPLV")+p60];
  1351. pos61 = query[sbjct.find("DVPLV")+p61];
  1352. pos62 = query[sbjct.find("DVPLV")+p62];
  1353. sigSeq.append(pos58);
  1354. sigSeq.append(pos59);
  1355. sigSeq.append(pos60);
  1356. sigSeq.append(pos61);
  1357. sigSeq.append(pos62);
  1358. }
  1359. if(sbjct.find("QVAL") > 0
  1360. && sbjct.find("QVAL") < maxChars )
  1361. {
  1362. pos70 = query[sbjct.find("QVAL")+p70];
  1363. pos72 = query[sbjct.find("QVAL")+p72];
  1364. pos74 = query[sbjct.find("QVAL")+p74];
  1365. pos75 = query[sbjct.find("QVAL")+p75];
  1366. pos77 = query[sbjct.find("QVAL")+p77];
  1367. sigSeq.append(pos70);
  1368. sigSeq.append(pos72);
  1369. sigSeq.append(pos74);
  1370. sigSeq.append(pos75);
  1371. sigSeq.append(pos77);
  1372. }
  1373. if(sbjct.find("GQSMGE") > 0
  1374. && sbjct.find("GQSMGE") < maxChars )
  1375. {
  1376. pos93 = query[sbjct.find("GQSMGE")+p93];
  1377. sigSeq.append(pos93);
  1378. }
  1379. if(sbjct.find("ITVRRI") > 0
  1380. && sbjct.find("ITVRRI") < maxChars )
  1381. {
  1382. pos189 = query[sbjct.find("ITVRRI")+p189];
  1383. sigSeq.append(pos189);
  1384. pos191 = query[sbjct.find("ITVRRI")+p191];
  1385. sigSeq.append(pos191);
  1386. }
  1387. if(sbjct.find("YASH") > 0
  1388. && sbjct.find("YASH") < maxChars )
  1389. {
  1390. pos198 = query[sbjct.find("YASH")+p198];
  1391. pos199 = query[sbjct.find("YASH")+p199];
  1392. pos200 = query[sbjct.find("YASH")+p200];
  1393. sigSeq.append(pos198);
  1394. sigSeq.append(pos199);
  1395. sigSeq.append(pos200);
  1396. }
  1397. if(pos58[0] == 'E' || pos58[0] == 'D' || pos58[0] == 'Q'
  1398. || pos58[0] == 'S' || pos58[0] == 'K')
  1399. {
  1400. mal++;
  1401. }
  1402. if(pos58[0] == 'R' || pos58[0] == 'G')
  1403. {
  1404. mmal++;
  1405. emal++;
  1406. omal++;
  1407. }
  1408. if(pos59[0] == 'T' || pos59[0] == 'S')
  1409. {
  1410. mal++;
  1411. }
  1412. if(pos59[0] == 'V' || pos59[0] == 'A')
  1413. {
  1414. mmal++;
  1415. emal++;
  1416. omal++;
  1417. }
  1418. if(pos60[0] == 'G' || pos60[0] == 'P' || pos60[0] == 'A'
  1419. || pos60[0] == 'L' || pos60[0] == 'R')
  1420. {
  1421. mal++;
  1422. }
  1423. if(pos60[0] == 'D' || pos60[0] == 'E')
  1424. {
  1425. mmal++;
  1426. emal++;
  1427. omal++;
  1428. }
  1429. if(pos61[0] == 'Y' || pos61[0] == 'F')
  1430. {
  1431. mal++;
  1432. }
  1433. if(pos61[0] == 'V')
  1434. {
  1435. mmal++;
  1436. emal++;
  1437. omal++;
  1438. }
  1439. if(pos62[0] == 'A' || pos62[0] == 'T')
  1440. {
  1441. mal++;
  1442. }
  1443. if(pos62[0] == 'V')
  1444. {
  1445. mmal++;
  1446. emal++;
  1447. omal++;
  1448. }
  1449. if(pos70[0] == 'Q')
  1450. {
  1451. mal++;
  1452. }
  1453. if(pos70[0] == 'M')
  1454. {
  1455. mmal++;
  1456. emal++;
  1457. omal++;
  1458. }
  1459. if(pos72[0] == 'A')
  1460. {
  1461. mal++;
  1462. }
  1463. if(pos72[0] == 'S')
  1464. {
  1465. mmal++;
  1466. }
  1467. if(pos74[0] == 'F')
  1468. {
  1469. mal++;
  1470. }
  1471. if(pos74[0] == 'A')
  1472. {
  1473. mmal++;
  1474. emal++;
  1475. omal++;
  1476. }
  1477. if(pos75[0] == 'G')
  1478. {
  1479. mal++;
  1480. }
  1481. if(pos75[0] == 'A')
  1482. {
  1483. mmal++;
  1484. emal++;
  1485. omal++;
  1486. }
  1487. if(pos77[0] == 'L')
  1488. {
  1489. mal++;
  1490. }
  1491. if(pos77[0] == 'W')
  1492. {
  1493. mmal++;
  1494. emal++;
  1495. omal++;
  1496. }
  1497. if(pos93[0] == 'L' || pos93[0] == 'V' || pos93[0] == 'I')
  1498. {
  1499. mal++;
  1500. }
  1501. if(pos93[0] == 'Q')
  1502. {
  1503. mmal++;
  1504. emal++;
  1505. omal++;
  1506. }
  1507. if(pos189[0] == 'M' || pos189[0] == 'L' || pos189[0] == 'Y')
  1508. {
  1509. omal++;
  1510. }
  1511. else
  1512. {
  1513. mal++;
  1514. mmal++;
  1515. emal++;
  1516. }
  1517. if(pos191[0] == 'W')
  1518. {
  1519. omal++;
  1520. }
  1521. else
  1522. {
  1523. mal++;
  1524. mmal++;
  1525. emal++;
  1526. }
  1527. if(pos198[0] == 'H')
  1528. {
  1529. mal = mal + lastPKSkey;
  1530. }
  1531. if(pos198[0] == 'Y')
  1532. {
  1533. mmal = mmal + lastPKSkey;
  1534. }
  1535. if(pos198[0] == 'T' || pos198[0] == 'C')
  1536. {
  1537. emal = emal + lastPKSkey;
  1538. }
  1539. if(pos199[0] == 'P')
  1540. {
  1541. emal++;
  1542. }
  1543. if(pos200[0] == 'F')
  1544. {
  1545. mal = mal + lastPKSkey;
  1546. }
  1547. if(pos200[0] == 'S')
  1548. {
  1549. mmal = mmal + lastPKSkey;
  1550. }
  1551. if(pos200[0] == 'G')
  1552. {
  1553. emal = emal + lastPKSkey;
  1554. }
  1555. if(pos200[0] == 'T')
  1556. {
  1557. emal++;
  1558. }
  1559. //outputSpecificitiesPKS( mal, mmal, emal, omal);
  1560. scoredSubstrate = maxSpecificityPKS(mal, mmal, emal, omal);
  1561. //cout << "scoredSubstrate: " << scoredSubstrate << endl;
  1562. mal = 0;
  1563. mmal = 0;
  1564. emal = 0;
  1565. omal = 0;
  1566. clearPositions( pos235, pos236, pos239,
  1567. pos278, pos299, pos301, pos322, pos330,
  1568. pos331, pos58, pos59, pos60, pos61, pos62,
  1569. pos70, pos72, pos74, pos75, pos77, pos93,
  1570. pos189, pos191, pos198, pos199, pos200 );
  1571. }
  1572. //cout << "signature sequence: " << sigSeq << " ";
  1573. cout << sigSeq << " ";
  1574. Out.open("sigSeq.txt");
  1575. Out.clear();
  1576. Out << "> " << "SigSeq" << '\n'
  1577. << sigSeq;
  1578. Out.close();
  1579. // align with NRPS/PKS signatures
  1580. if(PKS)
  1581. {
  1582. spawnl(1, "./blastall", "blastall", "-p", "blastp", "-d",
  1583. "PKSsignatures.fasta", "-i", "sigSeq.txt",
  1584. "-e", arg[4], "-F", "F", "-C", "0", "-e", "10.0",
  1585. "-o", "blast3.out", NULL);
  1586. }
  1587. if(NRPS)
  1588. {
  1589. spawnl(1, "./blastall", "blastall", "-p", "blastp", "-d",
  1590. "NRPSsignatures.fasta", "-i", "sigSeq.txt",
  1591. "-e", arg[4], "-F", "F", "-C", "0", "-e", "10.0",
  1592. "-o", "blast3.out", NULL);
  1593. if(FDxAA == 'P' || FDxAA == 'p')
  1594. {
  1595. Aad++;
  1596. }
  1597. if(FDxAA == 'L' || FDxAA == 'l')
  1598. {
  1599. Ala++;
  1600. Asn++;
  1601. Asp++;
  1602. Cys++;
  1603. Dab++;
  1604. Dhb++;
  1605. Gly3++;
  1606. }
  1607. if(FDxAA == 'H' || FDxAA == 'h')
  1608. {
  1609. Cys++;
  1610. }
  1611. if(FDxAA == 'A' || FDxAA == 'a')
  1612. {
  1613. Gln++;
  1614. Glu++;
  1615. Ile++;
  1616. Leu1++;
  1617. Leu2++;
  1618. Leu4++;
  1619. Phe++;
  1620. Tyr2++;
  1621. Tyr3++;
  1622. Val1++;
  1623. Val3++;
  1624. Lys++;
  1625. }
  1626. if(FDxAA == 'G' || FDxAA == 'g')
  1627. {
  1628. Ile++;
  1629. Leu3++;
  1630. Tyr1++;
  1631. }
  1632. if(FDxAA == 'M' || FDxAA == 'm')
  1633. {
  1634. Orn++;
  1635. }
  1636. if(FDxAA == 'V' || FDxAA == 'v')
  1637. {
  1638. Orn++;
  1639. Pro++;
  1640. Ser++;
  1641. }
  1642. if(FDxAA == 'I' || FDxAA == 'i')
  1643. {
  1644. Phg++;
  1645. Gly1++;
  1646. Gly2++;
  1647. }
  1648. if(FDxAA == 'F' || FDxAA == 'f')
  1649. {
  1650. Phe++;
  1651. Thr++;
  1652. Val2++;
  1653. }
  1654. if(FDxAA == 'S' || FDxAA == 's')
  1655. {
  1656. His++;
  1657. }
  1658. if(FD4xAA == 'R' || FD4xAA == 'r')
  1659. {
  1660. Aad++;
  1661. }
  1662. if(FD4xAA == 'L' || FD4xAA == 'l')
  1663. {
  1664. Ala++;
  1665. Tyr2++;
  1666. Gly1++;
  1667. }
  1668. if(FD4xAA == 'T' || FD4xAA == 't')
  1669. {
  1670. Asn++;
  1671. Asp++;
  1672. Tyr1++;
  1673. }
  1674. if(FD4xAA == 'E' || FD4xAA == 'e')
  1675. {
  1676. Cys++;
  1677. Dab++;
  1678. Orn++;
  1679. Val2++;
  1680. His++;
  1681. Lys++;
  1682. }
  1683. if(FD4xAA == 'Y' || FD4xAA == 'y')
  1684. {
  1685. Cys++;
  1686. }
  1687. if(FD4xAA == 'P' || FD4xAA == 'p')
  1688. {
  1689. Dhb++;
  1690. }
  1691. if(FD4xAA == 'Q' || FD4xAA == 'q')
  1692. {
  1693. Gln++;
  1694. Pip++;
  1695. Pro++;
  1696. Gly2++;
  1697. }
  1698. if(FD4xAA == 'W' || FD4xAA == 'w')
  1699. {
  1700. Glu++;
  1701. Leu1++;
  1702. Leu2++;
  1703. Phe++;
  1704. Ser++;
  1705. Thr++;
  1706. Val3++;
  1707. Gln++;
  1708. }
  1709. if(FD4xAA == 'K' || FD4xAA == 'k')
  1710. {
  1711. Glu++;
  1712. }
  1713. if(FD4xAA == 'F' || FD4xAA == 'f')
  1714. {
  1715. Ile++;
  1716. Leu4++;
  1717. Gly3++;
  1718. Val1++;
  1719. }
  1720. if(FD4xAA == 'A' || FD4xAA == 'a')
  1721. {
  1722. Leu3++;
  1723. }
  1724. if(FD4xAA == 'G' || FD4xAA == 'g')
  1725. {
  1726. Orn++;
  1727. }
  1728. if(FD4xAA == 'S' || FD4xAA == 's')
  1729. {
  1730. Tyr3++;
  1731. }
  1732. // score IT3xAA
  1733. if(IT3xAA == 'N' || IT3xAA == 'n')
  1734. {
  1735. Aad++;
  1736. Cys++;
  1737. Orn++;
  1738. Thr++;
  1739. Gly3++;
  1740. }
  1741. if(IT3xAA == 'F' || IT3xAA == 'f')
  1742. {
  1743. Ala++;
  1744. Ile++;
  1745. Leu++;
  1746. }
  1747. if(IT3xAA == 'K' || IT3xAA == 'k')
  1748. {
  1749. Asn++;
  1750. Asp++;
  1751. }
  1752. if(IT3xAA == 'S' || IT3xAA == 's')
  1753. {
  1754. Cys++;
  1755. Val++;
  1756. Lys++;
  1757. }
  1758. if(IT3xAA == 'H' || IT3xAA == 'h')
  1759. {
  1760. Dab++;
  1761. Glu++;
  1762. Ser++;
  1763. }
  1764. if(IT3xAA == 'A' || IT3xAA == 'a')
  1765. {
  1766. Dhb++;
  1767. His++;
  1768. }
  1769. if(IT3xAA == 'D' || IT3xAA == 'd')
  1770. {
  1771. Gln++;
  1772. Glu++;
  1773. }
  1774. if(IT3xAA == 'L' || IT3xAA == 'l')
  1775. {
  1776. Leu++;
  1777. Phg++;
  1778. Pip++;
  1779. Pro++;
  1780. }
  1781. if(IT3xAA == 'Y' || IT3xAA == 'y')
  1782. {
  1783. Leu++;
  1784. }
  1785. if(IT3xAA == 'M' || IT3xAA == 'm')
  1786. {
  1787. Leu++;
  1788. Val++;
  1789. Gly2++;
  1790. }
  1791. if(IT3xAA == 'E' || IT3xAA == 'e')
  1792. {
  1793. Orn++;
  1794. }
  1795. if(IT3xAA == 'T' || IT3xAA == 't')
  1796. {
  1797. Phe++;
  1798. Tyr++;
  1799. }
  1800. if(IT3xAA == 'I' || IT3xAA == 'i')
  1801. {
  1802. Tyr++;
  1803. }
  1804. if(IT3xAA == 'V' || IT3xAA == 'v')
  1805. {
  1806. Tyr++;
  1807. }
  1808. if(IT3xAA == 'W' || IT3xAA == 'w')
  1809. {
  1810. Val++;
  1811. }
  1812. if(IT3xAA == 'Q' || IT3xAA == 'q')
  1813. {
  1814. Gly1++;
  1815. Gln++;
  1816. }
  1817. if(x1xG299 == 'I' || x1xG299 == 'i')
  1818. {
  1819. Aad++;
  1820. Orn++;
  1821. Phe++;
  1822. Pro++;
  1823. Thr++;
  1824. Val1++;
  1825. }
  1826. if(x1xG299 == 'G' || x1xG299 == 'g')
  1827. {
  1828. Ala++;
  1829. }
  1830. if(x1xG299 == 'L' || x1xG299 == 'l')
  1831. {
  1832. Asn++;
  1833. Cys++;
  1834. Glu++;
  1835. Ile++;
  1836. Leu1++;
  1837. Leu4++;
  1838. Orn++;
  1839. Phg++;
  1840. Pip++;
  1841. Ser++;
  1842. Gly1++;
  1843. }
  1844. if(x1xG299 == 'V' || x1xG299 == 'v')
  1845. {
  1846. Asp++;
  1847. Phe++; // added from Challis
  1848. Tyr3++;
  1849. }
  1850. if(x1xG299 == 'D' || x1xG299 == 'd')
  1851. {
  1852. Cys++;
  1853. }
  1854. if(x1xG299 == 'N' || x1xG299 == 'n')
  1855. {
  1856. Dab++;
  1857. Gly3++;
  1858. }
  1859. if(x1xG299 == 'Q' || x1xG299 == 'q')
  1860. {
  1861. Dhb++;
  1862. }
  1863. if(x1xG299 == 'F' || x1xG299 == 'f')
  1864. {
  1865. Glu++;
  1866. Val3++;
  1867. Gly2++;
  1868. Gln++;
  1869. }
  1870. if(x1xG299 == 'Y' || x1xG299 == 'y')
  1871. {
  1872. Ile++;
  1873. Leu2++;
  1874. }
  1875. if(x1xG299 == 'T' || x1xG299 == 't')
  1876. {
  1877. Leu3++;
  1878. Tyr1++;
  1879. Tyr2++;
  1880. Val2++;
  1881. His++;
  1882. }
  1883. // detect 301 signature
  1884. if(x1xG301 == 'V' || x1xG301 == 'v')
  1885. {
  1886. Aad++;
  1887. Cys++;
  1888. Gly2++;
  1889. }
  1890. if(x1xG301 == 'I' || x1xG301 == 'i')
  1891. {
  1892. Ala++;
  1893. Cys++;
  1894. Lys++;
  1895. }
  1896. if(x1xG301 == 'G' || x1xG301 == 'g')
  1897. {
  1898. Asn++;
  1899. Asp++;
  1900. Dhb++;
  1901. Gln++;
  1902. Glu++;
  1903. Ile++;
  1904. Leu1++;
  1905. Leu2++;
  1906. Leu3++;
  1907. Leu4++;
  1908. Orn++;
  1909. Phg++;
  1910. Phe++; // extra
  1911. Pip++;
  1912. Thr++;
  1913. Tyr2++;
  1914. Val1++;
  1915. Lys++;
  1916. Gly1++;
  1917. }
  1918. if(x1xG301 == 'S' || x1xG301 == 's')
  1919. {
  1920. Cys++;
  1921. Ser++;
  1922. }
  1923. if(x1xG301 == 'T' || x1xG301 == 't')
  1924. {
  1925. Dab++;
  1926. }
  1927. if(x1xG301 == 'A' || x1xG301 == 'a')
  1928. {
  1929. Phe++;
  1930. Pro++;
  1931. Tyr1++;
  1932. Tyr3++;
  1933. Val2++;
  1934. Val3++;
  1935. His++;
  1936. Gly3++;
  1937. }
  1938. // analyze 322 residue
  1939. if(x7xxY322 == 'E' || x7xxY322 == 'e')
  1940. {
  1941. Aad++;
  1942. Asn++;
  1943. Leu3++;
  1944. Tyr1++;
  1945. His++;
  1946. }
  1947. if(x7xxY322 == 'A' || x7xxY322 == 'a')
  1948. {
  1949. Ala++;
  1950. Leu2++;
  1951. Phe++;
  1952. Tyr2++;
  1953. Tyr3++;
  1954. Val2++;
  1955. Val3++;
  1956. Gly2++;
  1957. }
  1958. if(x7xxY322 == 'H' || x7xxY322 == 'h')
  1959. {
  1960. Asp++;
  1961. Pro++;
  1962. }
  1963. if(x7xxY322 == 'G' || x7xxY322 == 'g')
  1964. {
  1965. Cys++;
  1966. Glu++;
  1967. Val1++;
  1968. Phe++; // extra
  1969. }
  1970. if(x7xxY322 == 'L' || x7xxY322 == 'l')
  1971. {
  1972. Cys++;
  1973. Orn++;
  1974. Phg++;
  1975. Ser++;
  1976. Gly1++;
  1977. Gly3++;
  1978. Gln++;
  1979. }
  1980. if(x7xxY322 == 'T' || x7xxY322 == 't')
  1981. {
  1982. Dab++;
  1983. }
  1984. if(x7xxY322 == 'V' || x7xxY322 == 'v')
  1985. {
  1986. D

Large files files are truncated, but you can click here to view the full file