PageRenderTime 77ms CodeModel.GetById 34ms RepoModel.GetById 0ms app.codeStats 1ms

/NRPeditor/clusterFinderBackup2.cpp

https://bitbucket.org/antismash/antismash1
C++ | 3090 lines | 2741 code | 186 blank | 163 comment | 913 complexity | 48ebfb274863f5b454818c984134eb3a MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1. #include <iostream>
  2. #include <string>
  3. #include <fstream>
  4. #include <stdlib.h>
  5. #include <sstream>
  6. using namespace std;
  7. #include "constants.h"
  8. #include "UnitClass2.h"
  9. #include "QueueClass.h"
  10. #include "ReactionClass.h"
  11. #include "spawnl.h"
  12. // function declarations
  13. // overloading operator for outputing nodes
  14. ostream& operator <<(ostream& outputStream, const UnitClass2& domain );
  15. ostream& operator <<(ostream& outputStream, const UnitClass2& domain )
  16. {
  17. if( domain.getStartSite() == 0 )
  18. {
  19. outputStream << domain.getUnit() << " "
  20. << domain.getSmile() << endl;
  21. }
  22. else
  23. {
  24. outputStream << " Formula: "
  25. << domain.getFormula()
  26. << " start site: "
  27. << domain.getStartSite() << endl
  28. << " " << domain.getSmile() << endl;
  29. }
  30. return outputStream;
  31. }
  32. void clearPositions( string &pos235, string &pos236, string &pos239,
  33. string &pos278, string &pos299, string &pos301, string &pos322,
  34. string &pos330, string &pos331, string &pos58, string &pos59,
  35. string &pos60, string &pos61, string &pos62, string &pos70,
  36. string &pos72, string &pos74, string &pos75, string &pos77,
  37. string &pos93, string &pos189, string &pos191, string &pos198,
  38. string &pos199, string &pos200 );
  39. void outputSpecificitiesNRPS( int Aad, int Ala, int Asn, int Asp, int Cys,
  40. int Dab, int Dhb, int Gln, int Glu, int Ile,
  41. int Leu, int Orn, int Phe, int Phg, int Pip,
  42. int Pro, int Ser, int Thr, int Tyr, int His,
  43. int Lys, int Gly, int Val);
  44. void outputSpecificitiesPKS( int mal, int mmal, int emal, int omal);
  45. string maxSpecificityNRPS( int Aad, int Ala, int Asn, int Asp, int Cys,
  46. int Dab, int Dhb, int Gln, int Glu, int Ile,
  47. int Leu, int Orn, int Phe, int Phg, int Pip,
  48. int Pro, int Ser, int Thr, int Tyr, int His,
  49. int Lys, int Gly, int Val);
  50. string maxSpecificityPKS( int mal, int mmal, int emal, int omal);
  51. int getIntVal(string strConvert);
  52. float getFloatVal(string strConvert);
  53. void translateSeq( string &inSeq, string &outputSeq, int &frame);
  54. void predictDomains( string aminoAcidSeq,
  55. string &predictedSubstrates,
  56. QueueClass< UnitClass2 > &aaNameList,
  57. const bool &allFramesRead,
  58. const int &frame, const string &nucSeq,
  59. string &allClusters, int&NRPScount, int &PKScount,
  60. int &mixedNRPSPKScount, int &transATcount, int &mepCount,
  61. int &mvaCount);
  62. // function bodies
  63. void outputSpecificitiesPKS( int mal, int mmal, int emal, int omal)
  64. {
  65. cout << "PKS specificity scores: " << endl;
  66. cout << "mal: " << mal << " ";
  67. cout << "mmal: " << mmal << " ";
  68. cout << "emal: " << emal << " ";
  69. cout << "omal: " << omal << " ";
  70. cout << endl;
  71. }
  72. void outputSpecificitiesNRPS( int Aad, int Ala, int Asn, int Asp, int Cys,
  73. int Dab, int Dhb, int Gln, int Glu, int Ile,
  74. int Leu, int Orn, int Phe, int Phg, int Pip,
  75. int Pro, int Ser, int Thr, int Tyr, int His,
  76. int Lys, int Gly, int Val)
  77. {
  78. cout << "NRPS specificity scores: " << endl;
  79. cout << "Aad: " << Aad << " ";
  80. cout << "Ala: " << Ala << " ";
  81. cout << "Asn: " << Asn << " ";
  82. cout << "Asp: " << Asp << " ";
  83. cout << "Cys: " << Cys << " ";
  84. cout << "Dab: " << Dab << " ";
  85. cout << "Dhb: " << Dhb << " ";
  86. cout << "Gln: " << Gln << " ";
  87. cout << "Glu: " << Glu << " ";
  88. cout << "Gly: " << Gly << " ";
  89. cout << "His: " << His << endl;
  90. cout << "Ile: " << Ile << " ";
  91. cout << "Leu: " << Leu << " ";
  92. cout << "Lys: " << Lys << " ";
  93. cout << "Orn: " << Orn << " ";
  94. cout << "Phe: " << Phe << " ";
  95. cout << "Phg: " << Phg << " ";
  96. cout << "Pip: " << Pip << " ";
  97. cout << "Pro: " << Pro << " ";
  98. cout << "Ser: " << Ser << " ";
  99. cout << "Thr: " << Thr << " ";
  100. cout << "Tyr: " << Tyr << endl;
  101. cout << "Val: " << Val << " ";
  102. cout << endl;
  103. }
  104. string maxSpecificityPKS( int mal, int mmal, int emal, int omal)
  105. {
  106. int max;
  107. int distinguishingFactor = 5;
  108. string CA;
  109. max = mal;
  110. CA = "mal";
  111. if(mmal > mal)
  112. {
  113. max = mmal;
  114. CA = "mmal";
  115. }
  116. if(emal > max)
  117. {
  118. max = emal;
  119. CA = "emal";
  120. }
  121. if(omal > max)
  122. {
  123. max = omal;
  124. CA = "omal";
  125. }
  126. if(max < distinguishingFactor)
  127. {
  128. CA = "pk";
  129. }
  130. return CA;
  131. }
  132. string maxSpecificityNRPS( int Aad, int Ala, int Asn, int Asp, int Cys,
  133. int Dab, int Dhb, int Gln, int Glu, int Ile,
  134. int Leu, int Orn, int Phe, int Phg, int Pip,
  135. int Pro, int Ser, int Thr, int Tyr, int His,
  136. int Lys, int Gly, int Val)
  137. {
  138. int max;
  139. int distinguishingFactor = 3;
  140. string AA;
  141. max = Aad;
  142. AA = "Aad";
  143. if(Ala >= Aad)
  144. {
  145. max = Ala;
  146. AA = "ala";
  147. }
  148. if(Asn >= max)
  149. {
  150. max = Asn;
  151. AA = "asn";
  152. }
  153. if(Asp >= max)
  154. {
  155. max = Asp;
  156. AA = "asp";
  157. }
  158. if(Cys >= max)
  159. {
  160. max = Cys;
  161. AA = "cys";
  162. }
  163. if(Dab >= max)
  164. {
  165. max = Dab;
  166. AA = "dab";
  167. }
  168. if(Dhb >= max)
  169. {
  170. max = Dhb;
  171. AA = "dhb";
  172. }
  173. if(Gln >= max)
  174. {
  175. max = Gln;
  176. AA = "gln";
  177. }
  178. if(Glu >= max)
  179. {
  180. max = Glu;
  181. AA = "glu";
  182. }
  183. if(Ile >= max)
  184. {
  185. max = Ile;
  186. AA = "ile";
  187. }
  188. if(Leu >= max)
  189. {
  190. max = Leu;
  191. AA = "leu";
  192. }
  193. if(Orn >= max)
  194. {
  195. max = Orn;
  196. AA = "orn";
  197. }
  198. if(Phe >= max)
  199. {
  200. max = Phe;
  201. AA = "phe";
  202. }
  203. if(Phg >= max)
  204. {
  205. max = Phg;
  206. AA = "phg";
  207. }
  208. if(Pip >= max)
  209. {
  210. max = Pip;
  211. AA = "pip";
  212. }
  213. if(Pro >= max)
  214. {
  215. max = Pro;
  216. AA = "pro";
  217. }
  218. if(Ser >= max)
  219. {
  220. max = Ser;
  221. AA = "ser";
  222. }
  223. if(Thr >= max)
  224. {
  225. max = Thr;
  226. AA = "thr";
  227. }
  228. if(Tyr >= max)
  229. {
  230. max = Tyr;
  231. AA = "tyr";
  232. }
  233. if(His >= max)
  234. {
  235. max = His;
  236. AA = "his";
  237. }
  238. if(Lys >= max)
  239. {
  240. max = Lys;
  241. AA = "lys";
  242. }
  243. if(Gly >= max)
  244. {
  245. max = Gly;
  246. AA = "gly";
  247. }
  248. if(Val >= max)
  249. {
  250. max = Val;
  251. AA = "val";
  252. }
  253. if( max < distinguishingFactor)
  254. {
  255. AA = "nrp";
  256. }
  257. return AA;
  258. }
  259. int getIntVal(string strConvert)
  260. {
  261. int intReturn;
  262. // NOTE: You should probably do some checks to ensure that
  263. // this string contains only numbers. If the string is not
  264. // a valid integer, zero will be returned.
  265. intReturn = atoi(strConvert.c_str());
  266. return(intReturn);
  267. }
  268. float getFloatVal(string strConvert)
  269. {
  270. float floatReturn;
  271. // NOTE: You should probably do some checks to ensure that
  272. // this string contains only numbers. If the string is not
  273. // a valid integer, zero will be returned.
  274. floatReturn = atof(strConvert.c_str());
  275. return(floatReturn);
  276. }
  277. void clearPositions( string &pos235, string &pos236, string &pos239,
  278. string &pos278, string &pos299, string &pos301, string &pos322,
  279. string &pos330, string &pos331, string &pos58, string &pos59,
  280. string &pos60, string &pos61, string &pos62, string &pos70,
  281. string &pos72, string &pos74, string &pos75, string &pos77,
  282. string &pos93, string &pos189, string &pos191, string &pos198,
  283. string &pos199, string &pos200 )
  284. {
  285. pos235 = "";
  286. pos236 = "";
  287. pos239 = "";
  288. pos278 = "";
  289. pos299 = "";
  290. pos301 = "";
  291. pos322 = "";
  292. pos330 = "";
  293. pos331 = "";
  294. pos58 = "";
  295. pos59 = "";
  296. pos60 = "";
  297. pos61 = "";
  298. pos62 = "";
  299. pos70 = "";
  300. pos72 = "";
  301. pos74 = "";
  302. pos75 = "";
  303. pos77 = "";
  304. pos93 = "";
  305. pos189 = "";
  306. pos191 = "";
  307. pos198 = "";
  308. pos199 = "";
  309. pos200 = "";
  310. }
  311. // finds and returns the reverse complement sequence
  312. string revComp( const string &inSeq, const string &nucleotideSeq);
  313. string revComp( const string &inSeq, const string &nucleotideSeq)
  314. {
  315. string inputSeq;
  316. string outputSeq;
  317. string segment;
  318. string outputFilename;
  319. //string outFile = "rev-" + inSeq;
  320. int seqMin = 50; // minimum length of nucleotide line
  321. ifstream In;
  322. ofstream Out;
  323. int lineCharsCount = 0;
  324. int lineCharsMax = 70;
  325. In.open(inSeq.c_str());
  326. //outputFilename = "genomes/rev-"
  327. outputFilename = "genomes/rev-"
  328. + nucleotideSeq;
  329. In >> segment;
  330. // bypass initial FASTA details
  331. while(segment.length() < seqMin && !In.fail())
  332. {
  333. In >> segment;
  334. }
  335. // stores nucleotide
  336. inputSeq.append(segment);
  337. while(!In.fail())
  338. {
  339. In >> segment;
  340. inputSeq.append(segment);
  341. }
  342. In.close();
  343. for(int i = inputSeq.length(); i > 0; i--)
  344. {
  345. if(inputSeq[i] == 'A')
  346. {
  347. outputSeq.append("T");
  348. }
  349. if(inputSeq[i] == 'T')
  350. {
  351. outputSeq.append("A");
  352. }
  353. if(inputSeq[i] == 'C')
  354. {
  355. outputSeq.append("G");
  356. }
  357. if(inputSeq[i] == 'G')
  358. {
  359. outputSeq.append("C");
  360. }
  361. lineCharsCount++;
  362. if(lineCharsCount == lineCharsMax)
  363. {
  364. outputSeq.append("\n");
  365. lineCharsCount = 0;
  366. }
  367. }
  368. Out.open(outputFilename.c_str());
  369. Out << ">revComp-" << nucleotideSeq << "\n";
  370. Out << outputSeq;
  371. Out.clear();
  372. Out.close();
  373. return outputSeq;
  374. }
  375. // translates DNA to amino acid through common code
  376. void translateSeq( string &inSeq, string &outputSeq, int &frame)
  377. {
  378. int stopPrev = 0;
  379. int stopPres = 0;
  380. int stopLimit = 50;
  381. int start = 0;
  382. int choice;
  383. int stopCount = 0;
  384. int seqMin = 50;
  385. ifstream In;
  386. ofstream Out;
  387. string segment;
  388. outputSeq.clear();
  389. In.open(inSeq.c_str());
  390. string inputSeq;
  391. int charCount = 0;
  392. int charCountMax = 70;
  393. In >> segment;
  394. //cout << "segment: " << segment << endl;
  395. while(segment.length() < seqMin && !In.fail())
  396. {
  397. In >> segment;
  398. }
  399. inputSeq.append(segment);
  400. while(!In.fail())
  401. {
  402. In >> segment;
  403. inputSeq.append(segment);
  404. }
  405. //cout << "FASTA of DNA stored" << endl;
  406. In.close();
  407. //cout << "Frame: " << frame << endl;
  408. for(int i = frame; i < inputSeq.length()-3; i=i+3)
  409. {
  410. charCount++;
  411. if(charCount == charCountMax)
  412. {
  413. outputSeq.append("\n");
  414. charCount = 0;
  415. }
  416. if(inputSeq[i] == 't' ||
  417. inputSeq[i] == 'T')
  418. {
  419. if(inputSeq[i+1] == 't' ||
  420. inputSeq[i+1] == 'T')
  421. {
  422. if(inputSeq[i+2] == 't' ||
  423. inputSeq[i+2] == 'T' ||
  424. inputSeq[i+2] == 'c' ||
  425. inputSeq[i+2] == 'C')
  426. {
  427. outputSeq.append("F");
  428. }
  429. else
  430. {
  431. outputSeq.append("L");
  432. }
  433. }
  434. else if(inputSeq[i+1] == 'c' ||
  435. inputSeq[i+1] == 'C')
  436. {
  437. outputSeq.append("S");
  438. }
  439. else if(inputSeq[i+1] == 'a' ||
  440. inputSeq[i+1] == 'A')
  441. {
  442. if(inputSeq[i+2] == 't' ||
  443. inputSeq[i+2] == 'T' ||
  444. inputSeq[i+2] == 'c' ||
  445. inputSeq[i+2] == 'C')
  446. {
  447. outputSeq.append("Y");
  448. }
  449. else
  450. {
  451. outputSeq.append("-");
  452. stopCount++; // stop codon
  453. }
  454. }
  455. else if(inputSeq[i+1] == 'g' ||
  456. inputSeq[i+1] == 'G')
  457. {
  458. if(inputSeq[i+2] == 't' ||
  459. inputSeq[i+2] == 'T' ||
  460. inputSeq[i+2] == 'c' ||
  461. inputSeq[i+2] == 'C')
  462. {
  463. outputSeq.append("C");
  464. }
  465. else if(inputSeq[i+2] == 'g' ||
  466. inputSeq[i+2] == 'G')
  467. {
  468. outputSeq.append("W");
  469. }
  470. else
  471. {
  472. outputSeq.append("-");
  473. stopCount++; // stop codon
  474. }
  475. }
  476. } // sequence begins with 't' ends
  477. else if(inputSeq[i] == 'c' ||
  478. inputSeq[i] == 'C')
  479. {
  480. if(inputSeq[i+1] == 't' ||
  481. inputSeq[i+1] == 'T')
  482. {
  483. outputSeq.append("L");
  484. }
  485. else if(inputSeq[i+1] == 'c' ||
  486. inputSeq[i+1] == 'C')
  487. {
  488. outputSeq.append("P");
  489. }
  490. else if(inputSeq[i+1] == 'a' ||
  491. inputSeq[i+1] == 'A')
  492. {
  493. if(inputSeq[i+2] == 't' ||
  494. inputSeq[i+2] == 'T' ||
  495. inputSeq[i+2] == 'c' ||
  496. inputSeq[i+2] == 'C')
  497. {
  498. outputSeq.append("H");
  499. }
  500. else
  501. {
  502. outputSeq.append("Q");
  503. }
  504. }
  505. else if(inputSeq[i+1] == 'g' ||
  506. inputSeq[i+1] == 'G')
  507. {
  508. outputSeq.append("R");
  509. }
  510. } // sequence begins with 'c' ends
  511. else if(inputSeq[i] == 'a' ||
  512. inputSeq[i] == 'A')
  513. {
  514. if(inputSeq[i+1] == 't' ||
  515. inputSeq[i+1] == 'T')
  516. {
  517. if(inputSeq[i+2] != 'g' &&
  518. inputSeq[i+2] != 'G')
  519. {
  520. outputSeq.append("I");
  521. }
  522. else
  523. {
  524. outputSeq.append("M");
  525. }
  526. }
  527. else if(inputSeq[i+1] == 'c' ||
  528. inputSeq[i+1] == 'C')
  529. {
  530. outputSeq.append("T");
  531. }
  532. else if(inputSeq[i+1] == 'a' ||
  533. inputSeq[i+1] == 'A')
  534. {
  535. if(inputSeq[i+2] == 't' ||
  536. inputSeq[i+2] == 'T' ||
  537. inputSeq[i+2] == 'c' ||
  538. inputSeq[i+2] == 'C')
  539. {
  540. outputSeq.append("N");
  541. }
  542. else
  543. {
  544. outputSeq.append("K");
  545. }
  546. }
  547. else if(inputSeq[i+1] == 'g' ||
  548. inputSeq[i+1] == 'G')
  549. {
  550. if(inputSeq[i+2] == 't' ||
  551. inputSeq[i+2] == 'T' ||
  552. inputSeq[i+2] == 'c' ||
  553. inputSeq[i+2] == 'C')
  554. {
  555. outputSeq.append("S");
  556. }
  557. else
  558. {
  559. outputSeq.append("R");
  560. }
  561. }
  562. } // sequence begins with 'a' ends
  563. else if(inputSeq[i] == 'g' ||
  564. inputSeq[i] == 'G')
  565. {
  566. if(inputSeq[i+1] == 't' ||
  567. inputSeq[i+1] == 'T')
  568. {
  569. outputSeq.append("V");
  570. }
  571. else if(inputSeq[i+1] == 'c' ||
  572. inputSeq[i+1] == 'C')
  573. {
  574. outputSeq.append("A");
  575. }
  576. else if(inputSeq[i+1] == 'a' ||
  577. inputSeq[i+1] == 'A')
  578. {
  579. if(inputSeq[i+2] == 't' ||
  580. inputSeq[i+2] == 'T' ||
  581. inputSeq[i+2] == 'c' ||
  582. inputSeq[i+2] == 'C')
  583. {
  584. outputSeq.append("D");
  585. }
  586. else
  587. {
  588. outputSeq.append("E");
  589. }
  590. }
  591. else if(inputSeq[i+1] == 'g' ||
  592. inputSeq[i+1] == 'G')
  593. {
  594. outputSeq.append("G");
  595. }
  596. } // codon begins with 'g' ends
  597. else
  598. {
  599. outputSeq.append("-");
  600. stopCount++;
  601. }
  602. } // for loop ends
  603. Out.open("aminoAcidSeq.txt");
  604. Out.clear();
  605. Out << "Protein from one DNA frame" << "\n";
  606. Out << outputSeq;
  607. Out.close();
  608. } // function ends
  609. void predictDomains(string aminoAcidSeq,
  610. string &predictedSubstrates,
  611. QueueClass< UnitClass2 > &aaNameList,
  612. const bool &allFramesRead,
  613. const int &frame, const string &nucSeq,
  614. string &allClusters, int &NRPScount, int &PKScount,
  615. int &mixedNRPSPKScount, int &transATcount, int &mepCount,
  616. int &mvaCount )
  617. {
  618. QueueClass <UnitClass2> sortedListOfDomains;
  619. QueueClass <UnitClass2> sortedListOfMods;
  620. QueueClass <UnitClass2> sortedListOfATKS;
  621. UnitClass2 *domain;
  622. ifstream In;
  623. ifstream InMods;
  624. ifstream tempIn;
  625. ifstream aaSeqIn;
  626. ofstream Out;
  627. ofstream tempOut;
  628. string segment;
  629. float score;
  630. string scoreString;
  631. float modsScore;
  632. float modsScoreMax = 0.1;
  633. float scoreMax = 0.1; // minimum for domain synthetase matching
  634. float idenScore;
  635. float idenScoreMin = 120; // minimum for mods alignment
  636. float length;
  637. string SMILE;
  638. string formula;
  639. string mod;
  640. string substrate;
  641. float siteToAnalyze;
  642. int siteCount;
  643. bool siteFound;
  644. int whileCount = 0;
  645. int queryOrSbjctLength = 6;
  646. string aaSeq;
  647. char *arg[21];
  648. arg[0] = "./blastall"; // program to execute
  649. arg[1] = "blastp"; // protein blast
  650. arg[2] = "synthetases.fasta"; // control A-domain
  651. arg[3] = "mods.fasta"; // output file
  652. arg[4] = "100.0";
  653. arg[6] = "aaSeq.txt";
  654. arg[7] = "blast2.out";
  655. int AdomainStart = 5;
  656. int AdomainEnd = 5;
  657. // A-domains to analyze
  658. bool substrateKnown;
  659. string query;
  660. string sbjct;
  661. string predictedSubstrate;
  662. string proposedSubstrate;
  663. bool foundSubstrate;
  664. bool stringsCompare;
  665. int hits = 0;
  666. int misses = 0;
  667. int maxChars = 10000;
  668. // NRPS key positions
  669. int p235 = 11;
  670. int p236 = 12;
  671. int p239 = 15;
  672. int p278 = 10;
  673. int p299 = 1;
  674. int p301 = 3;
  675. int p322 = 1;
  676. int p330 = 9;
  677. int p331 = 10;
  678. string pos235;
  679. string pos236;
  680. string pos239;
  681. string pos278;
  682. string pos299;
  683. string pos301;
  684. string pos322;
  685. string pos330;
  686. string pos331;
  687. int Aad = 0;
  688. int Ala = 0;
  689. int Asn = 0;
  690. int Asp = 0;
  691. int Cys = 0;
  692. int Dab = 0;
  693. int Dhb = 0;
  694. int Gln = 0;
  695. int Glu = 0;
  696. int Ile = 0;
  697. int Leu = 0;
  698. int Orn = 0;
  699. int Phe = 0;
  700. int Phg = 0;
  701. int Pip = 0;
  702. int Pro = 0;
  703. int Ser = 0;
  704. int Thr = 0;
  705. int Tyr = 0;
  706. int Val = 0;
  707. int His = 0;
  708. int Lys = 0;
  709. int Gly = 0;
  710. int Gly1 = 0;
  711. int Gly2 = 0;
  712. int Gly3 = 0;
  713. int Leu1 = 0;
  714. int Leu2 = 0;
  715. int Leu3 = 0;
  716. int Leu4 = 0;
  717. int Tyr1 = 0;
  718. int Tyr2 = 0;
  719. int Tyr3 = 0;
  720. int Val1 = 0;
  721. int Val2 = 0;
  722. int Val3 = 0;
  723. int mal = 0;
  724. int mmal = 0;
  725. int emal = 0;
  726. int omal = 0;
  727. char FDxAA;
  728. char FD4xAA;
  729. char IT3xAA;
  730. char x1xG299;
  731. char x1xG301;
  732. char x7xxY322;
  733. char x7xxY330;
  734. char x7xxY331;
  735. // PKS key positions
  736. int p58 = 0;
  737. int p59 = 1;
  738. int p60 = 2;
  739. int p61 = 3;
  740. int p62 = 4;
  741. int p70 = 0;
  742. int p72 = 2;
  743. int p74 = 4;
  744. int p75 = 5;
  745. int p77 = 7;
  746. int p93 = 3;
  747. int p189 = 1;
  748. int p191 = 3;
  749. int p198 = 0;
  750. int p199 = 1;
  751. int p200 = 2;
  752. string pos58;
  753. string pos59;
  754. string pos60;
  755. string pos61;
  756. string pos62;
  757. string pos70;
  758. string pos72;
  759. string pos74;
  760. string pos75;
  761. string pos77;
  762. string pos93;
  763. string pos189;
  764. string pos191;
  765. string pos198;
  766. string pos199;
  767. string pos200;
  768. string sigSeq;
  769. float position;
  770. bool NRPS = 0;
  771. bool PKS = 0;
  772. string residue;
  773. string fname;
  774. bool success;
  775. string smileName;
  776. string inString;
  777. UnitClass2 *aaName;
  778. float currentDomainPos;
  779. float nextDomainPos;
  780. float thirdDomainPos;
  781. int PKSmodRange = 7000;
  782. bool KR = 0;
  783. bool DH = 0;
  784. bool ER = 0;
  785. string endSiteString;
  786. string prevSegment;
  787. int endSite;
  788. int startSite;
  789. int extraRange = 1000;
  790. int lastPKSkey = 3;
  791. string scoredSubstrate;
  792. string proteinID;
  793. string proteinFrame;
  794. string aminoAcids;
  795. int nucStartSite;
  796. int nucEndSite;
  797. cout << "Analyzing data..." << endl;
  798. //get every NRPS/PKS domain until no further domains are detected;
  799. // does blast alignment of cluster
  800. spawnl( 1, "./blastall", "./blastall",
  801. "-g", "T", "-p", arg[1], "-d", arg[2], "-i", aminoAcidSeq.c_str(),
  802. "-F", "F", "-C", "0", "-o", "blast.out", NULL);
  803. //cout << "called blast" << endl;
  804. spawnl( 1, "./blastall", "./blastall",
  805. "-g", "T", "-p", arg[1], "-d", arg[3], "-i", aminoAcidSeq.c_str(),
  806. "-F", "F", "-C", "0", "-o", "modsBlast.out", NULL);
  807. //cout << "called modblast" << endl;
  808. //cout << "frame: " << frame << endl;
  809. if(frame == 0)
  810. {
  811. system("cp blast.out frame0.out");
  812. //cout << "copied to frame0.out" << endl;
  813. system("cp modsBlast.out modsFrame0.out");
  814. //cout << "copied to modsFrame0.out" << endl;
  815. system("cp aminoAcidSeq.txt aminoAcidSeq0.txt");
  816. //cout << "aminoAcidSeq0 copied" << endl;
  817. }
  818. if(frame == 1)
  819. {
  820. system("cp blast.out frame1.out");
  821. //cout << "copied to frame1.out" << endl;
  822. system("cp modsBlast.out modsFrame1.out");
  823. //cout << "copied to modsFrame1.out" << endl;
  824. system("cp aminoAcidSeq.txt aminoAcidSeq1.txt");
  825. //cout << "aminoAcidSeq1 copied" << endl;
  826. }
  827. if(frame == 2)
  828. {
  829. system("cp blast.out frame2.out");
  830. //cout << "copied to frame2.out" << endl;
  831. system("cp modsBlast.out modsFrame2.out");
  832. //cout << "copied to modsFrame2.out" << endl;
  833. system("cat frame0.out frame1.out frame2.out > blast.out");
  834. //cout << "concatenated all domain frames" << endl;
  835. system("cat modsFrame0.out modsFrame1.out modsFrame2.out > modsBlast.out");
  836. //cout << "concatenated all mod frames" << endl;
  837. system("rm frame0.out frame1.out frame2.out modsFrame1.out modsFrame2.out");
  838. //cout << "last aminoAcidSeq" << endl;
  839. }
  840. //cout << "after frames copies" << endl;
  841. if(allFramesRead)
  842. {
  843. //cout << "ALL FRAMES READ" << endl;
  844. score = 0;
  845. // open NRPS/PKS alignment file
  846. In.open("blast.out");
  847. In >> segment;
  848. while( !In.fail())
  849. {
  850. //cout << "read blast.out" << endl;
  851. if(segment[0] == '*')
  852. {
  853. In >> segment;
  854. substrate = segment;
  855. }
  856. if(segment[0] == 'E' && segment[1] == 'x'
  857. && segment[2] == 'p')
  858. {
  859. In >> segment; // "="
  860. In >> segment; // expectation
  861. for(int i = 0; i < segment.length(); i++)
  862. {
  863. if(segment[i] == 'e')
  864. {
  865. i = segment.length();
  866. }
  867. else
  868. {
  869. scoreString.append(1, segment[i]);
  870. }
  871. }
  872. score = getFloatVal(scoreString);
  873. //cout << "scoreString: " << scoreString << endl;
  874. scoreString.clear();
  875. for(int i = 0; i < segment.length(); i++)
  876. {
  877. if(segment[i] == 'e')
  878. {
  879. for(int j = i+1; j < segment.length(); j++)
  880. {
  881. scoreString.append(1, segment[j]);
  882. }
  883. }
  884. }
  885. score = score * pow(10.0,(static_cast<double>(getFloatVal(scoreString))));
  886. //cout << "segment: " << segment << endl;
  887. //cout << "score: " << score << endl;
  888. //cout << "scoreString float: " << getFloatVal(scoreString) << endl;
  889. scoreString.clear();
  890. }
  891. if(segment[0] == 'I' && segment[1] == 'd'
  892. && segment[2] == 'e')
  893. {
  894. In >> segment; // "="
  895. In >> segment; // identities
  896. // collect denominator of identities value
  897. for(int i = 0; i < segment.length(); i++)
  898. {
  899. if(segment[i] == '/')
  900. {
  901. for(int j = i+1; j < segment.length(); j++)
  902. {
  903. scoreString.append(1, segment[j]);
  904. i = j;
  905. }
  906. }
  907. }
  908. idenScore = getIntVal(scoreString);
  909. scoreString.clear();
  910. //cout << "score: " << " idenScore: " << idenScore << endl;
  911. if(score < scoreMax && idenScore > idenScoreMin)
  912. {
  913. //cout << "score: " << score << " idenScore: " << idenScore << endl;
  914. while(segment[0] != 'Q' && segment[1] != 'u')
  915. {
  916. In >> segment;
  917. }
  918. In >> segment; // start site of domain on protein
  919. startSite = getIntVal(segment);
  920. //cout << "start site: " << startSite << endl;
  921. In >> segment;
  922. // get protein frame and find where adenylation domain begins
  923. proteinID = segment;
  924. for(int i = 0; i < proteinID.length(); i++)
  925. {
  926. while(proteinID[i] == '-')
  927. {
  928. proteinID.erase(i, 1);
  929. }
  930. }
  931. //cout << "PROTEINID: " << proteinID << endl;
  932. proteinFrame.clear();
  933. aaSeqIn.open("aminoAcidSeq.txt");
  934. aaSeqIn >> aminoAcids;
  935. while(!aaSeqIn.fail())
  936. {
  937. if(aminoAcids.length() > 1)
  938. {
  939. proteinFrame.append(aminoAcids);
  940. }
  941. aaSeqIn >> aminoAcids;
  942. }
  943. aaSeqIn.clear();
  944. aaSeqIn.close();
  945. //cout << "proteinFrame.length(): " << proteinFrame.length() << endl;
  946. nucStartSite = proteinFrame.find(proteinID, startSite - 100);
  947. if(nucStartSite < 0)
  948. {
  949. proteinFrame.clear();
  950. aaSeqIn.open("aminoAcidSeq0.txt");
  951. aaSeqIn >> aminoAcids;
  952. while(!aaSeqIn.fail())
  953. {
  954. if(aminoAcids.length() > 1)
  955. {
  956. proteinFrame.append(aminoAcids);
  957. }
  958. aaSeqIn >> aminoAcids;
  959. }
  960. aaSeqIn.clear();
  961. aaSeqIn.close();
  962. nucStartSite = proteinFrame.find(proteinID, startSite - 100);
  963. //cout << "NOT IN LAST AA FRAME" << endl;
  964. }
  965. if(nucStartSite < 0)
  966. {
  967. proteinFrame.clear();
  968. aaSeqIn.open("aminoAcidSeq1.txt");
  969. aaSeqIn >> aminoAcids;
  970. while(!aaSeqIn.fail())
  971. {
  972. if(aminoAcids.length() > 1)
  973. {
  974. proteinFrame.append(aminoAcids);
  975. }
  976. aaSeqIn >> aminoAcids;
  977. }
  978. aaSeqIn.clear();
  979. aaSeqIn.close();
  980. nucStartSite = proteinFrame.find(proteinID, startSite - 100);
  981. // cout << "NOT IN FIRST AA FRAME" << endl;
  982. }
  983. if(nucStartSite < 0)
  984. {
  985. //cout << "NOT IN SECOND AA FRAME" << endl;
  986. }
  987. nucStartSite = nucStartSite * 3; // translate amino acid position
  988. // into nucleotide position
  989. //cout << "NUCSTARTSITE: " << nucStartSite << endl;
  990. while((segment[0] != 'S' || segment[4] != 'e') && segment[0] != '>'
  991. && !In.fail())
  992. {
  993. prevSegment = segment;
  994. In >> segment;
  995. }
  996. endSiteString = prevSegment;
  997. endSite = getIntVal(prevSegment) + startSite + extraRange;
  998. domain = new UnitClass2;
  999. domain->setStartSite(startSite);
  1000. domain->setFormula(substrate);
  1001. domain->setEndSite(endSite);
  1002. domain->setNucStartSite(nucStartSite);
  1003. sortedListOfDomains.insertValue(*domain);
  1004. delete domain;
  1005. }
  1006. score = 1;
  1007. idenScore = 0;
  1008. }
  1009. In >> segment;
  1010. }
  1011. In.close();
  1012. if(sortedListOfDomains.getNumElems() > 0)
  1013. {
  1014. //sortedListOfDomains.printForward();
  1015. }
  1016. else
  1017. {
  1018. cout << "No NRPS-A or PKS-AT domains detected" << endl;
  1019. }
  1020. // detect modifications in cluster and their locations
  1021. InMods.open("modsBlast.out");
  1022. InMods >> segment;
  1023. //cout << "InMods: " << InMods << " segment: " << segment << endl;
  1024. while( !InMods.fail())
  1025. {
  1026. //cout << "in InMods loop" << endl;
  1027. if(segment[0] == '*')
  1028. {
  1029. InMods >> segment;
  1030. mod = segment;
  1031. }
  1032. if(segment[0] == 'E' && segment[1] == 'x'
  1033. && segment[2] == 'p')
  1034. {
  1035. InMods >> segment; // "="
  1036. InMods >> segment; // expectation
  1037. //cout << "segment(expectation): " << segment << endl;
  1038. for(int i = 0; i < segment.length(); i++)
  1039. {
  1040. if(segment[i] == 'e')
  1041. {
  1042. i = segment.length();
  1043. }
  1044. else
  1045. {
  1046. scoreString.append(1, segment[i]);
  1047. }
  1048. }
  1049. modsScore = getFloatVal(scoreString);
  1050. scoreString.clear();
  1051. for(int i = 0; i < segment.length(); i++)
  1052. {
  1053. if(segment[i] == 'e')
  1054. {
  1055. for(int j = i+1; j < segment.length(); j++)
  1056. {
  1057. scoreString.append(1, segment[j]);
  1058. i = j;
  1059. }
  1060. }
  1061. }
  1062. }
  1063. modsScore = modsScore * pow(10.0,(static_cast<double>(getFloatVal(scoreString))));
  1064. scoreString.clear();
  1065. if(segment[0] == 'I' && segment[1] == 'd'
  1066. && segment[2] == 'e')
  1067. {
  1068. InMods >> segment; // "="
  1069. InMods >> segment; // identities
  1070. // collect denominator of identities value
  1071. for(int i = 0; i < segment.length(); i++)
  1072. {
  1073. if(segment[i] == '/')
  1074. {
  1075. for(int j = i+1; j < segment.length(); j++)
  1076. {
  1077. scoreString.append(1, segment[j]);
  1078. i = j;
  1079. }
  1080. }
  1081. }
  1082. idenScore = getIntVal(scoreString);
  1083. scoreString.clear();
  1084. //cout << "/identities: " << score << endl;
  1085. if(modsScore < modsScoreMax and (idenScore > idenScoreMin
  1086. || mod[0] == 'T' && mod[1] == 'E'))
  1087. {
  1088. //cout << "modsScore: " << modsScore << " idenScore: " << idenScore << endl;
  1089. while( segment[0] != 'Q')
  1090. {
  1091. InMods >> segment;
  1092. }
  1093. InMods >> segment; // start site of domain on protein
  1094. startSite = getIntVal(segment);
  1095. InMods >> segment; // start site of domain on protein
  1096. // get protein frame and find where adenylation domains begins
  1097. proteinID = segment;
  1098. for(int i = 0; i < proteinID.length(); i++)
  1099. {
  1100. while(proteinID[i] == '-')
  1101. {
  1102. proteinID.erase(i, 1);
  1103. }
  1104. }
  1105. // cout << "PROTEINID: " << proteinID << endl;
  1106. proteinFrame.clear();
  1107. aaSeqIn.open("aminoAcidSeq.txt");
  1108. aaSeqIn >> aminoAcids;
  1109. while(!aaSeqIn.fail())
  1110. {
  1111. if(aminoAcids.length() > 1)
  1112. {
  1113. proteinFrame.append(aminoAcids);
  1114. }
  1115. aaSeqIn >> aminoAcids;
  1116. }
  1117. aaSeqIn.clear();
  1118. aaSeqIn.close();
  1119. //cout << "proteinFrame.length(): " << proteinFrame.length() << endl;
  1120. nucStartSite = proteinFrame.find(proteinID, startSite);
  1121. if(nucStartSite < 0)
  1122. {
  1123. proteinFrame.clear();
  1124. aaSeqIn.open("aminoAcidSeq0.txt");
  1125. aaSeqIn >> aminoAcids;
  1126. while(!aaSeqIn.fail())
  1127. {
  1128. if(aminoAcids.length() > 1)
  1129. {
  1130. proteinFrame.append(aminoAcids);
  1131. }
  1132. aaSeqIn >> aminoAcids;
  1133. }
  1134. aaSeqIn.clear();
  1135. aaSeqIn.close();
  1136. nucStartSite = proteinFrame.find(proteinID, startSite);
  1137. // cout << "NOT IN LAST AA FRAME" << endl;
  1138. }
  1139. if(nucStartSite < 0)
  1140. {
  1141. proteinFrame.clear();
  1142. aaSeqIn.open("aminoAcidSeq1.txt");
  1143. aaSeqIn >> aminoAcids;
  1144. while(!aaSeqIn.fail())
  1145. {
  1146. if(aminoAcids.length() > 1)
  1147. {
  1148. proteinFrame.append(aminoAcids);
  1149. }
  1150. aaSeqIn >> aminoAcids;
  1151. }
  1152. aaSeqIn.clear();
  1153. aaSeqIn.close();
  1154. nucStartSite = proteinFrame.find(proteinID, startSite);
  1155. // cout << "NOT IN FIRST AA FRAME" << endl;
  1156. }
  1157. if(nucStartSite < 0)
  1158. {
  1159. // cout << "NOT IN SECOND AA FRAME" << endl;
  1160. }
  1161. nucStartSite = nucStartSite * 3; // translate amino acid position
  1162. // into nucleotide position
  1163. domain = new UnitClass2;
  1164. domain->setStartSite(startSite);
  1165. domain->setNucStartSite(nucStartSite);
  1166. domain->setFormula(mod);
  1167. sortedListOfMods.insertValue(*domain);
  1168. delete domain;
  1169. }
  1170. score = 1;
  1171. idenScore = 0;
  1172. }
  1173. InMods >> segment;
  1174. }
  1175. InMods.clear();
  1176. InMods.close();
  1177. if(sortedListOfMods.getNumElems() > 0)
  1178. {
  1179. //sortedListOfMods.printForward();
  1180. }
  1181. else
  1182. {
  1183. cout << "No auxiliary domains detected" << endl;
  1184. }
  1185. siteCount = 0;
  1186. // go through each domain and collect and align each
  1187. while(siteCount < sortedListOfDomains.getNumElems() && whileCount < 1000000)
  1188. {
  1189. In.close();
  1190. In.open("blast.out");
  1191. In.clear();
  1192. In >> segment;
  1193. siteFound = 0;
  1194. while( !In.fail() && siteFound == 0 && whileCount < 1000000)
  1195. {
  1196. //cout << "inner while" << endl;
  1197. siteToAnalyze = sortedListOfDomains.getElemAtIndex(siteCount).getStartSite();
  1198. //cout << "siteToAnalyze: " << siteToAnalyze << endl;
  1199. if(segment[0] == 'Q' && segment[1] == 'u'
  1200. && segment[queryOrSbjctLength - 1] == ':')
  1201. {
  1202. In >> segment;
  1203. if(getIntVal(segment) == siteToAnalyze)
  1204. {
  1205. // analyze this segment only and add to aaNameList
  1206. // store any modifications between this domain and the next
  1207. // cout << "segment " << segment << " found " << endl;
  1208. //cout << segment << " ";
  1209. cout << sortedListOfDomains.getElemAtIndex(siteCount).getNucStartSite() << " ";
  1210. cout << sortedListOfDomains.getElemAtIndex(siteCount).getNucStartSite() + 10000
  1211. << " ";
  1212. position = getIntVal(segment);
  1213. siteFound = 1;
  1214. siteCount++;
  1215. In >> segment;
  1216. aaSeq.append(segment);
  1217. In >> segment;
  1218. while((segment[0] != 'S' || segment[1] != 'c') && !In.fail())
  1219. {
  1220. if(segment[0] == 'Q' && segment[1] == 'u'
  1221. && segment[queryOrSbjctLength - 1] == ':')
  1222. {
  1223. In >> segment;
  1224. In >> segment;
  1225. for(int i = 0; i < segment.length(); i++)
  1226. {
  1227. if((static_cast< unsigned int >(segment[i]) < 65
  1228. || static_cast< unsigned int >(segment[i]) > 90)
  1229. && (static_cast< unsigned int >(segment[i]) < 97
  1230. || static_cast< unsigned int >(segment[i]) > 122))
  1231. {
  1232. segment.erase(i,1);
  1233. i--;
  1234. }
  1235. }
  1236. aaSeq.append(segment);
  1237. }
  1238. In >> segment;
  1239. }
  1240. // recognize amino acid sequence here
  1241. //cout << "aaSeq: " << aaSeq << endl;
  1242. Out.open("aaSeq.txt");
  1243. Out.clear();
  1244. Out << "> " << "aaSeq" << '\n' << aaSeq;
  1245. Out.close();
  1246. aaSeq.clear();
  1247. // does blast alignment of individual domain to recognize
  1248. // conserved motifs and substrate specificity
  1249. spawnl( 1, "./blastall", "blastall",
  1250. "-g", "T", "-p", arg[1], "-d", arg[2], "-i", "aaSeq.txt",
  1251. "-F", "F", "-C", "0", "-o", "blast2.out", NULL);
  1252. substrateKnown = 0;
  1253. In.close();
  1254. In.open("blast2.out");
  1255. In.clear();
  1256. segment.clear();
  1257. In >> segment;
  1258. while(!In.fail())
  1259. {
  1260. //cout << !In.fail() << " ";
  1261. //cout << segment << endl;
  1262. if(segment == "NRPS" && PKS == 0 && NRPS == 0)
  1263. {
  1264. NRPS = 1;
  1265. //cout << "NRPS detected" << endl;
  1266. }
  1267. if(segment == "PKS" && NRPS == 0 && PKS == 0)
  1268. {
  1269. PKS = 1;
  1270. //cout << "PKS detected" << endl;
  1271. }
  1272. if(segment.length() == queryOrSbjctLength)
  1273. {
  1274. if(segment[0] == 'Q' && segment[1] == 'u'
  1275. && segment[queryOrSbjctLength - 1] == ':')
  1276. {
  1277. In >> segment;
  1278. In >> segment;
  1279. query.append(segment);
  1280. //cout << "Query: " << segment << endl;
  1281. }
  1282. if(segment[0] == 'S' && segment[1] == 'b')
  1283. {
  1284. In >> segment;
  1285. In >> segment;
  1286. sbjct.append(segment);
  1287. //cout << "Sbjct: " << segment << endl;
  1288. }
  1289. }
  1290. In >> segment;
  1291. }
  1292. In.clear();
  1293. In.close();
  1294. segment.clear();
  1295. // search for first FDX motif containing residues 235,236,239
  1296. if(NRPS)
  1297. {
  1298. if(sbjct.find("SRVLQFASFSFD") > 0
  1299. && sbjct.find("SRVLQFASFSFD") < maxChars )
  1300. {
  1301. pos235 = query[sbjct.find("SRVLQFASFSFD")+p235];
  1302. pos236 = query[sbjct.find("SRVLQFASFSFD")+p236];
  1303. pos239 = query[sbjct.find("SRVLQFASFSFD")+p239];
  1304. sigSeq.append(pos235);
  1305. sigSeq.append(pos236);
  1306. sigSeq.append(pos239);
  1307. }
  1308. // search for IT motif containing the residue 278
  1309. if(sbjct.find("RLKNDGITHVTLPP") > 0
  1310. && sbjct.find("RLKNDGITHVTLPP") < maxChars )
  1311. {
  1312. pos278 = query[sbjct.find("RLKNDGITHVTLPP")+p278];
  1313. sigSeq.append(pos278);
  1314. }
  1315. // search for GE motif containing the residues 299,301
  1316. if(sbjct.find("IVVQGEACP") > 0
  1317. && sbjct.find("IVVAGEACP") < maxChars )
  1318. {
  1319. pos299 = query[sbjct.find("IVVAGEACP")+p299];
  1320. pos301 = query[sbjct.find("IVVAGEACP")+p301];
  1321. sigSeq.append(pos299);
  1322. sigSeq.append(pos301);
  1323. }
  1324. // search for YGPTE motif containing the residues 322,330,331
  1325. if(sbjct.find("NAYGPTE") > 0
  1326. && sbjct.find("NAYGPTE") < maxChars )
  1327. {
  1328. pos322 = query[sbjct.find("NAYGPTE")+p322];
  1329. pos330 = query[sbjct.find("NAYGPTE")+p330];
  1330. pos331 = query[sbjct.find("NAYGPTE")+p331];
  1331. sigSeq.append(pos322);
  1332. sigSeq.append(pos330);
  1333. sigSeq.append(pos331);
  1334. }
  1335. FDxAA = pos236[0];
  1336. FD4xAA = pos239[0];
  1337. IT3xAA = pos278[0];
  1338. x1xG299 = pos299[0];
  1339. x1xG301 = pos301[0];
  1340. x7xxY322 = pos322[0];
  1341. x7xxY330 = pos330[0];
  1342. x7xxY331 = pos331[0];
  1343. }
  1344. if(PKS)
  1345. {
  1346. if(sbjct.find("DVPLV") > 0
  1347. && sbjct.find("DVPLV") < maxChars )
  1348. {
  1349. pos58 = query[sbjct.find("DVPLV")+p58];
  1350. pos59 = query[sbjct.find("DVPLV")+p59];
  1351. pos60 = query[sbjct.find("DVPLV")+p60];
  1352. pos61 = query[sbjct.find("DVPLV")+p61];
  1353. pos62 = query[sbjct.find("DVPLV")+p62];
  1354. sigSeq.append(pos58);
  1355. sigSeq.append(pos59);
  1356. sigSeq.append(pos60);
  1357. sigSeq.append(pos61);
  1358. sigSeq.append(pos62);
  1359. }
  1360. if(sbjct.find("QVAL") > 0
  1361. && sbjct.find("QVAL") < maxChars )
  1362. {
  1363. pos70 = query[sbjct.find("QVAL")+p70];
  1364. pos72 = query[sbjct.find("QVAL")+p72];
  1365. pos74 = query[sbjct.find("QVAL")+p74];
  1366. pos75 = query[sbjct.find("QVAL")+p75];
  1367. pos77 = query[sbjct.find("QVAL")+p77];
  1368. sigSeq.append(pos70);
  1369. sigSeq.append(pos72);
  1370. sigSeq.append(pos74);
  1371. sigSeq.append(pos75);
  1372. sigSeq.append(pos77);
  1373. }
  1374. if(sbjct.find("GQSMGE") > 0
  1375. && sbjct.find("GQSMGE") < maxChars )
  1376. {
  1377. pos93 = query[sbjct.find("GQSMGE")+p93];
  1378. sigSeq.append(pos93);
  1379. }
  1380. if(sbjct.find("ITVRRI") > 0
  1381. && sbjct.find("ITVRRI") < maxChars )
  1382. {
  1383. pos189 = query[sbjct.find("ITVRRI")+p189];
  1384. sigSeq.append(pos189);
  1385. pos191 = query[sbjct.find("ITVRRI")+p191];
  1386. sigSeq.append(pos191);
  1387. }
  1388. if(sbjct.find("YASH") > 0
  1389. && sbjct.find("YASH") < maxChars )
  1390. {
  1391. pos198 = query[sbjct.find("YASH")+p198];
  1392. pos199 = query[sbjct.find("YASH")+p199];
  1393. pos200 = query[sbjct.find("YASH")+p200];
  1394. sigSeq.append(pos198);
  1395. sigSeq.append(pos199);
  1396. sigSeq.append(pos200);
  1397. }
  1398. if(pos58[0] == 'E' || pos58[0] == 'D' || pos58[0] == 'Q'
  1399. || pos58[0] == 'S' || pos58[0] == 'K')
  1400. {
  1401. mal++;
  1402. }
  1403. if(pos58[0] == 'R' || pos58[0] == 'G')
  1404. {
  1405. mmal++;
  1406. emal++;
  1407. omal++;
  1408. }
  1409. if(pos59[0] == 'T' || pos59[0] == 'S')
  1410. {
  1411. mal++;
  1412. }
  1413. if(pos59[0] == 'V' || pos59[0] == 'A')
  1414. {
  1415. mmal++;
  1416. emal++;
  1417. omal++;
  1418. }
  1419. if(pos60[0] == 'G' || pos60[0] == 'P' || pos60[0] == 'A'
  1420. || pos60[0] == 'L' || pos60[0] == 'R')
  1421. {
  1422. mal++;
  1423. }
  1424. if(pos60[0] == 'D' || pos60[0] == 'E')
  1425. {
  1426. mmal++;
  1427. emal++;
  1428. omal++;
  1429. }
  1430. if(pos61[0] == 'Y' || pos61[0] == 'F')
  1431. {
  1432. mal++;
  1433. }
  1434. if(pos61[0] == 'V')
  1435. {
  1436. mmal++;
  1437. emal++;
  1438. omal++;
  1439. }
  1440. if(pos62[0] == 'A' || pos62[0] == 'T')
  1441. {
  1442. mal++;
  1443. }
  1444. if(pos62[0] == 'V')
  1445. {
  1446. mmal++;
  1447. emal++;
  1448. omal++;
  1449. }
  1450. if(pos70[0] == 'Q')
  1451. {
  1452. mal++;
  1453. }
  1454. if(pos70[0] == 'M')
  1455. {
  1456. mmal++;
  1457. emal++;
  1458. omal++;
  1459. }
  1460. if(pos72[0] == 'A')
  1461. {
  1462. mal++;
  1463. }
  1464. if(pos72[0] == 'S')
  1465. {
  1466. mmal++;
  1467. }
  1468. if(pos74[0] == 'F')
  1469. {
  1470. mal++;
  1471. }
  1472. if(pos74[0] == 'A')
  1473. {
  1474. mmal++;
  1475. emal++;
  1476. omal++;
  1477. }
  1478. if(pos75[0] == 'G')
  1479. {
  1480. mal++;
  1481. }
  1482. if(pos75[0] == 'A')
  1483. {
  1484. mmal++;
  1485. emal++;
  1486. omal++;
  1487. }
  1488. if(pos77[0] == 'L')
  1489. {
  1490. mal++;
  1491. }
  1492. if(pos77[0] == 'W')
  1493. {
  1494. mmal++;
  1495. emal++;
  1496. omal++;
  1497. }
  1498. if(pos93[0] == 'L' || pos93[0] == 'V' || pos93[0] == 'I')
  1499. {
  1500. mal++;
  1501. }
  1502. if(pos93[0] == 'Q')
  1503. {
  1504. mmal++;
  1505. emal++;
  1506. omal++;
  1507. }
  1508. if(pos189[0] == 'M' || pos189[0] == 'L' || pos189[0] == 'Y')
  1509. {
  1510. omal++;
  1511. }
  1512. else
  1513. {
  1514. mal++;
  1515. mmal++;
  1516. emal++;
  1517. }
  1518. if(pos191[0] == 'W')
  1519. {
  1520. omal++;
  1521. }
  1522. else
  1523. {
  1524. mal++;
  1525. mmal++;
  1526. emal++;
  1527. }
  1528. if(pos198[0] == 'H')
  1529. {
  1530. mal = mal + lastPKSkey;
  1531. }
  1532. if(pos198[0] == 'Y')
  1533. {
  1534. mmal = mmal + lastPKSkey;
  1535. }
  1536. if(pos198[0] == 'T' || pos198[0] == 'C')
  1537. {
  1538. emal = emal + lastPKSkey;
  1539. }
  1540. if(pos199[0] == 'P')
  1541. {
  1542. emal++;
  1543. }
  1544. if(pos200[0] == 'F')
  1545. {
  1546. mal = mal + lastPKSkey;
  1547. }
  1548. if(pos200[0] == 'S')
  1549. {
  1550. mmal = mmal + lastPKSkey;
  1551. }
  1552. if(pos200[0] == 'G')
  1553. {
  1554. emal = emal + lastPKSkey;
  1555. }
  1556. if(pos200[0] == 'T')
  1557. {
  1558. emal++;
  1559. }
  1560. //outputSpecificitiesPKS( mal, mmal, emal, omal);
  1561. scoredSubstrate = maxSpecificityPKS(mal, mmal, emal, omal);
  1562. //cout << "scoredSubstrate: " << scoredSubstrate << endl;
  1563. mal = 0;
  1564. mmal = 0;
  1565. emal = 0;
  1566. omal = 0;
  1567. clearPositions( pos235, pos236, pos239,
  1568. pos278, pos299, pos301, pos322, pos330,
  1569. pos331, pos58, pos59, pos60, pos61, pos62,
  1570. pos70, pos72, pos74, pos75, pos77, pos93,
  1571. pos189, pos191, pos198, pos199, pos200 );
  1572. }
  1573. //cout << "signature sequence: " << sigSeq << " ";
  1574. cout << sigSeq << " ";
  1575. Out.open("sigSeq.txt");
  1576. Out.clear();
  1577. Out << "> " << "SigSeq" << '\n'
  1578. << sigSeq;
  1579. Out.close();
  1580. // align with NRPS/PKS signatures
  1581. if(PKS)
  1582. {
  1583. spawnl(1, "./blastall", "blastall", "-p", "blastp", "-d",
  1584. "PKSsignatures.fasta", "-i", "sigSeq.txt",
  1585. "-e", arg[4], "-F", "F", "-C", "0", "-e", "10.0",
  1586. "-o", "blast3.out", NULL);
  1587. }
  1588. if(NRPS)
  1589. {
  1590. spawnl(1, "./blastall", "blastall", "-p", "blastp", "-d",
  1591. "NRPSsignatures.fasta", "-i", "sigSeq.txt",
  1592. "-e", arg[4], "-F", "F", "-C", "0", "-e", "10.0",
  1593. "-o", "blast3.out", NULL);
  1594. if(FDxAA == 'P' || FDxAA == 'p')
  1595. {
  1596. Aad++;
  1597. }
  1598. if(FDxAA == 'L' || FDxAA == 'l')
  1599. {
  1600. Ala++;
  1601. Asn++;
  1602. Asp++;
  1603. Cys++;
  1604. Dab++;
  1605. Dhb++;
  1606. Gly3++;
  1607. }
  1608. if(FDxAA == 'H' || FDxAA == 'h')
  1609. {
  1610. Cys++;
  1611. }
  1612. if(FDxAA == 'A' || FDxAA == 'a')
  1613. {
  1614. Gln++;
  1615. Glu++;
  1616. Ile++;
  1617. Leu1++;
  1618. Leu2++;
  1619. Leu4++;
  1620. Phe++;
  1621. Tyr2++;
  1622. Tyr3++;
  1623. Val1++;
  1624. Val3++;
  1625. Lys++;
  1626. }
  1627. if(FDxAA == 'G' || FDxAA == 'g')
  1628. {
  1629. Ile++;
  1630. Leu3++;
  1631. Tyr1++;
  1632. }
  1633. if(FDxAA == 'M' || FDxAA == 'm')
  1634. {
  1635. Orn++;
  1636. }
  1637. if(FDxAA == 'V' || FDxAA == 'v')
  1638. {
  1639. Orn++;
  1640. Pro++;
  1641. Ser++;
  1642. }
  1643. if(FDxAA == 'I' || FDxAA == 'i')
  1644. {
  1645. Phg++;
  1646. Gly1++;
  1647. Gly2++;
  1648. }
  1649. if(FDxAA == 'F' || FDxAA == 'f')
  1650. {
  1651. Phe++;
  1652. Thr++;
  1653. Val2++;
  1654. }
  1655. if(FDxAA == 'S' || FDxAA == 's')
  1656. {
  1657. His++;
  1658. }
  1659. if(FD4xAA == 'R' || FD4xAA == 'r')
  1660. {
  1661. Aad++;
  1662. }
  1663. if(FD4xAA == 'L' || FD4xAA == 'l')
  1664. {
  1665. Ala++;
  1666. Tyr2++;
  1667. Gly1++;
  1668. }
  1669. if(FD4xAA == 'T' || FD4xAA == 't')
  1670. {
  1671. Asn++;
  1672. Asp++;
  1673. Tyr1++;
  1674. }
  1675. if(FD4xAA == 'E' || FD4xAA == 'e')
  1676. {
  1677. Cys++;
  1678. Dab++;
  1679. Orn++;
  1680. Val2++;
  1681. His++;
  1682. Lys++;
  1683. }
  1684. if(FD4xAA == 'Y' || FD4xAA == 'y')
  1685. {
  1686. Cys++;
  1687. }
  1688. if(FD4xAA == 'P' || FD4xAA == 'p')
  1689. {
  1690. Dhb++;
  1691. }
  1692. if(FD4xAA == 'Q' || FD4xAA == 'q')
  1693. {
  1694. Gln++;
  1695. Pip++;
  1696. Pro++;
  1697. Gly2++;
  1698. }
  1699. if(FD4xAA == 'W' || FD4xAA == 'w')
  1700. {
  1701. Glu++;
  1702. Leu1++;
  1703. Leu2++;
  1704. Phe++;
  1705. Ser++;
  1706. Thr++;
  1707. Val3++;
  1708. Gln++;
  1709. }
  1710. if(FD4xAA == 'K' || FD4xAA == 'k')
  1711. {
  1712. Glu++;
  1713. }
  1714. if(FD4xAA == 'F' || FD4xAA == 'f')
  1715. {
  1716. Ile++;
  1717. Leu4++;
  1718. Gly3++;
  1719. Val1++;
  1720. }
  1721. if(FD4xAA == 'A' || FD4xAA == 'a')
  1722. {
  1723. Leu3++;
  1724. }
  1725. if(FD4xAA == 'G' || FD4xAA == 'g')
  1726. {
  1727. Orn++;
  1728. }
  1729. if(FD4xAA == 'S' || FD4xAA == 's')
  1730. {
  1731. Tyr3++;
  1732. }
  1733. // score IT3xAA
  1734. if(IT3xAA == 'N' || IT3xAA == 'n')
  1735. {
  1736. Aad++;
  1737. Cys++;
  1738. Orn++;
  1739. Thr++;
  1740. Gly3++;
  1741. }
  1742. if(IT3xAA == 'F' || IT3xAA == 'f')
  1743. {
  1744. Ala++;
  1745. Ile++;
  1746. Leu++;
  1747. }
  1748. if(IT3xAA == 'K' || IT3xAA == 'k')
  1749. {
  1750. Asn++;
  1751. Asp++;
  1752. }
  1753. if(IT3xAA == 'S' || IT3xAA == 's')
  1754. {
  1755. Cys++;
  1756. Val++;
  1757. Lys++;
  1758. }
  1759. if(IT3xAA == 'H' || IT3xAA == 'h')
  1760. {
  1761. Dab++;
  1762. Glu++;
  1763. Ser++;
  1764. }
  1765. if(IT3xAA == 'A' || IT3xAA == 'a')
  1766. {
  1767. Dhb++;
  1768. His++;
  1769. }
  1770. if(IT3xAA == 'D' || IT3xAA == 'd')
  1771. {
  1772. Gln++;
  1773. Glu++;
  1774. }
  1775. if(IT3xAA == 'L' || IT3xAA == 'l')
  1776. {
  1777. Leu++;
  1778. Phg++;
  1779. Pip++;
  1780. Pro++;
  1781. }
  1782. if(IT3xAA == 'Y' || IT3xAA == 'y')
  1783. {
  1784. Leu++;
  1785. }
  1786. if(IT3xAA == 'M' || IT3xAA == 'm')
  1787. {
  1788. Leu++;
  1789. Val++;
  1790. Gly2++;
  1791. }
  1792. if(IT3xAA == 'E' || IT3xAA == 'e')
  1793. {
  1794. Orn++;
  1795. }
  1796. if(IT3xAA == 'T' || IT3xAA == 't')
  1797. {
  1798. Phe++;
  1799. Tyr++;
  1800. }
  1801. if(IT3xAA == 'I' || IT3xAA == 'i')
  1802. {
  1803. Tyr++;
  1804. }
  1805. if(IT3xAA == 'V' || IT3xAA == 'v')
  1806. {
  1807. Tyr++;
  1808. }
  1809. if(IT3xAA == 'W' || IT3xAA == 'w')
  1810. {
  1811. Val++;
  1812. }
  1813. if(IT3xAA == 'Q' || IT3xAA == 'q')
  1814. {
  1815. Gly1++;
  1816. Gln++;
  1817. }
  1818. if(x1xG299 == 'I' || x1xG299 == 'i')
  1819. {
  1820. Aad++;
  1821. Orn++;
  1822. Phe++;
  1823. Pro++;
  1824. Thr++;
  1825. Val1++;
  1826. }
  1827. if(x1xG299 == 'G' || x1xG299 == 'g')
  1828. {
  1829. Ala++;
  1830. }
  1831. if(x1xG299 == 'L' || x1xG299 == 'l')
  1832. {
  1833. Asn++;
  1834. Cys++;
  1835. Glu++;
  1836. Ile++;
  1837. Leu1++;
  1838. Leu4++;
  1839. Orn++;
  1840. Phg++;
  1841. Pip++;
  1842. Ser++;
  1843. Gly1++;
  1844. }
  1845. if(x1xG299 == 'V' || x1xG299 == 'v')
  1846. {
  1847. Asp++;
  1848. Phe++; // added from Challis
  1849. Tyr3++;
  1850. }
  1851. if(x1xG299 == 'D' || x1xG299 == 'd')
  1852. {
  1853. Cys++;
  1854. }
  1855. if(x1xG299 == 'N' || x1xG299 == 'n')
  1856. {
  1857. Dab++;
  1858. Gly3++;
  1859. }
  1860. if(x1xG299 == 'Q' || x1xG299 == 'q')
  1861. {
  1862. Dhb++;
  1863. }
  1864. if(x1xG299 == 'F' || x1xG299 == 'f')
  1865. {
  1866. Glu++;
  1867. Val3++;
  1868. Gly2++;
  1869. Gln++;
  1870. }
  1871. if(x1xG299 == 'Y' || x1xG299 == 'y')
  1872. {
  1873. Ile++;
  1874. Leu2++;
  1875. }
  1876. if(x1xG299 == 'T' || x1xG299 == 't')
  1877. {
  1878. Leu3++;
  1879. Tyr1++;
  1880. Tyr2++;
  1881. Val2++;
  1882. His++;
  1883. }
  1884. // detect 301 signature
  1885. if(x1xG301 == 'V' || x1xG301 == 'v')
  1886. {
  1887. Aad++;
  1888. Cys++;
  1889. Gly2++;
  1890. }
  1891. if(x1xG301 == 'I' || x1xG301 == 'i')
  1892. {
  1893. Ala++;
  1894. Cys++;
  1895. Lys++;
  1896. }
  1897. if(x1xG301 == 'G' || x1xG301 == 'g')
  1898. {
  1899. Asn++;
  1900. Asp++;
  1901. Dhb++;
  1902. Gln++;
  1903. Glu++;
  1904. Ile++;
  1905. Leu1++;
  1906. Leu2++;
  1907. Leu3++;
  1908. Leu4++;
  1909. Orn++;
  1910. Phg++;
  1911. Phe++; // extra
  1912. Pip++;
  1913. Thr++;
  1914. Tyr2++;
  1915. Val1++;
  1916. Lys++;
  1917. Gly1++;
  1918. }
  1919. if(x1xG301 == 'S' || x1xG301 == 's')
  1920. {
  1921. Cys++;
  1922. Ser++;
  1923. }
  1924. if(x1xG301 == 'T' || x1xG301 == 't')
  1925. {
  1926. Dab++;
  1927. }
  1928. if(x1xG301 == 'A' || x1xG301 == 'a')
  1929. {
  1930. Phe++;
  1931. Pro++;
  1932. Tyr1++;
  1933. Tyr3++;
  1934. Val2++;
  1935. Val3++;
  1936. His++;
  1937. Gly3++;
  1938. }
  1939. // analyze 322 residue
  1940. if(x7xxY322 == 'E' || x7xxY322 == 'e')
  1941. {
  1942. Aad++;
  1943. Asn++;
  1944. Leu3++;
  1945. Tyr1++;
  1946. His++;
  1947. }
  1948. if(x7xxY322 == 'A' || x7xxY322 == 'a')
  1949. {
  1950. Ala++;
  1951. Leu2++;
  1952. Phe++;
  1953. Tyr2++;
  1954. Tyr3++;
  1955. Val2++;
  1956. Val3++;
  1957. Gly2++;
  1958. }
  1959. if(x7xxY322 == 'H' || x7xxY322 == 'h')
  1960. {
  1961. Asp++;
  1962. Pro++;
  1963. }
  1964. if(x7xxY322 == 'G' || x7xxY322 == 'g')
  1965. {
  1966. Cys++;
  1967. Glu++;
  1968. Val1++;
  1969. Phe++; // extra
  1970. }
  1971. if(x7xxY322 == 'L' || x7xxY322 == 'l')
  1972. {
  1973. Cys++;
  1974. Orn++;
  1975. Phg++;
  1976. Ser++;
  1977. Gly1++;
  1978. Gly3++;
  1979. Gln++;
  1980. }
  1981. if(x7xxY322 == 'T' || x7xxY322 == 't')
  1982. {
  1983. Dab++;
  1984. }
  1985. if(x7xxY322 == '…

Large files files are truncated, but you can click here to view the full file