/NRPeditor/clusterFinderBackup.cpp
C++ | 3112 lines | 2732 code | 187 blank | 193 comment | 914 complexity | 213bc788573265f9dac075f3eb22d682 MD5 | raw file
- #include <iostream>
- #include <string>
- #include <fstream>
- #include <stdlib.h>
- #include <sstream>
- using namespace std;
- #include "constants.h"
- #include "UnitClass2.h"
- #include "QueueClass.h"
- #include "ReactionClass.h"
- #include "spawnl.h"
- // function declarations
- // overloading operator for outputing nodes
- ostream& operator <<(ostream& outputStream, const UnitClass2& domain );
- ostream& operator <<(ostream& outputStream, const UnitClass2& domain )
- {
- if( domain.getStartSite() == 0 )
- {
- outputStream << domain.getUnit() << " "
- << domain.getSmile() << endl;
- }
- else
- {
- outputStream << " Formula: "
- << domain.getFormula()
- << " start site: "
- << domain.getStartSite() << endl
- << " " << domain.getSmile() << endl;
- }
- return outputStream;
- }
- void clearPositions( string &pos235, string &pos236, string &pos239,
- string &pos278, string &pos299, string &pos301, string &pos322,
- string &pos330, string &pos331, string &pos58, string &pos59,
- string &pos60, string &pos61, string &pos62, string &pos70,
- string &pos72, string &pos74, string &pos75, string &pos77,
- string &pos93, string &pos189, string &pos191, string &pos198,
- string &pos199, string &pos200 );
- void outputSpecificitiesNRPS( int Aad, int Ala, int Asn, int Asp, int Cys,
- int Dab, int Dhb, int Gln, int Glu, int Ile,
- int Leu, int Orn, int Phe, int Phg, int Pip,
- int Pro, int Ser, int Thr, int Tyr, int His,
- int Lys, int Gly, int Val);
- void outputSpecificitiesPKS( int mal, int mmal, int emal, int omal);
- string maxSpecificityNRPS( int Aad, int Ala, int Asn, int Asp, int Cys,
- int Dab, int Dhb, int Gln, int Glu, int Ile,
- int Leu, int Orn, int Phe, int Phg, int Pip,
- int Pro, int Ser, int Thr, int Tyr, int His,
- int Lys, int Gly, int Val);
- string maxSpecificityPKS( int mal, int mmal, int emal, int omal);
- int getIntVal(string strConvert);
- float getFloatVal(string strConvert);
- void translateSeq( string &inSeq, string &outputSeq, int &frame);
- void predictDomains( string aminoAcidSeq,
- string &predictedSubstrates,
- QueueClass< UnitClass2 > &aaNameList,
- const bool &allFramesRead,
- const int &frame, const string &nucSeq,
- string &allClusters, int&NRPScount, int &PKScount,
- int &mixedNRPSPKScount, int &transATcount, int &mepCount,
- int &mvaCount);
- // function bodies
- void outputSpecificitiesPKS( int mal, int mmal, int emal, int omal)
- {
- cout << "PKS specificity scores: " << endl;
- cout << "mal: " << mal << " ";
- cout << "mmal: " << mmal << " ";
- cout << "emal: " << emal << " ";
- cout << "omal: " << omal << " ";
- cout << endl;
- }
- void outputSpecificitiesNRPS( int Aad, int Ala, int Asn, int Asp, int Cys,
- int Dab, int Dhb, int Gln, int Glu, int Ile,
- int Leu, int Orn, int Phe, int Phg, int Pip,
- int Pro, int Ser, int Thr, int Tyr, int His,
- int Lys, int Gly, int Val)
- {
- cout << "NRPS specificity scores: " << endl;
- cout << "Aad: " << Aad << " ";
- cout << "Ala: " << Ala << " ";
- cout << "Asn: " << Asn << " ";
- cout << "Asp: " << Asp << " ";
- cout << "Cys: " << Cys << " ";
- cout << "Dab: " << Dab << " ";
- cout << "Dhb: " << Dhb << " ";
- cout << "Gln: " << Gln << " ";
- cout << "Glu: " << Glu << " ";
- cout << "Gly: " << Gly << " ";
- cout << "His: " << His << endl;
- cout << "Ile: " << Ile << " ";
- cout << "Leu: " << Leu << " ";
- cout << "Lys: " << Lys << " ";
- cout << "Orn: " << Orn << " ";
- cout << "Phe: " << Phe << " ";
- cout << "Phg: " << Phg << " ";
- cout << "Pip: " << Pip << " ";
- cout << "Pro: " << Pro << " ";
- cout << "Ser: " << Ser << " ";
- cout << "Thr: " << Thr << " ";
- cout << "Tyr: " << Tyr << endl;
- cout << "Val: " << Val << " ";
- cout << endl;
- }
- string maxSpecificityPKS( int mal, int mmal, int emal, int omal)
- {
- int max;
- int distinguishingFactor = 5;
- string CA;
- max = mal;
- CA = "mal";
- if(mmal > mal)
- {
- max = mmal;
- CA = "mmal";
- }
- if(emal > max)
- {
- max = emal;
- CA = "emal";
- }
- if(omal > max)
- {
- max = omal;
- CA = "omal";
- }
- if(max < distinguishingFactor)
- {
- CA = "pk";
- }
- return CA;
- }
- string maxSpecificityNRPS( int Aad, int Ala, int Asn, int Asp, int Cys,
- int Dab, int Dhb, int Gln, int Glu, int Ile,
- int Leu, int Orn, int Phe, int Phg, int Pip,
- int Pro, int Ser, int Thr, int Tyr, int His,
- int Lys, int Gly, int Val)
- {
- int max;
- int distinguishingFactor = 3;
- string AA;
- max = Aad;
- AA = "Aad";
- if(Ala >= Aad)
- {
- max = Ala;
- AA = "ala";
- }
- if(Asn >= max)
- {
- max = Asn;
- AA = "asn";
- }
- if(Asp >= max)
- {
- max = Asp;
- AA = "asp";
- }
- if(Cys >= max)
- {
- max = Cys;
- AA = "cys";
- }
- if(Dab >= max)
- {
- max = Dab;
- AA = "dab";
- }
- if(Dhb >= max)
- {
- max = Dhb;
- AA = "dhb";
- }
- if(Gln >= max)
- {
- max = Gln;
- AA = "gln";
- }
- if(Glu >= max)
- {
- max = Glu;
- AA = "glu";
- }
- if(Ile >= max)
- {
- max = Ile;
- AA = "ile";
- }
- if(Leu >= max)
- {
- max = Leu;
- AA = "leu";
- }
- if(Orn >= max)
- {
- max = Orn;
- AA = "orn";
- }
- if(Phe >= max)
- {
- max = Phe;
- AA = "phe";
- }
- if(Phg >= max)
- {
- max = Phg;
- AA = "phg";
- }
- if(Pip >= max)
- {
- max = Pip;
- AA = "pip";
- }
- if(Pro >= max)
- {
- max = Pro;
- AA = "pro";
- }
- if(Ser >= max)
- {
- max = Ser;
- AA = "ser";
- }
- if(Thr >= max)
- {
- max = Thr;
- AA = "thr";
- }
- if(Tyr >= max)
- {
- max = Tyr;
- AA = "tyr";
- }
- if(His >= max)
- {
- max = His;
- AA = "his";
- }
- if(Lys >= max)
- {
- max = Lys;
- AA = "lys";
- }
- if(Gly >= max)
- {
- max = Gly;
- AA = "gly";
- }
- if(Val >= max)
- {
- max = Val;
- AA = "val";
- }
- if( max < distinguishingFactor)
- {
- AA = "nrp";
- }
- return AA;
- }
- int getIntVal(string strConvert)
- {
- int intReturn;
- // NOTE: You should probably do some checks to ensure that
- // this string contains only numbers. If the string is not
- // a valid integer, zero will be returned.
- intReturn = atoi(strConvert.c_str());
- return(intReturn);
- }
- float getFloatVal(string strConvert)
- {
- float floatReturn;
- // NOTE: You should probably do some checks to ensure that
- // this string contains only numbers. If the string is not
- // a valid integer, zero will be returned.
- floatReturn = atof(strConvert.c_str());
- return(floatReturn);
- }
- void clearPositions( string &pos235, string &pos236, string &pos239,
- string &pos278, string &pos299, string &pos301, string &pos322,
- string &pos330, string &pos331, string &pos58, string &pos59,
- string &pos60, string &pos61, string &pos62, string &pos70,
- string &pos72, string &pos74, string &pos75, string &pos77,
- string &pos93, string &pos189, string &pos191, string &pos198,
- string &pos199, string &pos200 )
- {
- pos235 = "";
- pos236 = "";
- pos239 = "";
- pos278 = "";
- pos299 = "";
- pos301 = "";
- pos322 = "";
- pos330 = "";
- pos331 = "";
- pos58 = "";
- pos59 = "";
- pos60 = "";
- pos61 = "";
- pos62 = "";
- pos70 = "";
- pos72 = "";
- pos74 = "";
- pos75 = "";
- pos77 = "";
- pos93 = "";
- pos189 = "";
- pos191 = "";
- pos198 = "";
- pos199 = "";
- pos200 = "";
- }
- // finds and returns the reverse complement sequence
- string revComp( const string &inSeq, const string &nucleotideSeq);
- string revComp( const string &inSeq, const string &nucleotideSeq)
- {
- string inputSeq;
- string outputSeq;
- string segment;
- string outputFilename;
- //string outFile = "rev-" + inSeq;
- int seqMin = 50; // minimum length of nucleotide line
- ifstream In;
- ofstream Out;
- int lineCharsCount = 0;
- int lineCharsMax = 70;
- In.open(inSeq.c_str());
- //outputFilename = "genomes/rev-"
- outputFilename = "genomes/rev-"
- + nucleotideSeq;
- In >> segment;
-
- // bypass initial FASTA details
- while(segment.length() < seqMin && !In.fail())
- {
- In >> segment;
- }
-
- // stores nucleotide
- inputSeq.append(segment);
- while(!In.fail())
- {
- In >> segment;
- inputSeq.append(segment);
- }
- In.close();
- for(int i = inputSeq.length(); i > 0; i--)
- {
- if(inputSeq[i] == 'A')
- {
- outputSeq.append("T");
- }
- if(inputSeq[i] == 'T')
- {
- outputSeq.append("A");
- }
- if(inputSeq[i] == 'C')
- {
- outputSeq.append("G");
- }
- if(inputSeq[i] == 'G')
- {
- outputSeq.append("C");
- }
-
- lineCharsCount++;
- if(lineCharsCount == lineCharsMax)
- {
- outputSeq.append("\n");
- lineCharsCount = 0;
- }
- }
- Out.open(outputFilename.c_str());
- Out << ">revComp-" << nucleotideSeq << "\n";
- Out << outputSeq;
- Out.clear();
- Out.close();
- return outputSeq;
- }
- // translates DNA to amino acid through common code
- void translateSeq( string &inSeq, string &outputSeq, int &frame)
- {
- int stopPrev = 0;
- int stopPres = 0;
- int stopLimit = 50;
- int start = 0;
- int choice;
- int stopCount = 0;
- int seqMin = 50;
- ifstream In;
- ofstream Out;
- string segment;
- outputSeq.clear();
- In.open(inSeq.c_str());
- string inputSeq;
- int charCount = 0;
- int charCountMax = 70;
-
- In >> segment;
- //cout << "segment: " << segment << endl;
- while(segment.length() < seqMin && !In.fail())
- {
- In >> segment;
- }
- inputSeq.append(segment);
- while(!In.fail())
- {
- In >> segment;
- inputSeq.append(segment);
- }
- //cout << "FASTA of DNA stored" << endl;
- In.close();
- //cout << "Frame: " << frame << endl;
- for(int i = frame; i < inputSeq.length()-3; i=i+3)
- {
- charCount++;
- if(charCount == charCountMax)
- {
- outputSeq.append("\n");
- charCount = 0;
- }
- if(inputSeq[i] == 't' ||
- inputSeq[i] == 'T')
- {
- if(inputSeq[i+1] == 't' ||
- inputSeq[i+1] == 'T')
- {
- if(inputSeq[i+2] == 't' ||
- inputSeq[i+2] == 'T' ||
- inputSeq[i+2] == 'c' ||
- inputSeq[i+2] == 'C')
- {
- outputSeq.append("F");
- }
- else
- {
- outputSeq.append("L");
- }
- }
- else if(inputSeq[i+1] == 'c' ||
- inputSeq[i+1] == 'C')
- {
- outputSeq.append("S");
- }
- else if(inputSeq[i+1] == 'a' ||
- inputSeq[i+1] == 'A')
- {
- if(inputSeq[i+2] == 't' ||
- inputSeq[i+2] == 'T' ||
- inputSeq[i+2] == 'c' ||
- inputSeq[i+2] == 'C')
- {
- outputSeq.append("Y");
- }
- else
- {
- outputSeq.append("-");
- stopCount++; // stop codon
- }
- }
- else if(inputSeq[i+1] == 'g' ||
- inputSeq[i+1] == 'G')
- {
- if(inputSeq[i+2] == 't' ||
- inputSeq[i+2] == 'T' ||
- inputSeq[i+2] == 'c' ||
- inputSeq[i+2] == 'C')
- {
- outputSeq.append("C");
- }
- else if(inputSeq[i+2] == 'g' ||
- inputSeq[i+2] == 'G')
- {
- outputSeq.append("W");
- }
- else
- {
- outputSeq.append("-");
- stopCount++; // stop codon
- }
- }
- } // sequence begins with 't' ends
- else if(inputSeq[i] == 'c' ||
- inputSeq[i] == 'C')
- {
- if(inputSeq[i+1] == 't' ||
- inputSeq[i+1] == 'T')
- {
- outputSeq.append("L");
- }
- else if(inputSeq[i+1] == 'c' ||
- inputSeq[i+1] == 'C')
- {
- outputSeq.append("P");
- }
- else if(inputSeq[i+1] == 'a' ||
- inputSeq[i+1] == 'A')
- {
- if(inputSeq[i+2] == 't' ||
- inputSeq[i+2] == 'T' ||
- inputSeq[i+2] == 'c' ||
- inputSeq[i+2] == 'C')
- {
- outputSeq.append("H");
- }
- else
- {
- outputSeq.append("Q");
- }
- }
- else if(inputSeq[i+1] == 'g' ||
- inputSeq[i+1] == 'G')
- {
- outputSeq.append("R");
- }
- } // sequence begins with 'c' ends
- else if(inputSeq[i] == 'a' ||
- inputSeq[i] == 'A')
- {
- if(inputSeq[i+1] == 't' ||
- inputSeq[i+1] == 'T')
- {
- if(inputSeq[i+2] != 'g' &&
- inputSeq[i+2] != 'G')
- {
- outputSeq.append("I");
- }
- else
- {
- outputSeq.append("M");
- }
- }
- else if(inputSeq[i+1] == 'c' ||
- inputSeq[i+1] == 'C')
- {
- outputSeq.append("T");
- }
- else if(inputSeq[i+1] == 'a' ||
- inputSeq[i+1] == 'A')
- {
- if(inputSeq[i+2] == 't' ||
- inputSeq[i+2] == 'T' ||
- inputSeq[i+2] == 'c' ||
- inputSeq[i+2] == 'C')
- {
- outputSeq.append("N");
- }
- else
- {
- outputSeq.append("K");
- }
- }
- else if(inputSeq[i+1] == 'g' ||
- inputSeq[i+1] == 'G')
- {
- if(inputSeq[i+2] == 't' ||
- inputSeq[i+2] == 'T' ||
- inputSeq[i+2] == 'c' ||
- inputSeq[i+2] == 'C')
- {
- outputSeq.append("S");
- }
- else
- {
- outputSeq.append("R");
- }
- }
- } // sequence begins with 'a' ends
- else if(inputSeq[i] == 'g' ||
- inputSeq[i] == 'G')
- {
- if(inputSeq[i+1] == 't' ||
- inputSeq[i+1] == 'T')
- {
- outputSeq.append("V");
- }
- else if(inputSeq[i+1] == 'c' ||
- inputSeq[i+1] == 'C')
- {
- outputSeq.append("A");
- }
- else if(inputSeq[i+1] == 'a' ||
- inputSeq[i+1] == 'A')
- {
- if(inputSeq[i+2] == 't' ||
- inputSeq[i+2] == 'T' ||
- inputSeq[i+2] == 'c' ||
- inputSeq[i+2] == 'C')
- {
- outputSeq.append("D");
- }
- else
- {
- outputSeq.append("E");
- }
- }
- else if(inputSeq[i+1] == 'g' ||
- inputSeq[i+1] == 'G')
- {
- outputSeq.append("G");
- }
- } // codon begins with 'g' ends
- else
- {
- outputSeq.append("-");
- stopCount++;
- }
- } // for loop ends
- Out.open("aminoAcidSeq.txt");
- Out.clear();
- Out << "Protein from one DNA frame" << "\n";
- Out << outputSeq;
- Out.close();
-
- } // function ends
- void predictDomains(string aminoAcidSeq,
- string &predictedSubstrates,
- QueueClass< UnitClass2 > &aaNameList,
- const bool &allFramesRead,
- const int &frame, const string &nucSeq,
- string &allClusters, int &NRPScount, int &PKScount,
- int &mixedNRPSPKScount, int &transATcount, int &mepCount,
- int &mvaCount )
- {
- QueueClass <UnitClass2> sortedListOfDomains;
- QueueClass <UnitClass2> sortedListOfMods;
- QueueClass <UnitClass2> sortedListOfATKS;
- UnitClass2 *domain;
- ifstream In;
- ifstream InMods;
- ifstream tempIn;
- ifstream aaSeqIn;
- ofstream Out;
- ofstream tempOut;
- string segment;
- float score;
- string scoreString;
- float modsScore;
- float modsScoreMax = 0.1;
- float scoreMax = 0.1; // minimum for domain synthetase matching
- float idenScore;
- float idenScoreMin = 120; // minimum for mods alignment
- float length;
- string SMILE;
- string formula;
- string mod;
- string substrate;
- float siteToAnalyze;
- int siteCount;
- bool siteFound;
- int whileCount = 0;
- int queryOrSbjctLength = 6;
- string aaSeq;
- char *arg[21];
- arg[0] = "./blastall"; // program to execute
- arg[1] = "blastp"; // protein blast
- arg[2] = "synthetases.fasta"; // control A-domain
- arg[3] = "mods.fasta"; // output file
- arg[4] = "100.0";
- arg[6] = "aaSeq.txt";
- arg[7] = "blast2.out";
- int AdomainStart = 5;
- int AdomainEnd = 5;
- // A-domains to analyze
- bool substrateKnown;
- string query;
- string sbjct;
- string predictedSubstrate;
- string proposedSubstrate;
- bool foundSubstrate;
- bool stringsCompare;
- int hits = 0;
- int misses = 0;
- int maxChars = 10000;
- // NRPS key positions
- int p235 = 11;
- int p236 = 12;
- int p239 = 15;
- int p278 = 10;
- int p299 = 1;
- int p301 = 3;
- int p322 = 1;
- int p330 = 9;
- int p331 = 10;
- string pos235;
- string pos236;
- string pos239;
- string pos278;
- string pos299;
- string pos301;
- string pos322;
- string pos330;
- string pos331;
- int Aad = 0;
- int Ala = 0;
- int Asn = 0;
- int Asp = 0;
- int Cys = 0;
- int Dab = 0;
- int Dhb = 0;
- int Gln = 0;
- int Glu = 0;
- int Ile = 0;
- int Leu = 0;
- int Orn = 0;
- int Phe = 0;
- int Phg = 0;
- int Pip = 0;
- int Pro = 0;
- int Ser = 0;
- int Thr = 0;
- int Tyr = 0;
- int Val = 0;
- int His = 0;
- int Lys = 0;
- int Gly = 0;
- int Gly1 = 0;
- int Gly2 = 0;
- int Gly3 = 0;
- int Leu1 = 0;
- int Leu2 = 0;
- int Leu3 = 0;
- int Leu4 = 0;
- int Tyr1 = 0;
- int Tyr2 = 0;
- int Tyr3 = 0;
- int Val1 = 0;
- int Val2 = 0;
- int Val3 = 0;
- int mal = 0;
- int mmal = 0;
- int emal = 0;
- int omal = 0;
- char FDxAA;
- char FD4xAA;
- char IT3xAA;
- char x1xG299;
- char x1xG301;
- char x7xxY322;
- char x7xxY330;
- char x7xxY331;
- // PKS key positions
- int p58 = 0;
- int p59 = 1;
- int p60 = 2;
- int p61 = 3;
- int p62 = 4;
- int p70 = 0;
- int p72 = 2;
- int p74 = 4;
- int p75 = 5;
- int p77 = 7;
- int p93 = 3;
- int p189 = 1;
- int p191 = 3;
- int p198 = 0;
- int p199 = 1;
- int p200 = 2;
- string pos58;
- string pos59;
- string pos60;
- string pos61;
- string pos62;
- string pos70;
- string pos72;
- string pos74;
- string pos75;
- string pos77;
- string pos93;
- string pos189;
- string pos191;
- string pos198;
- string pos199;
- string pos200;
- string sigSeq;
- float position;
- bool NRPS = 0;
- bool PKS = 0;
- string residue;
- string fname;
- bool success;
- string smileName;
- string inString;
- UnitClass2 *aaName;
- float currentDomainPos;
- float nextDomainPos;
- float thirdDomainPos;
- int PKSmodRange = 7000;
- bool KR = 0;
- bool DH = 0;
- bool ER = 0;
- string endSiteString;
- string prevSegment;
- int endSite;
- int startSite;
- int extraRange = 1000;
- int lastPKSkey = 3;
- string scoredSubstrate;
- string proteinID;
- string proteinFrame;
- string aminoAcids;
- int nucStartSite;
- int nucEndSite;
- cout << "Analyzing data..." << endl;
- //get every NRPS/PKS domain until no further domains are detected;
-
- // does blast alignment of cluster
- spawnl( 1, "./blastall", "./blastall",
- "-g", "T", "-p", arg[1], "-d", arg[2], "-i", aminoAcidSeq.c_str(),
- "-F", "F", "-C", "0", "-o", "blast.out", NULL);
- //cout << "called blast" << endl;
- spawnl( 1, "./blastall", "./blastall",
- "-g", "T", "-p", arg[1], "-d", arg[3], "-i", aminoAcidSeq.c_str(),
- "-F", "F", "-C", "0", "-o", "modsBlast.out", NULL);
- //cout << "called modblast" << endl;
- //cout << "frame: " << frame << endl;
- if(frame == 0)
- {
- system("cp blast.out frame0.out");
- //cout << "copied to frame0.out" << endl;
- system("cp modsBlast.out modsFrame0.out");
- //cout << "copied to modsFrame0.out" << endl;
- system("cp aminoAcidSeq.txt aminoAcidSeq0.txt");
- //cout << "aminoAcidSeq0 copied" << endl;
- }
- if(frame == 1)
- {
- system("cp blast.out frame1.out");
- //cout << "copied to frame1.out" << endl;
- system("cp modsBlast.out modsFrame1.out");
- //cout << "copied to modsFrame1.out" << endl;
- system("cp aminoAcidSeq.txt aminoAcidSeq1.txt");
- //cout << "aminoAcidSeq1 copied" << endl;
- }
- if(frame == 2)
- {
- system("cp blast.out frame2.out");
- //cout << "copied to frame2.out" << endl;
- system("cp modsBlast.out modsFrame2.out");
- //cout << "copied to modsFrame2.out" << endl;
- system("cat frame0.out frame1.out frame2.out > blast.out");
- //cout << "concatenated all domain frames" << endl;
- system("cat modsFrame0.out modsFrame1.out modsFrame2.out > modsBlast.out");
- //cout << "concatenated all mod frames" << endl;
- system("rm frame0.out frame1.out frame2.out modsFrame1.out modsFrame2.out");
- //cout << "last aminoAcidSeq" << endl;
- }
- //cout << "after frames copies" << endl;
- if(allFramesRead)
- {
- //cout << "ALL FRAMES READ" << endl;
- score = 0;
- // open NRPS/PKS alignment file
- In.open("blast.out");
- In >> segment;
- while( !In.fail())
- {
- //cout << "read blast.out" << endl;
- if(segment[0] == '*')
- {
- In >> segment;
- substrate = segment;
- }
- if(segment[0] == 'E' && segment[1] == 'x'
- && segment[2] == 'p')
- {
- In >> segment; // "="
- In >> segment; // expectation
- for(int i = 0; i < segment.length(); i++)
- {
- if(segment[i] == 'e')
- {
- i = segment.length();
- }
- else
- {
- scoreString.append(1, segment[i]);
- }
- }
-
- score = getFloatVal(scoreString);
- //cout << "scoreString: " << scoreString << endl;
- scoreString.clear();
- for(int i = 0; i < segment.length(); i++)
- {
- if(segment[i] == 'e')
- {
- for(int j = i+1; j < segment.length(); j++)
- {
- scoreString.append(1, segment[j]);
- }
- }
- }
- score = score * pow(10.0,(static_cast<double>(getFloatVal(scoreString))));
- //cout << "segment: " << segment << endl;
- //cout << "score: " << score << endl;
- //cout << "scoreString float: " << getFloatVal(scoreString) << endl;
- scoreString.clear();
- }
-
-
- if(segment[0] == 'I' && segment[1] == 'd'
- && segment[2] == 'e')
- {
- In >> segment; // "="
- In >> segment; // identities
-
- // collect denominator of identities value
- for(int i = 0; i < segment.length(); i++)
- {
- if(segment[i] == '/')
- {
- for(int j = i+1; j < segment.length(); j++)
- {
- scoreString.append(1, segment[j]);
- i = j;
- }
- }
- }
- idenScore = getIntVal(scoreString);
- scoreString.clear();
- //cout << "score: " << " idenScore: " << idenScore << endl;
- if(score < scoreMax && idenScore > idenScoreMin)
- {
- //cout << "score: " << score << " idenScore: " << idenScore << endl;
- while(segment[0] != 'Q' && segment[1] != 'u')
- {
- In >> segment;
- }
- In >> segment; // start site of domain on protein
- startSite = getIntVal(segment);
- //cout << "start site: " << startSite << endl;
-
- In >> segment;
- // get protein frame and find where adenylation domain begins
- proteinID = segment;
- for(int i = 0; i < proteinID.length(); i++)
- {
- while(proteinID[i] == '-')
- {
- proteinID.erase(i, 1);
- }
- }
- //cout << "PROTEINID: " << proteinID << endl;
-
- proteinFrame.clear();
-
- aaSeqIn.open("aminoAcidSeq.txt");
-
- aaSeqIn >> aminoAcids;
- while(!aaSeqIn.fail())
- {
- if(aminoAcids.length() > 1)
- {
- proteinFrame.append(aminoAcids);
- }
- aaSeqIn >> aminoAcids;
- }
- aaSeqIn.clear();
- aaSeqIn.close();
- //cout << "proteinFrame.length(): " << proteinFrame.length() << endl;
- nucStartSite = proteinFrame.find(proteinID, startSite - 100);
- if(nucStartSite < 0)
- {
- proteinFrame.clear();
- aaSeqIn.open("aminoAcidSeq0.txt");
-
- aaSeqIn >> aminoAcids;
- while(!aaSeqIn.fail())
- {
- if(aminoAcids.length() > 1)
- {
- proteinFrame.append(aminoAcids);
- }
- aaSeqIn >> aminoAcids;
- }
- aaSeqIn.clear();
- aaSeqIn.close();
- nucStartSite = proteinFrame.find(proteinID, startSite - 100);
- //cout << "NOT IN LAST AA FRAME" << endl;
- }
- if(nucStartSite < 0)
- {
- proteinFrame.clear();
- aaSeqIn.open("aminoAcidSeq1.txt");
-
- aaSeqIn >> aminoAcids;
- while(!aaSeqIn.fail())
- {
- if(aminoAcids.length() > 1)
- {
- proteinFrame.append(aminoAcids);
- }
- aaSeqIn >> aminoAcids;
- }
- aaSeqIn.clear();
- aaSeqIn.close();
- nucStartSite = proteinFrame.find(proteinID, startSite - 100);
- // cout << "NOT IN FIRST AA FRAME" << endl;
- }
-
- if(nucStartSite < 0)
- {
- //cout << "NOT IN SECOND AA FRAME" << endl;
- }
- nucStartSite = nucStartSite * 3; // translate amino acid position
- // into nucleotide position
- //cout << "NUCSTARTSITE: " << nucStartSite << endl;
- while((segment[0] != 'S' || segment[4] != 'e') && segment[0] != '>'
- && !In.fail())
- {
- prevSegment = segment;
- In >> segment;
- }
-
- endSiteString = prevSegment;
- endSite = getIntVal(prevSegment) + startSite + extraRange;
- domain = new UnitClass2;
- domain->setStartSite(startSite);
- domain->setFormula(substrate);
- domain->setEndSite(endSite);
- domain->setNucStartSite(nucStartSite);
- sortedListOfDomains.insertValue(*domain);
- delete domain;
- }
- score = 1;
- idenScore = 0;
- }
- In >> segment;
- }
- In.close();
- if(sortedListOfDomains.getNumElems() > 0)
- {
- //sortedListOfDomains.printForward();
- }
- else
- {
- cout << "No NRPS-A or PKS-AT domains detected" << endl;
- }
- // detect modifications in cluster and their locations
- InMods.open("modsBlast.out");
- InMods >> segment;
- //cout << "InMods: " << InMods << " segment: " << segment << endl;
- while( !InMods.fail())
- {
- //cout << "in InMods loop" << endl;
- if(segment[0] == '*')
- {
- InMods >> segment;
- mod = segment;
- }
- if(segment[0] == 'E' && segment[1] == 'x'
- && segment[2] == 'p')
- {
- InMods >> segment; // "="
- InMods >> segment; // expectation
- //cout << "segment(expectation): " << segment << endl;
- for(int i = 0; i < segment.length(); i++)
- {
- if(segment[i] == 'e')
- {
- i = segment.length();
- }
- else
- {
- scoreString.append(1, segment[i]);
- }
- }
- modsScore = getFloatVal(scoreString);
- scoreString.clear();
- for(int i = 0; i < segment.length(); i++)
- {
- if(segment[i] == 'e')
- {
- for(int j = i+1; j < segment.length(); j++)
- {
- scoreString.append(1, segment[j]);
- i = j;
- }
- }
- }
- }
- modsScore = modsScore * pow(10.0,(static_cast<double>(getFloatVal(scoreString))));
- scoreString.clear();
- if(segment[0] == 'I' && segment[1] == 'd'
- && segment[2] == 'e')
- {
- InMods >> segment; // "="
- InMods >> segment; // identities
-
- // collect denominator of identities value
- for(int i = 0; i < segment.length(); i++)
- {
- if(segment[i] == '/')
- {
- for(int j = i+1; j < segment.length(); j++)
- {
- scoreString.append(1, segment[j]);
- i = j;
- }
- }
- }
- idenScore = getIntVal(scoreString);
- scoreString.clear();
- //cout << "/identities: " << score << endl;
- if(modsScore < modsScoreMax and (idenScore > idenScoreMin
- || mod[0] == 'T' && mod[1] == 'E'))
- {
- //cout << "modsScore: " << modsScore << " idenScore: " << idenScore << endl;
- while( segment[0] != 'Q')
- {
- InMods >> segment;
- }
- InMods >> segment; // start site of domain on protein
- startSite = getIntVal(segment);
- InMods >> segment; // start site of domain on protein
- // get protein frame and find where adenylation domains begins
- proteinID = segment;
-
- for(int i = 0; i < proteinID.length(); i++)
- {
- while(proteinID[i] == '-')
- {
- proteinID.erase(i, 1);
- }
- }
- // cout << "PROTEINID: " << proteinID << endl;
- proteinFrame.clear();
- aaSeqIn.open("aminoAcidSeq.txt");
- aaSeqIn >> aminoAcids;
- while(!aaSeqIn.fail())
- {
- if(aminoAcids.length() > 1)
- {
- proteinFrame.append(aminoAcids);
- }
- aaSeqIn >> aminoAcids;
- }
- aaSeqIn.clear();
- aaSeqIn.close();
- //cout << "proteinFrame.length(): " << proteinFrame.length() << endl;
- nucStartSite = proteinFrame.find(proteinID, startSite);
- if(nucStartSite < 0)
- {
- proteinFrame.clear();
- aaSeqIn.open("aminoAcidSeq0.txt");
- aaSeqIn >> aminoAcids;
- while(!aaSeqIn.fail())
- {
- if(aminoAcids.length() > 1)
- {
- proteinFrame.append(aminoAcids);
- }
- aaSeqIn >> aminoAcids;
- }
- aaSeqIn.clear();
- aaSeqIn.close();
- nucStartSite = proteinFrame.find(proteinID, startSite);
- // cout << "NOT IN LAST AA FRAME" << endl;
- }
- if(nucStartSite < 0)
- {
- proteinFrame.clear();
- aaSeqIn.open("aminoAcidSeq1.txt");
- aaSeqIn >> aminoAcids;
- while(!aaSeqIn.fail())
- {
- if(aminoAcids.length() > 1)
- {
- proteinFrame.append(aminoAcids);
- }
- aaSeqIn >> aminoAcids;
- }
- aaSeqIn.clear();
- aaSeqIn.close();
- nucStartSite = proteinFrame.find(proteinID, startSite);
- // cout << "NOT IN FIRST AA FRAME" << endl;
- }
-
- if(nucStartSite < 0)
- {
- // cout << "NOT IN SECOND AA FRAME" << endl;
- }
- nucStartSite = nucStartSite * 3; // translate amino acid position
- // into nucleotide position
- domain = new UnitClass2;
- domain->setStartSite(startSite);
- domain->setNucStartSite(nucStartSite);
- domain->setFormula(mod);
- sortedListOfMods.insertValue(*domain);
- delete domain;
- }
- score = 1;
- idenScore = 0;
- }
- InMods >> segment;
- }
- InMods.clear();
- InMods.close();
- if(sortedListOfMods.getNumElems() > 0)
- {
- //sortedListOfMods.printForward();
- }
- else
- {
- cout << "No auxiliary domains detected" << endl;
- }
- siteCount = 0;
- // go through each domain and collect and align each
- while(siteCount < sortedListOfDomains.getNumElems() && whileCount < 1000000)
- {
- In.close();
- In.open("blast.out");
- In.clear();
- In >> segment;
- siteFound = 0;
- while( !In.fail() && siteFound == 0 && whileCount < 1000000)
- {
- //cout << "inner while" << endl;
- siteToAnalyze = sortedListOfDomains.getElemAtIndex(siteCount).getStartSite();
- //cout << "siteToAnalyze: " << siteToAnalyze << endl;
- if(segment[0] == 'Q' && segment[1] == 'u'
- && segment[queryOrSbjctLength - 1] == ':')
- {
- In >> segment;
-
- if(getIntVal(segment) == siteToAnalyze)
- {
- // analyze this segment only and add to aaNameList
- // store any modifications between this domain and the next
- // cout << "segment " << segment << " found " << endl;
- //cout << segment << " ";
- cout << sortedListOfDomains.getElemAtIndex(siteCount).getNucStartSite() << " ";
- cout << sortedListOfDomains.getElemAtIndex(siteCount).getNucStartSite() + 10000
- << " ";
- position = getIntVal(segment);
- siteFound = 1;
- siteCount++;
- In >> segment;
- aaSeq.append(segment);
- In >> segment;
- while((segment[0] != 'S' || segment[1] != 'c') && !In.fail())
- {
-
- if(segment[0] == 'Q' && segment[1] == 'u'
- && segment[queryOrSbjctLength - 1] == ':')
- {
- In >> segment;
- In >> segment;
-
- for(int i = 0; i < segment.length(); i++)
- {
- if((static_cast< unsigned int >(segment[i]) < 65
- || static_cast< unsigned int >(segment[i]) > 90)
- && (static_cast< unsigned int >(segment[i]) < 97
- || static_cast< unsigned int >(segment[i]) > 122))
- {
- segment.erase(i,1);
- i--;
- }
- }
- aaSeq.append(segment);
- }
- In >> segment;
- }
-
- // recognize amino acid sequence here
- //cout << "aaSeq: " << aaSeq << endl;
- Out.open("aaSeq.txt");
- Out.clear();
- Out << "> " << "aaSeq" << '\n' << aaSeq;
- Out.close();
- aaSeq.clear();
- // does blast alignment of individual domain to recognize
- // conserved motifs and substrate specificity
- spawnl( 1, "./blastall", "blastall",
- "-g", "T", "-p", arg[1], "-d", arg[2], "-i", "aaSeq.txt",
- "-F", "F", "-C", "0", "-o", "blast2.out", NULL);
- substrateKnown = 0;
- In.close();
- In.open("blast2.out");
- In.clear();
- segment.clear();
- In >> segment;
- while(!In.fail())
- {
- //cout << !In.fail() << " ";
- //cout << segment << endl;
- if(segment == "NRPS" && PKS == 0 && NRPS == 0)
- {
- NRPS = 1;
- //cout << "NRPS detected" << endl;
- }
- if(segment == "PKS" && NRPS == 0 && PKS == 0)
- {
- PKS = 1;
- //cout << "PKS detected" << endl;
- }
- if(segment.length() == queryOrSbjctLength)
- {
- if(segment[0] == 'Q' && segment[1] == 'u'
- && segment[queryOrSbjctLength - 1] == ':')
- {
- In >> segment;
- In >> segment;
- query.append(segment);
- //cout << "Query: " << segment << endl;
- }
- if(segment[0] == 'S' && segment[1] == 'b')
- {
- In >> segment;
- In >> segment;
- sbjct.append(segment);
- //cout << "Sbjct: " << segment << endl;
- }
- }
- In >> segment;
- }
- In.clear();
- In.close();
- segment.clear();
- // search for first FDX motif containing residues 235,236,239
-
- if(NRPS)
- {
- if(sbjct.find("SRVLQFASFSFD") > 0
- && sbjct.find("SRVLQFASFSFD") < maxChars )
- {
- pos235 = query[sbjct.find("SRVLQFASFSFD")+p235];
- pos236 = query[sbjct.find("SRVLQFASFSFD")+p236];
- pos239 = query[sbjct.find("SRVLQFASFSFD")+p239];
- sigSeq.append(pos235);
- sigSeq.append(pos236);
- sigSeq.append(pos239);
- }
- // search for IT motif containing the residue 278
- if(sbjct.find("RLKNDGITHVTLPP") > 0
- && sbjct.find("RLKNDGITHVTLPP") < maxChars )
- {
- pos278 = query[sbjct.find("RLKNDGITHVTLPP")+p278];
- sigSeq.append(pos278);
- }
- // search for GE motif containing the residues 299,301
- if(sbjct.find("IVVQGEACP") > 0
- && sbjct.find("IVVAGEACP") < maxChars )
- {
- pos299 = query[sbjct.find("IVVAGEACP")+p299];
- pos301 = query[sbjct.find("IVVAGEACP")+p301];
- sigSeq.append(pos299);
- sigSeq.append(pos301);
- }
- // search for YGPTE motif containing the residues 322,330,331
- if(sbjct.find("NAYGPTE") > 0
- && sbjct.find("NAYGPTE") < maxChars )
- {
- pos322 = query[sbjct.find("NAYGPTE")+p322];
- pos330 = query[sbjct.find("NAYGPTE")+p330];
- pos331 = query[sbjct.find("NAYGPTE")+p331];
- sigSeq.append(pos322);
- sigSeq.append(pos330);
- sigSeq.append(pos331);
- }
- FDxAA = pos236[0];
- FD4xAA = pos239[0];
- IT3xAA = pos278[0];
- x1xG299 = pos299[0];
- x1xG301 = pos301[0];
- x7xxY322 = pos322[0];
- x7xxY330 = pos330[0];
- x7xxY331 = pos331[0];
- }
- if(PKS)
- {
- if(sbjct.find("DVPLV") > 0
- && sbjct.find("DVPLV") < maxChars )
- {
- pos58 = query[sbjct.find("DVPLV")+p58];
- pos59 = query[sbjct.find("DVPLV")+p59];
- pos60 = query[sbjct.find("DVPLV")+p60];
- pos61 = query[sbjct.find("DVPLV")+p61];
- pos62 = query[sbjct.find("DVPLV")+p62];
- sigSeq.append(pos58);
- sigSeq.append(pos59);
- sigSeq.append(pos60);
- sigSeq.append(pos61);
- sigSeq.append(pos62);
- }
- if(sbjct.find("QVAL") > 0
- && sbjct.find("QVAL") < maxChars )
- {
- pos70 = query[sbjct.find("QVAL")+p70];
- pos72 = query[sbjct.find("QVAL")+p72];
- pos74 = query[sbjct.find("QVAL")+p74];
- pos75 = query[sbjct.find("QVAL")+p75];
- pos77 = query[sbjct.find("QVAL")+p77];
- sigSeq.append(pos70);
- sigSeq.append(pos72);
- sigSeq.append(pos74);
- sigSeq.append(pos75);
- sigSeq.append(pos77);
- }
- if(sbjct.find("GQSMGE") > 0
- && sbjct.find("GQSMGE") < maxChars )
- {
- pos93 = query[sbjct.find("GQSMGE")+p93];
- sigSeq.append(pos93);
- }
- if(sbjct.find("ITVRRI") > 0
- && sbjct.find("ITVRRI") < maxChars )
- {
- pos189 = query[sbjct.find("ITVRRI")+p189];
- sigSeq.append(pos189);
- pos191 = query[sbjct.find("ITVRRI")+p191];
- sigSeq.append(pos191);
- }
- if(sbjct.find("YASH") > 0
- && sbjct.find("YASH") < maxChars )
- {
- pos198 = query[sbjct.find("YASH")+p198];
- pos199 = query[sbjct.find("YASH")+p199];
- pos200 = query[sbjct.find("YASH")+p200];
- sigSeq.append(pos198);
- sigSeq.append(pos199);
- sigSeq.append(pos200);
- }
-
- if(pos58[0] == 'E' || pos58[0] == 'D' || pos58[0] == 'Q'
- || pos58[0] == 'S' || pos58[0] == 'K')
- {
- mal++;
- }
- if(pos58[0] == 'R' || pos58[0] == 'G')
- {
- mmal++;
- emal++;
- omal++;
- }
- if(pos59[0] == 'T' || pos59[0] == 'S')
- {
- mal++;
- }
- if(pos59[0] == 'V' || pos59[0] == 'A')
- {
- mmal++;
- emal++;
- omal++;
- }
- if(pos60[0] == 'G' || pos60[0] == 'P' || pos60[0] == 'A'
- || pos60[0] == 'L' || pos60[0] == 'R')
- {
- mal++;
- }
- if(pos60[0] == 'D' || pos60[0] == 'E')
- {
- mmal++;
- emal++;
- omal++;
- }
- if(pos61[0] == 'Y' || pos61[0] == 'F')
- {
- mal++;
- }
- if(pos61[0] == 'V')
- {
- mmal++;
- emal++;
- omal++;
- }
- if(pos62[0] == 'A' || pos62[0] == 'T')
- {
- mal++;
- }
- if(pos62[0] == 'V')
- {
- mmal++;
- emal++;
- omal++;
- }
- if(pos70[0] == 'Q')
- {
- mal++;
- }
- if(pos70[0] == 'M')
- {
- mmal++;
- emal++;
- omal++;
- }
- if(pos72[0] == 'A')
- {
- mal++;
- }
- if(pos72[0] == 'S')
- {
- mmal++;
- }
- if(pos74[0] == 'F')
- {
- mal++;
- }
- if(pos74[0] == 'A')
- {
- mmal++;
- emal++;
- omal++;
- }
- if(pos75[0] == 'G')
- {
- mal++;
- }
- if(pos75[0] == 'A')
- {
- mmal++;
- emal++;
- omal++;
- }
- if(pos77[0] == 'L')
- {
- mal++;
- }
- if(pos77[0] == 'W')
- {
- mmal++;
- emal++;
- omal++;
- }
- if(pos93[0] == 'L' || pos93[0] == 'V' || pos93[0] == 'I')
- {
- mal++;
- }
- if(pos93[0] == 'Q')
- {
- mmal++;
- emal++;
- omal++;
- }
- if(pos189[0] == 'M' || pos189[0] == 'L' || pos189[0] == 'Y')
- {
- omal++;
- }
- else
- {
- mal++;
- mmal++;
- emal++;
- }
- if(pos191[0] == 'W')
- {
- omal++;
- }
- else
- {
- mal++;
- mmal++;
- emal++;
- }
- if(pos198[0] == 'H')
- {
- mal = mal + lastPKSkey;
- }
- if(pos198[0] == 'Y')
- {
- mmal = mmal + lastPKSkey;
- }
- if(pos198[0] == 'T' || pos198[0] == 'C')
- {
- emal = emal + lastPKSkey;
- }
- if(pos199[0] == 'P')
- {
- emal++;
- }
- if(pos200[0] == 'F')
- {
- mal = mal + lastPKSkey;
- }
- if(pos200[0] == 'S')
- {
- mmal = mmal + lastPKSkey;
- }
- if(pos200[0] == 'G')
- {
- emal = emal + lastPKSkey;
- }
- if(pos200[0] == 'T')
- {
- emal++;
- }
-
- //outputSpecificitiesPKS( mal, mmal, emal, omal);
- scoredSubstrate = maxSpecificityPKS(mal, mmal, emal, omal);
- //cout << "scoredSubstrate: " << scoredSubstrate << endl;
- mal = 0;
- mmal = 0;
- emal = 0;
- omal = 0;
- clearPositions( pos235, pos236, pos239,
- pos278, pos299, pos301, pos322, pos330,
- pos331, pos58, pos59, pos60, pos61, pos62,
- pos70, pos72, pos74, pos75, pos77, pos93,
- pos189, pos191, pos198, pos199, pos200 );
- }
- //cout << "signature sequence: " << sigSeq << " ";
- cout << sigSeq << " ";
- Out.open("sigSeq.txt");
- Out.clear();
- Out << "> " << "SigSeq" << '\n'
- << sigSeq;
- Out.close();
- // align with NRPS/PKS signatures
- if(PKS)
- {
- spawnl(1, "./blastall", "blastall", "-p", "blastp", "-d",
- "PKSsignatures.fasta", "-i", "sigSeq.txt",
- "-e", arg[4], "-F", "F", "-C", "0", "-e", "10.0",
- "-o", "blast3.out", NULL);
- }
- if(NRPS)
- {
- spawnl(1, "./blastall", "blastall", "-p", "blastp", "-d",
- "NRPSsignatures.fasta", "-i", "sigSeq.txt",
- "-e", arg[4], "-F", "F", "-C", "0", "-e", "10.0",
- "-o", "blast3.out", NULL);
- if(FDxAA == 'P' || FDxAA == 'p')
- {
- Aad++;
- }
- if(FDxAA == 'L' || FDxAA == 'l')
- {
- Ala++;
- Asn++;
- Asp++;
- Cys++;
- Dab++;
- Dhb++;
- Gly3++;
- }
- if(FDxAA == 'H' || FDxAA == 'h')
- {
- Cys++;
- }
- if(FDxAA == 'A' || FDxAA == 'a')
- {
- Gln++;
- Glu++;
- Ile++;
- Leu1++;
- Leu2++;
- Leu4++;
- Phe++;
- Tyr2++;
- Tyr3++;
- Val1++;
- Val3++;
- Lys++;
- }
- if(FDxAA == 'G' || FDxAA == 'g')
- {
- Ile++;
- Leu3++;
- Tyr1++;
- }
- if(FDxAA == 'M' || FDxAA == 'm')
- {
- Orn++;
- }
- if(FDxAA == 'V' || FDxAA == 'v')
- {
- Orn++;
- Pro++;
- Ser++;
- }
- if(FDxAA == 'I' || FDxAA == 'i')
- {
- Phg++;
- Gly1++;
- Gly2++;
- }
- if(FDxAA == 'F' || FDxAA == 'f')
- {
- Phe++;
- Thr++;
- Val2++;
- }
- if(FDxAA == 'S' || FDxAA == 's')
- {
- His++;
- }
-
- if(FD4xAA == 'R' || FD4xAA == 'r')
- {
- Aad++;
- }
- if(FD4xAA == 'L' || FD4xAA == 'l')
- {
- Ala++;
- Tyr2++;
- Gly1++;
- }
- if(FD4xAA == 'T' || FD4xAA == 't')
- {
- Asn++;
- Asp++;
- Tyr1++;
- }
- if(FD4xAA == 'E' || FD4xAA == 'e')
- {
- Cys++;
- Dab++;
- Orn++;
- Val2++;
- His++;
- Lys++;
- }
- if(FD4xAA == 'Y' || FD4xAA == 'y')
- {
- Cys++;
- }
- if(FD4xAA == 'P' || FD4xAA == 'p')
- {
- Dhb++;
- }
- if(FD4xAA == 'Q' || FD4xAA == 'q')
- {
- Gln++;
- Pip++;
- Pro++;
- Gly2++;
- }
- if(FD4xAA == 'W' || FD4xAA == 'w')
- {
- Glu++;
- Leu1++;
- Leu2++;
- Phe++;
- Ser++;
- Thr++;
- Val3++;
- Gln++;
- }
- if(FD4xAA == 'K' || FD4xAA == 'k')
- {
- Glu++;
- }
- if(FD4xAA == 'F' || FD4xAA == 'f')
- {
- Ile++;
- Leu4++;
- Gly3++;
- Val1++;
- }
- if(FD4xAA == 'A' || FD4xAA == 'a')
- {
- Leu3++;
- }
- if(FD4xAA == 'G' || FD4xAA == 'g')
- {
- Orn++;
- }
- if(FD4xAA == 'S' || FD4xAA == 's')
- {
- Tyr3++;
- }
- // score IT3xAA
- if(IT3xAA == 'N' || IT3xAA == 'n')
- {
- Aad++;
- Cys++;
- Orn++;
- Thr++;
- Gly3++;
- }
- if(IT3xAA == 'F' || IT3xAA == 'f')
- {
- Ala++;
- Ile++;
- Leu++;
- }
- if(IT3xAA == 'K' || IT3xAA == 'k')
- {
- Asn++;
- Asp++;
- }
- if(IT3xAA == 'S' || IT3xAA == 's')
- {
- Cys++;
- Val++;
- Lys++;
- }
- if(IT3xAA == 'H' || IT3xAA == 'h')
- {
- Dab++;
- Glu++;
- Ser++;
- }
- if(IT3xAA == 'A' || IT3xAA == 'a')
- {
- Dhb++;
- His++;
- }
- if(IT3xAA == 'D' || IT3xAA == 'd')
- {
- Gln++;
- Glu++;
- }
- if(IT3xAA == 'L' || IT3xAA == 'l')
- {
- Leu++;
- Phg++;
- Pip++;
- Pro++;
- }
- if(IT3xAA == 'Y' || IT3xAA == 'y')
- {
- Leu++;
- }
- if(IT3xAA == 'M' || IT3xAA == 'm')
- {
- Leu++;
- Val++;
- Gly2++;
- }
- if(IT3xAA == 'E' || IT3xAA == 'e')
- {
- Orn++;
- }
- if(IT3xAA == 'T' || IT3xAA == 't')
- {
- Phe++;
- Tyr++;
- }
- if(IT3xAA == 'I' || IT3xAA == 'i')
- {
- Tyr++;
- }
- if(IT3xAA == 'V' || IT3xAA == 'v')
- {
- Tyr++;
- }
- if(IT3xAA == 'W' || IT3xAA == 'w')
- {
- Val++;
- }
- if(IT3xAA == 'Q' || IT3xAA == 'q')
- {
- Gly1++;
- Gln++;
- }
- if(x1xG299 == 'I' || x1xG299 == 'i')
- {
- Aad++;
- Orn++;
- Phe++;
- Pro++;
- Thr++;
- Val1++;
- }
- if(x1xG299 == 'G' || x1xG299 == 'g')
- {
- Ala++;
- }
- if(x1xG299 == 'L' || x1xG299 == 'l')
- {
- Asn++;
- Cys++;
- Glu++;
- Ile++;
- Leu1++;
- Leu4++;
- Orn++;
- Phg++;
- Pip++;
- Ser++;
- Gly1++;
- }
- if(x1xG299 == 'V' || x1xG299 == 'v')
- {
- Asp++;
- Phe++; // added from Challis
- Tyr3++;
- }
- if(x1xG299 == 'D' || x1xG299 == 'd')
- {
- Cys++;
- }
- if(x1xG299 == 'N' || x1xG299 == 'n')
- {
- Dab++;
- Gly3++;
- }
- if(x1xG299 == 'Q' || x1xG299 == 'q')
- {
- Dhb++;
- }
- if(x1xG299 == 'F' || x1xG299 == 'f')
- {
- Glu++;
- Val3++;
- Gly2++;
- Gln++;
- }
- if(x1xG299 == 'Y' || x1xG299 == 'y')
- {
- Ile++;
- Leu2++;
- }
- if(x1xG299 == 'T' || x1xG299 == 't')
- {
- Leu3++;
- Tyr1++;
- Tyr2++;
- Val2++;
- His++;
- }
- // detect 301 signature
- if(x1xG301 == 'V' || x1xG301 == 'v')
- {
- Aad++;
- Cys++;
- Gly2++;
- }
- if(x1xG301 == 'I' || x1xG301 == 'i')
- {
- Ala++;
- Cys++;
- Lys++;
- }
- if(x1xG301 == 'G' || x1xG301 == 'g')
- {
- Asn++;
- Asp++;
- Dhb++;
- Gln++;
- Glu++;
- Ile++;
- Leu1++;
- Leu2++;
- Leu3++;
- Leu4++;
- Orn++;
- Phg++;
- Phe++; // extra
- Pip++;
- Thr++;
- Tyr2++;
- Val1++;
- Lys++;
- Gly1++;
- }
- if(x1xG301 == 'S' || x1xG301 == 's')
- {
- Cys++;
- Ser++;
- }
- if(x1xG301 == 'T' || x1xG301 == 't')
- {
- Dab++;
- }
- if(x1xG301 == 'A' || x1xG301 == 'a')
- {
- Phe++;
- Pro++;
- Tyr1++;
- Tyr3++;
- Val2++;
- Val3++;
- His++;
- Gly3++;
- }
-
- // analyze 322 residue
- if(x7xxY322 == 'E' || x7xxY322 == 'e')
- {
- Aad++;
- Asn++;
- Leu3++;
- Tyr1++;
- His++;
- }
- if(x7xxY322 == 'A' || x7xxY322 == 'a')
- {
- Ala++;
- Leu2++;
- Phe++;
- Tyr2++;
- Tyr3++;
- Val2++;
- Val3++;
- Gly2++;
- }
- if(x7xxY322 == 'H' || x7xxY322 == 'h')
- {
- Asp++;
- Pro++;
- }
- if(x7xxY322 == 'G' || x7xxY322 == 'g')
- {
- Cys++;
- Glu++;
- Val1++;
- Phe++; // extra
- }
- if(x7xxY322 == 'L' || x7xxY322 == 'l')
- {
- Cys++;
- Orn++;
- Phg++;
- Ser++;
- Gly1++;
- Gly3++;
- Gln++;
- }
- if(x7xxY322 == 'T' || x7xxY322 == 't')
- {
- Dab++;
- }
- if(x7xxY322 == 'V' || x7xxY322 == 'v')
- {
- Dhb++;
- Gln++;
- Glu++;
- Ile++;
- Pip++;
- }
- if(x7xxY322 == 'I' || x7xxY322 == 'i')
- {
- Ile++;
- Gln++;
- }
- if(x7xxY322 == 'N' || x7xxY322 == 'n')
- {
- Leu1++;
- }
- if(x7xxY322 == 'M' || x7xxY322 == 'm')
- {
- Leu4++;
- Thr++;
- Gly2++;
- }
- if(x7xxY322 == 'S' || x7xxY322 == 's')
- {
- Orn++;
- Lys++;
- }
- // analyze 330 residue
- if(x7xxY330 == 'F' || x7xxY330 == 'f')
- {
- Aad++;
- }
- if(x7xxY330 == 'V' || x7xxY330 == 'v')
- {
- Ala++;
- Asn++;
- Dab++;
- Dhb++;
- Gln++;
- Glu++;
- Ile++;
- Leu1++;
- Leu2++;
- Leu3++;
- Leu4++;
- Phe++;
- Pro++;
- Thr++;
- Tyr1++;
- Tyr2++;
- Tyr3++;
- Val2++;
- Val3++;
- Lys++;
- }
- if(x7xxY330 == 'I' || x7xxY330 == 'i')
- {
- Asp++;
- Cys++;
- Orn++;
- Ser++;
- Gly1++;
- }
- if(x7xxY330 == 'T' || x7xxY330 == 't')
- {
- Ile++;
- Val1++;
- Gly3++;
- }
- if(x7xxY330 == 'L' || x7xxY330 == 'l')
- {
- Phg++;
- }
- if(x7xxY330 == 'A' || x7xxY330 == 'a')
- {
- Pip++;
- }
- if(x7xxY330 == 'M' || x7xxY330 == 'm')
- {
- Leu2++;
- Gly2++;
- }
- // analyze 331 residue
- if(x7xxY331 == 'V' || x7xxY331 == 'v')
- {
- Aad++;
- Leu1++;
- Leu3++;
- Pip++;
- Pro++;
- Tyr2++;
- His++;
- }
- if(x7xxY331 == 'L' || x7xxY331 == 'l')
- {
- Ala++;
- Val3++;
- }
- if(x7xxY331 == 'G' || x7xxY331 == 'g')
- {
- Asn++;
- Asp++;
- }
- if(x7xxY331 == 'T' || x7xxY331 == 't')
- {
- Cys++;
- }
- if(x7xxY331 == 'W' || x7xxY331 == 'w')
- {
- Cys++;
- }
- if(x7xxY331 == 'S' || x7xxY331 == 's')
- {
- Dab++;
- }
- if(x7xxY331 == 'N' || x7xxY331 == 'n')
- {
- Dhb++;
- }
- if(x7xxY331 == 'D' || x7xxY331 == 'd')
- {
- Gln++;
- Glu++;
- Orn++;
- Ser++;
- }
- if(x7xxY331 == 'Y' || x7xxY331 == 'y')
- {
- Ile++;
- Val2++;
- }
- if(x7xxY331 == 'F' || x7xxY331 == 'f')
- {
- Ile++;
- Leu4++;
- Val1++;
- }
- if(x7xxY331 == 'M' || x7xxY331 == 'm')
- {
- Leu1++;
- }
- if(x7xxY331 == 'C' || x7xxY331 == 'c')
- {
- Phe++;
- Phg++;
- Tyr3++;
- }
- if(x7xxY331 == 'H' || x7xxY331 == 'h')
- {
- Thr++;
- }
- if(x7xxY331 == 'A' || x7xxY331 == 'a')
- {
- Tyr1++;
- }
-
- // determine most specific Gly
- Gly = Gly1;
- if(Gly2 > Gly)
- {
- Gly = Gly2;
- }
- if(Gly3 > Gly)
- {
- Gly = Gly3;
- }
-
- // determine most specific Tyr
- Tyr = Tyr1;
- if(Tyr2 > Tyr)
- {
- Tyr = Tyr2;
- }
- if(Tyr3 > Tyr)
- {
- Tyr = Tyr3;
- }
- // determine most specific Leu
- Leu = Leu1;
- if(Leu2 > Leu)
- {
- Leu = Leu2;
- }
- if(Leu3 > Leu)
- {
- Leu = Leu3;
- }
- // determine most specific Val
- Val = Val1;
- if(Val2 > Val)
- {
- Val = Val2;
- }
- if(Val3 > Val)
- {
- Val = Val3;
- }
-
- /* outputSpecificitiesNRPS( Aad, Ala, Asn, Asp, Cys, Dab, Dhb, Gln, Glu,
- Ile, Leu, Orn, Phe, Phg, Pip, Pro, Ser, Thr,
- Tyr, His, Lys, Gly, Val);*/
- scoredSubstrate = maxSpecificityNRPS(Aad, Ala, Asn, Asp, Cys, Dab,
- Dhb, Gln, Glu, Ile, Leu, Orn,
- Phe, Phg, Pip, Pro, Ser, Thr,
- Tyr, His, Lys, Gly, Val);
- //cout << endl;
- //cout << "scoredSubstrate: " << scoredSubstrate << endl;
- clearPositions( pos235, pos236, pos239,
- pos278, pos299, pos301, pos322, pos330,
- pos331, pos58, pos59, pos60, pos61, pos62,
- pos70, pos72, pos74, pos75, pos77, pos93,
- pos189, pos191, pos198, pos199, pos200 );
-
- Aad = 0;
- Ala = 0;
- Asn = 0;
- Asp = 0;
- Cys = 0;
- Dab = 0;
- Dhb = 0;
- Gln = 0;
- Glu = 0;
- Ile = 0;
- Leu = 0;
- Orn = 0;
- Phe = 0;
- Phg = 0;
- Pip = 0;
- Pro = 0;
- Ser = 0;
- Thr = 0;
- Tyr = 0;
- Val = 0;
- His = 0;
- Lys = 0;
- Gly = 0;
- Gly1 = 0;
- Gly2 = 0;
- Gly3 = 0;
- Leu1 = 0;
- Leu2 = 0;
- Leu3 = 0;
- Leu4 = 0;
- Tyr1 = 0;
- Tyr2 = 0;
- Tyr3 = 0;
- Val1 = 0;
- Val2 = 0;
- Val3 = 0;
- }
- foundSubstrate = 0;
- In.open("blast3.out");
- In >> segment;
- while( !In.fail())
- {
- if(segment == "*" && foundSubstrate == 0)
- {
- In >> segment;
- predictedSubstrate = segment;
- //cout << "predictedSubstrate: " << predictedSubstrate << endl;
- foundSubstrate = 1;
- }
- In >> segment;
- }
- if(predictedSubstrate.length() == 0 && NRPS)
- {
- predictedSubstrate = scoredSubstrate; // unknown NRPS
- }
- if(predictedSubstrate.length() == 0 && PKS)
- {
- predictedSubstrate = scoredSubstrate; // unknown PKS
- }
-
- //reset
- sbjct.clear();
- query.clear();
- sigSeq.clear();
- In.close();
- //cout << "predicted substrate: " << predictedSubstrate << endl;
- cout << predictedSubstrate << endl;
- // string to keep track of substrates
- predictedSubstrates.append(predictedSubstrate);
- predictedSubstrates.append(" ");
- predictedSubstrate.clear();
- NRPS = 0;
- PKS = 0;
- mod.clear();
- }
- }
- In >> segment;
- whileCount++;
- }
- whileCount++;
- }
-
-
- // } previous allframesread end?
- //cout << "clusterwork" << endl;
- int prevStart;
- int clusterStart;
- int clusterEnd;
- int separationMax = 15000;
- int aStore = 0;
- int minLineLength = 60;
- int lineLength = 1;
- int startBuffer = 10000;
- int endBuffer = 10000;
- int domainWidth = 5000;
- string clusterOut;
- string aaGenome;
- string str;
- string str1;
- string str2;
- string strFile;
- //string inputFilename = "genomes/"
- string inputFilename = "genomes/"
- + nucSeq;
- int extraSpaceCount = 1;
- stringstream outStr1;
- stringstream outStr2;
- ifstream InGenome;
- ofstream OutCluster;
- ofstream clustersSummary;
- //InGenome.open(nucSeq.c_str());
- InGenome.open(inputFilename.c_str());
- InGenome.clear();
- InGenome >> segment;
- int minUnits = 3; // minimum domain of activating domains in a molecule
- aaGenome = "";
- aaGenome.clear();
- int tempStart;
- int tempRange;
- float AATcount = 0;
- float KScount = 0;
- float minTransATratio = 1.5; // ratio of KS/AT
- bool NRPSfound = 0;
- bool PKSfound = 0;
- // skip first lines
- while(segment.length() < minLineLength && !In.fail())
- {
- InGenome >> segment;
- }
- while(!InGenome.fail())
- {
- aaGenome.append(segment);
- aaGenome.append("\n");
- InGenome >> segment;
- if(segment.length() > minLineLength && lineLength == 1)
- {
- lineLength = segment.length();
- }
- }
- if(lineLength < minLineLength)
- {
- lineLength = 70; // length of FASTA line
- }
- //cout << "genome size: " << aaGenome.length() << endl;
- // searches and outputs clusters within a genome into files
- for(int i = 0; i < sortedListOfDomains.getNumElems(); i++)
- {
- if( i == 0) // the first clusterStart
- {
- clusterStart = sortedListOfDomains.getElemAtIndex(0).getNucStartSite();
- prevStart = clusterStart;
- //cout << "0. i: " << i << endl;
- }
- for(int a = i+1; a < sortedListOfDomains.getNumElems(); a++)
- {
- aStore = a;
- // if next domain is within a range of the previous domain, designate
- // it to be within the cluster
- if(sortedListOfDomains.getElemAtIndex(a).getNucStartSite() -
- prevStart < separationMax)
- {
- prevStart = sortedListOfDomains.getElemAtIndex(a).getNucStartSite();
- }
- // if next start site is significantly apart, designate it ias the
- // end of the cluster
- if( sortedListOfDomains.getElemAtIndex(a).getNucStartSite() -
- prevStart > separationMax )
- {
- clusterStart = sortedListOfDomains.getElemAtIndex(i).getNucStartSite();
- clusterEnd = sortedListOfDomains.getElemAtIndex(a-1).getNucStartSite() + domainWidth;
- cout << ".clusterStart: " << clusterStart
- << " .clusterEnd: " << clusterEnd << " " << a - i << endl;
- //cout << "1. a: " << a << " i: " << i << endl;
- if(a - i >= minUnits)
- {
-
- tempStart = clusterStart + clusterStart/lineLength - startBuffer;
- tempRange = clusterEnd - clusterStart + (clusterEnd - clusterStart)/lineLength + endBuffer;
- if(tempStart < 0)
- {
- tempStart = 1;
- }
- if(tempRange > (aaGenome.length() - tempStart))
- {
- tempRange = aaGenome.length() - tempStart - 1;
- }
- clusterOut = aaGenome.substr(tempStart, tempRange);
-
- for(int b = i; b < a; b++)
- {
-
- // cout << "!!"
- // << sortedListOfDomains.getElemAtIndex(b).getFormula();
- if(sortedListOfDomains.getElemAtIndex(b).getFormula()=="NRPS")
- {
- NRPSfound = 1;
- }
- if(sortedListOfDomains.getElemAtIndex(b).getFormula() == "PKS")
- {
- PKSfound = 1;
- }
- }
- if(NRPSfound && PKSfound)
- {
- mixedNRPSPKScount++;
- }
- else if(NRPSfound)
- {
- NRPScount++;
- }
- else if (PKSfound)
- {
- PKScount++;
- }
- NRPSfound = 0;
- PKSfound = 0;
- outStr1.clear();
- outStr2.clear();
- outStr1.str("");
- outStr2.str("");
- str1.clear();
- str2.clear();
- str.clear();
- outStr1 << clusterStart;
- str1 = outStr1.str();
- outStr2 << clusterEnd;
- str2 = outStr2.str();
- strFile = "clusters/"
- + nucSeq + "-" + str1 + "-" + str2 + ".txt";
- str = ">" + str1 + "-" + str2;
- OutCluster.open(strFile.c_str());
- OutCluster << str << "\n";
- OutCluster << clusterOut;
- allClusters.append(strFile);
- allClusters.append("\n");
- OutCluster.close();
- OutCluster.clear();
- }
-
- //as long as domain analyzed is within the total number of domains
- if(a < sortedListOfDomains.getNumElems())
- {
- clusterStart = sortedListOfDomains.getElemAtIndex(a).getNucStartSite();
- prevStart = clusterStart;
- }
- //cout << endl;
- //cout << "i: " << i << " a: " << a << endl;
- i = a - 1;
- aStore = a;
- a = sortedListOfDomains.getNumElems();
-
- // if next domain is last domain, iterate i since loop cannot iterate i
- if(aStore == sortedListOfDomains.getNumElems()-1)
- {
- i++;
- }
- }
- if(aStore == sortedListOfDomains.getNumElems()-1) // if domain is last domain in list
- {
- clusterEnd = sortedListOfDomains.getElemAtIndex(aStore).getNucStartSite() + domainWidth;
- cout << "clusterStart: " << clusterStart
- << " clusterEnd: " << clusterEnd << " " << aStore + 1 - i << endl;
- //cout << "2. a: " << aStore + 1 << " i: " << i << endl;
- if(a - i >= minUnits)
- {
- for(int b = i; b < aStore; b++)
- {
- //cout << "!!"
- // << sortedListOfDomains.getElemAtIndex(b).getFormula();
- if(sortedListOfDomains.getElemAtIndex(b).getFormula() == "NRPS")
- {
- NRPSfound = 1;
- }
- if(sortedListOfDomains.getElemAtIndex(b).getFormula() == "PKS")
- {
- PKSfound = 1;
- }
- }
- if(NRPSfound && PKSfound)
- {
- mixedNRPSPKScount++;
- }
- else if(NRPSfound)
- {
- NRPScount++;
- }
- else if(PKSfound)
- {
- PKScount++;
- }
- NRPSfound = 0;
- PKSfound = 0;
- tempStart = clusterStart + clusterStart/lineLength - startBuffer;
- tempRange = clusterEnd - clusterStart + (clusterEnd - clusterStart)/lineLength + endBuffer;
- if(tempStart < 0)
- {
- tempStart = 1;
- }
- if(tempRange > (aaGenome.length() - tempStart))
- {
- tempRange = aaGenome.length() - tempStart -1;
- }
- clusterOut = aaGenome.substr(tempStart, tempRange);
-
- outStr1.clear();
- outStr2.clear();
- outStr1.str("");
- outStr2.str("");
- str1.clear();
- str2.clear();
- str.clear();
- outStr1 << clusterStart;
- str1 = outStr1.str();
- outStr2 << clusterEnd;
- str2 = outStr2.str();
- strFile = "clusters/" + nucSeq + "-" + str1 + "-" + str2 + ".txt";
- str = ">" + str1 + "-" + str2;
- OutCluster.open(strFile.c_str());
- OutCluster << str << "\n";
- OutCluster << clusterOut;
- allClusters.append(strFile);
- allClusters.append("\n ");
- OutCluster.close();
- OutCluster.clear();
- }
- //cout << "2. i: " << i << " a: " << a << endl;
- i = a;
- aStore = a;
- a = sortedListOfDomains.getNumElems();
- }
- }
- } // cluster search complete
- InGenome.close();
-
- for(int i = 0; i < sortedListOfMods.getNumElems(); i++)
- {
- if(sortedListOfMods.getElemAtIndex(i).getFormula() == "IspH")
- {
- cout << sortedListOfMods.getElemAtIndex(i).getFormula()
- << " " << sortedListOfMods.getElemAtIndex(i).getStartSite()
- << endl;
- cout << "NON-MEVALONATE PATHWAY TERPENOID GENE IDENTIFIED." << endl;
- mepCount++;
-
- }
- if(sortedListOfMods.getElemAtIndex(i).getFormula() == "Mvd1")
- {
- cout << sortedListOfMods.getElemAtIndex(i).getFormula()
- << " " << sortedListOfMods.getElemAtIndex(i).getStartSite()
- << endl;
- cout << "MEVALONATE PATHWAY GENE IDENTIFIED (TERPENOID)." << endl;
- mvaCount++;
- }
- // Type of mevalonate pathway
- if(sortedListOfMods.getElemAtIndex(i).getFormula() == "IPP1" ||
- sortedListOfMods.getElemAtIndex(i).getFormula() == "IPP2")
- {
- // cout << sortedListOfMods.getElemAtIndex(i).getFormula()
- // << " " << sortedListOfMods.getElemAtIndex(i).getStartSite()
- // << endl;
- }
- }
- // collect AT or A and KS from previous lists into one new list
- for(int a = 0; a < sortedListOfDomains.getNumElems(); a++)
- {
- domain = new UnitClass2;
- domain->setStartSite(sortedListOfDomains.getElemAtIndex(a).getStartSite());
- domain->setFormula(sortedListOfDomains.getElemAtIndex(a).getFormula());
- domain->setEndSite(sortedListOfDomains.getElemAtIndex(a).getEndSite());
- domain->setNucStartSite(sortedListOfDomains.getElemAtIndex(a).getNucStartSite());
- sortedListOfATKS.insertValue(*domain);
- delete domain;
- }
- for(int a = 0; a < sortedListOfMods.getNumElems(); a++)
- {
- if(sortedListOfMods.getElemAtIndex(a).getFormula() == "KS")
- {
- domain = new UnitClass2;
- domain->setStartSite(sortedListOfMods.getElemAtIndex(a).getStartSite());
- domain->setFormula(sortedListOfMods.getElemAtIndex(a).getFormula());
- domain->setEndSite(sortedListOfMods.getElemAtIndex(a).getEndSite());
- domain->setNucStartSite(sortedListOfMods.getElemAtIndex(a).getNucStartSite());
- sortedListOfATKS.insertValue(*domain);
- delete domain;
- }
- }
-
- // check list
- /*for(int a = 0; a < sortedListOfATKS.getNumElems(); a++)
- {
- cout << sortedListOfATKS.getElemAtIndex(a).getFormula() << " "
- << sortedListOfATKS.getElemAtIndex(a).getNucStartSite() << endl;
- }*/
- // Searching for Trans AT clusters
-
- // searches and outputs clusters within a genome into files
- for(int i = 0; i < sortedListOfATKS.getNumElems(); i++)
- {
- if( i == 0) // the first clusterStart
- {
- clusterStart = sortedListOfATKS.getElemAtIndex(0).getNucStartSite();
- prevStart = clusterStart;
- if(sortedListOfATKS.getElemAtIndex(i).getFormula() == "NRPS" ||
- sortedListOfATKS.getElemAtIndex(i).getFormula() == "PKS")
- {
- AATcount++;
- }
- if(sortedListOfATKS.getElemAtIndex(i).getFormula() == "KS")
- {
- KScount++;
- }
- }
- for(int a = i+1; a < sortedListOfATKS.getNumElems(); a++)
- {
- aStore = a;
- // if next domain is within range of the previous domain, designate
- // it to be within the cluster
-
- if(sortedListOfATKS.getElemAtIndex(a).getFormula() == "NRPS" ||
- sortedListOfATKS.getElemAtIndex(a).getFormula() == "PKS")
- {
- AATcount++;
- }
- if(sortedListOfATKS.getElemAtIndex(a).getFormula() == "KS")
- {
- KScount++;
- }
- if(sortedListOfATKS.getElemAtIndex(a).getNucStartSite() -
- prevStart < separationMax)
- {
- prevStart = sortedListOfATKS.getElemAtIndex(a).getNucStartSite();
- }
-
- // if next start site is significantly apart, designate it as the
- // end of the cluster
-
- if(sortedListOfATKS.getElemAtIndex(a).getNucStartSite() -
- prevStart > separationMax)
- {
- clusterStart = sortedListOfATKS.getElemAtIndex(i).getNucStartSite();
- clusterEnd = sortedListOfATKS.getElemAtIndex(a-1).getNucStartSite() +
- domainWidth;
- //cout << ".clusterStart: " << clusterStart
- // <<" .clusterEnd: " << clusterEnd << " " << a - i << endl;
- if(a - i >= minUnits)
- {
- //cout << "KScount: " << KScount << " AATcount: " << AATcount << endl;
- cout << "clusterStart: " << clusterStart << " KS/AAT ratio: "
- << KScount/AATcount << endl;
- if(KScount/AATcount > minTransATratio)
- {
- cout << "TRANS AT PKS SYSTEM IDENTIFIED." << endl;
- transATcount++;
-
- clusterOut = aaGenome.substr(clusterStart + clusterStart/lineLength
- - startBuffer, clusterEnd - clusterStart + (clusterEnd -
- clusterStart)/lineLength + endBuffer);
- outStr1.clear();
- outStr2.clear();
- outStr1.str("");
- outStr2.str("");
- str1.clear();
- str2.clear();
- outStr1 << clusterStart;
- str1 = outStr1.str();
- outStr2 << clusterEnd;
- str2 = outStr2.str();
- strFile = "clusters/transAT/" + nucSeq + "-" + str1 + "-" + str2
- + ".txt";
- str = ">" + str1 + "-" + str2;
- OutCluster.open(strFile.c_str());
- OutCluster << str << "\n";
- OutCluster << clusterOut;
- allClusters.append(strFile);
- allClusters.append("\n");
- OutCluster.close();
- OutCluster.clear();
- } // end of if statement 'if(KScout/AATcount > minTransATratio...'
- }
- // as long as domain analyzed is within total number of domains
- if(a < sortedListOfATKS.getNumElems())
- {
- clusterStart = sortedListOfATKS.getElemAtIndex(a).getNucStartSite();
- prevStart = clusterStart;
- }
-
- i = a - 1;
- aStore = a;
- a = sortedListOfATKS.getNumElems();
- AATcount = 0;
- KScount = 0;
- }
- if(aStore == sortedListOfATKS.getNumElems()-1)
- {
- clusterEnd = sortedListOfATKS.getElemAtIndex(aStore).getNucStartSite()
- + domainWidth;
- //cout << "clusterStart: " << clusterStart
- // << " clusterEnd: " << clusterEnd << " " << a - i - 1 << endl;
- if(a - i - 1 >= minUnits)
- {
- if(KScount/AATcount > minTransATratio)
- {
-
- clusterOut = aaGenome.substr(clusterStart + clusterStart/lineLength
- - startBuffer, clusterEnd - clusterStart + (clusterEnd-clusterStart)/ lineLength + endBuffer);
-
- outStr1.clear();
- outStr2.clear();
- outStr1.str("");
- outStr2.str("");
- str1.clear();
- str2.clear();
- outStr1 << clusterStart;
- str1 = outStr1.str();
- outStr2 << clusterEnd;
- str2 = outStr2.str();
- strFile = "clusters/transAT/" + nucSeq + "-" + str1 + "-" + str2 +
- ".txt";
- str = ">" + str1 + "-" + str2;
- OutCluster.open(strFile.c_str());
- OutCluster << str << "\n";
- OutCluster << clusterOut;
- allClusters.append(strFile);
- allClusters.append("\n");
- OutCluster.close();
- OutCluster.clear();
- }
- }
- i = a;
- aStore = a;
- a = sortedListOfATKS.getNumElems();
- }
- }
- } // cluster search complete
- aaGenome.clear();
- sortedListOfDomains.clear();
- sortedListOfMods.clear();
- sortedListOfATKS.clear();
- std::stringstream ss;
- std::string NRPScountStr;
- ss << NRPScount;
- ss >> NRPScountStr;
-
- clustersSummary.open("SMILES/summary.html");
- clustersSummary << " <html><head><title></title></head> ";
- clustersSummary << " <body bgcolor = '#F4FFE4'> <font face='tahoma' color='#254117'>";
- clustersSummary << "<b>" << NRPScountStr << "</b>"
- << " modular NRPSs" << "</br>";
- clustersSummary << "<b>" << PKScount << "</b>" << " modular PKSs" << "</br>";
- clustersSummary << "<b>" << mixedNRPSPKScount << "</b>"
- << " mixed modular NRPS/PKSs" << "</br>" << "</br>";
- clustersSummary << "<b>" << transATcount << "</b>" << " trans AT PKSs"
- << "</br>" << "</br>";
- clustersSummary << "<b>" << mvaCount << "</b>"
- << " mevalonate terpenoid mva genes" << "</br>";
- clustersSummary << "<b>" << mepCount << "</b>"
- << " non-mevalonate terpenoid mep genes" << "</br>";
- clustersSummary.close();
-
- clustersSummary << "</font></body></html>";
- } // allframes read end
- }
- int main( int argc , char* argv[])
- {
- string nucleotideSeq;
- string predictedSubstrates;
- string outputSeq;
- string revCompSeq;
- string tempSeq;
- string inputFilename;
- ifstream In;
- ofstream allClustersFile;
- QueueClass< UnitClass2 > aaNameList;
- bool allFramesRead = 0;
- string allClusters;
- string clusters;
- string aReaction = "AA";
- float minMass;
- float maxMass;
- string outputFilename;
- string fileName;
- int NRPScount = 0;
- int PKScount = 0;
- int mixedNRPSPKScount = 0;
- int transATcount = 0;
- int mepCount = 0;
- int mvaCount = 0;
- int charCount = 0;
- ofstream runTime; // time it takes for program to run genome
- int charTime; // time estimated time needed to analyze genome
- int charTimeRatio = 3000000; // basepairs analyzed per minute
- string character;
-
- cout << "argc = " << argc << endl;
- for(int i = 0; i < argc; i++)
- {
- cout << "argv[" << i << "] = " << argv[i] << endl;
- }
-
- if(argc == 1)
- {
- cout << "no file name supplied" << endl;
- return 0;
- }
- if(argc == 2) // if a filename is typed but no limits on mass
- {
- minMass = 1; // default
- maxMass = 5000; // default
- }
- if(argc == 3) // if a min value is typed by not max
- {
- minMass = atof(argv[2]);
- if(minMass < 0 || minMass > 5000)
- {
- minMass = 1;
- }
- maxMass = 5000; // arbitrary max mass
- }
- if(argc > 3) // if both min and max values are typed
- {
- minMass = atof(argv[2]);
- if(minMass < 0 || minMass > 5000)
- {
- minMass = 1;
- }
- maxMass = atof(argv[3]);
- if(maxMass < 0 || maxMass > 5000)
- {
- maxMass = 5000;
- }
- }
- if(argc > 4) // if further reactions are done, store them
- {
- aReaction = argv[4];
- if(aReaction != "CY")
- {
- aReaction = "AA";
- }
- }
- nucleotideSeq = argv[1];
- inputFilename = "genomes/" + nucleotideSeq;
- /*
- cout << "inputFile: " << inputFilename << endl;
- In.open(inputFilename.c_str());
- if(In.fail())
- {
- cout << "Invalid file." << endl;
- return 0;
- }
- else
- {
- In >> character;
- while(!In.fail())
- {
- In >> character;
- charCount = charCount + character.length();
- }
- charTime = charCount / charTimeRatio;
- runTime.open("SMILES/runTime");
- runTime << charTime;
- runTime.close();
- }
- In.close();
- */
- //revCompSeq = revComp(nucleotideSeq);
- revCompSeq = revComp(inputFilename, nucleotideSeq);
- tempSeq = "rev-" + nucleotideSeq;
- cout << "Unchanged sequence: " << endl;
- for(int i = 0; i < 3; i++)
- {
- if(i == 2)
- {
- allFramesRead = 1;
- }
- //translateSeq(nucleotideSeq, outputSeq, i);
- translateSeq(inputFilename, outputSeq, i);
- predictDomains("aminoAcidSeq.txt",
- predictedSubstrates, aaNameList,
- allFramesRead, i, nucleotideSeq,
- allClusters, NRPScount, PKScount,
- mixedNRPSPKScount, transATcount,
- mepCount, mvaCount);
- }
- allFramesRead = 0;
- if(predictedSubstrates.length() > 0)
- {
- cout << predictedSubstrates << endl;
- }
- else
- {
- cout << "Unchanged sequence yielded no hits" << endl;
- }
- cout << endl;
- predictedSubstrates.clear();
- aaNameList.clear();
-
- cout << "Reverse complement sequence: " << endl;
- //inputFilename = "genomes/rev-"
- inputFilename = "genomes/rev-"
- + nucleotideSeq;
- for(int i = 0; i < 3; i++)
- {
- if(i == 2)
- {
- allFramesRead = 1;
- }
- //translateSeq(tempSeq, outputSeq, i);
- translateSeq(inputFilename, outputSeq, i);
- predictDomains("aminoAcidSeq.txt",
- predictedSubstrates, aaNameList,
- allFramesRead, i, tempSeq,
- allClusters, NRPScount, PKScount,
- mixedNRPSPKScount, transATcount,
- mepCount, mvaCount);
- }
- allFramesRead = 0;
-
- if(predictedSubstrates.length() > 0)
- {
- cout << predictedSubstrates << endl;
- }
- else
- {
- cout << "Reverse complement sequence yielded no hits" << endl;
- }
- allClustersFile.open("clusters.txt");
- allClustersFile << allClusters;
- allClustersFile.close();
- allClusters.clear();
- In.clear();
- In.open("clusters.txt");
- cout << "after opened clusters.txt" << endl;
- In >> outputFilename;
- cout << "outputFilename: " << outputFilename << endl;
- //string minNum = "1";
- //string maxNum = "5000";
- char *out1;
- char *out2;
- out1 = (char *)malloc(100);
- out2 = (char *)malloc(100);
- sprintf(out1, "%f\n", minMass);
- sprintf(out2, "%f\n", maxMass);
- printf(out1, "%f\n", minMass);
- printf(out2, "%f\n", maxMass);
- while(!In.fail())
- {
- cout << argv[2] << " " << argv[3] << endl;
- fileName = outputFilename.substr(9);
- spawnl(1, "./main.exe", "main.exe", fileName.c_str(),
- out1, out2, aReaction.c_str(), fileName.c_str(), NULL);
- In >> outputFilename;
- }
- In.close();
-
- return 0;
- }