/code_comm/wanal1.cpp
C++ | 1062 lines | 607 code | 357 blank | 98 comment | 85 complexity | 096464cc61c1dd0bb415586e7bd7e21e MD5 | raw file
-
- #include<stdiostream.h>
- #include<string.h>
- #include<stdlib.h>
- #include<ctype.h>
-
-
-
- #include <vector>
- #include<complex>
- using namespace std;
-
-
-
- //begin Power spectrum code
-
- class PowerSpectrum
- {
- public:
- PowerSpectrum();
- virtual ~PowerSpectrum();
- char * strCRegion;
- double *s;
- double *x;
- int doPowerSpectrumAnalysis(int);
- void setSequence(const char * seq);
-
- };
-
-
- PowerSpectrum::PowerSpectrum()
- {
- s=NULL;
- x=NULL;
- strCRegion=NULL;
-
- }
-
- PowerSpectrum::~PowerSpectrum()
- {
-
- delete []s;
- delete []x;
- delete []strCRegion;
-
- }
-
- void PowerSpectrum::setSequence(const char * seq)
- {
-
- delete []strCRegion;
-
- strCRegion=new char[ strlen(seq)+1 ];
-
- memset(strCRegion, '\0' , strlen(seq)+1 ) ;
-
- strcpy(strCRegion,seq);
-
- }
-
-
- int PowerSpectrum::doPowerSpectrumAnalysis(int pc)
- {
-
-
- const double PI = 3.14159274;
-
- complex<double> i1(0.0,1.0);
-
- long lenStr = strlen(strCRegion);
-
-
- delete []s;
- delete []x;
-
-
- //cout<<"Coding Region Length:"<<lenStr<<endl;
-
-
- s = new double[lenStr/2 + 1];
- x = new double[lenStr/2 + 1];
-
- char nuclt[5]= "ATGC";
- int const lenWindow = (lenStr/pc)*pc;
-
- double li=(1.0/lenWindow);
-
- double winsq = li+li*li;
- //long wincube = lenWindow*lenWindow*lenWindow;
-
-
- int countA =0;
- int countC =0;
- int countG =0;
- int countT= 0;
-
- complex<double> s1(0.0,0.0);
-
- double sq=0.0;
- double sq1=0.0;
- double s2=0.0;
- double sbar =0.0;
- double p=0.0;
-
-
- //ofstream out("power");
-
- for(long i =0; i < lenWindow; i++ )
- {
-
- if( strCRegion[i] == 'A')
- { countA++; continue;}
- if( strCRegion[i] == 'G')
- { countG++; continue;}
- if( strCRegion[i] == 'C')
- { countC++; continue;}
- if( strCRegion[i] == 'T')
- { countT++; continue;}
-
- }
-
- //out<<"Count A :"<<countA <<" countT :"<<countT <<"countG :"<<countG <<"countC :"<<countC <<endl;
-
-
- sq1 = 0.0+ pow(countA,2)+pow(countG,2) + pow(countT,2) + pow(countC,2);
-
- double f =0.0;
-
-
-
- long k;
- for(k = 1 ; k <=lenWindow/2 ; k++ )
- {
-
- s2 =0.0;
- sbar =0.0;
- p =0.0;
- f= ((1.0)*k)/lenWindow;
-
- // for each Nucleotide. A T G C
-
-
- for(int i = 0; i <4 ; i ++ )
- {
- s1= complex<double>(0.0,0.0);
-
-
- for (int bi=0; bi<lenWindow; bi++)
- {
- if(strCRegion[bi] == nuclt[i] )
- s1= s1+exp(-i1*2.0*PI*f*(bi+1.0));
- }
-
- s2 = s2+ (abs(s1)*abs(s1));
-
-
- }
-
- // to calculater power specturm and code region
-
- sq = sq1*li*li*li;
-
- sbar =(winsq-sq);
- sq = sbar * (lenWindow/2.0);
-
- // to calcuater signal to noise ratio
-
- s2 = s2/(sq*lenWindow*lenWindow);
-
- p= (s2/sbar);
-
-
- s[k-1]=f;
- x[k-1]=p;
-
-
- // out<<"\n Frequency:"<<f <<" Power : "<<p;
-
- } // outer for loop
-
-
-
- int numPeaks=0;
- int genePeak=0;
- //WindowAnalysis wobj;
-
- for( k = 1 ; k <= (lenWindow/2) ; k++ )
- {
-
- if( x[k-1] > 4.0)
- {
- //if( s[k-1] == 1.0/3.0 )] //added by Sreenu on mandars suggestion on 19/04/2002
- //if( ( s[k-1]>= 0.32 ) && ( s[k-1] <= 0.34 ))
-
- if(s[k-1]==(1.0/pc))
- genePeak=1;
- else
- numPeaks++;
- }
-
- }
-
-
- //delete []s ;
- //delete []x ;
- //s =NULL ;
- //x =NULL ;
-
-
- if(genePeak==1)
- {
- if(numPeaks==0)
- return 1;
- else
- return 2;
- }
- else
- {
- return 0;
- }
-
-
-
- } // end of member function.
-
- //end Power spectrum code
-
-
- //begin Windowanalysis code
-
- struct structCR
- {
- long start;
- long end;
- char type;
-
- };
-
-
- class WindowAnalysis //__declspec(dllexport)
- {
- public:
-
-
- long *s;
- double *x;
- int pcity;
-
- vector<structCR> vecCRegion;
- WindowAnalysis();
- WindowAnalysis(int len ,double thold,long intron , long exon);
- virtual ~WindowAnalysis();
- void setWindowLen(int len);
- void settherHold(double thold);
- void setIntronExon(long intron , long exon );
- int doFt(const char * strSeq , char *logfile);
- void setpcity(int pc);
-
-
- private :
-
- int lenWindow;
- double thersHold;
- long exonCutOff;
- long intronCutOff;
-
- int findexp(int WindowSize);
- long framecount;
- complex<double> * expval;
- WindowAnalysis& operator =(const WindowAnalysis &cwa);
- WindowAnalysis(const WindowAnalysis &cwa);
-
- };
-
- WindowAnalysis::WindowAnalysis()
- {
- expval = NULL;
- s = NULL;
- x = NULL;
- framecount=0;
-
-
- }
-
- WindowAnalysis::~WindowAnalysis()
- {
-
- delete []expval;
- delete []s ;
- delete []x ;
- printf("Destructor of WindowAnalysis ");
- fflush(stdout);
- //delete []CRegion;
- //this->vecCRegion.clear();
-
-
- }
-
- WindowAnalysis::WindowAnalysis(int len , double thold ,long intron , long exon)
- {
-
- lenWindow=len;
- thersHold=thold;
- exonCutOff=exon;
- intronCutOff =intron;
-
- s = NULL;
- x = NULL;
-
- framecount=0;
- // cout<<"ThresHold Value "<<thersHold;
- // cout<<"Intron :"<<intronCutOff<<" Exon :"<<exonCutOff<<endl;
-
- expval=new complex<double>[lenWindow];
-
- }
-
-
- void WindowAnalysis::settherHold(double thold)
- {
- thersHold =thold;
- // cout<< "ThresHold Value "<<thersHold;
-
- }
- void WindowAnalysis::setpcity(int pc)
- {
- pcity=pc;
- }
-
- void WindowAnalysis::setWindowLen(int len)
- {
- lenWindow=len;
- delete [] expval;
- expval=new complex<double>[lenWindow];
-
- }
- void WindowAnalysis::setIntronExon(long intron , long exon )
- {
- exonCutOff=exon;
- intronCutOff =intron;
-
-
- }
- int WindowAnalysis::doFt(const char * strSeq , char * logfile)
- {
-
-
- const double F3 = 1.000/pcity;
- const double PI = 3.14159274;
-
-
- //ofstream out("output" );
- //ofstream out(logfile ,ios::app);
-
- long lenStr = strlen(strSeq);
-
- if(lenStr<lenWindow)
- {
- //out<<"Sequence length is less than windowlength."<<endl;
- return 1;
- }
-
- //cout<<"\n Sequence Length :"<<lenStr<<endl;
-
- int result= findexp(lenWindow);
-
- //long *CRegion = new long[lenStr-lenWindow+ 1];
- //long *s = new long[lenStr-lenWindow+ 1];
- //double *x = new double[lenStr-lenWindow+1];
-
- //delete [] s;
- //delete [] x;
- s = new long[lenStr-lenWindow+ 1];
- x = new double[lenStr-lenWindow+1];
-
-
- char nuclt[5]= "ATGC";
-
- double li=(1.0/lenWindow);
-
-
- double winsq = li+ (li*li);
-
- double winsq1 = (1.0/lenWindow)+(1.0/(lenWindow*lenWindow));
- double wincube = lenWindow*lenWindow*lenWindow;
-
-
- int countA =0;
- int countC =0;
- int countG =0;
- int countT= 0;
-
- complex<double> s1(0.0,0.0);
-
- double sq=0.0;
- double sq1=0.0;
- double s2=0.0;
- double sbar =0.0;
- double p=0.0;
-
- framecount=0;
-
-
-
- for(long k = 0 ; k < lenStr-lenWindow+1 ; k++ )
-
- {
- sq =0.0;
- s2 =0.0;
- sbar =0.0;
- p =0.0;
-
- // Frame wise count of A C G T seperately
-
-
- if(k==0)
- {
- for(long i =k; i < k+lenWindow; i++ )
- {
-
- if( strSeq[i] == 'A')
- { countA++; continue;}
- if( strSeq[i] == 'G')
- { countG++; continue;}
- if( strSeq[i] == 'C')
- { countC++; continue;}
- if( strSeq[i] == 'T')
- { countT++; continue;}
-
- } //end of count for look.
-
- }else
- {
-
- switch(strSeq[k-1])
- {
- case 'A' :
- { countA--; break; }
- case 'T' :
- { countT--; break; }
- case 'G' :
- { countG--; break; }
- case 'C' :
- { countC--; break; }
- }
-
- switch(strSeq[k+lenWindow-1])
- {
- case 'A' :
- { countA++; break; }
- case 'T' :
- { countT++; break; }
- case 'G' :
- { countG++; break; }
- case 'C' :
- { countC++; break; }
- }
-
-
- }
-
- //out<<"Count A :"<<countA <<" Frame count :"<<k<<endl;
- //out<<"Count T :"<<countT <<" Frame count :"<<k<<endl;
- //out<<"Count G :"<<countG <<" Frame count :"<<k<<endl;
- //out<<"Count C :"<<countC <<" Frame count :"<<k<<endl;
-
- sq1 = 0.0+ pow(countA,2)+pow(countG,2) + pow(countT,2) + pow(countC,2);
-
- // for each Nucleotide. A T G C
-
-
- for(int i = 0; i <4 ; i ++ )
- {
- s1= complex<double>(0.0,0.0);
-
-
- for (int bi=0; bi<lenWindow; bi++)
- {
- if(strSeq[k+bi] == nuclt[i] )
- s1= s1+expval[bi];
- }
- //cout<<s1<<endl;
-
- s2 = s2+ (abs(s1)*abs(s1));
-
- }
-
-
- // to calculater power peak
-
-
- sq = sq1*li*li*li;
- sbar =(winsq-sq);
- sq = sbar * (lenWindow/2.0);
-
- // to calcuater signal to noise ratio
-
- s2 = s2/(sq*lenWindow*lenWindow);
-
- p= s2/sbar;
-
-
- s[framecount]=k+(lenWindow/2);
- x[framecount]=p;
-
- //out<<s[framecount]<<" "<<x[framecount]<<"\n";
- framecount++;
-
-
- } // End of Outer for loop(Frame wise)
-
- //return(0);
- //}
- // end of Ft function..
- //void WindowAnalysis::findCodingRegion(long seqfileId , const char * seqId)
- //{
-
- long *CRegion = new long[framecount+1];
- int flag=0;
-
- long i =0 , m =0;
-
- for (m =0; m<framecount; m++)
- {
-
- if(x[m]>=thersHold)
- {
- if(flag == 0)
- {
-
- CRegion[i] =s[m];
- //out<< "\n START :" <<s[m]<<" : "<<x[m];
- //i++;
- flag =1;
- }
-
- }
-
- else
- {
- if(flag==1)
- {
- if(x[m]<thersHold)
- {
-
- //out<< "\n START :" <<CRegion[i];
- //out<<"\t\tEND: "<<s[m-1]<<" : "<<x[m-1];
- i++;
- CRegion[i]=s[m-1];
- i++;
- }
- flag=0;
-
- }
-
- } // end of if.
- }// End Coding region loop.
-
-
-
- if (flag==1)
- {
- //out<< "\n START :" <<CRegion[i];
- //out<<"\t\tEND: "<<s[m-1]<<" : "<<x[m-1];
- i++;
- CRegion[i]=s[m-1];
- i++;
- }
-
- //delete []CRegion;
- // that data is deleted 07/02/02 To inprove performance.
-
- //delete []s ;
- //delete []x ;
- //s=NULL;
- //x=NULL;
-
- //out<<"\n Final Coding Regions :"<<endl;
-
- flag =0;
- int code =0;
- long sta=0 , stp =0;
- structCR obj;
- obj.type=' ';
-
- for(int j=0 ; j<i ; j +=2)
- {
- if(flag ==0)
- {
- sta = CRegion[j];
- stp = CRegion[j+1];
- if ( (stp-sta)>= exonCutOff ) code =1; else code =0;
- flag = 1 ;
- }
- else
- {
- if( (CRegion[j]-CRegion[j-1]) >=intronCutOff )
- {
- if(code ==1)
- {
- //out <<"\nStart :"<<sta <<" End :" << stp;
- // inset in database start new set.
- obj.start=sta;
- obj.end=stp;
- vecCRegion.push_back(obj);
- //insetcodes( sta , stp,seqfileId,seqId);
- sta = CRegion[j];
- stp = CRegion[j+1];
-
- if ( (stp-sta)>= exonCutOff ) code =1; else code =0;
- flag = 1 ;
- }
- else
- {
- //cout << forget the old set.
- if ((stp-sta)>= exonCutOff )
- {
- //out <<"\nStart :"<<sta <<" End :" << stp;
- obj.start=sta;
- obj.end=stp;
- vecCRegion.push_back(obj);
- //insetcodes( sta , stp,seqfileId,seqId);
- }
- // may need to be commented later.
- sta = CRegion[j];
- stp = CRegion[j+1];
- if ( (stp-sta)>= exonCutOff ) code =1; else code =0;
- flag = 1 ;
- }
- }
- else
- {
-
- if( ( CRegion[j+1]-CRegion[j] ) >= exonCutOff ) code =1;
- stp = CRegion[j+1];
- flag =1;
-
- }
- }
- }
-
- if ((code == 1) || ((stp-sta) >exonCutOff) )
- {
-
- //out <<"\nStart :"<<sta <<" End :" << stp;
- obj.start=sta;
- obj.end=stp;
- vecCRegion.push_back(obj);
- //mapCRegion[sta] =stp;
- //insetcodes( sta , stp,seqfileId,seqId);
- }
-
-
- //delete []CRegion;
- //CRegion=NULL;
-
-
- vector<structCR>::iterator it;
-
- PowerSpectrum powerObj;
-
- char * strtemp=NULL;
-
-
-
- for(it =vecCRegion.begin(); it !=vecCRegion.end(); it++)
- {
- //long start =it->start;
- //long end =it->end;
-
- strtemp = new char[( (it->end - it->start) +1)];
-
- memset(strtemp , '\0' ,(it->end - it->start) +1 );
-
- strncpy( strtemp , &strSeq[it->start] ,(it->end - it->start) );
-
- //strncpy( strtemp , &strSeq[start] , end -start);
-
- //printf("Length : %d \n ", strlen(strtemp));
-
- //printf("start %ld , End %d length %d\n " , it->start ,it->end , (it->end - it->start));
- //fflush(stdout);
-
- powerObj.setSequence(strtemp);
- int peaks =powerObj.doPowerSpectrumAnalysis(pcity);
- if(peaks==0)
- {
- it=vecCRegion.erase(it);
- if( it !=vecCRegion.end() )
- --it;
- else
- break;
- }
- else if(peaks==1)
- {
- it->type ='S';
- }
- else if(peaks==2)
- {
- it->type ='M';
- }
- delete []strtemp;
- strtemp = NULL;
- }
-
-
-
- //store the coding regions in an array
-
-
- //int elsize;
- //elsize=vecCRegion.size();
-
-
-
- delete []strtemp;
- strtemp = NULL;
-
-
- //delete []CRegion;
- //CRegion=NULL;
-
- //delete []s ;
- //delete []x ;
-
- return(0);
-
- } // end of Coding region finding.
-
-
- // to find exponential positional values to window size .
-
- int WindowAnalysis::findexp(int WindowSize)
- {
-
-
- complex<double> i1(0.0,1.0);
- const double PI = 3.14159274;
- const double F3 = 1.000/3;
-
- for(int k=0; k<WindowSize; k++)
- {
-
- expval[k] = exp(-i1*2.0*PI*F3*(k+1.0));
-
- //cout<<cos(-1*2.0*PI*F3*(k+1.0))<<endl;
- }
-
- return(0);
-
- }
-
-
-
- // end Window Analysis code
-
-
-
-
-
- class ftcds
- {
-
-
- WindowAnalysis *winobj, *winobj1;
- PowerSpectrum pobj;
-
-
- FILE *fp1,*fp2,*fp3,*fpl,*fpc;
- char *seq,*seqs;//store the sequence to find the PNR;
- long begpos,endpos;
- //ifstream begptr,endptr;
-
-
-
- public:
-
- void doGenscan(char *,char*,int,int,int,int,int);
-
- ftcds()
- {
- //seq = NULL;
- winobj = new WindowAnalysis();
- winobj1 = new WindowAnalysis();
-
-
- }
-
- ~ftcds()
- {
- delete [] seq;
- seq = NULL;
- cout<<"Destructor of ftcds"<<endl;
- delete winobj;
- delete winobj1;
-
- }
-
-
- };
-
-
-
- void ftcds::doGenscan(char *inputfile,char *outfile,int winlen,int thold,int pcity,int exoncut,int intcut)
- {
-
- //variable where the sequence will be stored
-
- int seqno=1;
-
- //char fname[50];
- char ch,ch1;
- char *seqname;
- int snin;
-
- vector<structCR>::iterator it1;
-
-
- fp1=fopen(inputfile,"r");
-
- if(fp1==NULL)
- {
- cout<<"File doesn't exist"<<endl;
- exit(1);
- }
-
- //create a temporary file to store the sequence without the newline characters
-
- fpc=fopen(outfile,"w");
- if(fpc==NULL)
- {
- cout<<"Could not open output file"<<endl;
- exit(1);
- }
-
- ch=fgetc(fp1);
-
- while(ch!=EOF)
- {
-
-
- fp2=fopen("tmpseq.tmp","w+");
-
- if(fp2==NULL)
- {
- cout<<"Could not create temporary file"<<endl;
- exit(1);
- }
-
- do
- {
-
- //check for the initial '>'symbol put the characters after the next line
-
- if(ch=='>')
- {
-
- begpos=ftell(fp1);
-
- while(ch!='\n')
- {
- ch=fgetc(fp1);
- }
-
- endpos=ftell(fp1);
- seqname=new char[endpos -begpos + 2];
- memset(seqname,'\0',endpos-begpos +2);
-
- fseek(fp1,begpos,SEEK_SET);
- snin=0;
- while(true)
- {
- ch=fgetc(fp1);
- if(!(snin==11 || ch=='\n'))
- {
- seqname[snin]=ch;
- snin++;
- }
-
- if(ch=='\n')
- {
- ch=fgetc(fp1);
- break;
- }
- }//end ofwhile
-
- }//end of if
-
- if(ch!='\n')
- fputc(ch,fp2);
-
- ch=fgetc(fp1);
-
- if(ch==EOF)
- {
- break;
- }
-
- }while(ch!='>');//end of while
-
-
- endpos=ftell(fp2);
-
- //int endpos=begpos;
-
-
- seq=new char[endpos + 2];
- memset(seq,'\0',endpos + 2);
-
-
- begpos=0;
- fseek(fp2,begpos,SEEK_SET);
-
- ch1=fgetc(fp2);
- int seqin=0;
-
- while(ch1!=EOF)
- {
- seq[seqin]=ch1;
- seqin++;
- ch1=fgetc(fp2);
- }
-
-
- winobj->setWindowLen(winlen);
- winobj->setpcity(pcity);
- winobj->settherHold(thold*1.0);
- winobj->setIntronExon(exoncut,intcut);
-
-
- cout<<"Length:"<<strlen(seq)<<endl;
-
- winobj->doFt(seq,"log");
-
-
- //put the coding regions in a separate file
-
-
- for(it1=winobj->vecCRegion.begin();it1!=winobj->vecCRegion.end();it1++)
- {
-
- unsigned long actstartpos=it1->start-((winlen/2)-1);
- unsigned long actendpos=it1->end +(winlen/2);
-
- if(actendpos>strlen(seq))
- actendpos=strlen(seq);
-
-
- seqs=new char[actendpos -actstartpos +2];
- memset(seqs,'\0',actendpos -actstartpos +2);
-
- int j=0;
-
- for(int i=actstartpos -1;i<actendpos;i++,j++)
- seqs[j]=seq[i];
-
-
-
- int len=strlen(seqs);
-
-
- cout<<"Length:"<<len<<endl;
- winobj1->setWindowLen(strlen(seqs));
-
- winobj1->setpcity(pcity);
- winobj1->settherHold(thold*1.0);
- winobj1->doFt(seqs,"log");
-
- fprintf(fpc,">fourier cds%d %s [%d - %d] %d %0.4f\n",seqno,seqname,actstartpos,actendpos,(actendpos-actstartpos)+1,winobj1->x[0]);
- fprintf(fpc,"%s\n",seqs);
-
-
- }//end of for
-
- fclose(fp2);
- remove("tmpseq.tmp");
- winobj->vecCRegion.clear();
- seqno++;
-
- delete []seq;
- seq = NULL;
-
- }//end of while
-
-
- fclose(fpc);
-
-
- }//end of main function
-
-
- int main(int argc, char* argv[])
- {
-
- ftcds *Ch = new ftcds();
-
- int pcity;
- int excut,incut;
-
- cout<<argc<<endl;
- //cout<<"Usage:[filename] -w [length] -t [threshold] -p [periodicity] -i [inputfile] -o [outputfile]"<<endl;
-
- int winlen,thold;
- char *infile,*outfile;
-
- if(argc!=14)
- {
- cout<<"Usage:[filename] -w [length] -t [threshold] -p [periodicity] -ei [exoncut] [introncut] -i [inputfile] -o [outputfile]"<<endl;
- exit(1);
- }
-
- for(int s=1;s<argc;s++)
- {
-
- if(strcmp(argv[s],"-w")==0)
- winlen=atoi(argv[++s]);
-
- if(strcmp(argv[s],"-t")==0)
- thold=atoi(argv[++s]);
-
- if(strcmp(argv[s],"-i")==0)
- {
- infile=new char[strlen(argv[s +1])+1];
- strcpy(infile,argv[++s]);
- }
-
- if(strcmp(argv[s],"-o")==0)
- {
- outfile=new char[strlen(argv[s +1])+1];
- strcpy(outfile,argv[++s]);
- }
-
- if(strcmp(argv[s],"-p")==0)
- pcity=atoi(argv[++s]);
-
- if(strcmp(argv[s],"-ei")==0)
- {
- excut=atoi(argv[s+1]);
- incut=atoi(argv[s +2]);
-
- }
-
- }//end of for
-
- cout<<"lenwindow:"<<winlen<<endl;
- cout<<"Pcity:"<<pcity<<endl;
- cout<<"Exon intron"<<excut<<" "<<incut<<endl;
- cout<<"Thold:"<<thold<<endl;
- cout<<"Infile:"<<infile<<endl;
- cout<<"Outfile:"<<outfile<<endl;
-
- Ch->doGenscan(infile,outfile,winlen,thold,pcity,excut,incut);
-
- //Ch->doGenscan("c:\\anift\\amrna.txt","c:\\anift\\amrnaout.txt",300,4,3,50,50);
-
-
- delete Ch;
-
- return 0;
-
-
- }//end main