PageRenderTime 51ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 1ms

/code_comm/ft1.cpp

http://research-code-base-animesh.googlecode.com/
C++ | 835 lines | 450 code | 285 blank | 100 comment | 54 complexity | 393a2823d9cfe45ab17f47739aedc20d MD5 | raw file
  1. /***************************************************************************
  2. main.cpp - description
  3. -------------------
  4. begin : Tue Nov 19 15:31:09 IST 2002
  5. copyright : (C) 2002 by guest of ani
  6. email : guest@mgldel-incorp.com
  7. ***************************************************************************/
  8. /***************************************************************************
  9. * *
  10. * This program is free software; you can redistribute it and/or modify *
  11. * it under the terms of the GNU General Public License as published by *
  12. * the Free Software Foundation; either version 2 of the License, or *
  13. * (at your option) any later version. *
  14. * *
  15. ***************************************************************************/
  16. //#include<stdio.h>
  17. #include<iostream.h>
  18. #include<stdio.h>
  19. #include<string.h>
  20. #include<stdlib.h>
  21. #include<ctype.h>
  22. #include<math.h>
  23. //Window Analysis code
  24. #include <vector>
  25. #include<complex>
  26. using namespace::std;
  27. //begin Power spectrum code
  28. struct structCR
  29. {
  30. long start;
  31. long end;
  32. char type;
  33. };
  34. class WindowAnalysis //__declspec(dllexport)
  35. {
  36. public:
  37. long *s;
  38. double *x;
  39. int pcity;
  40. vector<structCR> vecCRegion;
  41. WindowAnalysis();
  42. WindowAnalysis(int len ,double thold,long intron , long exon);
  43. virtual ~WindowAnalysis();
  44. void setWindowLen(int len);
  45. void settherHold(double thold);
  46. void setIntronExon(long intron , long exon );
  47. int doFt(const char * strSeq , char *logfile);
  48. void setpcity(int pc);
  49. private :
  50. int lenWindow;
  51. double thersHold;
  52. long exonCutOff;
  53. long intronCutOff;
  54. int findexp(int WindowSize);
  55. long framecount;
  56. complex<double> * expval;
  57. WindowAnalysis& operator =(const WindowAnalysis &cwa);
  58. WindowAnalysis(const WindowAnalysis &cwa);
  59. };
  60. WindowAnalysis::WindowAnalysis()
  61. {
  62. expval = NULL;
  63. s = NULL;
  64. x = NULL;
  65. framecount=0;
  66. }
  67. WindowAnalysis::~WindowAnalysis()
  68. {
  69. delete []expval;
  70. delete []s ;
  71. delete []x ;
  72. //printf("Destructor of WindowAnalysis ");
  73. fflush(stdout);
  74. //delete []CRegion;
  75. //this->vecCRegion.clear();
  76. }
  77. WindowAnalysis::WindowAnalysis(int len , double thold ,long intron , long exon)
  78. {
  79. lenWindow=len;
  80. thersHold=thold;
  81. exonCutOff=exon;
  82. intronCutOff =intron;
  83. s = NULL;
  84. x = NULL;
  85. framecount=0;
  86. // cout<<"ThresHold Value "<<thersHold;
  87. // cout<<"Intron :"<<intronCutOff<<" Exon :"<<exonCutOff<<endl;
  88. expval=new complex<double>[lenWindow];
  89. }
  90. void WindowAnalysis::settherHold(double thold)
  91. {
  92. thersHold =thold;
  93. // cout<< "ThresHold Value "<<thersHold;
  94. }
  95. void WindowAnalysis::setpcity(int pc)
  96. {
  97. pcity=pc;
  98. }
  99. void WindowAnalysis::setWindowLen(int len)
  100. {
  101. lenWindow=len;
  102. delete [] expval;
  103. expval=new complex<double>[lenWindow];
  104. }
  105. void WindowAnalysis::setIntronExon(long intron , long exon )
  106. {
  107. exonCutOff=exon;
  108. intronCutOff =intron;
  109. }
  110. int WindowAnalysis::doFt(const char * strSeq , char * logfile)
  111. {
  112. const double F3 = 1.000/pcity;
  113. const double PI = 3.14159274;
  114. //ofstream out("output" );
  115. //ofstream out(logfile ,ios::app);
  116. long lenStr = (strlen(strSeq)/pcity) * pcity;
  117. //lenWindow=(lenStr/pc)*pc;
  118. if(lenStr<lenWindow)
  119. {
  120. //out<<"Sequence length is less than windowlength."<<endl;
  121. return 1;
  122. }
  123. //cout<<"\n Sequence Length :"<<lenStr<<endl;
  124. int result= findexp(lenWindow);
  125. //long *CRegion = new long[lenStr-lenWindow+ 1];
  126. //long *s = new long[lenStr-lenWindow+ 1];
  127. //double *x = new double[lenStr-lenWindow+1];
  128. //delete [] s;
  129. //delete [] x;
  130. s = new long[lenStr-lenWindow+ 1];
  131. x = new double[lenStr-lenWindow+1];
  132. char nuclt[5]= "ATGC";
  133. double li=(1.0/lenWindow);
  134. double winsq = li+ (li*li);
  135. double winsq1 = (1.0/lenWindow)+(1.0/(lenWindow*lenWindow));
  136. int countA =0;
  137. int countC =0;
  138. int countG =0;
  139. int countT= 0;
  140. complex<double> s1(0.0,0.0);
  141. double sq=0.0;
  142. double sq1=0.0;
  143. double s2=0.0;
  144. double sbar =0.0;
  145. double p=0.0;
  146. framecount=0;
  147. for(long k = 0 ; k < lenStr-lenWindow+1 ; k++ )
  148. {
  149. sq =0.0;
  150. s2 =0.0;
  151. sbar =0.0;
  152. p =0.0;
  153. // Frame wise count of A C G T seperately
  154. if(k==0)
  155. {
  156. for(long i =k; i < k+lenWindow; i++ )
  157. {
  158. if( strSeq[i] == 'A')
  159. { countA++; continue;}
  160. if( strSeq[i] == 'G')
  161. { countG++; continue;}
  162. if( strSeq[i] == 'C')
  163. { countC++; continue;}
  164. if( strSeq[i] == 'T')
  165. { countT++; continue;}
  166. } //end of count for look.
  167. }else
  168. {
  169. switch(strSeq[k-1])
  170. {
  171. case 'A' :
  172. { countA--; break; }
  173. case 'T' :
  174. { countT--; break; }
  175. case 'G' :
  176. { countG--; break; }
  177. case 'C' :
  178. { countC--; break; }
  179. }
  180. switch(strSeq[k+lenWindow-1])
  181. {
  182. case 'A' :
  183. { countA++; break; }
  184. case 'T' :
  185. { countT++; break; }
  186. case 'G' :
  187. { countG++; break; }
  188. case 'C' :
  189. { countC++; break; }
  190. }
  191. }
  192. //out<<"Count A :"<<countA <<" Frame count :"<<k<<endl;
  193. //out<<"Count T :"<<countT <<" Frame count :"<<k<<endl;
  194. //out<<"Count G :"<<countG <<" Frame count :"<<k<<endl;
  195. //out<<"Count C :"<<countC <<" Frame count :"<<k<<endl;
  196. sq1 = 0.0+ pow(countA,2)+pow(countG,2) + pow(countT,2) + pow(countC,2);
  197. // for each Nucleotide. A T G C
  198. for(int i = 0; i <4 ; i ++ )
  199. {
  200. s1= complex<double>(0.0,0.0);
  201. for (int bi=0; bi<lenWindow; bi++)
  202. {
  203. if(strSeq[k+bi] == nuclt[i] )
  204. s1= s1+expval[bi];
  205. }
  206. //cout<<s1<<endl;
  207. s2 = s2+ (abs(s1)*abs(s1));
  208. }
  209. // to calculater power peak
  210. sq = sq1*li*li*li;
  211. sbar =(winsq-sq);
  212. sq = sbar * (lenWindow/2.0);
  213. // to calcuater signal to noise ratio
  214. s2 = s2/(sq*lenWindow*lenWindow);
  215. p= s2/sbar;
  216. s[framecount]=k+(lenWindow/2);
  217. x[framecount]=p;
  218. //out<<s[framecount]<<" "<<x[framecount]<<"\n";
  219. framecount++;
  220. } // End of Outer for loop(Frame wise)
  221. //return(0);
  222. //}
  223. // end of Ft function..
  224. //void WindowAnalysis::findCodingRegion(long seqfileId , const char * seqId)
  225. //{
  226. long *CRegion = new long[framecount+1];
  227. int flag=0;
  228. long i =0 , m =0;
  229. for (m =0; m<framecount; m++)
  230. {
  231. if(x[m]>=thersHold)
  232. {
  233. if(flag == 0)
  234. {
  235. CRegion[i] =s[m];
  236. //out<< "\n START :" <<s[m]<<" : "<<x[m];
  237. //i++;
  238. flag =1;
  239. }
  240. }
  241. else
  242. {
  243. if(flag==1)
  244. {
  245. if(x[m]<thersHold)
  246. {
  247. //out<< "\n START :" <<CRegion[i];
  248. //out<<"\t\tEND: "<<s[m-1]<<" : "<<x[m-1];
  249. i++;
  250. CRegion[i]=s[m-1];
  251. i++;
  252. }
  253. flag=0;
  254. }
  255. } // end of if.
  256. }// End Coding region loop.
  257. if (flag==1)
  258. {
  259. //out<< "\n START :" <<CRegion[i];
  260. //out<<"\t\tEND: "<<s[m-1]<<" : "<<x[m-1];
  261. i++;
  262. CRegion[i]=s[m-1];
  263. i++;
  264. }
  265. //delete []CRegion;
  266. // that data is deleted 07/02/02 To inprove performance.
  267. //out<<"\n Final Coding Regions :"<<endl;
  268. flag =0;
  269. int code =0;
  270. long sta=0 , stp =0;
  271. structCR obj;
  272. obj.type=' ';
  273. for(int j=0 ; j<i ; j +=2)
  274. {
  275. if(flag ==0)
  276. {
  277. sta = CRegion[j];
  278. stp = CRegion[j+1];
  279. if ( (stp-sta)>= exonCutOff ) code =1; else code =0;
  280. flag = 1 ;
  281. }
  282. else
  283. {
  284. if( (CRegion[j]-CRegion[j-1]) >=intronCutOff )
  285. {
  286. if(code ==1)
  287. {
  288. //out <<"\nStart :"<<sta <<" End :" << stp;
  289. // inset in database start new set.
  290. obj.start=sta;
  291. obj.end=stp;
  292. vecCRegion.push_back(obj);
  293. //insetcodes( sta , stp,seqfileId,seqId);
  294. sta = CRegion[j];
  295. stp = CRegion[j+1];
  296. if ( (stp-sta)>= exonCutOff ) code =1; else code =0; flag = 1 ;
  297. }
  298. else
  299. {
  300. //cout << forget the old set.
  301. if ((stp-sta)>= exonCutOff )
  302. {
  303. //out <<"\nStart :"<<sta <<" End :" << stp;
  304. obj.start=sta;
  305. obj.end=stp;
  306. vecCRegion.push_back(obj);
  307. //insetcodes( sta , stp,seqfileId,seqId);
  308. }
  309. // may need to be commented later.
  310. sta = CRegion[j];
  311. stp = CRegion[j+1];
  312. if ( (stp-sta)>= exonCutOff ) code =1; else code =0;
  313. flag = 1 ;
  314. }
  315. }
  316. else
  317. {
  318. if( ( CRegion[j+1]-CRegion[j] ) >= exonCutOff ) code =1;
  319. stp = CRegion[j+1];
  320. flag =1;
  321. }
  322. }
  323. }
  324. if ((code == 1) || ((stp-sta) >exonCutOff) )
  325. {
  326. //out <<"\nStart :"<<sta <<" End :" << stp;
  327. obj.start=sta;
  328. obj.end=stp;
  329. vecCRegion.push_back(obj);
  330. }
  331. //delete []CRegion;
  332. //CRegion=NULL;
  333. vector<structCR>::iterator it;
  334. char * strtemp=NULL;
  335. for(it =vecCRegion.begin(); it !=vecCRegion.end(); it++)
  336. {
  337. //long start =it->start;
  338. //long end =it->end;
  339. strtemp = new char[( (it->end - it->start) +1)];
  340. memset(strtemp , '\0' ,(it->end - it->start) +1 );
  341. strncpy( strtemp , &strSeq[it->start] ,(it->end - it->start) );
  342. //strncpy( strtemp , &strSeq[start] , end -start);
  343. //printf("Length : %d \n ", strlen(strtemp));
  344. //printf("start %ld , End %d length %d\n " , it->start ,it->end , (it->end - it->start));
  345. //fflush(stdout);
  346. delete []strtemp;
  347. strtemp = NULL;
  348. }
  349. //store the coding regions in an array
  350. //int elsize;
  351. //elsize=vecCRegion.size();
  352. delete []strtemp;
  353. strtemp = NULL;
  354. //delete []CRegion;
  355. //CRegion=NULL;
  356. //delete []s ;
  357. //delete []x ;
  358. return(0);
  359. } // end of Coding region finding.
  360. // to find exponential positional values to window size .
  361. int WindowAnalysis::findexp(int WindowSize)
  362. {
  363. complex<double> i1(0.0,1.0);
  364. const double PI = 3.14159274;
  365. const double F3 = 1.000/3;
  366. for(int k=0; k<WindowSize; k++)
  367. {
  368. expval[k] = exp(-i1*2.0*PI*F3*(k+1.0));
  369. //cout<<cos(-1*2.0*PI*F3*(k+1.0))<<endl;
  370. }
  371. return(0);
  372. }
  373. // end Window Analysis code
  374. class ftcds
  375. {
  376. WindowAnalysis *winobj, *winobj1;
  377. FILE *fp1,*fp2,*fp3,*fpl,*fpc;
  378. char *seq ,*seqs;
  379. long begpos,endpos;
  380. public:
  381. void doGenscan(char *,char*,int,int,int,int,int);
  382. ftcds()
  383. {
  384. //seq = NULL;
  385. winobj = new WindowAnalysis();
  386. winobj1 = new WindowAnalysis();
  387. }
  388. ~ftcds()
  389. {
  390. delete [] seq;
  391. seq = NULL;
  392. //cout<<"Destructor of ftcds"<<endl;
  393. delete winobj;
  394. delete winobj1;
  395. }
  396. };
  397. void ftcds::doGenscan(char *inputfile,char *outfile,int winlen,int thold,int pcity,int exoncut,int intcut)
  398. {
  399. //variable where the sequence will be stored
  400. int seqno=1;
  401. //char fname[50];
  402. char ch,ch1;
  403. char *seqname;
  404. int snin;
  405. vector<structCR>::iterator it1;
  406. fp1=fopen(inputfile,"r");
  407. if(fp1==NULL)
  408. {
  409. cout<<"File doesn't exist"<<endl;
  410. exit(1);
  411. }
  412. //create a temporary file to store the sequence without the newline characters
  413. fpc=fopen(outfile,"w");
  414. if(fpc==NULL)
  415. {
  416. cout<<"Could not open output file"<<endl;
  417. exit(1);
  418. }
  419. ch=fgetc(fp1);
  420. while(ch!=EOF)
  421. {
  422. fp2=fopen("tmpseq.tmp","w+");
  423. if(fp2==NULL)
  424. {
  425. cout<<"Could not create temporary file"<<endl;
  426. exit(1);
  427. }
  428. do
  429. {
  430. //check for the initial '>'symbol put the characters after the next line
  431. if(ch=='>')
  432. {
  433. begpos=ftell(fp1);
  434. while(ch!='\n')
  435. {
  436. ch=fgetc(fp1);
  437. }
  438. endpos= ftell(fp1);
  439. seqname=new char[endpos -begpos + 2];
  440. memset(seqname,'\0',endpos-begpos +2);
  441. fseek(fp1,begpos,SEEK_SET);
  442. snin=0;
  443. while(true)
  444. {
  445. ch=fgetc(fp1);
  446. if(ch=='\n')
  447. {
  448. ch=fgetc(fp1);
  449. break;
  450. }
  451. seqname[snin]=ch;
  452. snin++;
  453. }//end ofwhile
  454. }//end of if
  455. if(ch!='\n')
  456. fputc(ch,fp2);
  457. ch=fgetc(fp1);
  458. if(ch==EOF)
  459. {
  460. break;
  461. }
  462. }while(ch!='>');//end of while
  463. endpos=ftell(fp2);
  464. //int endpos=begpos;
  465. seq=new char[endpos + 2];
  466. memset(seq,'\0',endpos + 2);
  467. begpos=0;
  468. fseek(fp2,begpos,SEEK_SET);
  469. ch1=fgetc(fp2);
  470. int seqin=0;
  471. while(ch1!=EOF)
  472. {
  473. seq[seqin]=ch1;
  474. seqin++;
  475. ch1=fgetc(fp2);
  476. }
  477. winobj->setWindowLen(winlen);
  478. winobj->setpcity(pcity);
  479. winobj->settherHold(thold*1.0);
  480. winobj->setIntronExon(exoncut,intcut);
  481. //cout<<"Length:"<<strlen(seq)<<endl;
  482. winobj->doFt(seq,"log");
  483. /*for(it1=winobj->vecCRegion.begin();it1!=winobj->vecCRegion.end();it1++)
  484. {
  485. // it1=winobj->vecCRegion.begin();
  486. cout<<it1->start<<" "<<it1->end<<endl;
  487. }*/
  488. // cout<<"\n"<<endl;
  489. for(it1=winobj->vecCRegion.begin();it1!=winobj->vecCRegion.end();it1++)
  490. {
  491. unsigned long actstartpos=it1->start - ((winlen/2)-1);
  492. unsigned long actendpos=it1->end;
  493. if(actendpos>strlen(seq))
  494. actendpos=strlen(seq);
  495. seqs=new char[actendpos -actstartpos +2];
  496. memset(seqs,'\0',actendpos -actstartpos +2);
  497. int j=0;
  498. for(int i=actstartpos -1;i<actendpos;i++,j++)
  499. seqs[j]=seq[i];
  500. int len=(strlen(seqs)/3)*3;
  501. //cout<<"Length:"<<len<<endl;
  502. winobj1->setWindowLen(len);
  503. winobj1->setpcity(pcity);
  504. winobj1->settherHold(thold*1.0);
  505. winobj1->doFt(seqs,"log");
  506. fprintf(fpc,">%s [%d - %d] fourier-CDS%d Length-%d P2NR-%0.4f\n",seqname,actstartpos,actendpos,seqno,(actendpos-actstartpos)+1,winobj1->x[0]);
  507. fprintf(fpc,"%s\n",seqs);
  508. printf(".");
  509. seqno++;
  510. }//end of for
  511. printf("\n");
  512. fclose(fp2);
  513. remove("tmpseq.tmp");
  514. winobj->vecCRegion.clear();
  515. delete []seq;
  516. seq = NULL;
  517. }//end of while
  518. cout<<"\nFourier transform for file: "<<inputfile<<"\nResults written to file : "<<outfile<<" (Multiple Sequence Fasta Format)\n\n"<<endl;
  519. fclose(fpc);
  520. }//end of main function
  521. int main(int argc, char* argv[])
  522. {
  523. ftcds *Ch = new ftcds();
  524. int pcity;
  525. int excut,incut;
  526. int winlen,thold;
  527. char *infile,*outfile;
  528. if(argc!=5)
  529. {
  530. cout<<"Usage:[filename] -i [inputfile] -o [outputfile]"<<endl;
  531. exit(1);
  532. }
  533. for(int s=1;s<argc;s++)
  534. {
  535. if(strcmp(argv[s],"-i")==0)
  536. {
  537. infile=new char[strlen(argv[s +1])+1];
  538. strcpy(infile,argv[++s]);
  539. }
  540. if(strcmp(argv[s],"-o")==0)
  541. {
  542. outfile=new char[strlen(argv[s +1])+1];
  543. strcpy(outfile,argv[++s]);
  544. }
  545. }//end of for
  546. Ch->doGenscan(infile,outfile,100,4,3,100,100);
  547. //Ch->doGenscan("amrna.txt","amrnaout.txt",300,4,3,50,50);
  548. delete Ch;
  549. return 0;
  550. }//end main