PageRenderTime 60ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/GeneR/GeneR/src/makeIndex.cc

#
C++ | 512 lines | 356 code | 92 blank | 64 comment | 157 complexity | 3ea9ad7e66e841860193a4b845e03eba MD5 | raw file
Possible License(s): LGPL-2.0
  1. /*! \file makeIndex.cc
  2. *
  3. * \date Created : 08/07/04
  4. * \date Last Modified : Time-stamp: <2006-03-08 17:31:07 lucas>
  5. *
  6. * For all sequence files we build index file for fast access to a
  7. * specific sequence. Index files are a table with 4 columns
  8. * defined as follow:
  9. * accno deb_feature deb_sequence length_sequence
  10. *
  11. * \brief make index for Fasta embl and Genbank file
  12. * \version 1
  13. * \author A. Lucas
  14. * \note Licence: CeCIll
  15. */
  16. #include <string>
  17. #include <stdio.h>
  18. #include <stdlib.h>
  19. #include "makeIndex.h"
  20. #include "GeneR_globals.h"
  21. #include <Rinternals.h>
  22. #include <Rdefines.h>
  23. extern "C"
  24. {
  25. void ixfasta(char **path,int *res) ;
  26. void ixgbk(char **path,int *res) ;
  27. void ixembl(char **path,char **index,int *res) ;
  28. void delete_CR_infile (char ** filein , char ** fileout, int * err);
  29. }
  30. /** @brief Make an index for a Fasta file
  31. *
  32. *
  33. * @param path File path
  34. * @param res Integer return -1 if error; 1 if ok; -2 if "\r" found;
  35. * -3 if accno too long -4 file not found;
  36. */
  37. void ixfasta(char **path,int *res)
  38. {
  39. FILE *fichierin;
  40. FILE *fichierout;
  41. char * pathout;
  42. char c;
  43. int i,compteur_accno,ecriture,len,header;
  44. int deb_sequence,len_feature;
  45. len = strlen(*path);
  46. *res = -1;
  47. pathout = (char * ) malloc ( (len + 4)* sizeof(char) );
  48. strcpy(pathout,*path);
  49. pathout[len]='.';
  50. pathout[len+1]='i';
  51. pathout[len+2]='x';
  52. pathout[len+3]='\0';
  53. fichierin=fopen(*path,"r");
  54. fichierout=fopen(pathout,"w");
  55. if ((fichierin == NULL ) || (fichierout == NULL))
  56. {
  57. fprintf(stdout, "GeneR.so: error while opening file\n");
  58. *res = -4;
  59. return ;
  60. }
  61. /* On se place au debut du fichier */
  62. /* rewind(fichierin);*/
  63. i=0;
  64. ecriture=0;
  65. deb_sequence=0;
  66. header=0;
  67. len_feature = 0;
  68. compteur_accno=0;
  69. while((c = fgetc(fichierin)) != EOF)
  70. {
  71. i++;
  72. if(c == '\r')
  73. *res = -2 ;
  74. /* On traite l'ent?te (header ou feature) */
  75. if(header)
  76. {
  77. len_feature++;
  78. if(c == '\n')
  79. {
  80. header=0;
  81. ecriture=0;
  82. }
  83. if(c == ' ')
  84. ecriture=0;
  85. if(c == '\r')
  86. {
  87. header=0;
  88. ecriture=0;
  89. }
  90. if(c == '\t')
  91. ecriture=0;
  92. if(c == 10)
  93. ecriture=0;
  94. if(( (compteur_accno) > MAX_LEN_ACCNO) && (ecriture))
  95. {
  96. *res = -3;
  97. ecriture=0;
  98. }
  99. if(ecriture)
  100. {
  101. compteur_accno++;
  102. fputc(c,fichierout);
  103. }
  104. }
  105. if(c == '>' && (!header))
  106. {
  107. if(i>1)
  108. makeIndex::ixecritureligne(deb_sequence,compteur_accno,len_feature,i-deb_sequence,fichierout);
  109. deb_sequence = i;
  110. compteur_accno = 0;
  111. ecriture=1;
  112. header = 1;
  113. len_feature = 0;
  114. }
  115. }
  116. makeIndex::ixecritureligne(deb_sequence,compteur_accno,len_feature,i-deb_sequence+1,fichierout);
  117. fclose(fichierin);
  118. fclose(fichierout);
  119. if(*res == -1)
  120. *res = 1;
  121. }
  122. /** @brief Make an index for a GenBank file
  123. *
  124. *
  125. * @param path File path
  126. * @param res Integer return -1 if error; 1 if ok; -2 if "\r" found;
  127. * -3 if accno too long, -4 file not found;
  128. */
  129. void ixgbk(char **path,int *res)
  130. {
  131. FILE *fichierin;
  132. FILE *fichierout;
  133. char * pathout;
  134. char c,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9;
  135. int i,compteur_accno,ecriture,len,header;
  136. int deb_sequence,len_feature;
  137. *res = -1;
  138. len = strlen(*path);
  139. pathout =(char * ) malloc ( (len + 4)* sizeof(char) );
  140. strcpy(pathout,*path);
  141. pathout[len]='.';
  142. pathout[len+1]='i';
  143. pathout[len+2]='x';
  144. pathout[len+3]='\0';
  145. c=c0=c1=c2=c3=c4=c5=c6=c7=c8=c9=0;
  146. fichierin=fopen(*path,"r");
  147. fichierout=fopen(pathout,"w");
  148. if ((fichierin == NULL ) || (fichierout == NULL))
  149. {
  150. fprintf(stdout, "GeneR.so: error while opening file\n");
  151. *res = -4;
  152. return ;
  153. }
  154. /* On se place au debut du fichier */
  155. /* rewind(fichierin);*/
  156. i=0;
  157. ecriture=0;
  158. deb_sequence=0;
  159. header=1;
  160. len_feature = 0;
  161. compteur_accno=0;
  162. while((c = fgetc(fichierin)) != EOF)
  163. {
  164. if(c == '\r')
  165. *res = -2;
  166. c9=c8;
  167. c8=c7;
  168. c7=c6;
  169. c6=c5;
  170. c5=c4;
  171. c4=c3;
  172. c3=c2;
  173. c2=c1;
  174. c1=c0;
  175. c0=c;
  176. i++;
  177. /* On traite l'ent?te (header ou feature) */
  178. if(header)
  179. {
  180. len_feature++;
  181. if(((c6 == '\n')|| (c6 == '\r') ) && (c5 == 'O')&& (c4 == 'R')&& (c3 == 'I')&& (c2 == 'G')&& (c1 == 'I')&& (c == 'N') )
  182. {
  183. while((((c = fgetc(fichierin))!='\n') && (c != '\r') ) && c != EOF)
  184. {
  185. i++;
  186. len_feature++;
  187. }
  188. header=0;
  189. i++;
  190. len_feature++;
  191. }
  192. if(((c9 == '\n') || (i < 10) || (c9 == '\r') ) && (c8 == 'A')&& (c7 == 'C')&& (c6 == 'C')&& (c5 == 'E')&& (c4 == 'S')&& (c3 == 'S') && (c2 == 'I')&& (c1 == 'O')&& (c0 == 'N') )
  193. {
  194. ecriture=1;
  195. while((c = fgetc(fichierin))==' ')
  196. {
  197. i++;
  198. len_feature++;
  199. }
  200. i++;
  201. len_feature++;
  202. compteur_accno=0;
  203. }
  204. if(ecriture)
  205. {
  206. if((c == ':') || (c == '\n') || (c == '\r') || (c == ' '))
  207. ecriture=0;
  208. else
  209. {
  210. fputc(c,fichierout);
  211. compteur_accno++;
  212. }
  213. if(compteur_accno > MAX_LEN_ACCNO)
  214. {
  215. ecriture=0;
  216. *res= -3;
  217. }
  218. }
  219. }
  220. if(((c2 == '\n') || (c2 == '\r')) && (c1 == '/') && (c0 == '/') )
  221. {
  222. while((((c = fgetc(fichierin))!='\n') && (c != '\r') )&& c != EOF)
  223. {
  224. i++;
  225. }
  226. i++;
  227. if(i>2)
  228. makeIndex::ixecritureligne(deb_sequence+1,compteur_accno,len_feature-1,i-deb_sequence,fichierout);
  229. deb_sequence = i;
  230. compteur_accno = 0;
  231. ecriture=0;
  232. header = 1;
  233. len_feature = 0;
  234. }
  235. }
  236. if(!header)
  237. makeIndex::ixecritureligne(deb_sequence+1,compteur_accno,len_feature-1,i-deb_sequence,fichierout);
  238. fclose(fichierin);
  239. fclose(fichierout);
  240. if(*res == -1)
  241. *res = 1;
  242. }
  243. /** @brief Make an index for a EMBL file
  244. * @param path File path
  245. * @param index "d" or "x" if index file is ".ix" or ".id"
  246. * @param res Integer return -1 if error; 1 if ok; -2 if "\r" found;
  247. * -3 if accno too long, -4 file not found;
  248. */
  249. void ixembl(char **path,char **index,int *res)
  250. {
  251. FILE *fichierin;
  252. FILE *fichierout;
  253. char * pathout;
  254. char c,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9;
  255. int i,compteur_accno,ecriture,len,header;
  256. int deb_sequence,len_feature;
  257. *res = -1;
  258. len = strlen(*path);
  259. pathout =(char * ) malloc ( (len + 4)* sizeof(char) );
  260. strcpy(pathout,*path);
  261. pathout[len]='.';
  262. pathout[len+1]='i';
  263. pathout[len+2]= **index;
  264. pathout[len+3]='\0';
  265. c=c0=c1=c2=c3=c4=c5=c6=c7=c8=c9=0;
  266. fichierin=fopen(*path,"r");
  267. fichierout=fopen(pathout,"w");
  268. if ((fichierin == NULL ) || (fichierout == NULL))
  269. {
  270. fprintf(stdout, "GeneR.so: error while opening file\n");
  271. *res = -4;
  272. return ;
  273. }
  274. /* On se place au debut du fichier */
  275. /* rewind(fichierin);*/
  276. i=0;
  277. ecriture=0;
  278. deb_sequence=0;
  279. header=1;
  280. len_feature = 0;
  281. compteur_accno=0;
  282. while((c = fgetc(fichierin)) != EOF)
  283. {
  284. if(c == '\r')
  285. *res = -2;
  286. c9=c8;
  287. c8=c7;
  288. c7=c6;
  289. c6=c5;
  290. c5=c4;
  291. c4=c3;
  292. c3=c2;
  293. c2=c1;
  294. c1=c0;
  295. c0=c;
  296. i++;
  297. /* On traite l'ent?te (header ou feature) */
  298. if(header)
  299. {
  300. len_feature++;
  301. if(((c5 == '\n') || (c5 == '\r'))&& (c4 == 'S')&&(c3 == 'Q')&& (c2 == ' ')&& (c1 == ' ')&& (c0 == ' ') )
  302. {
  303. while((((c = fgetc(fichierin))!='\n') && (c != '\r') ) && c != EOF)
  304. {
  305. i++;
  306. len_feature++;
  307. }
  308. header=0;
  309. i++;
  310. len_feature++;
  311. }
  312. if(((c6 == '\n')||(c6 == '\r')|| (i<7) )&& (c5 == 'A')&&(c4 == 'C')&& (c3 == ' ')&& (c2 == ' ')&& (c1 == ' ') )
  313. {
  314. ecriture=1;
  315. compteur_accno=0;
  316. }
  317. if(ecriture)
  318. {
  319. if((c == ':') || (c == ';') || (c == '\n') || (c == '\r') || (c == ' '))
  320. ecriture=0;
  321. else
  322. {
  323. fputc(c,fichierout);
  324. compteur_accno++;
  325. }
  326. if(compteur_accno > MAX_LEN_ACCNO)
  327. {
  328. *res = -3;
  329. ecriture=0;
  330. }
  331. }
  332. }
  333. if(((c2 == '\n') ||(c2 == '\r')) && (c1 == '/') && (c0 == '/') )
  334. {
  335. while((((c = fgetc(fichierin))!='\n') && (c != '\r'))&& c != EOF)
  336. {
  337. i++;
  338. }
  339. i++;
  340. if(i>2)
  341. makeIndex::ixecritureligne(deb_sequence+1,compteur_accno,len_feature-1,i-deb_sequence,fichierout);
  342. deb_sequence = i;
  343. compteur_accno = 0;
  344. ecriture=0;
  345. header = 1;
  346. len_feature = 0;
  347. }
  348. }
  349. if(!header)
  350. makeIndex::ixecritureligne(deb_sequence+1,compteur_accno,len_feature-1,i-deb_sequence,fichierout);
  351. fclose(fichierin);
  352. fclose(fichierout);
  353. if(*res == -1)
  354. *res = 1;
  355. }
  356. /**
  357. * Delete carriage return in the file.
  358. * \brief This function read filein and copy all to fileout except char "\r"
  359. * it transfrom "\r" into "\n" if newline only delimited with "\r" otherwise, it just
  360. * delete "\r".
  361. * \param filein,fileout path to files
  362. * \param err: -1 problem while opening file, else 1
  363. */
  364. void delete_CR_infile (char ** filein , char ** fileout, int * err)
  365. {
  366. FILE *fichierin;
  367. FILE *fichierout;
  368. char c;
  369. int i=0;
  370. fichierin=fopen(*filein,"rb");
  371. fichierout=fopen(*fileout,"wb");
  372. if((fichierin == NULL) || (fichierout == NULL))
  373. {
  374. *err = -1;
  375. return ;
  376. }
  377. i=0;
  378. while((c = getc(fichierin)) != EOF)
  379. {
  380. i++;
  381. if(c == '\r')
  382. {
  383. c = fgetc(fichierin);
  384. if(c == EOF)
  385. {
  386. fputc('\n',fichierout);
  387. break;
  388. }
  389. if(c != '\n') /* case: newline delimited by "\r" (i.e mac ?) */
  390. fputc('\n',fichierout);
  391. /* case: newline delimited by "\r\n" (i.e windows, "\r" char deleted ) */
  392. }
  393. fputc(c,fichierout);
  394. }
  395. fclose(fichierin);
  396. fclose(fichierout);
  397. }
  398. namespace makeIndex{
  399. /** @brief Internal C function to write the end of a line in the index
  400. * \note The accno should be written, function add spaces and all numbers
  401. * to the current line of the file
  402. *
  403. *
  404. * @param deb_sequence Begining of Sequence
  405. * @param compteur_accno Lenght of Accno
  406. * @param len_feature Length of features
  407. * @param len_sequence Length of sequence
  408. * @param fichierout File out
  409. */
  410. int ixecritureligne(int deb_sequence,int compteur_accno,
  411. int len_feature,int len_sequence,
  412. FILE * fichierout)
  413. {
  414. int i;
  415. for(i=compteur_accno; i<= MAX_LEN_ACCNO ; i++)
  416. fputc(' ',fichierout);
  417. return(fprintf(fichierout,"%10d %10d %8d\n",deb_sequence-1,
  418. len_feature+deb_sequence,len_sequence));
  419. }
  420. }