/GeneR/GeneR/src/makeIndex.cc
C++ | 512 lines | 356 code | 92 blank | 64 comment | 157 complexity | 3ea9ad7e66e841860193a4b845e03eba MD5 | raw file
Possible License(s): LGPL-2.0
- /*! \file makeIndex.cc
- *
- * \date Created : 08/07/04
- * \date Last Modified : Time-stamp: <2006-03-08 17:31:07 lucas>
- *
- * For all sequence files we build index file for fast access to a
- * specific sequence. Index files are a table with 4 columns
- * defined as follow:
- * accno deb_feature deb_sequence length_sequence
- *
- * \brief make index for Fasta embl and Genbank file
- * \version 1
- * \author A. Lucas
- * \note Licence: CeCIll
- */
- #include <string>
- #include <stdio.h>
- #include <stdlib.h>
- #include "makeIndex.h"
- #include "GeneR_globals.h"
- #include <Rinternals.h>
- #include <Rdefines.h>
- extern "C"
- {
- void ixfasta(char **path,int *res) ;
- void ixgbk(char **path,int *res) ;
- void ixembl(char **path,char **index,int *res) ;
- void delete_CR_infile (char ** filein , char ** fileout, int * err);
- }
- /** @brief Make an index for a Fasta file
- *
- *
- * @param path File path
- * @param res Integer return -1 if error; 1 if ok; -2 if "\r" found;
- * -3 if accno too long -4 file not found;
- */
- void ixfasta(char **path,int *res)
- {
- FILE *fichierin;
- FILE *fichierout;
-
- char * pathout;
- char c;
- int i,compteur_accno,ecriture,len,header;
- int deb_sequence,len_feature;
- len = strlen(*path);
- *res = -1;
- pathout = (char * ) malloc ( (len + 4)* sizeof(char) );
- strcpy(pathout,*path);
- pathout[len]='.';
- pathout[len+1]='i';
- pathout[len+2]='x';
- pathout[len+3]='\0';
- fichierin=fopen(*path,"r");
- fichierout=fopen(pathout,"w");
- if ((fichierin == NULL ) || (fichierout == NULL))
- {
- fprintf(stdout, "GeneR.so: error while opening file\n");
- *res = -4;
- return ;
- }
- /* On se place au debut du fichier */
- /* rewind(fichierin);*/
- i=0;
- ecriture=0;
- deb_sequence=0;
- header=0;
- len_feature = 0;
- compteur_accno=0;
- while((c = fgetc(fichierin)) != EOF)
- {
- i++;
- if(c == '\r')
- *res = -2 ;
- /* On traite l'ent?te (header ou feature) */
- if(header)
- {
- len_feature++;
- if(c == '\n')
- {
- header=0;
- ecriture=0;
- }
- if(c == ' ')
- ecriture=0;
- if(c == '\r')
- {
- header=0;
- ecriture=0;
- }
- if(c == '\t')
- ecriture=0;
- if(c == 10)
- ecriture=0;
- if(( (compteur_accno) > MAX_LEN_ACCNO) && (ecriture))
- {
- *res = -3;
- ecriture=0;
- }
- if(ecriture)
- {
- compteur_accno++;
- fputc(c,fichierout);
- }
- }
- if(c == '>' && (!header))
- {
- if(i>1)
- makeIndex::ixecritureligne(deb_sequence,compteur_accno,len_feature,i-deb_sequence,fichierout);
- deb_sequence = i;
- compteur_accno = 0;
- ecriture=1;
- header = 1;
- len_feature = 0;
- }
- }
- makeIndex::ixecritureligne(deb_sequence,compteur_accno,len_feature,i-deb_sequence+1,fichierout);
- fclose(fichierin);
- fclose(fichierout);
- if(*res == -1)
- *res = 1;
- }
- /** @brief Make an index for a GenBank file
- *
- *
- * @param path File path
- * @param res Integer return -1 if error; 1 if ok; -2 if "\r" found;
- * -3 if accno too long, -4 file not found;
- */
- void ixgbk(char **path,int *res)
- {
- FILE *fichierin;
- FILE *fichierout;
-
- char * pathout;
- char c,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9;
- int i,compteur_accno,ecriture,len,header;
- int deb_sequence,len_feature;
- *res = -1;
- len = strlen(*path);
- pathout =(char * ) malloc ( (len + 4)* sizeof(char) );
- strcpy(pathout,*path);
- pathout[len]='.';
- pathout[len+1]='i';
- pathout[len+2]='x';
- pathout[len+3]='\0';
- c=c0=c1=c2=c3=c4=c5=c6=c7=c8=c9=0;
- fichierin=fopen(*path,"r");
- fichierout=fopen(pathout,"w");
- if ((fichierin == NULL ) || (fichierout == NULL))
- {
- fprintf(stdout, "GeneR.so: error while opening file\n");
- *res = -4;
- return ;
- }
- /* On se place au debut du fichier */
- /* rewind(fichierin);*/
- i=0;
- ecriture=0;
- deb_sequence=0;
- header=1;
- len_feature = 0;
- compteur_accno=0;
- while((c = fgetc(fichierin)) != EOF)
- {
- if(c == '\r')
- *res = -2;
- c9=c8;
- c8=c7;
- c7=c6;
- c6=c5;
- c5=c4;
- c4=c3;
- c3=c2;
- c2=c1;
- c1=c0;
- c0=c;
- i++;
- /* On traite l'ent?te (header ou feature) */
- if(header)
- {
- len_feature++;
- if(((c6 == '\n')|| (c6 == '\r') ) && (c5 == 'O')&& (c4 == 'R')&& (c3 == 'I')&& (c2 == 'G')&& (c1 == 'I')&& (c == 'N') )
- {
- while((((c = fgetc(fichierin))!='\n') && (c != '\r') ) && c != EOF)
- {
- i++;
- len_feature++;
- }
- header=0;
- i++;
- len_feature++;
- }
- if(((c9 == '\n') || (i < 10) || (c9 == '\r') ) && (c8 == 'A')&& (c7 == 'C')&& (c6 == 'C')&& (c5 == 'E')&& (c4 == 'S')&& (c3 == 'S') && (c2 == 'I')&& (c1 == 'O')&& (c0 == 'N') )
- {
- ecriture=1;
- while((c = fgetc(fichierin))==' ')
- {
- i++;
- len_feature++;
- }
- i++;
- len_feature++;
- compteur_accno=0;
-
- }
- if(ecriture)
- {
- if((c == ':') || (c == '\n') || (c == '\r') || (c == ' '))
- ecriture=0;
- else
- {
- fputc(c,fichierout);
- compteur_accno++;
- }
- if(compteur_accno > MAX_LEN_ACCNO)
- {
- ecriture=0;
- *res= -3;
- }
- }
-
- }
- if(((c2 == '\n') || (c2 == '\r')) && (c1 == '/') && (c0 == '/') )
- {
-
- while((((c = fgetc(fichierin))!='\n') && (c != '\r') )&& c != EOF)
- {
- i++;
- }
- i++;
- if(i>2)
- makeIndex::ixecritureligne(deb_sequence+1,compteur_accno,len_feature-1,i-deb_sequence,fichierout);
- deb_sequence = i;
- compteur_accno = 0;
- ecriture=0;
- header = 1;
- len_feature = 0;
- }
- }
- if(!header)
- makeIndex::ixecritureligne(deb_sequence+1,compteur_accno,len_feature-1,i-deb_sequence,fichierout);
- fclose(fichierin);
- fclose(fichierout);
- if(*res == -1)
- *res = 1;
- }
- /** @brief Make an index for a EMBL file
- * @param path File path
- * @param index "d" or "x" if index file is ".ix" or ".id"
- * @param res Integer return -1 if error; 1 if ok; -2 if "\r" found;
- * -3 if accno too long, -4 file not found;
- */
- void ixembl(char **path,char **index,int *res)
- {
- FILE *fichierin;
- FILE *fichierout;
-
- char * pathout;
- char c,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9;
- int i,compteur_accno,ecriture,len,header;
- int deb_sequence,len_feature;
- *res = -1;
- len = strlen(*path);
- pathout =(char * ) malloc ( (len + 4)* sizeof(char) );
- strcpy(pathout,*path);
- pathout[len]='.';
- pathout[len+1]='i';
- pathout[len+2]= **index;
- pathout[len+3]='\0';
- c=c0=c1=c2=c3=c4=c5=c6=c7=c8=c9=0;
- fichierin=fopen(*path,"r");
- fichierout=fopen(pathout,"w");
- if ((fichierin == NULL ) || (fichierout == NULL))
- {
- fprintf(stdout, "GeneR.so: error while opening file\n");
- *res = -4;
- return ;
- }
- /* On se place au debut du fichier */
- /* rewind(fichierin);*/
- i=0;
- ecriture=0;
- deb_sequence=0;
- header=1;
- len_feature = 0;
- compteur_accno=0;
- while((c = fgetc(fichierin)) != EOF)
- {
- if(c == '\r')
- *res = -2;
- c9=c8;
- c8=c7;
- c7=c6;
- c6=c5;
- c5=c4;
- c4=c3;
- c3=c2;
- c2=c1;
- c1=c0;
- c0=c;
- i++;
- /* On traite l'ent?te (header ou feature) */
- if(header)
- {
- len_feature++;
- if(((c5 == '\n') || (c5 == '\r'))&& (c4 == 'S')&&(c3 == 'Q')&& (c2 == ' ')&& (c1 == ' ')&& (c0 == ' ') )
- {
- while((((c = fgetc(fichierin))!='\n') && (c != '\r') ) && c != EOF)
- {
- i++;
- len_feature++;
- }
- header=0;
- i++;
- len_feature++;
- }
- if(((c6 == '\n')||(c6 == '\r')|| (i<7) )&& (c5 == 'A')&&(c4 == 'C')&& (c3 == ' ')&& (c2 == ' ')&& (c1 == ' ') )
- {
- ecriture=1;
- compteur_accno=0;
- }
- if(ecriture)
- {
- if((c == ':') || (c == ';') || (c == '\n') || (c == '\r') || (c == ' '))
- ecriture=0;
- else
- {
- fputc(c,fichierout);
- compteur_accno++;
- }
- if(compteur_accno > MAX_LEN_ACCNO)
- {
- *res = -3;
- ecriture=0;
- }
- }
-
- }
- if(((c2 == '\n') ||(c2 == '\r')) && (c1 == '/') && (c0 == '/') )
- {
-
- while((((c = fgetc(fichierin))!='\n') && (c != '\r'))&& c != EOF)
- {
- i++;
- }
- i++;
- if(i>2)
- makeIndex::ixecritureligne(deb_sequence+1,compteur_accno,len_feature-1,i-deb_sequence,fichierout);
- deb_sequence = i;
- compteur_accno = 0;
- ecriture=0;
- header = 1;
- len_feature = 0;
- }
- }
- if(!header)
- makeIndex::ixecritureligne(deb_sequence+1,compteur_accno,len_feature-1,i-deb_sequence,fichierout);
- fclose(fichierin);
- fclose(fichierout);
- if(*res == -1)
- *res = 1;
- }
- /**
- * Delete carriage return in the file.
- * \brief This function read filein and copy all to fileout except char "\r"
- * it transfrom "\r" into "\n" if newline only delimited with "\r" otherwise, it just
- * delete "\r".
- * \param filein,fileout path to files
- * \param err: -1 problem while opening file, else 1
- */
- void delete_CR_infile (char ** filein , char ** fileout, int * err)
- {
- FILE *fichierin;
- FILE *fichierout;
- char c;
- int i=0;
- fichierin=fopen(*filein,"rb");
- fichierout=fopen(*fileout,"wb");
- if((fichierin == NULL) || (fichierout == NULL))
- {
- *err = -1;
- return ;
- }
-
- i=0;
- while((c = getc(fichierin)) != EOF)
- {
- i++;
- if(c == '\r')
- {
- c = fgetc(fichierin);
- if(c == EOF)
- {
- fputc('\n',fichierout);
- break;
- }
-
- if(c != '\n') /* case: newline delimited by "\r" (i.e mac ?) */
- fputc('\n',fichierout);
- /* case: newline delimited by "\r\n" (i.e windows, "\r" char deleted ) */
- }
-
- fputc(c,fichierout);
-
- }
-
- fclose(fichierin);
- fclose(fichierout);
- }
- namespace makeIndex{
- /** @brief Internal C function to write the end of a line in the index
- * \note The accno should be written, function add spaces and all numbers
- * to the current line of the file
- *
- *
- * @param deb_sequence Begining of Sequence
- * @param compteur_accno Lenght of Accno
- * @param len_feature Length of features
- * @param len_sequence Length of sequence
- * @param fichierout File out
- */
- int ixecritureligne(int deb_sequence,int compteur_accno,
- int len_feature,int len_sequence,
- FILE * fichierout)
- {
- int i;
- for(i=compteur_accno; i<= MAX_LEN_ACCNO ; i++)
- fputc(' ',fichierout);
-
- return(fprintf(fichierout,"%10d %10d %8d\n",deb_sequence-1,
- len_feature+deb_sequence,len_sequence));
- }
- }