PageRenderTime 55ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/gbfp.c

http://gbfp.googlecode.com/
C | 781 lines | 588 code | 151 blank | 42 comment | 124 complexity | 0e8b33a8256926f51342d5c48af5b53d MD5 | raw file
Possible License(s): GPL-2.0
  1. #define __EXTENSIONS__
  2. #include <stdio.h>
  3. #include <stdlib.h>
  4. #include <regex.h>
  5. #include <unistd.h>
  6. #include <string.h>
  7. #include <ctype.h>
  8. #include <limits.h>
  9. #include <sys/types.h>
  10. #include "gbfp.h"
  11. const char sVer[] = "0.5.0";
  12. const char sNorBase[] = "ACGTRYMKWSBDHVNacgtrymkwsbdhvn";
  13. const char sComBase[] = "TGCAYRKMWSVHDBNtgcayrkmwsvhdbn";
  14. const unsigned int iBaseLen = 30;
  15. char sTempLine[LINELEN] = {'\0',};
  16. regex_t ptRegExLocus;
  17. regex_t ptRegExOneLine;
  18. regex_t ptRegExAccession;
  19. regex_t ptRegExVersion;
  20. regex_t ptRegExRegion;
  21. regex_t ptRegExGI;
  22. #define skipSpace( x ) for (; isspace(*x); x++)
  23. #define putLine( x ) strcpy(sTempLine, x)
  24. #define getLine_w_rtrim( x, y ) \
  25. getLine(x, y); \
  26. rtrim(x)
  27. static gb_string getLine(gb_string sLine, FILE *FSeqFile) {
  28. gb_string sReturn;
  29. if (*sTempLine != '\0') {
  30. sReturn = strcpy(sLine, sTempLine);
  31. *sTempLine = '\0';
  32. } else {
  33. sReturn = fgets(sLine, LINELEN, FSeqFile);
  34. }
  35. return sReturn;
  36. }
  37. static void rtrim(gb_string sLine) {
  38. register int i;
  39. for (i = (strlen(sLine) - 1); i >= 0; i--) {
  40. if (! isspace(sLine[i])) {
  41. sLine[++i] = '\0';
  42. break;
  43. }
  44. }
  45. }
  46. static void removeRChar(gb_string sLine, char cRemove) {
  47. register int i;
  48. for (i = (strlen(sLine) - 1); i >= 0; i--) {
  49. if (sLine[i] == cRemove) {
  50. sLine[i] = '\0';
  51. break;
  52. }
  53. }
  54. }
  55. static gb_string joinLines(FILE *FSeqFile, unsigned int iSpaceLen) {
  56. char sLine[LINELEN];
  57. gb_string sTemp, sJoinedLine;
  58. sJoinedLine = malloc(sizeof(char) * LINELEN);
  59. getLine_w_rtrim(sLine, FSeqFile);
  60. strcpy(sJoinedLine, sLine + iSpaceLen);
  61. while (fgets(sLine, LINELEN, FSeqFile)) {
  62. sTemp = sLine;
  63. skipSpace(sTemp);
  64. if ((sTemp - sLine) < iSpaceLen) break;
  65. rtrim(sTemp);
  66. sJoinedLine = strcat(sJoinedLine, sTemp - 1); /* '- 1' in order to insert a space character at the juncation */
  67. }
  68. putLine(sLine);
  69. return realloc(sJoinedLine, sizeof(char) * (strlen(sJoinedLine) + 1));
  70. }
  71. void initRegEx(void) {
  72. const char sLocus[] = "^LOCUS +([a-z|A-Z|0-9|_]+) +([0-9]+) bp +([a-z|A-Z|-]+) +([a-z]+) +([A-Z]{3}) (.+)";
  73. const char sOneLine[] = "^ *([A-Z]+) +(.+)";
  74. const char sAccession[] = "^ACCESSION +([a-z|A-Z|0-9|_]+) ?";
  75. const char sRegion[] = " +REGION: ?([0-9]+)\\.\\.([0-9]+)";
  76. const char sVersion[] = "^VERSION +([a-z|A-Z|0-9|_.]+) ?";
  77. const char sGI[] = " +GI: ?([0-9]+)";
  78. regcomp(&ptRegExLocus, sLocus, REG_EXTENDED | REG_ICASE);
  79. regcomp(&ptRegExOneLine, sOneLine, REG_EXTENDED | REG_ICASE);
  80. regcomp(&ptRegExAccession, sAccession, REG_EXTENDED | REG_ICASE);
  81. regcomp(&ptRegExVersion, sVersion, REG_EXTENDED | REG_ICASE);
  82. regcomp(&ptRegExRegion, sRegion, REG_EXTENDED | REG_ICASE);
  83. regcomp(&ptRegExGI, sGI, REG_EXTENDED | REG_ICASE);
  84. }
  85. static int Pos2Num(gb_string sPositions, int aiPositions[]) {
  86. register int i;
  87. int iNum = 0;
  88. for (i = strlen(sPositions); i >= 0; i--) {
  89. if (isdigit(*(sPositions + i))) aiPositions[(aiPositions[iNum] - 1 == i) ? iNum : ++iNum] = i;
  90. else *(sPositions + i) = '\0';
  91. }
  92. return iNum;
  93. }
  94. static int Positions2Numbers(gb_string sPositions, unsigned long *lStart, unsigned long *lEnd) {
  95. int aiPositions[16] = {-2,};
  96. int iNum;
  97. iNum = Pos2Num(sPositions, aiPositions);
  98. if (iNum == 2) {
  99. *lStart = atol(sPositions + aiPositions[2]);
  100. *lEnd = atol(sPositions + aiPositions[1]);
  101. return 1;
  102. } else if (iNum == 1) {
  103. *lStart = *lEnd = atol(sPositions + aiPositions[1]);
  104. return 1;
  105. } else {
  106. fprintf(stderr, "Warning: cannot parse '%s'\n", sPositions);
  107. return 0;
  108. }
  109. }
  110. static void parseLocus(gb_string sLocusStr, gb_data *ptGBData) {
  111. /*
  112. 01-05 'LOCUS'
  113. 06-12 spaces
  114. 13-28 Locus name
  115. 29-29 space
  116. 30-40 Length of sequence, right-justified
  117. 41-41 space
  118. 42-43 bp
  119. 44-44 space
  120. 45-47 spaces, ss- (single-stranded), ds- (double-stranded), or
  121. ms- (mixed-stranded)
  122. 48-53 NA, DNA, RNA, tRNA (transfer RNA), rRNA (ribosomal RNA),
  123. mRNA (messenger RNA), uRNA (small nuclear RNA), snRNA,
  124. snoRNA. Left justified.
  125. 54-55 space
  126. 56-63 'linear' followed by two spaces, or 'circular'
  127. 64-64 space
  128. 65-67 The division code (see Section 3.3)
  129. 68-68 space
  130. 69-79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991)
  131. */
  132. char sTemp[LINELEN];
  133. unsigned int i, iErr, iLen;
  134. regmatch_t ptRegMatch[7];
  135. struct tData {
  136. char cType;
  137. void *Pointer;
  138. } tDatas[] = {
  139. {STRING, NULL},
  140. {LONG, NULL},
  141. {STRING, NULL},
  142. {STRING, NULL},
  143. {STRING, NULL},
  144. {STRING, NULL}};
  145. tDatas[0].Pointer = ptGBData->sLocusName;
  146. tDatas[1].Pointer = &(ptGBData->lLength);
  147. tDatas[2].Pointer = ptGBData->sType;
  148. tDatas[3].Pointer = ptGBData->sTopology;
  149. tDatas[4].Pointer = ptGBData->sDivisionCode;
  150. tDatas[5].Pointer = ptGBData->sDate;
  151. rtrim(sLocusStr);
  152. if ((iErr = regexec(&ptRegExLocus, sLocusStr, 7, ptRegMatch, 0)) == 0) {
  153. for (i = 0; i < 6; i++) {
  154. iLen = ptRegMatch[i + 1].rm_eo - ptRegMatch[i + 1].rm_so;
  155. switch (tDatas[i].cType) {
  156. case STRING:
  157. memcpy(tDatas[i].Pointer, (sLocusStr + ptRegMatch[i + 1].rm_so), iLen);
  158. *((gb_string ) tDatas[i].Pointer + iLen) = '\0';
  159. break;
  160. case LONG:
  161. memcpy(sTemp, (sLocusStr + ptRegMatch[i + 1].rm_so), iLen);
  162. sTemp[iLen] = '\0';
  163. *((unsigned long *) tDatas[i].Pointer) = atol(sTemp);
  164. break;
  165. default:
  166. perror("Unknown Data Type!!!");
  167. }
  168. }
  169. } else {
  170. regerror(iErr, &ptRegExLocus, sTemp, LINELEN);
  171. fprintf(stderr, "%s\n", sTemp);
  172. exit(1);
  173. }
  174. }
  175. static gb_string checkComplement(gb_string sLocation) {
  176. gb_string sPosition;
  177. skipSpace(sLocation);
  178. for (sPosition = sLocation; *sPosition; sPosition++) {
  179. /* Check the 1st and the 2nd characters of 'complement' */
  180. if (*sPosition == 'c' && *(sPosition + 1) == 'o') {
  181. removeRChar(sLocation, ')');
  182. return sPosition + 11;
  183. }
  184. }
  185. return sLocation;
  186. }
  187. static gb_string checkJoin(gb_string sLocation) {
  188. gb_string sPosition;
  189. skipSpace(sLocation);
  190. for (sPosition = sLocation; *sPosition; sPosition++) {
  191. /* Check the 1st and the 2nd characters of 'complement' */
  192. if (*sPosition == 'j' && *(sPosition + 1) == 'o') {
  193. removeRChar(sLocation, ')');
  194. return sPosition + 5;
  195. }
  196. }
  197. return sLocation;
  198. }
  199. /* Parsing a gb_string that contains gb_location information */
  200. static void LocationParser(gb_string sLocation, gb_feature *pFeature) {
  201. gb_string sTemp;
  202. gb_string sString = NULL;
  203. unsigned int iLocationNum = 1;
  204. /* Evalue sequence direction
  205. sString has gb_location and join informations
  206. */
  207. sString = checkComplement(sLocation);
  208. if (sLocation == sString) pFeature->cDirection = NORMAL;
  209. else pFeature->cDirection = REVCOM;
  210. /* Remove 'join' gb_string
  211. sString has gb_location informations
  212. */
  213. sString = checkJoin(sString);
  214. sTemp = sString - 1;
  215. while((sTemp = strchr((sTemp + 1), ','))) iLocationNum++;
  216. pFeature->ptLocation = malloc(iLocationNum * sizeof(*(pFeature->ptLocation)));
  217. iLocationNum = 0;
  218. sLocation = strtok_r(sString, ",", &sTemp);
  219. if (Positions2Numbers(sLocation,
  220. &(((pFeature->ptLocation)+iLocationNum)->lStart),
  221. &(((pFeature->ptLocation)+iLocationNum)->lEnd)) == 1) iLocationNum++;
  222. while((sLocation = strtok_r(NULL, ",", &sTemp))) {
  223. if (Positions2Numbers(sLocation,
  224. &(((pFeature->ptLocation)+iLocationNum)->lStart),
  225. &(((pFeature->ptLocation)+iLocationNum)->lEnd))) iLocationNum++;
  226. }
  227. pFeature->lStart = (pFeature->ptLocation)->lStart;
  228. pFeature->lEnd = ((pFeature->ptLocation)+(iLocationNum - 1))->lEnd;
  229. pFeature->iLocationNum = iLocationNum;
  230. }
  231. static gb_string parseQualifier(gb_string sQualifier, gb_string *psValue) {
  232. gb_string sPosition;
  233. skipSpace(sQualifier);
  234. if ((sPosition = strchr(sQualifier, '=')) == NULL) {
  235. *psValue = sQualifier + strlen(sQualifier);
  236. return sQualifier;
  237. }
  238. *sPosition = '\0';
  239. sPosition++;
  240. skipSpace(sQualifier);
  241. if (*sPosition == '"') {
  242. sPosition++;
  243. removeRChar(sPosition, '"');
  244. }
  245. *psValue = sPosition;
  246. return sQualifier;
  247. }
  248. static void QualifierParser(gb_string sQualifier, gb_feature *pFeature) {
  249. gb_string sValue;
  250. gb_string sTemp = NULL;
  251. gb_string sString = NULL;
  252. gb_qualifier *ptQualifier;
  253. pFeature->ptQualifier = malloc(INITQUALIFIERNUM * sizeof(gb_qualifier));
  254. ptQualifier = pFeature->ptQualifier;
  255. /* Parse the 1st gb_qualifier gb_string */
  256. sString = strtok_r(sQualifier, "\n", &sTemp);
  257. sQualifier = parseQualifier(sString, &sValue);
  258. ptQualifier->sQualifier = sQualifier;
  259. ptQualifier->sValue = sValue;
  260. ptQualifier++;
  261. /* Parse the rest gb_qualifier gb_string */
  262. while((sString = strtok_r(NULL, "\n", &sTemp)) != NULL) {
  263. sQualifier = parseQualifier(sString, &sValue);
  264. ptQualifier->sQualifier = sQualifier;
  265. ptQualifier->sValue = sValue;
  266. ptQualifier++;
  267. }
  268. pFeature->iQualifierNum = ptQualifier - pFeature->ptQualifier;
  269. pFeature->ptQualifier = realloc(pFeature->ptQualifier, pFeature->iQualifierNum * sizeof(gb_qualifier));
  270. }
  271. static unsigned int SequenceParser(gb_string sSequence, gb_string sSequence2) {
  272. register unsigned int i = 0;
  273. register unsigned int j = 0;
  274. register char c;
  275. while((c = *(sSequence + i++)) != '\0')
  276. if (isalpha(c) != 0) *(sSequence2 + j++) = c;
  277. *(sSequence2 + j) = '\0';
  278. return j;
  279. }
  280. static void RevCom(gb_string sSequence) {
  281. char c;
  282. unsigned int k;
  283. unsigned long i, j;
  284. for (i = 0, j = strlen(sSequence) - 1; i < j; i++, j--) {
  285. c = *(sSequence + i);
  286. *(sSequence + i) = 'X';
  287. for (k = 0; k < iBaseLen; k++)
  288. if (*(sNorBase + k) == *(sSequence + j)) {
  289. *(sSequence + i) = *(sComBase + k);
  290. break;
  291. }
  292. *(sSequence + j) = 'X';
  293. for (k = 0; k < iBaseLen; k++)
  294. if (*(sNorBase + k) == c) {
  295. *(sSequence + j) = *(sComBase + k);
  296. break;
  297. }
  298. }
  299. }
  300. gb_string getSequence(gb_string sSequence, gb_feature *ptFeature) {
  301. unsigned long lSeqLen = 1; /* For the '\0' characher */
  302. unsigned long lStart, lEnd;
  303. unsigned int i;
  304. gb_string sSequenceTemp;
  305. for (i = 0; i < ptFeature->iLocationNum; i++)
  306. lSeqLen += (((ptFeature->ptLocation) + i)->lEnd - ((ptFeature->ptLocation) + i)->lStart + 1);
  307. sSequenceTemp = malloc(lSeqLen * sizeof(char));
  308. lSeqLen = 0;
  309. for (i = 0; i < ptFeature->iLocationNum; i++) {
  310. lStart = ((ptFeature->ptLocation) + i)->lStart;
  311. lEnd = ((ptFeature->ptLocation) + i)->lEnd;
  312. strncpy(sSequenceTemp + lSeqLen, sSequence + lStart - 1, lEnd - lStart + 1);
  313. lSeqLen += (lEnd - lStart + 1);
  314. }
  315. *(sSequenceTemp + lSeqLen) = '\0';
  316. if (ptFeature->cDirection == REVCOM) RevCom(sSequenceTemp);
  317. return sSequenceTemp;
  318. }
  319. static void parseFeature(FILE *FSeqFile, gb_data *ptGBData) {
  320. char sLine[LINELEN] = {'\0',};
  321. char sLocation[LINELEN] = {'\0',};
  322. gb_string sQualifier = NULL;
  323. gb_string sQualifierTemp = NULL;
  324. unsigned int iReadPos = INELSE;
  325. unsigned int iFeatureNum = 0;
  326. unsigned int iFeatureMem = INITFEATURENUM;
  327. unsigned int i = 0;
  328. gb_feature *pFeatures = NULL;
  329. gb_feature *pFeature = NULL;
  330. pFeatures = (gb_feature *) malloc(iFeatureMem * sizeof(gb_feature));
  331. /* Parse FEATURES */
  332. while(fgets(sLine, LINELEN, FSeqFile)) {
  333. if (! isspace(*sLine)) {
  334. putLine(sLine);
  335. break;
  336. }
  337. rtrim(sLine);
  338. if (memcmp(sLine + 5, " ", 15) != 0) {
  339. if (iFeatureNum == iFeatureMem) {
  340. iFeatureMem += INITFEATURENUM;
  341. pFeatures = realloc(pFeatures, sizeof(gb_feature) * iFeatureMem);
  342. }
  343. if (strlen(sLocation) != 0) LocationParser(sLocation, (pFeatures + iFeatureNum - 1));
  344. if (sQualifier < sQualifierTemp) {
  345. *sQualifierTemp++ = '\n';
  346. *sQualifierTemp = '\0';
  347. /* printf("=====\n%s=====", sQualifier); */
  348. sQualifier = realloc(sQualifier, (sQualifierTemp - sQualifier + 1) * sizeof(*sQualifier));
  349. QualifierParser(sQualifier, (pFeatures + iFeatureNum - 1));
  350. }
  351. *sLocation = '\0';
  352. sQualifier = malloc(sizeof(*sQualifier) * LINELEN);
  353. sQualifierTemp = sQualifier;
  354. iReadPos = INFEATURE;
  355. memcpy((pFeatures + iFeatureNum)->sFeature, (sLine + 5), 15);
  356. *(((pFeatures + iFeatureNum)->sFeature) + 15) = '\0';
  357. rtrim((pFeatures + iFeatureNum)->sFeature);
  358. strcpy(sLocation, (sLine + 21));
  359. /* Feature Initalize */
  360. pFeature = pFeatures + iFeatureNum;
  361. pFeature->iNum = iFeatureNum;
  362. pFeature->cDirection = NORMAL;
  363. pFeature->iLocationNum = 0;
  364. pFeature->lStart = 0;
  365. pFeature->lEnd = 0;
  366. pFeature->iQualifierNum = 0;
  367. pFeature->ptLocation = NULL;
  368. pFeature->ptQualifier = NULL;
  369. iFeatureNum++;
  370. } else if (*(sLine + QUALIFIERSTART) == '/') {
  371. iReadPos = INQUALIFIER;
  372. if (sQualifier < sQualifierTemp) *sQualifierTemp++ = '\n';
  373. i = strlen(sLine) - (QUALIFIERSTART + 1);
  374. memcpy(sQualifierTemp, sLine + (QUALIFIERSTART + 1), i);
  375. sQualifierTemp += i;
  376. } else {
  377. if (iReadPos == INFEATURE) {
  378. strcpy((sLocation + strlen(sLocation)), (sLine + QUALIFIERSTART));
  379. } else if (iReadPos == INQUALIFIER) {
  380. i = strlen(sLine) - QUALIFIERSTART;
  381. memcpy(sQualifierTemp, sLine + QUALIFIERSTART, i);
  382. sQualifierTemp += i;
  383. }
  384. }
  385. }
  386. /* Finishing of the parsing */
  387. if (iFeatureNum == iFeatureMem) {
  388. iFeatureMem += INITFEATURENUM;
  389. pFeatures = realloc(pFeatures, sizeof(gb_feature) * iFeatureMem);
  390. }
  391. if (strlen(sLocation) != 0) LocationParser(sLocation, (pFeatures + iFeatureNum - 1));
  392. if (sQualifier < sQualifierTemp) {
  393. *sQualifierTemp++ = '\n';
  394. *sQualifierTemp = '\0';
  395. sQualifier = realloc(sQualifier, (sQualifierTemp - sQualifier + 1) * sizeof(*sQualifier));
  396. QualifierParser(sQualifier, (pFeatures + iFeatureNum - 1));
  397. }
  398. ptGBData->iFeatureNum = iFeatureNum;
  399. ptGBData->ptFeatures = pFeatures;
  400. }
  401. /* Parse sequences */
  402. static void parseSequence(FILE *FSeqFile, gb_data *ptGBData) {
  403. char sLine[LINELEN] = {'\0',};
  404. unsigned long lSeqLen;
  405. lSeqLen = 0;
  406. ptGBData->sSequence = malloc((ptGBData->lLength + 1) * sizeof(char));
  407. while(fgets(sLine, LINELEN, FSeqFile)) {
  408. if (*sLine == '/' && *(sLine + 1) == '/') {
  409. putLine(sLine);
  410. break;
  411. }
  412. lSeqLen += SequenceParser(sLine, ptGBData->sSequence + lSeqLen);
  413. }
  414. }
  415. static void parseDef(FILE *FSeqFile, gb_data *ptGBData) {
  416. char sLine[LINELEN];
  417. regmatch_t ptRegMatch[3];
  418. getLine_w_rtrim(sLine, FSeqFile);
  419. regexec(&ptRegExOneLine, sLine, 3, ptRegMatch, 0);
  420. ptGBData->sDef = strdup(sLine + ptRegMatch[2].rm_so);
  421. }
  422. static void parseKeywords(FILE *FSeqFile, gb_data *ptGBData) {
  423. char sLine[LINELEN];
  424. regmatch_t ptRegMatch[3];
  425. getLine_w_rtrim(sLine, FSeqFile);
  426. regexec(&ptRegExOneLine, sLine, 3, ptRegMatch, 0);
  427. ptGBData->sKeywords = strdup(sLine + ptRegMatch[2].rm_so);
  428. }
  429. static void parseAccession(FILE *FSeqFile, gb_data *ptGBData) {
  430. char sLine[LINELEN];
  431. regmatch_t ptRegMatch[3];
  432. getLine_w_rtrim(sLine, FSeqFile);
  433. if (regexec(&ptRegExAccession, sLine, 2, ptRegMatch, 0) == 0) {
  434. *(sLine + ptRegMatch[1].rm_eo) = '\0';
  435. ptGBData->sAccession = strdup(sLine + ptRegMatch[1].rm_so);
  436. }
  437. if (regexec(&ptRegExRegion, sLine + ptRegMatch[1].rm_eo + 1, 3, ptRegMatch, 0) == 0) {
  438. *(sLine + ptRegMatch[1].rm_eo) = '\0';
  439. (ptGBData->lRegion)[0] = atol(sLine + ptRegMatch[1].rm_so);
  440. *(sLine + ptRegMatch[2].rm_eo) = '\0';
  441. (ptGBData->lRegion)[1] = atol(sLine + ptRegMatch[2].rm_so);
  442. }
  443. }
  444. static void parseVersion(FILE *FSeqFile, gb_data *ptGBData) {
  445. char sLine[LINELEN];
  446. regmatch_t ptRegMatch[2];
  447. getLine_w_rtrim(sLine, FSeqFile);
  448. if (regexec(&ptRegExVersion, sLine, 2, ptRegMatch, 0) == 0) {
  449. *(sLine + ptRegMatch[1].rm_eo) = '\0';
  450. ptGBData->sVersion = strdup(sLine + ptRegMatch[1].rm_so);
  451. }
  452. if (regexec(&ptRegExGI, sLine + ptRegMatch[1].rm_eo + 1, 2, ptRegMatch, 0) == 0) {
  453. *(sLine + ptRegMatch[1].rm_eo) = '\0';
  454. ptGBData->sGI = strdup(sLine + ptRegMatch[1].rm_so);
  455. }
  456. }
  457. static void parseComment(FILE *FSeqFile, gb_data *ptGBData) {
  458. ptGBData->sComment = joinLines(FSeqFile, 12);
  459. }
  460. static void parseSource(FILE *FSeqFile, gb_data *ptGBData) {
  461. char sLine[LINELEN];
  462. regmatch_t ptRegMatch[3];
  463. getLine_w_rtrim(sLine, FSeqFile);
  464. regexec(&ptRegExOneLine, sLine, 3, ptRegMatch, 0);
  465. ptGBData->sSource = strdup(sLine + ptRegMatch[2].rm_so);
  466. getLine_w_rtrim(sLine, FSeqFile);
  467. regexec(&ptRegExOneLine, sLine, 3, ptRegMatch, 0);
  468. ptGBData->sOrganism = strdup(sLine + ptRegMatch[2].rm_so);
  469. ptGBData->sLineage = joinLines(FSeqFile, 12);
  470. }
  471. #define processRef( x, y ) \
  472. y = NULL; \
  473. getLine_w_rtrim(sLine, FSeqFile); \
  474. putLine(sLine); \
  475. if (strstr(sLine, x) != NULL) y = joinLines(FSeqFile, 12)
  476. static void parseReference(FILE *FSeqFile, gb_data *ptGBData) {
  477. char sLine[LINELEN];
  478. regmatch_t ptRegMatch[3];
  479. gb_reference *ptReferences = NULL;
  480. gb_reference *ptReference = NULL;
  481. unsigned int iReferenceNum = 0;
  482. ptReferences = ptGBData->ptReferences;
  483. iReferenceNum = ptGBData->iReferenceNum;
  484. ptReferences = realloc(ptReferences, sizeof(gb_reference) * (iReferenceNum + 1));
  485. ptReference = ptReferences + iReferenceNum;
  486. getLine_w_rtrim(sLine, FSeqFile);
  487. regexec(&ptRegExOneLine, sLine, 3, ptRegMatch, 0);
  488. ptReference->iNum = atoi(sLine + ptRegMatch[2].rm_so);
  489. processRef("AUTHORS", ptReference->sAuthors);
  490. processRef("TITLE", ptReference->sTitle);
  491. processRef("CONSTRM", ptReference->sConsrtm);
  492. processRef("JOURNAL", ptReference->sJournal);
  493. processRef("PUBMED", ptReference->sPubMed);
  494. ptGBData->ptReferences = ptReferences;
  495. ptGBData->iReferenceNum = iReferenceNum + 1;
  496. }
  497. static void initGBData(gb_data *ptGBData) {
  498. ptGBData->sAccession = NULL;
  499. ptGBData->sComment = NULL;
  500. ptGBData->sDef = NULL;
  501. ptGBData->sGI = NULL;
  502. ptGBData->sKeywords = NULL;
  503. ptGBData->sLineage = NULL;
  504. ptGBData->sOrganism = NULL;
  505. ptGBData->sSequence = NULL;
  506. ptGBData->sSource = NULL;
  507. ptGBData->sVersion = NULL;
  508. ptGBData->ptReferences = NULL;
  509. ptGBData->ptFeatures = NULL;
  510. ptGBData->iFeatureNum = 0;
  511. ptGBData->iReferenceNum = 0;
  512. ptGBData->lLength = 0;
  513. ptGBData->lRegion[0] = 0;
  514. ptGBData->lRegion[1] = 0;
  515. ptGBData->sLocusName[0] = '\0';
  516. ptGBData->sType[0] = '\0';
  517. ptGBData->sTopology[0] = '\0';
  518. ptGBData->sDivisionCode[0] = '\0';
  519. ptGBData->sDate[0] = '\0';
  520. }
  521. static gb_data *_parseGBFF(FILE *FSeqFile) {
  522. int i;
  523. char sLine[LINELEN] = {'\0',};
  524. gb_data *ptGBData = NULL;
  525. struct tField {
  526. char sField[FIELDLEN + 1];
  527. void (*vFunction)(FILE *FSeqFile, gb_data *ptGBData);
  528. } atFields[] = {
  529. {"DEFINITION", parseDef},
  530. {"ACCESSION", parseAccession},
  531. {"VERSION", parseVersion},
  532. {"KEYWORDS", parseKeywords},
  533. {"SOURCE", parseSource},
  534. {"REFERENCE", parseReference},
  535. {"COMMENT", parseComment},
  536. {"FEATURE", parseFeature},
  537. {"ORIGIN", parseSequence},
  538. {"", NULL} /* To terminate seeking */
  539. };
  540. /* Confirming GBFF File with LOCUS line */
  541. while(fgets(sLine, LINELEN, FSeqFile)) {
  542. if (strstr(sLine, "LOCUS") == sLine) {
  543. ptGBData = malloc(sizeof(gb_data));
  544. initGBData(ptGBData);
  545. break;
  546. }
  547. }
  548. /* If there is a no LOCUS line, next statement return NULL value to end parsing */
  549. if (ptGBData == NULL) return NULL;
  550. /* Parse LOCUS line */
  551. parseLocus(sLine, ptGBData);
  552. while(getLine(sLine, FSeqFile)) {
  553. if (strstr(sLine, "//") == sLine) break;
  554. for(i = 0; *((atFields + i)->sField); i++) {
  555. if (strstr(sLine, (atFields + i)->sField) == sLine) {
  556. putLine(sLine);
  557. ((atFields + i)->vFunction)(FSeqFile, ptGBData);
  558. break;
  559. }
  560. }
  561. }
  562. return ptGBData;
  563. }
  564. #define freeString( x ) if (x != NULL) free(x)
  565. void freeGBData(gb_data **pptGBData) {
  566. int i;
  567. gb_data *ptGBData = NULL;
  568. gb_feature *ptFeatures = NULL;
  569. gb_reference *ptReferences = NULL;
  570. unsigned int iFeatureNum = 0;
  571. unsigned int iReferenceNum = 0;
  572. unsigned int iSeqPos = 0;
  573. for (iSeqPos = 0; *(pptGBData + iSeqPos) != NULL; iSeqPos++) {
  574. ptGBData = *(pptGBData + iSeqPos);
  575. ptFeatures = ptGBData->ptFeatures;
  576. iFeatureNum = ptGBData->iFeatureNum;
  577. for (i = 0; i < iFeatureNum; i++) {
  578. /* printf("%i, %i\n", iFeatureNum, (ptFeatures+iFeatureNum)->iQualifierNum); */
  579. free((ptFeatures + i)->ptLocation);
  580. free(((ptFeatures + i)->ptQualifier)->sQualifier);
  581. }
  582. free(ptFeatures);
  583. ptReferences = ptGBData->ptReferences;
  584. iReferenceNum = ptGBData->iReferenceNum;
  585. for (i = 0; i < iReferenceNum; i++) {
  586. freeString((ptReferences + i)->sAuthors);
  587. freeString((ptReferences + i)->sConsrtm);
  588. freeString((ptReferences + i)->sTitle);
  589. freeString((ptReferences + i)->sJournal);
  590. freeString((ptReferences + i)->sPubMed);
  591. }
  592. freeString(ptGBData->sDef);
  593. freeString(ptGBData->sAccession);
  594. freeString(ptGBData->sComment);
  595. freeString(ptGBData->sGI);
  596. freeString(ptGBData->sKeywords);
  597. freeString(ptGBData->sLineage);
  598. freeString(ptGBData->sOrganism);
  599. freeString(ptGBData->sSequence);
  600. freeString(ptGBData->sSource);
  601. freeString(ptGBData->sVersion);
  602. }
  603. }
  604. gb_data **parseGBFF(gb_string spFileName) {
  605. int iGBFSeqPos = 0;
  606. unsigned int iGBFSeqNum = INITGBFSEQNUM;
  607. gb_data **pptGBDatas;
  608. FILE *FSeqFile;
  609. if (spFileName == NULL) {
  610. FSeqFile = stdin;
  611. } else {
  612. if (access(spFileName, F_OK) != 0) {
  613. /* perror(spFileName); */
  614. return NULL;
  615. } else {
  616. FSeqFile = fopen(spFileName, "r");
  617. }
  618. }
  619. initRegEx(); /* Initalize for regular expression */
  620. pptGBDatas = malloc(iGBFSeqNum * sizeof(gb_data *));
  621. do {
  622. if (iGBFSeqNum == iGBFSeqPos) {
  623. iGBFSeqNum += INITGBFSEQNUM;
  624. pptGBDatas = realloc(pptGBDatas, iGBFSeqNum * sizeof(gb_data *));
  625. }
  626. *(pptGBDatas + iGBFSeqPos) = _parseGBFF(FSeqFile);
  627. } while (*(pptGBDatas + iGBFSeqPos++) != NULL);
  628. if (spFileName) fclose(FSeqFile);
  629. return pptGBDatas;
  630. }