/src/ftk_xml_parser.c

http://ftk.googlecode.com/ · C · 563 lines · 462 code · 71 blank · 30 comment · 116 complexity · 1d8d14ec6199398a417758016a78ff18 MD5 · raw file

  1. /*
  2. * File: ftk_xml_parser.c
  3. * Author: Li XianJing <xianjimli@hotmail.com>
  4. * Brief: xml parser
  5. *
  6. * Copyright (c) Li XianJing
  7. *
  8. * Licensed under the Academic Free License version 2.1
  9. *
  10. * This program is free software; you can redistribute it and/or modify
  11. * it under the terms of the GNU General Public License as published by
  12. * the Free Software Foundation; either version 2 of the License, or
  13. * (at your option) any later version.
  14. *
  15. * This program is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU General Public License
  21. * along with this program; if not, write to the Free Software
  22. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  23. */
  24. /*
  25. * History:
  26. * ================================================================
  27. * 2009-05-30 Li XianJing <xianjimli@hotmail.com> created.
  28. *
  29. */
  30. #include "ftk_allocator.h"
  31. #include "ftk_xml_parser.h"
  32. struct _FtkXmlParser
  33. {
  34. const char* read_ptr;
  35. int attrs_nr;
  36. char* attrs[2*MAX_ATTR_NR+1];
  37. char* buffer;
  38. int buffer_used;
  39. int buffer_total;
  40. FtkXmlBuilder* builder;
  41. };
  42. static const char* strtrim(char* str);
  43. static void ftk_xml_parser_parse_entity(FtkXmlParser* thiz);
  44. static void ftk_xml_parser_parse_start_tag(FtkXmlParser* thiz);
  45. static void ftk_xml_parser_parse_end_tag(FtkXmlParser* thiz);
  46. static void ftk_xml_parser_parse_comment(FtkXmlParser* thiz);
  47. static void ftk_xml_parser_parse_pi(FtkXmlParser* thiz);
  48. static void ftk_xml_parser_parse_text(FtkXmlParser* thiz);
  49. static void ftk_xml_parser_reset_buffer(FtkXmlParser* thiz);
  50. FtkXmlParser* ftk_xml_parser_create(void)
  51. {
  52. return (FtkXmlParser*)FTK_ZALLOC(sizeof(FtkXmlParser));
  53. }
  54. void ftk_xml_parser_set_builder(FtkXmlParser* thiz, FtkXmlBuilder* builder)
  55. {
  56. thiz->builder = builder;
  57. return;
  58. }
  59. void ftk_xml_parser_parse(FtkXmlParser* thiz, const char* xml, int length)
  60. {
  61. int i = 0;
  62. enum _State
  63. {
  64. STAT_NONE,
  65. STAT_AFTER_LT,
  66. STAT_START_TAG,
  67. STAT_END_TAG,
  68. STAT_TEXT,
  69. STAT_PRE_COMMENT1,
  70. STAT_PRE_COMMENT2,
  71. STAT_COMMENT,
  72. STAT_PROCESS_INSTRUCTION,
  73. }state = STAT_NONE;
  74. thiz->read_ptr = xml;
  75. for(; *thiz->read_ptr != '\0' && (thiz->read_ptr - xml) < length; thiz->read_ptr++, i++)
  76. {
  77. char c = thiz->read_ptr[0];
  78. switch(state)
  79. {
  80. case STAT_NONE:
  81. {
  82. if(c == '<')
  83. {
  84. ftk_xml_parser_reset_buffer(thiz);
  85. state = STAT_AFTER_LT;
  86. }
  87. else if(!isspace(c))
  88. {
  89. state = STAT_TEXT;
  90. }
  91. break;
  92. }
  93. case STAT_AFTER_LT:
  94. {
  95. if(c == '?')
  96. {
  97. state = STAT_PROCESS_INSTRUCTION;
  98. }
  99. else if(c == '/')
  100. {
  101. state = STAT_END_TAG;
  102. }
  103. else if(c == '!')
  104. {
  105. state = STAT_PRE_COMMENT1;
  106. }
  107. else if(isalpha(c) || c == '_')
  108. {
  109. state = STAT_START_TAG;
  110. }
  111. else
  112. {
  113. ftk_xml_builder_on_error(thiz->builder, 0, 0, "unexpected char");
  114. }
  115. break;
  116. }
  117. case STAT_START_TAG:
  118. {
  119. ftk_xml_parser_parse_start_tag(thiz);
  120. state = STAT_NONE;
  121. break;
  122. }
  123. case STAT_END_TAG:
  124. {
  125. ftk_xml_parser_parse_end_tag(thiz);
  126. state = STAT_NONE;
  127. break;
  128. }
  129. case STAT_PROCESS_INSTRUCTION:
  130. {
  131. ftk_xml_parser_parse_pi(thiz);
  132. state = STAT_NONE;
  133. break;
  134. }
  135. case STAT_TEXT:
  136. {
  137. ftk_xml_parser_parse_text(thiz);
  138. state = STAT_NONE;
  139. break;
  140. }
  141. case STAT_PRE_COMMENT1:
  142. {
  143. if(c == '-')
  144. {
  145. state = STAT_PRE_COMMENT2;
  146. }
  147. else
  148. {
  149. ftk_xml_builder_on_error(thiz->builder, 0, 0, "expected \'-\'");
  150. }
  151. break;
  152. }
  153. case STAT_PRE_COMMENT2:
  154. {
  155. if(c == '-')
  156. {
  157. state = STAT_COMMENT;
  158. }
  159. else
  160. {
  161. ftk_xml_builder_on_error(thiz->builder, 0, 0, "expected \'-\'");
  162. }
  163. }
  164. case STAT_COMMENT:
  165. {
  166. ftk_xml_parser_parse_comment(thiz);
  167. state = STAT_NONE;
  168. break;
  169. }
  170. default:break;
  171. }
  172. if(*thiz->read_ptr == '\0')
  173. {
  174. break;
  175. }
  176. }
  177. return;
  178. }
  179. static void ftk_xml_parser_reset_buffer(FtkXmlParser* thiz)
  180. {
  181. thiz->buffer_used = 0;
  182. thiz->attrs_nr = 0;
  183. thiz->attrs[0] = NULL;
  184. return;
  185. }
  186. static int ftk_xml_parser_strdup(FtkXmlParser* thiz, const char* start, int length)
  187. {
  188. int offset = -1;
  189. if((thiz->buffer_used + length) >= thiz->buffer_total)
  190. {
  191. int length = thiz->buffer_total+(thiz->buffer_total>>1) + 128;
  192. char* buffer = (char*)FTK_REALLOC(thiz->buffer, length);
  193. if(buffer != NULL)
  194. {
  195. thiz->buffer = buffer;
  196. thiz->buffer_total = length;
  197. }
  198. }
  199. if((thiz->buffer_used + length) >= thiz->buffer_total)
  200. {
  201. return offset;
  202. }
  203. offset = thiz->buffer_used;
  204. ftk_strncpy(thiz->buffer + offset, start, length);
  205. thiz->buffer[offset + length] = '\0';
  206. strtrim(thiz->buffer+offset);
  207. thiz->buffer_used += length + 1;
  208. return offset;
  209. }
  210. static void ftk_xml_parser_parse_attrs(FtkXmlParser* thiz, char end_char)
  211. {
  212. int i = 0;
  213. enum _State
  214. {
  215. STAT_PRE_KEY,
  216. STAT_KEY,
  217. STAT_PRE_VALUE,
  218. STAT_VALUE,
  219. STAT_END,
  220. }state = STAT_PRE_KEY;
  221. char value_end = '\"';
  222. const char* start = thiz->read_ptr;
  223. thiz->attrs_nr = 0;
  224. for(; *thiz->read_ptr != '\0' && thiz->attrs_nr < MAX_ATTR_NR; thiz->read_ptr++)
  225. {
  226. char c = *thiz->read_ptr;
  227. switch(state)
  228. {
  229. case STAT_PRE_KEY:
  230. {
  231. if(c == end_char || c == '>')
  232. {
  233. state = STAT_END;
  234. }
  235. else if(!isspace(c))
  236. {
  237. state = STAT_KEY;
  238. start = thiz->read_ptr;
  239. }
  240. }
  241. case STAT_KEY:
  242. {
  243. if(c == '=')
  244. {
  245. thiz->attrs[thiz->attrs_nr++] = (char*)ftk_xml_parser_strdup(thiz, start, thiz->read_ptr - start);
  246. state = STAT_PRE_VALUE;
  247. }
  248. break;
  249. }
  250. case STAT_PRE_VALUE:
  251. {
  252. if(c == '\"' || c == '\'')
  253. {
  254. state = STAT_VALUE;
  255. value_end = c;
  256. start = thiz->read_ptr + 1;
  257. }
  258. break;
  259. }
  260. case STAT_VALUE:
  261. {
  262. if(c == value_end)
  263. {
  264. thiz->attrs[thiz->attrs_nr++] = (char*)ftk_xml_parser_strdup(thiz, start, thiz->read_ptr - start);
  265. state = STAT_PRE_KEY;
  266. }
  267. }
  268. default:break;
  269. }
  270. if(state == STAT_END)
  271. {
  272. break;
  273. }
  274. }
  275. for(i = 0; i < thiz->attrs_nr; i++)
  276. {
  277. thiz->attrs[i] = thiz->buffer + (int)(thiz->attrs[i]);
  278. }
  279. thiz->attrs[thiz->attrs_nr] = NULL;
  280. return;
  281. }
  282. static void ftk_xml_parser_parse_start_tag(FtkXmlParser* thiz)
  283. {
  284. enum _State
  285. {
  286. STAT_NAME,
  287. STAT_ATTR,
  288. STAT_END,
  289. }state = STAT_NAME;
  290. char* tag_name = NULL;
  291. const char* start = thiz->read_ptr - 1;
  292. for(; *thiz->read_ptr != '\0'; thiz->read_ptr++)
  293. {
  294. char c = *thiz->read_ptr;
  295. switch(state)
  296. {
  297. case STAT_NAME:
  298. {
  299. if(isspace(c) || c == '>' || c == '/')
  300. {
  301. tag_name = (char*)ftk_xml_parser_strdup(thiz, start, thiz->read_ptr - start);
  302. state = (c != '>' && c != '/') ? STAT_ATTR : STAT_END;
  303. }
  304. break;
  305. }
  306. case STAT_ATTR:
  307. {
  308. ftk_xml_parser_parse_attrs(thiz, '/');
  309. state = STAT_END;
  310. break;
  311. }
  312. default:break;
  313. }
  314. if(state == STAT_END)
  315. {
  316. break;
  317. }
  318. }
  319. tag_name = thiz->buffer + (int)tag_name;
  320. ftk_xml_builder_on_start_element(thiz->builder, tag_name, (const char**)thiz->attrs);
  321. if(thiz->read_ptr[0] == '/')
  322. {
  323. ftk_xml_builder_on_end_element(thiz->builder, tag_name);
  324. }
  325. for(; *thiz->read_ptr != '>' && *thiz->read_ptr != '\0'; thiz->read_ptr++);
  326. return;
  327. }
  328. static void ftk_xml_parser_parse_end_tag(FtkXmlParser* thiz)
  329. {
  330. char* tag_name = NULL;
  331. const char* start = thiz->read_ptr;
  332. for(; *thiz->read_ptr != '\0'; thiz->read_ptr++)
  333. {
  334. if(*thiz->read_ptr == '>')
  335. {
  336. tag_name = thiz->buffer + ftk_xml_parser_strdup(thiz, start, thiz->read_ptr-start);
  337. ftk_xml_builder_on_end_element(thiz->builder, tag_name);
  338. break;
  339. }
  340. }
  341. return;
  342. }
  343. static void ftk_xml_parser_parse_comment(FtkXmlParser* thiz)
  344. {
  345. enum _State
  346. {
  347. STAT_COMMENT,
  348. STAT_MINUS1,
  349. STAT_MINUS2,
  350. }state = STAT_COMMENT;
  351. const char* start = ++thiz->read_ptr;
  352. for(; *thiz->read_ptr != '\0'; thiz->read_ptr++)
  353. {
  354. char c = *thiz->read_ptr;
  355. switch(state)
  356. {
  357. case STAT_COMMENT:
  358. {
  359. if(c == '-')
  360. {
  361. state = STAT_MINUS1;
  362. }
  363. break;
  364. }
  365. case STAT_MINUS1:
  366. {
  367. if(c == '-')
  368. {
  369. state = STAT_MINUS2;
  370. }
  371. else
  372. {
  373. state = STAT_COMMENT;
  374. }
  375. break;
  376. }
  377. case STAT_MINUS2:
  378. {
  379. if(c == '>')
  380. {
  381. ftk_xml_builder_on_comment(thiz->builder, start, thiz->read_ptr-start-2);
  382. return;
  383. }
  384. }
  385. default:break;
  386. }
  387. }
  388. return;
  389. }
  390. static void ftk_xml_parser_parse_pi(FtkXmlParser* thiz)
  391. {
  392. enum _State
  393. {
  394. STAT_NAME,
  395. STAT_ATTR,
  396. STAT_END
  397. }state = STAT_NAME;
  398. char* tag_name = NULL;
  399. const char* start = thiz->read_ptr;
  400. for(; *thiz->read_ptr != '\0'; thiz->read_ptr++)
  401. {
  402. char c = *thiz->read_ptr;
  403. switch(state)
  404. {
  405. case STAT_NAME:
  406. {
  407. if(isspace(c) || c == '>')
  408. {
  409. tag_name = (char*)ftk_xml_parser_strdup(thiz, start, thiz->read_ptr - start);
  410. state = c != '>' ? STAT_ATTR : STAT_END;
  411. }
  412. break;
  413. }
  414. case STAT_ATTR:
  415. {
  416. ftk_xml_parser_parse_attrs(thiz, '?');
  417. state = STAT_END;
  418. break;
  419. }
  420. default:break;
  421. }
  422. if(state == STAT_END)
  423. {
  424. break;
  425. }
  426. }
  427. tag_name = thiz->buffer + (int)tag_name;
  428. ftk_xml_builder_on_pi_element(thiz->builder, tag_name, (const char**)thiz->attrs);
  429. for(; *thiz->read_ptr != '>' && *thiz->read_ptr != '\0'; thiz->read_ptr++);
  430. return;
  431. }
  432. static void ftk_xml_parser_parse_text(FtkXmlParser* thiz)
  433. {
  434. const char* start = thiz->read_ptr - 1;
  435. for(; *thiz->read_ptr != '\0'; thiz->read_ptr++)
  436. {
  437. char c = *thiz->read_ptr;
  438. if(c == '<')
  439. {
  440. if(thiz->read_ptr > start)
  441. {
  442. ftk_xml_builder_on_text(thiz->builder, start, thiz->read_ptr-start);
  443. }
  444. thiz->read_ptr--;
  445. return;
  446. }
  447. else if(c == '&')
  448. {
  449. ftk_xml_parser_parse_entity(thiz);
  450. }
  451. }
  452. return;
  453. }
  454. static void ftk_xml_parser_parse_entity(FtkXmlParser* thiz)
  455. {
  456. /*TODO*/
  457. return;
  458. }
  459. void ftk_xml_parser_destroy(FtkXmlParser* thiz)
  460. {
  461. if(thiz != NULL)
  462. {
  463. FTK_FREE(thiz->buffer);
  464. FTK_FREE(thiz);
  465. }
  466. return;
  467. }
  468. static const char* strtrim(char* str)
  469. {
  470. char* p = NULL;
  471. p = str + strlen(str) - 1;
  472. while(p != str && isspace(*p))
  473. {
  474. *p = '\0';
  475. p--;
  476. }
  477. p = str;
  478. while(*p != '\0' && isspace(*p)) p++;
  479. if(p != str)
  480. {
  481. char* s = p;
  482. char* d = str;
  483. while(*s != '\0')
  484. {
  485. *d = *s;
  486. d++;
  487. s++;
  488. }
  489. *d = '\0';
  490. }
  491. return str;
  492. }