PageRenderTime 1265ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/corelib/xml.cpp

#
C++ | 557 lines | 500 code | 33 blank | 24 comment | 261 complexity | 2ea9d3d22d48329aba433f3a1e40345c MD5 | raw file
Possible License(s): GPL-3.0, LGPL-3.0
  1. // Copyright (C) 2006-2010 David Sugar, Tycho Softworks.
  2. //
  3. // This file is part of GNU uCommon C++.
  4. //
  5. // GNU uCommon C++ is free software: you can redistribute it and/or modify
  6. // it under the terms of the GNU Lesser General Public License as published
  7. // by the Free Software Foundation, either version 3 of the License, or
  8. // (at your option) any later version.
  9. //
  10. // GNU uCommon C++ is distributed in the hope that it will be useful,
  11. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. // GNU Lesser General Public License for more details.
  14. //
  15. // You should have received a copy of the GNU Lesser General Public License
  16. // along with GNU uCommon C++. If not, see <http://www.gnu.org/licenses/>.
  17. #include <config.h>
  18. #include <ucommon/export.h>
  19. #include <ucommon/string.h>
  20. #include <ucommon/xml.h>
  21. #include <ctype.h>
  22. using namespace UCOMMON_NAMESPACE;
  23. #if !defined(_MSC_VER)
  24. using namespace std;
  25. #endif
  26. static bool isElement(char c)
  27. {
  28. return isalnum(c) || c == ':' || c == '-' || c == '.' || c == '_';
  29. }
  30. XMLParser::XMLParser(unsigned size)
  31. {
  32. state = NONE;
  33. bufpos = 0;
  34. bufsize = size;
  35. buffer = new char[size];
  36. ecount = dcount = 0;
  37. }
  38. XMLParser::~XMLParser()
  39. {
  40. if(buffer) {
  41. delete[] buffer;
  42. buffer = NULL;
  43. }
  44. }
  45. void XMLParser::putBuffer(char c)
  46. {
  47. buffer[bufpos++] = c;
  48. if(bufpos >= bufsize) {
  49. if(ecount)
  50. characters((caddr_t)buffer, bufpos);
  51. bufpos = 0;
  52. }
  53. }
  54. void XMLParser::clearBuffer(void)
  55. {
  56. if(bufpos && ecount)
  57. characters((caddr_t)buffer, bufpos);
  58. bufpos = 0;
  59. }
  60. bool XMLParser::parse(FILE *fp)
  61. {
  62. state = NONE;
  63. bufpos = 0;
  64. ecount = dcount = 0;
  65. int ch;
  66. unsigned char cp;
  67. while((ch = fgetc(fp)) != EOF) {
  68. switch(state) {
  69. case AMP:
  70. if((!bufpos && ch == '#') || isElement(ch)) {
  71. buffer[bufpos++] = ch;
  72. break;
  73. }
  74. if(ch != ';')
  75. return false;
  76. buffer[bufpos] = 0;
  77. if(buffer[0] == '#')
  78. cp = atoi(buffer + 1);
  79. else if(eq(buffer, "amp"))
  80. cp = '&';
  81. else if(eq(buffer, "lt"))
  82. cp = '<';
  83. else if(eq(buffer, "gt"))
  84. cp = '>';
  85. else if(eq(buffer, "apos"))
  86. cp = '`';
  87. else if(eq(buffer, "quot"))
  88. cp = '\"';
  89. else
  90. return false;
  91. characters((caddr_t)&cp, 1);
  92. bufpos = 0;
  93. state = NONE;
  94. break;
  95. case TAG:
  96. if(ch == '>') {
  97. state = NONE;
  98. if(!parseTag())
  99. return false;
  100. }
  101. else if(ch == '[' && bufpos == 7 && !strncmp(buffer, "![CDATA", 7)) {
  102. state = CDATA;
  103. }
  104. else if(ch == '-' && bufpos == 2 && !strncmp(buffer, "!-", 2)) {
  105. state = COMMENT;
  106. bufpos = 0;
  107. }
  108. else if(ch == '[' && !strncmp(buffer, "!DOCTYPE ", 9)) {
  109. state = DTD;
  110. bufpos = 0;
  111. }
  112. else
  113. putBuffer(ch);
  114. break;
  115. case COMMENT:
  116. if(ch == '>' && bufpos >= 2 && !strncmp(&buffer[bufpos - 2], "--", 2)) {
  117. bufpos -= 2;
  118. if(bufpos)
  119. comment((caddr_t)buffer, bufpos);
  120. bufpos = 0;
  121. state = NONE;
  122. }
  123. else {
  124. buffer[bufpos++] = ch;
  125. if(bufpos == bufsize) {
  126. comment((caddr_t)buffer, bufpos);
  127. bufpos = 0;
  128. }
  129. }
  130. break;
  131. case CDATA:
  132. putBuffer(ch);
  133. if(bufpos > 2)
  134. if(eq(&buffer[bufpos - 3], "]]>")) {
  135. bufpos -= 3;
  136. state = NONE;
  137. clearBuffer();
  138. }
  139. break;
  140. case DTD:
  141. if(ch == '<')
  142. ++dcount;
  143. else if(ch == '>' && dcount)
  144. --dcount;
  145. else if(ch == '>')
  146. state = NONE;
  147. break;
  148. case NONE:
  149. if(ch == '<') {
  150. clearBuffer();
  151. state = TAG;
  152. }
  153. else if(ecount && ch == '&') {
  154. clearBuffer();
  155. state = AMP;
  156. }
  157. else if(ecount)
  158. putBuffer(ch);
  159. break;
  160. case END:
  161. return true;
  162. }
  163. if(state == END)
  164. return true;
  165. }
  166. // eof before end of ducument...
  167. return false;
  168. }
  169. bool XMLParser::parse(CharacterProtocol& io)
  170. {
  171. state = NONE;
  172. bufpos = 0;
  173. ecount = dcount = 0;
  174. int ch;
  175. unsigned char cp;
  176. while((ch = io.get()) != EOF) {
  177. switch(state) {
  178. case AMP:
  179. if((!bufpos && ch == '#') || isElement(ch)) {
  180. buffer[bufpos++] = ch;
  181. break;
  182. }
  183. if(ch != ';')
  184. return false;
  185. buffer[bufpos] = 0;
  186. if(buffer[0] == '#')
  187. cp = atoi(buffer + 1);
  188. else if(eq(buffer, "amp"))
  189. cp = '&';
  190. else if(eq(buffer, "lt"))
  191. cp = '<';
  192. else if(eq(buffer, "gt"))
  193. cp = '>';
  194. else if(eq(buffer, "apos"))
  195. cp = '`';
  196. else if(eq(buffer, "quot"))
  197. cp = '\"';
  198. else
  199. return false;
  200. characters((caddr_t)&cp, 1);
  201. bufpos = 0;
  202. state = NONE;
  203. break;
  204. case TAG:
  205. if(ch == '>') {
  206. state = NONE;
  207. if(!parseTag())
  208. return false;
  209. }
  210. else if(ch == '[' && bufpos == 7 && !strncmp(buffer, "![CDATA", 7)) {
  211. state = CDATA;
  212. }
  213. else if(ch == '-' && bufpos == 2 && !strncmp(buffer, "!-", 2)) {
  214. state = COMMENT;
  215. bufpos = 0;
  216. }
  217. else if(ch == '[' && !strncmp(buffer, "!DOCTYPE ", 9)) {
  218. state = DTD;
  219. bufpos = 0;
  220. }
  221. else
  222. putBuffer(ch);
  223. break;
  224. case COMMENT:
  225. if(ch == '>' && bufpos >= 2 && !strncmp(&buffer[bufpos - 2], "--", 2)) {
  226. bufpos -= 2;
  227. if(bufpos)
  228. comment((caddr_t)buffer, bufpos);
  229. bufpos = 0;
  230. state = NONE;
  231. }
  232. else {
  233. buffer[bufpos++] = ch;
  234. if(bufpos == bufsize) {
  235. comment((caddr_t)buffer, bufpos);
  236. bufpos = 0;
  237. }
  238. }
  239. break;
  240. case CDATA:
  241. putBuffer(ch);
  242. if(bufpos > 2)
  243. if(eq(&buffer[bufpos - 3], "]]>")) {
  244. bufpos -= 3;
  245. state = NONE;
  246. clearBuffer();
  247. }
  248. break;
  249. case DTD:
  250. if(ch == '<')
  251. ++dcount;
  252. else if(ch == '>' && dcount)
  253. --dcount;
  254. else if(ch == '>')
  255. state = NONE;
  256. break;
  257. case NONE:
  258. if(ch == '<') {
  259. clearBuffer();
  260. state = TAG;
  261. }
  262. else if(ecount && ch == '&') {
  263. clearBuffer();
  264. state = AMP;
  265. }
  266. else if(ecount)
  267. putBuffer(ch);
  268. break;
  269. case END:
  270. return true;
  271. }
  272. if(state == END)
  273. return true;
  274. }
  275. // eof before end of ducument...
  276. return false;
  277. }
  278. bool XMLParser::partial(const char *data, size_t len)
  279. {
  280. if(state == END)
  281. state = NONE;
  282. unsigned char cp;
  283. while(len--) {
  284. switch(state) {
  285. case AMP:
  286. if((!bufpos && *data == '#') || isElement(*data)) {
  287. buffer[bufpos++] = *data;
  288. break;
  289. }
  290. if(*data != ';')
  291. return false;
  292. buffer[bufpos] = 0;
  293. if(buffer[0] == '#')
  294. cp = atoi(buffer + 1);
  295. else if(eq(buffer, "amp"))
  296. cp = '&';
  297. else if(eq(buffer, "lt"))
  298. cp = '<';
  299. else if(eq(buffer, "gt"))
  300. cp = '>';
  301. else if(eq(buffer, "apos"))
  302. cp = '`';
  303. else if(eq(buffer, "quot"))
  304. cp = '\"';
  305. else
  306. return false;
  307. characters((caddr_t)&cp, 1);
  308. bufpos = 0;
  309. state = NONE;
  310. break;
  311. case TAG:
  312. if(*data == '>') {
  313. state = NONE;
  314. if(!parseTag())
  315. return false;
  316. }
  317. else if(*data == '[' && bufpos == 7 && !strncmp(buffer, "![CDATA", 7)) {
  318. state = CDATA;
  319. }
  320. else if(*data == '-' && bufpos == 2 && !strncmp(buffer, "!-", 2)) {
  321. state = COMMENT;
  322. bufpos = 0;
  323. }
  324. else if(*data == '[' && !strncmp(buffer, "!DOCTYPE ", 9)) {
  325. state = DTD;
  326. bufpos = 0;
  327. }
  328. else
  329. putBuffer(*data);
  330. break;
  331. case COMMENT:
  332. if(*data == '>' && bufpos >= 2 && !strncmp(&buffer[bufpos - 2], "--", 2)) {
  333. bufpos -= 2;
  334. if(bufpos)
  335. comment((caddr_t)buffer, bufpos);
  336. bufpos = 0;
  337. state = NONE;
  338. }
  339. else {
  340. buffer[bufpos++] = *data;
  341. if(bufpos == bufsize) {
  342. comment((caddr_t)buffer, bufpos);
  343. bufpos = 0;
  344. }
  345. }
  346. break;
  347. case CDATA:
  348. putBuffer(*data);
  349. if(bufpos > 2)
  350. if(eq(&buffer[bufpos - 3], "]]>")) {
  351. bufpos -= 3;
  352. state = NONE;
  353. clearBuffer();
  354. }
  355. break;
  356. case DTD:
  357. if(*data == '<')
  358. ++dcount;
  359. else if(*data == '>' && dcount)
  360. --dcount;
  361. else if(*data == '>')
  362. state = NONE;
  363. break;
  364. case NONE:
  365. case END:
  366. if(*data == '<') {
  367. clearBuffer();
  368. state = TAG;
  369. }
  370. else if(ecount && *data == '&') {
  371. clearBuffer();
  372. state = AMP;
  373. }
  374. else if(ecount)
  375. putBuffer(*data);
  376. }
  377. ++data;
  378. }
  379. return true;
  380. }
  381. bool XMLParser::parseTag(void)
  382. {
  383. size_t len = bufpos;
  384. const char *data = buffer;
  385. bool end = false;
  386. caddr_t attrib[128];
  387. unsigned attr = 0;
  388. char *ep;
  389. if(*data == '/') {
  390. while(--len) {
  391. if(!isElement(*(++data)))
  392. break;
  393. }
  394. if(len)
  395. return false;
  396. buffer[bufpos] = 0;
  397. endElement((caddr_t)(buffer + 1));
  398. bufpos = 0;
  399. --ecount;
  400. if(ecount < 0)
  401. return false;
  402. if(!ecount) {
  403. state = END;
  404. endDocument();
  405. }
  406. }
  407. else if(*data == '!') {
  408. bufpos = 0;
  409. return true; // dtd
  410. }
  411. else if(*data == '?') {
  412. if(!strnicmp(data, "?xml version=\"", 14)) {
  413. // version info
  414. }
  415. bufpos = 0;
  416. }
  417. else if(!isElement(*data))
  418. return false;
  419. else {
  420. end = false;
  421. if(buffer[bufpos - 1] == '/') {
  422. --bufpos;
  423. end = true;
  424. }
  425. len = 0;
  426. data = buffer;
  427. while(len < bufpos) {
  428. if(!isElement(*data))
  429. break;
  430. ++len;
  431. ++data;
  432. }
  433. if(len == bufpos) {
  434. if(!ecount)
  435. startDocument();
  436. ++ecount;
  437. attrib[0] = attrib[1] = NULL;
  438. buffer[bufpos] = 0;
  439. startElement((caddr_t)buffer, attrib);
  440. if(end) {
  441. ending:
  442. --ecount;
  443. endElement((caddr_t)buffer);
  444. if(!ecount) {
  445. state = END;
  446. endDocument();
  447. }
  448. }
  449. bufpos = 0;
  450. return true;
  451. }
  452. if(!ecount)
  453. startDocument();
  454. ++ecount;
  455. // attributes, name is between data and len
  456. for(;;) {
  457. while(!isElement(buffer[len]) && len < bufpos) {
  458. if(!isspace(buffer[len]))
  459. return false;
  460. buffer[len++] = 0;
  461. }
  462. if(len == bufpos)
  463. break;
  464. attrib[attr++] = (caddr_t)(buffer + len);
  465. while(len < bufpos && isElement(buffer[len]))
  466. ++len;
  467. if(len == bufpos)
  468. return false;
  469. if(buffer[len] != '=')
  470. return false;
  471. buffer[len++] = 0;
  472. if(len == bufpos) {
  473. attrib[attr++] = (caddr_t)"";
  474. break;
  475. }
  476. if(isspace(buffer[len])) {
  477. attrib[attr++] = (caddr_t)"";
  478. continue;
  479. }
  480. if(buffer[len] == '\'' || buffer[len] == '\"') {
  481. ep = strchr(buffer + len + 1, buffer[len]);
  482. if(!ep)
  483. return false;
  484. attrib[attr++] = (caddr_t)buffer + len + 1;
  485. *(ep++) = 0;
  486. len = ep - buffer;
  487. continue;
  488. }
  489. if(!isElement(buffer[len]))
  490. return false;
  491. attrib[attr++] = (caddr_t)buffer;
  492. while(isElement(buffer[len]) && len < bufpos)
  493. ++len;
  494. if(len == bufpos) {
  495. buffer[len] = 0;
  496. break;
  497. }
  498. }
  499. attrib[attr++] = NULL;
  500. attrib[attr++] = NULL;
  501. startElement((caddr_t)buffer, attrib);
  502. if(end)
  503. goto ending;
  504. bufpos = 0;
  505. return true;
  506. }
  507. return true;
  508. }
  509. // all our lovely base virtuals stubbed out so if we are lazy and forget to
  510. // implement something we want to ignore anyway (say comments...) we don't
  511. // bring whatever it is crashing down one day when we choose to add a
  512. // comment into an xml stream...
  513. void XMLParser::startDocument()
  514. {
  515. }
  516. void XMLParser::endDocument()
  517. {
  518. }
  519. void XMLParser::comment(caddr_t text, size_t len)
  520. {
  521. }
  522. void XMLParser::characters(caddr_t text, size_t len)
  523. {
  524. }