/Modules/expat/xmltok_impl.c

http://unladen-swallow.googlecode.com/ · C · 1779 lines · 1684 code · 54 blank · 41 comment · 417 complexity · 2d135f01d4064285f754cb587b3c24cf MD5 · raw file

  1. /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
  2. See the file COPYING for copying permission.
  3. */
  4. #ifndef IS_INVALID_CHAR
  5. #define IS_INVALID_CHAR(enc, ptr, n) (0)
  6. #endif
  7. #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
  8. case BT_LEAD ## n: \
  9. if (end - ptr < n) \
  10. return XML_TOK_PARTIAL_CHAR; \
  11. if (IS_INVALID_CHAR(enc, ptr, n)) { \
  12. *(nextTokPtr) = (ptr); \
  13. return XML_TOK_INVALID; \
  14. } \
  15. ptr += n; \
  16. break;
  17. #define INVALID_CASES(ptr, nextTokPtr) \
  18. INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
  19. INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
  20. INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
  21. case BT_NONXML: \
  22. case BT_MALFORM: \
  23. case BT_TRAIL: \
  24. *(nextTokPtr) = (ptr); \
  25. return XML_TOK_INVALID;
  26. #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
  27. case BT_LEAD ## n: \
  28. if (end - ptr < n) \
  29. return XML_TOK_PARTIAL_CHAR; \
  30. if (!IS_NAME_CHAR(enc, ptr, n)) { \
  31. *nextTokPtr = ptr; \
  32. return XML_TOK_INVALID; \
  33. } \
  34. ptr += n; \
  35. break;
  36. #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
  37. case BT_NONASCII: \
  38. if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
  39. *nextTokPtr = ptr; \
  40. return XML_TOK_INVALID; \
  41. } \
  42. case BT_NMSTRT: \
  43. case BT_HEX: \
  44. case BT_DIGIT: \
  45. case BT_NAME: \
  46. case BT_MINUS: \
  47. ptr += MINBPC(enc); \
  48. break; \
  49. CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
  50. CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
  51. CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
  52. #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
  53. case BT_LEAD ## n: \
  54. if (end - ptr < n) \
  55. return XML_TOK_PARTIAL_CHAR; \
  56. if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
  57. *nextTokPtr = ptr; \
  58. return XML_TOK_INVALID; \
  59. } \
  60. ptr += n; \
  61. break;
  62. #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
  63. case BT_NONASCII: \
  64. if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
  65. *nextTokPtr = ptr; \
  66. return XML_TOK_INVALID; \
  67. } \
  68. case BT_NMSTRT: \
  69. case BT_HEX: \
  70. ptr += MINBPC(enc); \
  71. break; \
  72. CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
  73. CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
  74. CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
  75. #ifndef PREFIX
  76. #define PREFIX(ident) ident
  77. #endif
  78. /* ptr points to character following "<!-" */
  79. static int PTRCALL
  80. PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
  81. const char *end, const char **nextTokPtr)
  82. {
  83. if (ptr != end) {
  84. if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
  85. *nextTokPtr = ptr;
  86. return XML_TOK_INVALID;
  87. }
  88. ptr += MINBPC(enc);
  89. while (ptr != end) {
  90. switch (BYTE_TYPE(enc, ptr)) {
  91. INVALID_CASES(ptr, nextTokPtr)
  92. case BT_MINUS:
  93. if ((ptr += MINBPC(enc)) == end)
  94. return XML_TOK_PARTIAL;
  95. if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
  96. if ((ptr += MINBPC(enc)) == end)
  97. return XML_TOK_PARTIAL;
  98. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  99. *nextTokPtr = ptr;
  100. return XML_TOK_INVALID;
  101. }
  102. *nextTokPtr = ptr + MINBPC(enc);
  103. return XML_TOK_COMMENT;
  104. }
  105. break;
  106. default:
  107. ptr += MINBPC(enc);
  108. break;
  109. }
  110. }
  111. }
  112. return XML_TOK_PARTIAL;
  113. }
  114. /* ptr points to character following "<!" */
  115. static int PTRCALL
  116. PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
  117. const char *end, const char **nextTokPtr)
  118. {
  119. if (ptr == end)
  120. return XML_TOK_PARTIAL;
  121. switch (BYTE_TYPE(enc, ptr)) {
  122. case BT_MINUS:
  123. return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  124. case BT_LSQB:
  125. *nextTokPtr = ptr + MINBPC(enc);
  126. return XML_TOK_COND_SECT_OPEN;
  127. case BT_NMSTRT:
  128. case BT_HEX:
  129. ptr += MINBPC(enc);
  130. break;
  131. default:
  132. *nextTokPtr = ptr;
  133. return XML_TOK_INVALID;
  134. }
  135. while (ptr != end) {
  136. switch (BYTE_TYPE(enc, ptr)) {
  137. case BT_PERCNT:
  138. if (ptr + MINBPC(enc) == end)
  139. return XML_TOK_PARTIAL;
  140. /* don't allow <!ENTITY% foo "whatever"> */
  141. switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
  142. case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
  143. *nextTokPtr = ptr;
  144. return XML_TOK_INVALID;
  145. }
  146. /* fall through */
  147. case BT_S: case BT_CR: case BT_LF:
  148. *nextTokPtr = ptr;
  149. return XML_TOK_DECL_OPEN;
  150. case BT_NMSTRT:
  151. case BT_HEX:
  152. ptr += MINBPC(enc);
  153. break;
  154. default:
  155. *nextTokPtr = ptr;
  156. return XML_TOK_INVALID;
  157. }
  158. }
  159. return XML_TOK_PARTIAL;
  160. }
  161. static int PTRCALL
  162. PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr,
  163. const char *end, int *tokPtr)
  164. {
  165. int upper = 0;
  166. *tokPtr = XML_TOK_PI;
  167. if (end - ptr != MINBPC(enc)*3)
  168. return 1;
  169. switch (BYTE_TO_ASCII(enc, ptr)) {
  170. case ASCII_x:
  171. break;
  172. case ASCII_X:
  173. upper = 1;
  174. break;
  175. default:
  176. return 1;
  177. }
  178. ptr += MINBPC(enc);
  179. switch (BYTE_TO_ASCII(enc, ptr)) {
  180. case ASCII_m:
  181. break;
  182. case ASCII_M:
  183. upper = 1;
  184. break;
  185. default:
  186. return 1;
  187. }
  188. ptr += MINBPC(enc);
  189. switch (BYTE_TO_ASCII(enc, ptr)) {
  190. case ASCII_l:
  191. break;
  192. case ASCII_L:
  193. upper = 1;
  194. break;
  195. default:
  196. return 1;
  197. }
  198. if (upper)
  199. return 0;
  200. *tokPtr = XML_TOK_XML_DECL;
  201. return 1;
  202. }
  203. /* ptr points to character following "<?" */
  204. static int PTRCALL
  205. PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
  206. const char *end, const char **nextTokPtr)
  207. {
  208. int tok;
  209. const char *target = ptr;
  210. if (ptr == end)
  211. return XML_TOK_PARTIAL;
  212. switch (BYTE_TYPE(enc, ptr)) {
  213. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  214. default:
  215. *nextTokPtr = ptr;
  216. return XML_TOK_INVALID;
  217. }
  218. while (ptr != end) {
  219. switch (BYTE_TYPE(enc, ptr)) {
  220. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  221. case BT_S: case BT_CR: case BT_LF:
  222. if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
  223. *nextTokPtr = ptr;
  224. return XML_TOK_INVALID;
  225. }
  226. ptr += MINBPC(enc);
  227. while (ptr != end) {
  228. switch (BYTE_TYPE(enc, ptr)) {
  229. INVALID_CASES(ptr, nextTokPtr)
  230. case BT_QUEST:
  231. ptr += MINBPC(enc);
  232. if (ptr == end)
  233. return XML_TOK_PARTIAL;
  234. if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  235. *nextTokPtr = ptr + MINBPC(enc);
  236. return tok;
  237. }
  238. break;
  239. default:
  240. ptr += MINBPC(enc);
  241. break;
  242. }
  243. }
  244. return XML_TOK_PARTIAL;
  245. case BT_QUEST:
  246. if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
  247. *nextTokPtr = ptr;
  248. return XML_TOK_INVALID;
  249. }
  250. ptr += MINBPC(enc);
  251. if (ptr == end)
  252. return XML_TOK_PARTIAL;
  253. if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  254. *nextTokPtr = ptr + MINBPC(enc);
  255. return tok;
  256. }
  257. /* fall through */
  258. default:
  259. *nextTokPtr = ptr;
  260. return XML_TOK_INVALID;
  261. }
  262. }
  263. return XML_TOK_PARTIAL;
  264. }
  265. static int PTRCALL
  266. PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr,
  267. const char *end, const char **nextTokPtr)
  268. {
  269. static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
  270. ASCII_T, ASCII_A, ASCII_LSQB };
  271. int i;
  272. /* CDATA[ */
  273. if (end - ptr < 6 * MINBPC(enc))
  274. return XML_TOK_PARTIAL;
  275. for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
  276. if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
  277. *nextTokPtr = ptr;
  278. return XML_TOK_INVALID;
  279. }
  280. }
  281. *nextTokPtr = ptr;
  282. return XML_TOK_CDATA_SECT_OPEN;
  283. }
  284. static int PTRCALL
  285. PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
  286. const char *end, const char **nextTokPtr)
  287. {
  288. if (ptr == end)
  289. return XML_TOK_NONE;
  290. if (MINBPC(enc) > 1) {
  291. size_t n = end - ptr;
  292. if (n & (MINBPC(enc) - 1)) {
  293. n &= ~(MINBPC(enc) - 1);
  294. if (n == 0)
  295. return XML_TOK_PARTIAL;
  296. end = ptr + n;
  297. }
  298. }
  299. switch (BYTE_TYPE(enc, ptr)) {
  300. case BT_RSQB:
  301. ptr += MINBPC(enc);
  302. if (ptr == end)
  303. return XML_TOK_PARTIAL;
  304. if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
  305. break;
  306. ptr += MINBPC(enc);
  307. if (ptr == end)
  308. return XML_TOK_PARTIAL;
  309. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  310. ptr -= MINBPC(enc);
  311. break;
  312. }
  313. *nextTokPtr = ptr + MINBPC(enc);
  314. return XML_TOK_CDATA_SECT_CLOSE;
  315. case BT_CR:
  316. ptr += MINBPC(enc);
  317. if (ptr == end)
  318. return XML_TOK_PARTIAL;
  319. if (BYTE_TYPE(enc, ptr) == BT_LF)
  320. ptr += MINBPC(enc);
  321. *nextTokPtr = ptr;
  322. return XML_TOK_DATA_NEWLINE;
  323. case BT_LF:
  324. *nextTokPtr = ptr + MINBPC(enc);
  325. return XML_TOK_DATA_NEWLINE;
  326. INVALID_CASES(ptr, nextTokPtr)
  327. default:
  328. ptr += MINBPC(enc);
  329. break;
  330. }
  331. while (ptr != end) {
  332. switch (BYTE_TYPE(enc, ptr)) {
  333. #define LEAD_CASE(n) \
  334. case BT_LEAD ## n: \
  335. if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
  336. *nextTokPtr = ptr; \
  337. return XML_TOK_DATA_CHARS; \
  338. } \
  339. ptr += n; \
  340. break;
  341. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  342. #undef LEAD_CASE
  343. case BT_NONXML:
  344. case BT_MALFORM:
  345. case BT_TRAIL:
  346. case BT_CR:
  347. case BT_LF:
  348. case BT_RSQB:
  349. *nextTokPtr = ptr;
  350. return XML_TOK_DATA_CHARS;
  351. default:
  352. ptr += MINBPC(enc);
  353. break;
  354. }
  355. }
  356. *nextTokPtr = ptr;
  357. return XML_TOK_DATA_CHARS;
  358. }
  359. /* ptr points to character following "</" */
  360. static int PTRCALL
  361. PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
  362. const char *end, const char **nextTokPtr)
  363. {
  364. if (ptr == end)
  365. return XML_TOK_PARTIAL;
  366. switch (BYTE_TYPE(enc, ptr)) {
  367. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  368. default:
  369. *nextTokPtr = ptr;
  370. return XML_TOK_INVALID;
  371. }
  372. while (ptr != end) {
  373. switch (BYTE_TYPE(enc, ptr)) {
  374. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  375. case BT_S: case BT_CR: case BT_LF:
  376. for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
  377. switch (BYTE_TYPE(enc, ptr)) {
  378. case BT_S: case BT_CR: case BT_LF:
  379. break;
  380. case BT_GT:
  381. *nextTokPtr = ptr + MINBPC(enc);
  382. return XML_TOK_END_TAG;
  383. default:
  384. *nextTokPtr = ptr;
  385. return XML_TOK_INVALID;
  386. }
  387. }
  388. return XML_TOK_PARTIAL;
  389. #ifdef XML_NS
  390. case BT_COLON:
  391. /* no need to check qname syntax here,
  392. since end-tag must match exactly */
  393. ptr += MINBPC(enc);
  394. break;
  395. #endif
  396. case BT_GT:
  397. *nextTokPtr = ptr + MINBPC(enc);
  398. return XML_TOK_END_TAG;
  399. default:
  400. *nextTokPtr = ptr;
  401. return XML_TOK_INVALID;
  402. }
  403. }
  404. return XML_TOK_PARTIAL;
  405. }
  406. /* ptr points to character following "&#X" */
  407. static int PTRCALL
  408. PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
  409. const char *end, const char **nextTokPtr)
  410. {
  411. if (ptr != end) {
  412. switch (BYTE_TYPE(enc, ptr)) {
  413. case BT_DIGIT:
  414. case BT_HEX:
  415. break;
  416. default:
  417. *nextTokPtr = ptr;
  418. return XML_TOK_INVALID;
  419. }
  420. for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
  421. switch (BYTE_TYPE(enc, ptr)) {
  422. case BT_DIGIT:
  423. case BT_HEX:
  424. break;
  425. case BT_SEMI:
  426. *nextTokPtr = ptr + MINBPC(enc);
  427. return XML_TOK_CHAR_REF;
  428. default:
  429. *nextTokPtr = ptr;
  430. return XML_TOK_INVALID;
  431. }
  432. }
  433. }
  434. return XML_TOK_PARTIAL;
  435. }
  436. /* ptr points to character following "&#" */
  437. static int PTRCALL
  438. PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
  439. const char *end, const char **nextTokPtr)
  440. {
  441. if (ptr != end) {
  442. if (CHAR_MATCHES(enc, ptr, ASCII_x))
  443. return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  444. switch (BYTE_TYPE(enc, ptr)) {
  445. case BT_DIGIT:
  446. break;
  447. default:
  448. *nextTokPtr = ptr;
  449. return XML_TOK_INVALID;
  450. }
  451. for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
  452. switch (BYTE_TYPE(enc, ptr)) {
  453. case BT_DIGIT:
  454. break;
  455. case BT_SEMI:
  456. *nextTokPtr = ptr + MINBPC(enc);
  457. return XML_TOK_CHAR_REF;
  458. default:
  459. *nextTokPtr = ptr;
  460. return XML_TOK_INVALID;
  461. }
  462. }
  463. }
  464. return XML_TOK_PARTIAL;
  465. }
  466. /* ptr points to character following "&" */
  467. static int PTRCALL
  468. PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
  469. const char **nextTokPtr)
  470. {
  471. if (ptr == end)
  472. return XML_TOK_PARTIAL;
  473. switch (BYTE_TYPE(enc, ptr)) {
  474. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  475. case BT_NUM:
  476. return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  477. default:
  478. *nextTokPtr = ptr;
  479. return XML_TOK_INVALID;
  480. }
  481. while (ptr != end) {
  482. switch (BYTE_TYPE(enc, ptr)) {
  483. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  484. case BT_SEMI:
  485. *nextTokPtr = ptr + MINBPC(enc);
  486. return XML_TOK_ENTITY_REF;
  487. default:
  488. *nextTokPtr = ptr;
  489. return XML_TOK_INVALID;
  490. }
  491. }
  492. return XML_TOK_PARTIAL;
  493. }
  494. /* ptr points to character following first character of attribute name */
  495. static int PTRCALL
  496. PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
  497. const char **nextTokPtr)
  498. {
  499. #ifdef XML_NS
  500. int hadColon = 0;
  501. #endif
  502. while (ptr != end) {
  503. switch (BYTE_TYPE(enc, ptr)) {
  504. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  505. #ifdef XML_NS
  506. case BT_COLON:
  507. if (hadColon) {
  508. *nextTokPtr = ptr;
  509. return XML_TOK_INVALID;
  510. }
  511. hadColon = 1;
  512. ptr += MINBPC(enc);
  513. if (ptr == end)
  514. return XML_TOK_PARTIAL;
  515. switch (BYTE_TYPE(enc, ptr)) {
  516. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  517. default:
  518. *nextTokPtr = ptr;
  519. return XML_TOK_INVALID;
  520. }
  521. break;
  522. #endif
  523. case BT_S: case BT_CR: case BT_LF:
  524. for (;;) {
  525. int t;
  526. ptr += MINBPC(enc);
  527. if (ptr == end)
  528. return XML_TOK_PARTIAL;
  529. t = BYTE_TYPE(enc, ptr);
  530. if (t == BT_EQUALS)
  531. break;
  532. switch (t) {
  533. case BT_S:
  534. case BT_LF:
  535. case BT_CR:
  536. break;
  537. default:
  538. *nextTokPtr = ptr;
  539. return XML_TOK_INVALID;
  540. }
  541. }
  542. /* fall through */
  543. case BT_EQUALS:
  544. {
  545. int open;
  546. #ifdef XML_NS
  547. hadColon = 0;
  548. #endif
  549. for (;;) {
  550. ptr += MINBPC(enc);
  551. if (ptr == end)
  552. return XML_TOK_PARTIAL;
  553. open = BYTE_TYPE(enc, ptr);
  554. if (open == BT_QUOT || open == BT_APOS)
  555. break;
  556. switch (open) {
  557. case BT_S:
  558. case BT_LF:
  559. case BT_CR:
  560. break;
  561. default:
  562. *nextTokPtr = ptr;
  563. return XML_TOK_INVALID;
  564. }
  565. }
  566. ptr += MINBPC(enc);
  567. /* in attribute value */
  568. for (;;) {
  569. int t;
  570. if (ptr == end)
  571. return XML_TOK_PARTIAL;
  572. t = BYTE_TYPE(enc, ptr);
  573. if (t == open)
  574. break;
  575. switch (t) {
  576. INVALID_CASES(ptr, nextTokPtr)
  577. case BT_AMP:
  578. {
  579. int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
  580. if (tok <= 0) {
  581. if (tok == XML_TOK_INVALID)
  582. *nextTokPtr = ptr;
  583. return tok;
  584. }
  585. break;
  586. }
  587. case BT_LT:
  588. *nextTokPtr = ptr;
  589. return XML_TOK_INVALID;
  590. default:
  591. ptr += MINBPC(enc);
  592. break;
  593. }
  594. }
  595. ptr += MINBPC(enc);
  596. if (ptr == end)
  597. return XML_TOK_PARTIAL;
  598. switch (BYTE_TYPE(enc, ptr)) {
  599. case BT_S:
  600. case BT_CR:
  601. case BT_LF:
  602. break;
  603. case BT_SOL:
  604. goto sol;
  605. case BT_GT:
  606. goto gt;
  607. default:
  608. *nextTokPtr = ptr;
  609. return XML_TOK_INVALID;
  610. }
  611. /* ptr points to closing quote */
  612. for (;;) {
  613. ptr += MINBPC(enc);
  614. if (ptr == end)
  615. return XML_TOK_PARTIAL;
  616. switch (BYTE_TYPE(enc, ptr)) {
  617. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  618. case BT_S: case BT_CR: case BT_LF:
  619. continue;
  620. case BT_GT:
  621. gt:
  622. *nextTokPtr = ptr + MINBPC(enc);
  623. return XML_TOK_START_TAG_WITH_ATTS;
  624. case BT_SOL:
  625. sol:
  626. ptr += MINBPC(enc);
  627. if (ptr == end)
  628. return XML_TOK_PARTIAL;
  629. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  630. *nextTokPtr = ptr;
  631. return XML_TOK_INVALID;
  632. }
  633. *nextTokPtr = ptr + MINBPC(enc);
  634. return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
  635. default:
  636. *nextTokPtr = ptr;
  637. return XML_TOK_INVALID;
  638. }
  639. break;
  640. }
  641. break;
  642. }
  643. default:
  644. *nextTokPtr = ptr;
  645. return XML_TOK_INVALID;
  646. }
  647. }
  648. return XML_TOK_PARTIAL;
  649. }
  650. /* ptr points to character following "<" */
  651. static int PTRCALL
  652. PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
  653. const char **nextTokPtr)
  654. {
  655. #ifdef XML_NS
  656. int hadColon;
  657. #endif
  658. if (ptr == end)
  659. return XML_TOK_PARTIAL;
  660. switch (BYTE_TYPE(enc, ptr)) {
  661. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  662. case BT_EXCL:
  663. if ((ptr += MINBPC(enc)) == end)
  664. return XML_TOK_PARTIAL;
  665. switch (BYTE_TYPE(enc, ptr)) {
  666. case BT_MINUS:
  667. return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  668. case BT_LSQB:
  669. return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
  670. end, nextTokPtr);
  671. }
  672. *nextTokPtr = ptr;
  673. return XML_TOK_INVALID;
  674. case BT_QUEST:
  675. return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  676. case BT_SOL:
  677. return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  678. default:
  679. *nextTokPtr = ptr;
  680. return XML_TOK_INVALID;
  681. }
  682. #ifdef XML_NS
  683. hadColon = 0;
  684. #endif
  685. /* we have a start-tag */
  686. while (ptr != end) {
  687. switch (BYTE_TYPE(enc, ptr)) {
  688. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  689. #ifdef XML_NS
  690. case BT_COLON:
  691. if (hadColon) {
  692. *nextTokPtr = ptr;
  693. return XML_TOK_INVALID;
  694. }
  695. hadColon = 1;
  696. ptr += MINBPC(enc);
  697. if (ptr == end)
  698. return XML_TOK_PARTIAL;
  699. switch (BYTE_TYPE(enc, ptr)) {
  700. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  701. default:
  702. *nextTokPtr = ptr;
  703. return XML_TOK_INVALID;
  704. }
  705. break;
  706. #endif
  707. case BT_S: case BT_CR: case BT_LF:
  708. {
  709. ptr += MINBPC(enc);
  710. while (ptr != end) {
  711. switch (BYTE_TYPE(enc, ptr)) {
  712. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  713. case BT_GT:
  714. goto gt;
  715. case BT_SOL:
  716. goto sol;
  717. case BT_S: case BT_CR: case BT_LF:
  718. ptr += MINBPC(enc);
  719. continue;
  720. default:
  721. *nextTokPtr = ptr;
  722. return XML_TOK_INVALID;
  723. }
  724. return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
  725. }
  726. return XML_TOK_PARTIAL;
  727. }
  728. case BT_GT:
  729. gt:
  730. *nextTokPtr = ptr + MINBPC(enc);
  731. return XML_TOK_START_TAG_NO_ATTS;
  732. case BT_SOL:
  733. sol:
  734. ptr += MINBPC(enc);
  735. if (ptr == end)
  736. return XML_TOK_PARTIAL;
  737. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  738. *nextTokPtr = ptr;
  739. return XML_TOK_INVALID;
  740. }
  741. *nextTokPtr = ptr + MINBPC(enc);
  742. return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
  743. default:
  744. *nextTokPtr = ptr;
  745. return XML_TOK_INVALID;
  746. }
  747. }
  748. return XML_TOK_PARTIAL;
  749. }
  750. static int PTRCALL
  751. PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
  752. const char **nextTokPtr)
  753. {
  754. if (ptr == end)
  755. return XML_TOK_NONE;
  756. if (MINBPC(enc) > 1) {
  757. size_t n = end - ptr;
  758. if (n & (MINBPC(enc) - 1)) {
  759. n &= ~(MINBPC(enc) - 1);
  760. if (n == 0)
  761. return XML_TOK_PARTIAL;
  762. end = ptr + n;
  763. }
  764. }
  765. switch (BYTE_TYPE(enc, ptr)) {
  766. case BT_LT:
  767. return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  768. case BT_AMP:
  769. return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  770. case BT_CR:
  771. ptr += MINBPC(enc);
  772. if (ptr == end)
  773. return XML_TOK_TRAILING_CR;
  774. if (BYTE_TYPE(enc, ptr) == BT_LF)
  775. ptr += MINBPC(enc);
  776. *nextTokPtr = ptr;
  777. return XML_TOK_DATA_NEWLINE;
  778. case BT_LF:
  779. *nextTokPtr = ptr + MINBPC(enc);
  780. return XML_TOK_DATA_NEWLINE;
  781. case BT_RSQB:
  782. ptr += MINBPC(enc);
  783. if (ptr == end)
  784. return XML_TOK_TRAILING_RSQB;
  785. if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
  786. break;
  787. ptr += MINBPC(enc);
  788. if (ptr == end)
  789. return XML_TOK_TRAILING_RSQB;
  790. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  791. ptr -= MINBPC(enc);
  792. break;
  793. }
  794. *nextTokPtr = ptr;
  795. return XML_TOK_INVALID;
  796. INVALID_CASES(ptr, nextTokPtr)
  797. default:
  798. ptr += MINBPC(enc);
  799. break;
  800. }
  801. while (ptr != end) {
  802. switch (BYTE_TYPE(enc, ptr)) {
  803. #define LEAD_CASE(n) \
  804. case BT_LEAD ## n: \
  805. if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
  806. *nextTokPtr = ptr; \
  807. return XML_TOK_DATA_CHARS; \
  808. } \
  809. ptr += n; \
  810. break;
  811. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  812. #undef LEAD_CASE
  813. case BT_RSQB:
  814. if (ptr + MINBPC(enc) != end) {
  815. if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
  816. ptr += MINBPC(enc);
  817. break;
  818. }
  819. if (ptr + 2*MINBPC(enc) != end) {
  820. if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
  821. ptr += MINBPC(enc);
  822. break;
  823. }
  824. *nextTokPtr = ptr + 2*MINBPC(enc);
  825. return XML_TOK_INVALID;
  826. }
  827. }
  828. /* fall through */
  829. case BT_AMP:
  830. case BT_LT:
  831. case BT_NONXML:
  832. case BT_MALFORM:
  833. case BT_TRAIL:
  834. case BT_CR:
  835. case BT_LF:
  836. *nextTokPtr = ptr;
  837. return XML_TOK_DATA_CHARS;
  838. default:
  839. ptr += MINBPC(enc);
  840. break;
  841. }
  842. }
  843. *nextTokPtr = ptr;
  844. return XML_TOK_DATA_CHARS;
  845. }
  846. /* ptr points to character following "%" */
  847. static int PTRCALL
  848. PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
  849. const char **nextTokPtr)
  850. {
  851. if (ptr == end)
  852. return -XML_TOK_PERCENT;
  853. switch (BYTE_TYPE(enc, ptr)) {
  854. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  855. case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
  856. *nextTokPtr = ptr;
  857. return XML_TOK_PERCENT;
  858. default:
  859. *nextTokPtr = ptr;
  860. return XML_TOK_INVALID;
  861. }
  862. while (ptr != end) {
  863. switch (BYTE_TYPE(enc, ptr)) {
  864. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  865. case BT_SEMI:
  866. *nextTokPtr = ptr + MINBPC(enc);
  867. return XML_TOK_PARAM_ENTITY_REF;
  868. default:
  869. *nextTokPtr = ptr;
  870. return XML_TOK_INVALID;
  871. }
  872. }
  873. return XML_TOK_PARTIAL;
  874. }
  875. static int PTRCALL
  876. PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
  877. const char **nextTokPtr)
  878. {
  879. if (ptr == end)
  880. return XML_TOK_PARTIAL;
  881. switch (BYTE_TYPE(enc, ptr)) {
  882. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  883. default:
  884. *nextTokPtr = ptr;
  885. return XML_TOK_INVALID;
  886. }
  887. while (ptr != end) {
  888. switch (BYTE_TYPE(enc, ptr)) {
  889. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  890. case BT_CR: case BT_LF: case BT_S:
  891. case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
  892. *nextTokPtr = ptr;
  893. return XML_TOK_POUND_NAME;
  894. default:
  895. *nextTokPtr = ptr;
  896. return XML_TOK_INVALID;
  897. }
  898. }
  899. return -XML_TOK_POUND_NAME;
  900. }
  901. static int PTRCALL
  902. PREFIX(scanLit)(int open, const ENCODING *enc,
  903. const char *ptr, const char *end,
  904. const char **nextTokPtr)
  905. {
  906. while (ptr != end) {
  907. int t = BYTE_TYPE(enc, ptr);
  908. switch (t) {
  909. INVALID_CASES(ptr, nextTokPtr)
  910. case BT_QUOT:
  911. case BT_APOS:
  912. ptr += MINBPC(enc);
  913. if (t != open)
  914. break;
  915. if (ptr == end)
  916. return -XML_TOK_LITERAL;
  917. *nextTokPtr = ptr;
  918. switch (BYTE_TYPE(enc, ptr)) {
  919. case BT_S: case BT_CR: case BT_LF:
  920. case BT_GT: case BT_PERCNT: case BT_LSQB:
  921. return XML_TOK_LITERAL;
  922. default:
  923. return XML_TOK_INVALID;
  924. }
  925. default:
  926. ptr += MINBPC(enc);
  927. break;
  928. }
  929. }
  930. return XML_TOK_PARTIAL;
  931. }
  932. static int PTRCALL
  933. PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
  934. const char **nextTokPtr)
  935. {
  936. int tok;
  937. if (ptr == end)
  938. return XML_TOK_NONE;
  939. if (MINBPC(enc) > 1) {
  940. size_t n = end - ptr;
  941. if (n & (MINBPC(enc) - 1)) {
  942. n &= ~(MINBPC(enc) - 1);
  943. if (n == 0)
  944. return XML_TOK_PARTIAL;
  945. end = ptr + n;
  946. }
  947. }
  948. switch (BYTE_TYPE(enc, ptr)) {
  949. case BT_QUOT:
  950. return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
  951. case BT_APOS:
  952. return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
  953. case BT_LT:
  954. {
  955. ptr += MINBPC(enc);
  956. if (ptr == end)
  957. return XML_TOK_PARTIAL;
  958. switch (BYTE_TYPE(enc, ptr)) {
  959. case BT_EXCL:
  960. return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  961. case BT_QUEST:
  962. return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  963. case BT_NMSTRT:
  964. case BT_HEX:
  965. case BT_NONASCII:
  966. case BT_LEAD2:
  967. case BT_LEAD3:
  968. case BT_LEAD4:
  969. *nextTokPtr = ptr - MINBPC(enc);
  970. return XML_TOK_INSTANCE_START;
  971. }
  972. *nextTokPtr = ptr;
  973. return XML_TOK_INVALID;
  974. }
  975. case BT_CR:
  976. if (ptr + MINBPC(enc) == end) {
  977. *nextTokPtr = end;
  978. /* indicate that this might be part of a CR/LF pair */
  979. return -XML_TOK_PROLOG_S;
  980. }
  981. /* fall through */
  982. case BT_S: case BT_LF:
  983. for (;;) {
  984. ptr += MINBPC(enc);
  985. if (ptr == end)
  986. break;
  987. switch (BYTE_TYPE(enc, ptr)) {
  988. case BT_S: case BT_LF:
  989. break;
  990. case BT_CR:
  991. /* don't split CR/LF pair */
  992. if (ptr + MINBPC(enc) != end)
  993. break;
  994. /* fall through */
  995. default:
  996. *nextTokPtr = ptr;
  997. return XML_TOK_PROLOG_S;
  998. }
  999. }
  1000. *nextTokPtr = ptr;
  1001. return XML_TOK_PROLOG_S;
  1002. case BT_PERCNT:
  1003. return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1004. case BT_COMMA:
  1005. *nextTokPtr = ptr + MINBPC(enc);
  1006. return XML_TOK_COMMA;
  1007. case BT_LSQB:
  1008. *nextTokPtr = ptr + MINBPC(enc);
  1009. return XML_TOK_OPEN_BRACKET;
  1010. case BT_RSQB:
  1011. ptr += MINBPC(enc);
  1012. if (ptr == end)
  1013. return -XML_TOK_CLOSE_BRACKET;
  1014. if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
  1015. if (ptr + MINBPC(enc) == end)
  1016. return XML_TOK_PARTIAL;
  1017. if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
  1018. *nextTokPtr = ptr + 2*MINBPC(enc);
  1019. return XML_TOK_COND_SECT_CLOSE;
  1020. }
  1021. }
  1022. *nextTokPtr = ptr;
  1023. return XML_TOK_CLOSE_BRACKET;
  1024. case BT_LPAR:
  1025. *nextTokPtr = ptr + MINBPC(enc);
  1026. return XML_TOK_OPEN_PAREN;
  1027. case BT_RPAR:
  1028. ptr += MINBPC(enc);
  1029. if (ptr == end)
  1030. return -XML_TOK_CLOSE_PAREN;
  1031. switch (BYTE_TYPE(enc, ptr)) {
  1032. case BT_AST:
  1033. *nextTokPtr = ptr + MINBPC(enc);
  1034. return XML_TOK_CLOSE_PAREN_ASTERISK;
  1035. case BT_QUEST:
  1036. *nextTokPtr = ptr + MINBPC(enc);
  1037. return XML_TOK_CLOSE_PAREN_QUESTION;
  1038. case BT_PLUS:
  1039. *nextTokPtr = ptr + MINBPC(enc);
  1040. return XML_TOK_CLOSE_PAREN_PLUS;
  1041. case BT_CR: case BT_LF: case BT_S:
  1042. case BT_GT: case BT_COMMA: case BT_VERBAR:
  1043. case BT_RPAR:
  1044. *nextTokPtr = ptr;
  1045. return XML_TOK_CLOSE_PAREN;
  1046. }
  1047. *nextTokPtr = ptr;
  1048. return XML_TOK_INVALID;
  1049. case BT_VERBAR:
  1050. *nextTokPtr = ptr + MINBPC(enc);
  1051. return XML_TOK_OR;
  1052. case BT_GT:
  1053. *nextTokPtr = ptr + MINBPC(enc);
  1054. return XML_TOK_DECL_CLOSE;
  1055. case BT_NUM:
  1056. return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1057. #define LEAD_CASE(n) \
  1058. case BT_LEAD ## n: \
  1059. if (end - ptr < n) \
  1060. return XML_TOK_PARTIAL_CHAR; \
  1061. if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
  1062. ptr += n; \
  1063. tok = XML_TOK_NAME; \
  1064. break; \
  1065. } \
  1066. if (IS_NAME_CHAR(enc, ptr, n)) { \
  1067. ptr += n; \
  1068. tok = XML_TOK_NMTOKEN; \
  1069. break; \
  1070. } \
  1071. *nextTokPtr = ptr; \
  1072. return XML_TOK_INVALID;
  1073. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1074. #undef LEAD_CASE
  1075. case BT_NMSTRT:
  1076. case BT_HEX:
  1077. tok = XML_TOK_NAME;
  1078. ptr += MINBPC(enc);
  1079. break;
  1080. case BT_DIGIT:
  1081. case BT_NAME:
  1082. case BT_MINUS:
  1083. #ifdef XML_NS
  1084. case BT_COLON:
  1085. #endif
  1086. tok = XML_TOK_NMTOKEN;
  1087. ptr += MINBPC(enc);
  1088. break;
  1089. case BT_NONASCII:
  1090. if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
  1091. ptr += MINBPC(enc);
  1092. tok = XML_TOK_NAME;
  1093. break;
  1094. }
  1095. if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
  1096. ptr += MINBPC(enc);
  1097. tok = XML_TOK_NMTOKEN;
  1098. break;
  1099. }
  1100. /* fall through */
  1101. default:
  1102. *nextTokPtr = ptr;
  1103. return XML_TOK_INVALID;
  1104. }
  1105. while (ptr != end) {
  1106. switch (BYTE_TYPE(enc, ptr)) {
  1107. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  1108. case BT_GT: case BT_RPAR: case BT_COMMA:
  1109. case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
  1110. case BT_S: case BT_CR: case BT_LF:
  1111. *nextTokPtr = ptr;
  1112. return tok;
  1113. #ifdef XML_NS
  1114. case BT_COLON:
  1115. ptr += MINBPC(enc);
  1116. switch (tok) {
  1117. case XML_TOK_NAME:
  1118. if (ptr == end)
  1119. return XML_TOK_PARTIAL;
  1120. tok = XML_TOK_PREFIXED_NAME;
  1121. switch (BYTE_TYPE(enc, ptr)) {
  1122. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  1123. default:
  1124. tok = XML_TOK_NMTOKEN;
  1125. break;
  1126. }
  1127. break;
  1128. case XML_TOK_PREFIXED_NAME:
  1129. tok = XML_TOK_NMTOKEN;
  1130. break;
  1131. }
  1132. break;
  1133. #endif
  1134. case BT_PLUS:
  1135. if (tok == XML_TOK_NMTOKEN) {
  1136. *nextTokPtr = ptr;
  1137. return XML_TOK_INVALID;
  1138. }
  1139. *nextTokPtr = ptr + MINBPC(enc);
  1140. return XML_TOK_NAME_PLUS;
  1141. case BT_AST:
  1142. if (tok == XML_TOK_NMTOKEN) {
  1143. *nextTokPtr = ptr;
  1144. return XML_TOK_INVALID;
  1145. }
  1146. *nextTokPtr = ptr + MINBPC(enc);
  1147. return XML_TOK_NAME_ASTERISK;
  1148. case BT_QUEST:
  1149. if (tok == XML_TOK_NMTOKEN) {
  1150. *nextTokPtr = ptr;
  1151. return XML_TOK_INVALID;
  1152. }
  1153. *nextTokPtr = ptr + MINBPC(enc);
  1154. return XML_TOK_NAME_QUESTION;
  1155. default:
  1156. *nextTokPtr = ptr;
  1157. return XML_TOK_INVALID;
  1158. }
  1159. }
  1160. return -tok;
  1161. }
  1162. static int PTRCALL
  1163. PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
  1164. const char *end, const char **nextTokPtr)
  1165. {
  1166. const char *start;
  1167. if (ptr == end)
  1168. return XML_TOK_NONE;
  1169. start = ptr;
  1170. while (ptr != end) {
  1171. switch (BYTE_TYPE(enc, ptr)) {
  1172. #define LEAD_CASE(n) \
  1173. case BT_LEAD ## n: ptr += n; break;
  1174. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1175. #undef LEAD_CASE
  1176. case BT_AMP:
  1177. if (ptr == start)
  1178. return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1179. *nextTokPtr = ptr;
  1180. return XML_TOK_DATA_CHARS;
  1181. case BT_LT:
  1182. /* this is for inside entity references */
  1183. *nextTokPtr = ptr;
  1184. return XML_TOK_INVALID;
  1185. case BT_LF:
  1186. if (ptr == start) {
  1187. *nextTokPtr = ptr + MINBPC(enc);
  1188. return XML_TOK_DATA_NEWLINE;
  1189. }
  1190. *nextTokPtr = ptr;
  1191. return XML_TOK_DATA_CHARS;
  1192. case BT_CR:
  1193. if (ptr == start) {
  1194. ptr += MINBPC(enc);
  1195. if (ptr == end)
  1196. return XML_TOK_TRAILING_CR;
  1197. if (BYTE_TYPE(enc, ptr) == BT_LF)
  1198. ptr += MINBPC(enc);
  1199. *nextTokPtr = ptr;
  1200. return XML_TOK_DATA_NEWLINE;
  1201. }
  1202. *nextTokPtr = ptr;
  1203. return XML_TOK_DATA_CHARS;
  1204. case BT_S:
  1205. if (ptr == start) {
  1206. *nextTokPtr = ptr + MINBPC(enc);
  1207. return XML_TOK_ATTRIBUTE_VALUE_S;
  1208. }
  1209. *nextTokPtr = ptr;
  1210. return XML_TOK_DATA_CHARS;
  1211. default:
  1212. ptr += MINBPC(enc);
  1213. break;
  1214. }
  1215. }
  1216. *nextTokPtr = ptr;
  1217. return XML_TOK_DATA_CHARS;
  1218. }
  1219. static int PTRCALL
  1220. PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
  1221. const char *end, const char **nextTokPtr)
  1222. {
  1223. const char *start;
  1224. if (ptr == end)
  1225. return XML_TOK_NONE;
  1226. start = ptr;
  1227. while (ptr != end) {
  1228. switch (BYTE_TYPE(enc, ptr)) {
  1229. #define LEAD_CASE(n) \
  1230. case BT_LEAD ## n: ptr += n; break;
  1231. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1232. #undef LEAD_CASE
  1233. case BT_AMP:
  1234. if (ptr == start)
  1235. return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1236. *nextTokPtr = ptr;
  1237. return XML_TOK_DATA_CHARS;
  1238. case BT_PERCNT:
  1239. if (ptr == start) {
  1240. int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
  1241. end, nextTokPtr);
  1242. return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
  1243. }
  1244. *nextTokPtr = ptr;
  1245. return XML_TOK_DATA_CHARS;
  1246. case BT_LF:
  1247. if (ptr == start) {
  1248. *nextTokPtr = ptr + MINBPC(enc);
  1249. return XML_TOK_DATA_NEWLINE;
  1250. }
  1251. *nextTokPtr = ptr;
  1252. return XML_TOK_DATA_CHARS;
  1253. case BT_CR:
  1254. if (ptr == start) {
  1255. ptr += MINBPC(enc);
  1256. if (ptr == end)
  1257. return XML_TOK_TRAILING_CR;
  1258. if (BYTE_TYPE(enc, ptr) == BT_LF)
  1259. ptr += MINBPC(enc);
  1260. *nextTokPtr = ptr;
  1261. return XML_TOK_DATA_NEWLINE;
  1262. }
  1263. *nextTokPtr = ptr;
  1264. return XML_TOK_DATA_CHARS;
  1265. default:
  1266. ptr += MINBPC(enc);
  1267. break;
  1268. }
  1269. }
  1270. *nextTokPtr = ptr;
  1271. return XML_TOK_DATA_CHARS;
  1272. }
  1273. #ifdef XML_DTD
  1274. static int PTRCALL
  1275. PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
  1276. const char *end, const char **nextTokPtr)
  1277. {
  1278. int level = 0;
  1279. if (MINBPC(enc) > 1) {
  1280. size_t n = end - ptr;
  1281. if (n & (MINBPC(enc) - 1)) {
  1282. n &= ~(MINBPC(enc) - 1);
  1283. end = ptr + n;
  1284. }
  1285. }
  1286. while (ptr != end) {
  1287. switch (BYTE_TYPE(enc, ptr)) {
  1288. INVALID_CASES(ptr, nextTokPtr)
  1289. case BT_LT:
  1290. if ((ptr += MINBPC(enc)) == end)
  1291. return XML_TOK_PARTIAL;
  1292. if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
  1293. if ((ptr += MINBPC(enc)) == end)
  1294. return XML_TOK_PARTIAL;
  1295. if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
  1296. ++level;
  1297. ptr += MINBPC(enc);
  1298. }
  1299. }
  1300. break;
  1301. case BT_RSQB:
  1302. if ((ptr += MINBPC(enc)) == end)
  1303. return XML_TOK_PARTIAL;
  1304. if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
  1305. if ((ptr += MINBPC(enc)) == end)
  1306. return XML_TOK_PARTIAL;
  1307. if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  1308. ptr += MINBPC(enc);
  1309. if (level == 0) {
  1310. *nextTokPtr = ptr;
  1311. return XML_TOK_IGNORE_SECT;
  1312. }
  1313. --level;
  1314. }
  1315. }
  1316. break;
  1317. default:
  1318. ptr += MINBPC(enc);
  1319. break;
  1320. }
  1321. }
  1322. return XML_TOK_PARTIAL;
  1323. }
  1324. #endif /* XML_DTD */
  1325. static int PTRCALL
  1326. PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
  1327. const char **badPtr)
  1328. {
  1329. ptr += MINBPC(enc);
  1330. end -= MINBPC(enc);
  1331. for (; ptr != end; ptr += MINBPC(enc)) {
  1332. switch (BYTE_TYPE(enc, ptr)) {
  1333. case BT_DIGIT:
  1334. case BT_HEX:
  1335. case BT_MINUS:
  1336. case BT_APOS:
  1337. case BT_LPAR:
  1338. case BT_RPAR:
  1339. case BT_PLUS:
  1340. case BT_COMMA:
  1341. case BT_SOL:
  1342. case BT_EQUALS:
  1343. case BT_QUEST:
  1344. case BT_CR:
  1345. case BT_LF:
  1346. case BT_SEMI:
  1347. case BT_EXCL:
  1348. case BT_AST:
  1349. case BT_PERCNT:
  1350. case BT_NUM:
  1351. #ifdef XML_NS
  1352. case BT_COLON:
  1353. #endif
  1354. break;
  1355. case BT_S:
  1356. if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
  1357. *badPtr = ptr;
  1358. return 0;
  1359. }
  1360. break;
  1361. case BT_NAME:
  1362. case BT_NMSTRT:
  1363. if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
  1364. break;
  1365. default:
  1366. switch (BYTE_TO_ASCII(enc, ptr)) {
  1367. case 0x24: /* $ */
  1368. case 0x40: /* @ */
  1369. break;
  1370. default:
  1371. *badPtr = ptr;
  1372. return 0;
  1373. }
  1374. break;
  1375. }
  1376. }
  1377. return 1;
  1378. }
  1379. /* This must only be called for a well-formed start-tag or empty
  1380. element tag. Returns the number of attributes. Pointers to the
  1381. first attsMax attributes are stored in atts.
  1382. */
  1383. static int PTRCALL
  1384. PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
  1385. int attsMax, ATTRIBUTE *atts)
  1386. {
  1387. enum { other, inName, inValue } state = inName;
  1388. int nAtts = 0;
  1389. int open = 0; /* defined when state == inValue;
  1390. initialization just to shut up compilers */
  1391. for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
  1392. switch (BYTE_TYPE(enc, ptr)) {
  1393. #define START_NAME \
  1394. if (state == other) { \
  1395. if (nAtts < attsMax) { \
  1396. atts[nAtts].name = ptr; \
  1397. atts[nAtts].normalized = 1; \
  1398. } \
  1399. state = inName; \
  1400. }
  1401. #define LEAD_CASE(n) \
  1402. case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
  1403. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1404. #undef LEAD_CASE
  1405. case BT_NONASCII:
  1406. case BT_NMSTRT:
  1407. case BT_HEX:
  1408. START_NAME
  1409. break;
  1410. #undef START_NAME
  1411. case BT_QUOT:
  1412. if (state != inValue) {
  1413. if (nAtts < attsMax)
  1414. atts[nAtts].valuePtr = ptr + MINBPC(enc);
  1415. state = inValue;
  1416. open = BT_QUOT;
  1417. }
  1418. else if (open == BT_QUOT) {
  1419. state = other;
  1420. if (nAtts < attsMax)
  1421. atts[nAtts].valueEnd = ptr;
  1422. nAtts++;
  1423. }
  1424. break;
  1425. case BT_APOS:
  1426. if (state != inValue) {
  1427. if (nAtts < attsMax)
  1428. atts[nAtts].valuePtr = ptr + MINBPC(enc);
  1429. state = inValue;
  1430. open = BT_APOS;
  1431. }
  1432. else if (open == BT_APOS) {
  1433. state = other;
  1434. if (nAtts < attsMax)
  1435. atts[nAtts].valueEnd = ptr;
  1436. nAtts++;
  1437. }
  1438. break;
  1439. case BT_AMP:
  1440. if (nAtts < attsMax)
  1441. atts[nAtts].normalized = 0;
  1442. break;
  1443. case BT_S:
  1444. if (state == inName)
  1445. state = other;
  1446. else if (state == inValue
  1447. && nAtts < attsMax
  1448. && atts[nAtts].normalized
  1449. && (ptr == atts[nAtts].valuePtr
  1450. || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
  1451. || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
  1452. || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
  1453. atts[nAtts].normalized = 0;
  1454. break;
  1455. case BT_CR: case BT_LF:
  1456. /* This case ensures that the first attribute name is counted
  1457. Apart from that we could just change state on the quote. */
  1458. if (state == inName)
  1459. state = other;
  1460. else if (state == inValue && nAtts < attsMax)
  1461. atts[nAtts].normalized = 0;
  1462. break;
  1463. case BT_GT:
  1464. case BT_SOL:
  1465. if (state != inValue)
  1466. return nAtts;
  1467. break;
  1468. default:
  1469. break;
  1470. }
  1471. }
  1472. /* not reached */
  1473. }
  1474. static int PTRFASTCALL
  1475. PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
  1476. {
  1477. int result = 0;
  1478. /* skip &# */
  1479. ptr += 2*MINBPC(enc);
  1480. if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
  1481. for (ptr += MINBPC(enc);
  1482. !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
  1483. ptr += MINBPC(enc)) {
  1484. int c = BYTE_TO_ASCII(enc, ptr);
  1485. switch (c) {
  1486. case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
  1487. case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
  1488. result <<= 4;
  1489. result |= (c - ASCII_0);
  1490. break;
  1491. case ASCII_A: case ASCII_B: case ASCII_C:
  1492. case ASCII_D: case ASCII_E: case ASCII_F:
  1493. result <<= 4;
  1494. result += 10 + (c - ASCII_A);
  1495. break;
  1496. case ASCII_a: case ASCII_b: case ASCII_c:
  1497. case ASCII_d: case ASCII_e: case ASCII_f:
  1498. result <<= 4;
  1499. result += 10 + (c - ASCII_a);
  1500. break;
  1501. }
  1502. if (result >= 0x110000)
  1503. return -1;
  1504. }
  1505. }
  1506. else {
  1507. for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
  1508. int c = BYTE_TO_ASCII(enc, ptr);
  1509. result *= 10;
  1510. result += (c - ASCII_0);
  1511. if (result >= 0x110000)
  1512. return -1;
  1513. }
  1514. }
  1515. return checkCharRefNumber(result);
  1516. }
  1517. static int PTRCALL
  1518. PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
  1519. const char *end)
  1520. {
  1521. switch ((end - ptr)/MINBPC(enc)) {
  1522. case 2:
  1523. if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
  1524. switch (BYTE_TO_ASCII(enc, ptr)) {
  1525. case ASCII_l:
  1526. return ASCII_LT;
  1527. case ASCII_g:
  1528. return ASCII_GT;
  1529. }
  1530. }
  1531. break;
  1532. case 3:
  1533. if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
  1534. ptr += MINBPC(enc);
  1535. if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
  1536. ptr += MINBPC(enc);
  1537. if (CHAR_MATCHES(enc, ptr, ASCII_p))
  1538. return ASCII_AMP;
  1539. }
  1540. }
  1541. break;
  1542. case 4:
  1543. switch (BYTE_TO_ASCII(enc, ptr)) {
  1544. case ASCII_q:
  1545. ptr += MINBPC(enc);
  1546. if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
  1547. ptr += MINBPC(enc);
  1548. if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
  1549. ptr += MINBPC(enc);
  1550. if (CHAR_MATCHES(enc, ptr, ASCII_t))
  1551. return ASCII_QUOT;
  1552. }
  1553. }
  1554. break;
  1555. case ASCII_a:
  1556. ptr += MINBPC(enc);
  1557. if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
  1558. ptr += MINBPC(enc);
  1559. if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
  1560. ptr += MINBPC(enc);
  1561. if (CHAR_MATCHES(enc, ptr, ASCII_s))
  1562. return ASCII_APOS;
  1563. }
  1564. }
  1565. break;
  1566. }
  1567. }
  1568. return 0;
  1569. }
  1570. static int PTRCALL
  1571. PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
  1572. {
  1573. for (;;) {
  1574. switch (BYTE_TYPE(enc, ptr1)) {
  1575. #define LEAD_CASE(n) \
  1576. case BT_LEAD ## n: \
  1577. if (*ptr1++ != *ptr2++) \
  1578. return 0;
  1579. LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
  1580. #undef LEAD_CASE
  1581. /* fall through */
  1582. if (*ptr1++ != *ptr2++)
  1583. return 0;
  1584. break;
  1585. case BT_NONASCII:
  1586. case BT_NMSTRT:
  1587. #ifdef XML_NS
  1588. case BT_COLON:
  1589. #endif
  1590. case BT_HEX:
  1591. case BT_DIGIT:
  1592. case BT_NAME:
  1593. case BT_MINUS:
  1594. if (*ptr2++ != *ptr1++)
  1595. return 0;
  1596. if (MINBPC(enc) > 1) {
  1597. if (*ptr2++ != *ptr1++)
  1598. return 0;
  1599. if (MINBPC(enc) > 2) {
  1600. if (*ptr2++ != *ptr1++)
  1601. return 0;
  1602. if (MINBPC(enc) > 3) {
  1603. if (*ptr2++ != *ptr1++)
  1604. return 0;
  1605. }
  1606. }
  1607. }
  1608. break;
  1609. default:
  1610. if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
  1611. return 1;
  1612. switch (BYTE_TYPE(enc, ptr2)) {
  1613. case BT_LEAD2:
  1614. case BT_LEAD3:
  1615. case BT_LEAD4:
  1616. case BT_NONASCII:
  1617. case BT_NMSTRT:
  1618. #ifdef XML_NS
  1619. case BT_COLON:
  1620. #endif
  1621. case BT_HEX:
  1622. case BT_DIGIT:
  1623. case BT_NAME:
  1624. case BT_MINUS:
  1625. return 0;
  1626. default:
  1627. return 1;
  1628. }
  1629. }
  1630. }
  1631. /* not reached */
  1632. }
  1633. static int PTRCALL
  1634. PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
  1635. const char *end1, const char *ptr2)
  1636. {
  1637. for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
  1638. if (ptr1 == end1)
  1639. return 0;
  1640. if (!CHAR_MATCHES(enc, ptr1, *ptr2))
  1641. return 0;
  1642. }
  1643. return ptr1 == end1;
  1644. }
  1645. static int PTRFASTCALL
  1646. PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
  1647. {
  1648. const char *start = ptr;
  1649. for (;;) {
  1650. switch (BYTE_TYPE(enc, ptr)) {
  1651. #define LEAD_CASE(n) \
  1652. case BT_LEAD ## n: ptr += n; break;
  1653. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1654. #undef LEAD_CASE
  1655. case BT_NONASCII:
  1656. case BT_NMSTRT:
  1657. #ifdef XML_NS
  1658. case BT_COLON:
  1659. #endif
  1660. case BT_HEX:
  1661. case BT_DIGIT:
  1662. case BT_NAME:
  1663. case BT_MINUS:
  1664. ptr += MINBPC(enc);
  1665. break;
  1666. default:
  1667. return (int)(ptr - start);
  1668. }
  1669. }
  1670. }
  1671. static const char * PTRFASTCALL
  1672. PREFIX(skipS)(const ENCODING *enc, const char *ptr)
  1673. {
  1674. for (;;) {
  1675. switch (BYTE_TYPE(enc, ptr)) {
  1676. case BT_LF:
  1677. case BT_CR:
  1678. case BT_S:
  1679. ptr += MINBPC(enc);
  1680. break;
  1681. default:
  1682. return ptr;
  1683. }
  1684. }
  1685. }
  1686. static void PTRCALL
  1687. PREFIX(updatePosition)(const ENCODING *enc,
  1688. const char *ptr,
  1689. const char *end,
  1690. POSITION *pos)
  1691. {
  1692. while (ptr < end) {
  1693. switch (BYTE_TYPE(enc, ptr)) {
  1694. #define LEAD_CASE(n) \
  1695. case BT_LEAD ## n: \
  1696. ptr += n; \
  1697. break;
  1698. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1699. #undef LEAD_CASE
  1700. case BT_LF:
  1701. pos->columnNumber = (XML_Size)-1;
  1702. pos->lineNumber++;
  1703. ptr += MINBPC(enc);
  1704. break;
  1705. case BT_CR:
  1706. pos->lineNumber++;
  1707. ptr += MINBPC(enc);
  1708. if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
  1709. ptr += MINBPC(enc);
  1710. pos->columnNumber = (XML_Size)-1;
  1711. break;
  1712. default:
  1713. ptr += MINBPC(enc);
  1714. break;
  1715. }
  1716. pos->columnNumber++;
  1717. }
  1718. }
  1719. #undef DO_LEAD_CASE
  1720. #undef MULTIBYTE_CASES
  1721. #undef INVALID_CASES
  1722. #undef CHECK_NAME_CASE
  1723. #undef CHECK_NAME_CASES
  1724. #undef CHECK_NMSTRT_CASE
  1725. #undef CHECK_NMSTRT_CASES