/Modules/expat/xmltok.c

http://unladen-swallow.googlecode.com/ · C · 1639 lines · 1402 code · 162 blank · 75 comment · 301 complexity · 4ad8bf7e12ff6f2a59fdc4b80bd3ed0a MD5 · raw file

  1. /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
  2. See the file COPYING for copying permission.
  3. */
  4. #ifdef COMPILED_FROM_DSP
  5. #include "winconfig.h"
  6. #elif defined(MACOS_CLASSIC)
  7. #include "macconfig.h"
  8. #elif defined(__amigaos4__)
  9. #include "amigaconfig.h"
  10. #else
  11. #ifdef HAVE_EXPAT_CONFIG_H
  12. #include <expat_config.h>
  13. #endif
  14. #endif /* ndef COMPILED_FROM_DSP */
  15. #include <stddef.h>
  16. #include "expat_external.h"
  17. #include "internal.h"
  18. #include "xmltok.h"
  19. #include "nametab.h"
  20. #ifdef XML_DTD
  21. #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
  22. #else
  23. #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
  24. #endif
  25. #define VTABLE1 \
  26. { PREFIX(prologTok), PREFIX(contentTok), \
  27. PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
  28. { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
  29. PREFIX(sameName), \
  30. PREFIX(nameMatchesAscii), \
  31. PREFIX(nameLength), \
  32. PREFIX(skipS), \
  33. PREFIX(getAtts), \
  34. PREFIX(charRefNumber), \
  35. PREFIX(predefinedEntityName), \
  36. PREFIX(updatePosition), \
  37. PREFIX(isPublicId)
  38. #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
  39. #define UCS2_GET_NAMING(pages, hi, lo) \
  40. (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
  41. /* A 2 byte UTF-8 representation splits the characters 11 bits between
  42. the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
  43. pages, 3 bits to add to that index and 5 bits to generate the mask.
  44. */
  45. #define UTF8_GET_NAMING2(pages, byte) \
  46. (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
  47. + ((((byte)[0]) & 3) << 1) \
  48. + ((((byte)[1]) >> 5) & 1)] \
  49. & (1 << (((byte)[1]) & 0x1F)))
  50. /* A 3 byte UTF-8 representation splits the characters 16 bits between
  51. the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
  52. into pages, 3 bits to add to that index and 5 bits to generate the
  53. mask.
  54. */
  55. #define UTF8_GET_NAMING3(pages, byte) \
  56. (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
  57. + ((((byte)[1]) >> 2) & 0xF)] \
  58. << 3) \
  59. + ((((byte)[1]) & 3) << 1) \
  60. + ((((byte)[2]) >> 5) & 1)] \
  61. & (1 << (((byte)[2]) & 0x1F)))
  62. #define UTF8_GET_NAMING(pages, p, n) \
  63. ((n) == 2 \
  64. ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
  65. : ((n) == 3 \
  66. ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
  67. : 0))
  68. /* Detection of invalid UTF-8 sequences is based on Table 3.1B
  69. of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
  70. with the additional restriction of not allowing the Unicode
  71. code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
  72. Implementation details:
  73. (A & 0x80) == 0 means A < 0x80
  74. and
  75. (A & 0xC0) == 0xC0 means A > 0xBF
  76. */
  77. #define UTF8_INVALID2(p) \
  78. ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
  79. #define UTF8_INVALID3(p) \
  80. (((p)[2] & 0x80) == 0 \
  81. || \
  82. ((*p) == 0xEF && (p)[1] == 0xBF \
  83. ? \
  84. (p)[2] > 0xBD \
  85. : \
  86. ((p)[2] & 0xC0) == 0xC0) \
  87. || \
  88. ((*p) == 0xE0 \
  89. ? \
  90. (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
  91. : \
  92. ((p)[1] & 0x80) == 0 \
  93. || \
  94. ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
  95. #define UTF8_INVALID4(p) \
  96. (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
  97. || \
  98. ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
  99. || \
  100. ((*p) == 0xF0 \
  101. ? \
  102. (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
  103. : \
  104. ((p)[1] & 0x80) == 0 \
  105. || \
  106. ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
  107. static int PTRFASTCALL
  108. isNever(const ENCODING *enc, const char *p)
  109. {
  110. return 0;
  111. }
  112. static int PTRFASTCALL
  113. utf8_isName2(const ENCODING *enc, const char *p)
  114. {
  115. return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
  116. }
  117. static int PTRFASTCALL
  118. utf8_isName3(const ENCODING *enc, const char *p)
  119. {
  120. return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
  121. }
  122. #define utf8_isName4 isNever
  123. static int PTRFASTCALL
  124. utf8_isNmstrt2(const ENCODING *enc, const char *p)
  125. {
  126. return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
  127. }
  128. static int PTRFASTCALL
  129. utf8_isNmstrt3(const ENCODING *enc, const char *p)
  130. {
  131. return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
  132. }
  133. #define utf8_isNmstrt4 isNever
  134. static int PTRFASTCALL
  135. utf8_isInvalid2(const ENCODING *enc, const char *p)
  136. {
  137. return UTF8_INVALID2((const unsigned char *)p);
  138. }
  139. static int PTRFASTCALL
  140. utf8_isInvalid3(const ENCODING *enc, const char *p)
  141. {
  142. return UTF8_INVALID3((const unsigned char *)p);
  143. }
  144. static int PTRFASTCALL
  145. utf8_isInvalid4(const ENCODING *enc, const char *p)
  146. {
  147. return UTF8_INVALID4((const unsigned char *)p);
  148. }
  149. struct normal_encoding {
  150. ENCODING enc;
  151. unsigned char type[256];
  152. #ifdef XML_MIN_SIZE
  153. int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
  154. int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
  155. int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
  156. int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
  157. int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
  158. #endif /* XML_MIN_SIZE */
  159. int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
  160. int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
  161. int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
  162. int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
  163. int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
  164. int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
  165. int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
  166. int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
  167. int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
  168. };
  169. #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))
  170. #ifdef XML_MIN_SIZE
  171. #define STANDARD_VTABLE(E) \
  172. E ## byteType, \
  173. E ## isNameMin, \
  174. E ## isNmstrtMin, \
  175. E ## byteToAscii, \
  176. E ## charMatches,
  177. #else
  178. #define STANDARD_VTABLE(E) /* as nothing */
  179. #endif
  180. #define NORMAL_VTABLE(E) \
  181. E ## isName2, \
  182. E ## isName3, \
  183. E ## isName4, \
  184. E ## isNmstrt2, \
  185. E ## isNmstrt3, \
  186. E ## isNmstrt4, \
  187. E ## isInvalid2, \
  188. E ## isInvalid3, \
  189. E ## isInvalid4
  190. static int FASTCALL checkCharRefNumber(int);
  191. #include "xmltok_impl.h"
  192. #include "ascii.h"
  193. #ifdef XML_MIN_SIZE
  194. #define sb_isNameMin isNever
  195. #define sb_isNmstrtMin isNever
  196. #endif
  197. #ifdef XML_MIN_SIZE
  198. #define MINBPC(enc) ((enc)->minBytesPerChar)
  199. #else
  200. /* minimum bytes per character */
  201. #define MINBPC(enc) 1
  202. #endif
  203. #define SB_BYTE_TYPE(enc, p) \
  204. (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
  205. #ifdef XML_MIN_SIZE
  206. static int PTRFASTCALL
  207. sb_byteType(const ENCODING *enc, const char *p)
  208. {
  209. return SB_BYTE_TYPE(enc, p);
  210. }
  211. #define BYTE_TYPE(enc, p) \
  212. (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
  213. #else
  214. #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
  215. #endif
  216. #ifdef XML_MIN_SIZE
  217. #define BYTE_TO_ASCII(enc, p) \
  218. (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
  219. static int PTRFASTCALL
  220. sb_byteToAscii(const ENCODING *enc, const char *p)
  221. {
  222. return *p;
  223. }
  224. #else
  225. #define BYTE_TO_ASCII(enc, p) (*(p))
  226. #endif
  227. #define IS_NAME_CHAR(enc, p, n) \
  228. (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
  229. #define IS_NMSTRT_CHAR(enc, p, n) \
  230. (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
  231. #define IS_INVALID_CHAR(enc, p, n) \
  232. (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
  233. #ifdef XML_MIN_SIZE
  234. #define IS_NAME_CHAR_MINBPC(enc, p) \
  235. (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
  236. #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
  237. (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
  238. #else
  239. #define IS_NAME_CHAR_MINBPC(enc, p) (0)
  240. #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
  241. #endif
  242. #ifdef XML_MIN_SIZE
  243. #define CHAR_MATCHES(enc, p, c) \
  244. (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
  245. static int PTRCALL
  246. sb_charMatches(const ENCODING *enc, const char *p, int c)
  247. {
  248. return *p == c;
  249. }
  250. #else
  251. /* c is an ASCII character */
  252. #define CHAR_MATCHES(enc, p, c) (*(p) == c)
  253. #endif
  254. #define PREFIX(ident) normal_ ## ident
  255. #include "xmltok_impl.c"
  256. #undef MINBPC
  257. #undef BYTE_TYPE
  258. #undef BYTE_TO_ASCII
  259. #undef CHAR_MATCHES
  260. #undef IS_NAME_CHAR
  261. #undef IS_NAME_CHAR_MINBPC
  262. #undef IS_NMSTRT_CHAR
  263. #undef IS_NMSTRT_CHAR_MINBPC
  264. #undef IS_INVALID_CHAR
  265. enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
  266. UTF8_cval1 = 0x00,
  267. UTF8_cval2 = 0xc0,
  268. UTF8_cval3 = 0xe0,
  269. UTF8_cval4 = 0xf0
  270. };
  271. static void PTRCALL
  272. utf8_toUtf8(const ENCODING *enc,
  273. const char **fromP, const char *fromLim,
  274. char **toP, const char *toLim)
  275. {
  276. char *to;
  277. const char *from;
  278. if (fromLim - *fromP > toLim - *toP) {
  279. /* Avoid copying partial characters. */
  280. for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
  281. if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
  282. break;
  283. }
  284. for (to = *toP, from = *fromP; from != fromLim; from++, to++)
  285. *to = *from;
  286. *fromP = from;
  287. *toP = to;
  288. }
  289. static void PTRCALL
  290. utf8_toUtf16(const ENCODING *enc,
  291. const char **fromP, const char *fromLim,
  292. unsigned short **toP, const unsigned short *toLim)
  293. {
  294. unsigned short *to = *toP;
  295. const char *from = *fromP;
  296. while (from != fromLim && to != toLim) {
  297. switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
  298. case BT_LEAD2:
  299. *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
  300. from += 2;
  301. break;
  302. case BT_LEAD3:
  303. *to++ = (unsigned short)(((from[0] & 0xf) << 12)
  304. | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
  305. from += 3;
  306. break;
  307. case BT_LEAD4:
  308. {
  309. unsigned long n;
  310. if (to + 1 == toLim)
  311. goto after;
  312. n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
  313. | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
  314. n -= 0x10000;
  315. to[0] = (unsigned short)((n >> 10) | 0xD800);
  316. to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
  317. to += 2;
  318. from += 4;
  319. }
  320. break;
  321. default:
  322. *to++ = *from++;
  323. break;
  324. }
  325. }
  326. after:
  327. *fromP = from;
  328. *toP = to;
  329. }
  330. #ifdef XML_NS
  331. static const struct normal_encoding utf8_encoding_ns = {
  332. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  333. {
  334. #include "asciitab.h"
  335. #include "utf8tab.h"
  336. },
  337. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  338. };
  339. #endif
  340. static const struct normal_encoding utf8_encoding = {
  341. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  342. {
  343. #define BT_COLON BT_NMSTRT
  344. #include "asciitab.h"
  345. #undef BT_COLON
  346. #include "utf8tab.h"
  347. },
  348. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  349. };
  350. #ifdef XML_NS
  351. static const struct normal_encoding internal_utf8_encoding_ns = {
  352. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  353. {
  354. #include "iasciitab.h"
  355. #include "utf8tab.h"
  356. },
  357. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  358. };
  359. #endif
  360. static const struct normal_encoding internal_utf8_encoding = {
  361. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  362. {
  363. #define BT_COLON BT_NMSTRT
  364. #include "iasciitab.h"
  365. #undef BT_COLON
  366. #include "utf8tab.h"
  367. },
  368. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  369. };
  370. static void PTRCALL
  371. latin1_toUtf8(const ENCODING *enc,
  372. const char **fromP, const char *fromLim,
  373. char **toP, const char *toLim)
  374. {
  375. for (;;) {
  376. unsigned char c;
  377. if (*fromP == fromLim)
  378. break;
  379. c = (unsigned char)**fromP;
  380. if (c & 0x80) {
  381. if (toLim - *toP < 2)
  382. break;
  383. *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
  384. *(*toP)++ = (char)((c & 0x3f) | 0x80);
  385. (*fromP)++;
  386. }
  387. else {
  388. if (*toP == toLim)
  389. break;
  390. *(*toP)++ = *(*fromP)++;
  391. }
  392. }
  393. }
  394. static void PTRCALL
  395. latin1_toUtf16(const ENCODING *enc,
  396. const char **fromP, const char *fromLim,
  397. unsigned short **toP, const unsigned short *toLim)
  398. {
  399. while (*fromP != fromLim && *toP != toLim)
  400. *(*toP)++ = (unsigned char)*(*fromP)++;
  401. }
  402. #ifdef XML_NS
  403. static const struct normal_encoding latin1_encoding_ns = {
  404. { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
  405. {
  406. #include "asciitab.h"
  407. #include "latin1tab.h"
  408. },
  409. STANDARD_VTABLE(sb_)
  410. };
  411. #endif
  412. static const struct normal_encoding latin1_encoding = {
  413. { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
  414. {
  415. #define BT_COLON BT_NMSTRT
  416. #include "asciitab.h"
  417. #undef BT_COLON
  418. #include "latin1tab.h"
  419. },
  420. STANDARD_VTABLE(sb_)
  421. };
  422. static void PTRCALL
  423. ascii_toUtf8(const ENCODING *enc,
  424. const char **fromP, const char *fromLim,
  425. char **toP, const char *toLim)
  426. {
  427. while (*fromP != fromLim && *toP != toLim)
  428. *(*toP)++ = *(*fromP)++;
  429. }
  430. #ifdef XML_NS
  431. static const struct normal_encoding ascii_encoding_ns = {
  432. { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
  433. {
  434. #include "asciitab.h"
  435. /* BT_NONXML == 0 */
  436. },
  437. STANDARD_VTABLE(sb_)
  438. };
  439. #endif
  440. static const struct normal_encoding ascii_encoding = {
  441. { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
  442. {
  443. #define BT_COLON BT_NMSTRT
  444. #include "asciitab.h"
  445. #undef BT_COLON
  446. /* BT_NONXML == 0 */
  447. },
  448. STANDARD_VTABLE(sb_)
  449. };
  450. static int PTRFASTCALL
  451. unicode_byte_type(char hi, char lo)
  452. {
  453. switch ((unsigned char)hi) {
  454. case 0xD8: case 0xD9: case 0xDA: case 0xDB:
  455. return BT_LEAD4;
  456. case 0xDC: case 0xDD: case 0xDE: case 0xDF:
  457. return BT_TRAIL;
  458. case 0xFF:
  459. switch ((unsigned char)lo) {
  460. case 0xFF:
  461. case 0xFE:
  462. return BT_NONXML;
  463. }
  464. break;
  465. }
  466. return BT_NONASCII;
  467. }
  468. #define DEFINE_UTF16_TO_UTF8(E) \
  469. static void PTRCALL \
  470. E ## toUtf8(const ENCODING *enc, \
  471. const char **fromP, const char *fromLim, \
  472. char **toP, const char *toLim) \
  473. { \
  474. const char *from; \
  475. for (from = *fromP; from != fromLim; from += 2) { \
  476. int plane; \
  477. unsigned char lo2; \
  478. unsigned char lo = GET_LO(from); \
  479. unsigned char hi = GET_HI(from); \
  480. switch (hi) { \
  481. case 0: \
  482. if (lo < 0x80) { \
  483. if (*toP == toLim) { \
  484. *fromP = from; \
  485. return; \
  486. } \
  487. *(*toP)++ = lo; \
  488. break; \
  489. } \
  490. /* fall through */ \
  491. case 0x1: case 0x2: case 0x3: \
  492. case 0x4: case 0x5: case 0x6: case 0x7: \
  493. if (toLim - *toP < 2) { \
  494. *fromP = from; \
  495. return; \
  496. } \
  497. *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
  498. *(*toP)++ = ((lo & 0x3f) | 0x80); \
  499. break; \
  500. default: \
  501. if (toLim - *toP < 3) { \
  502. *fromP = from; \
  503. return; \
  504. } \
  505. /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
  506. *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
  507. *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
  508. *(*toP)++ = ((lo & 0x3f) | 0x80); \
  509. break; \
  510. case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
  511. if (toLim - *toP < 4) { \
  512. *fromP = from; \
  513. return; \
  514. } \
  515. plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
  516. *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
  517. *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
  518. from += 2; \
  519. lo2 = GET_LO(from); \
  520. *(*toP)++ = (((lo & 0x3) << 4) \
  521. | ((GET_HI(from) & 0x3) << 2) \
  522. | (lo2 >> 6) \
  523. | 0x80); \
  524. *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
  525. break; \
  526. } \
  527. } \
  528. *fromP = from; \
  529. }
  530. #define DEFINE_UTF16_TO_UTF16(E) \
  531. static void PTRCALL \
  532. E ## toUtf16(const ENCODING *enc, \
  533. const char **fromP, const char *fromLim, \
  534. unsigned short **toP, const unsigned short *toLim) \
  535. { \
  536. /* Avoid copying first half only of surrogate */ \
  537. if (fromLim - *fromP > ((toLim - *toP) << 1) \
  538. && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
  539. fromLim -= 2; \
  540. for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
  541. *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
  542. }
  543. #define SET2(ptr, ch) \
  544. (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
  545. #define GET_LO(ptr) ((unsigned char)(ptr)[0])
  546. #define GET_HI(ptr) ((unsigned char)(ptr)[1])
  547. DEFINE_UTF16_TO_UTF8(little2_)
  548. DEFINE_UTF16_TO_UTF16(little2_)
  549. #undef SET2
  550. #undef GET_LO
  551. #undef GET_HI
  552. #define SET2(ptr, ch) \
  553. (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
  554. #define GET_LO(ptr) ((unsigned char)(ptr)[1])
  555. #define GET_HI(ptr) ((unsigned char)(ptr)[0])
  556. DEFINE_UTF16_TO_UTF8(big2_)
  557. DEFINE_UTF16_TO_UTF16(big2_)
  558. #undef SET2
  559. #undef GET_LO
  560. #undef GET_HI
  561. #define LITTLE2_BYTE_TYPE(enc, p) \
  562. ((p)[1] == 0 \
  563. ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
  564. : unicode_byte_type((p)[1], (p)[0]))
  565. #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
  566. #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
  567. #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
  568. UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
  569. #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
  570. UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
  571. #ifdef XML_MIN_SIZE
  572. static int PTRFASTCALL
  573. little2_byteType(const ENCODING *enc, const char *p)
  574. {
  575. return LITTLE2_BYTE_TYPE(enc, p);
  576. }
  577. static int PTRFASTCALL
  578. little2_byteToAscii(const ENCODING *enc, const char *p)
  579. {
  580. return LITTLE2_BYTE_TO_ASCII(enc, p);
  581. }
  582. static int PTRCALL
  583. little2_charMatches(const ENCODING *enc, const char *p, int c)
  584. {
  585. return LITTLE2_CHAR_MATCHES(enc, p, c);
  586. }
  587. static int PTRFASTCALL
  588. little2_isNameMin(const ENCODING *enc, const char *p)
  589. {
  590. return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
  591. }
  592. static int PTRFASTCALL
  593. little2_isNmstrtMin(const ENCODING *enc, const char *p)
  594. {
  595. return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
  596. }
  597. #undef VTABLE
  598. #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
  599. #else /* not XML_MIN_SIZE */
  600. #undef PREFIX
  601. #define PREFIX(ident) little2_ ## ident
  602. #define MINBPC(enc) 2
  603. /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
  604. #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
  605. #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
  606. #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
  607. #define IS_NAME_CHAR(enc, p, n) 0
  608. #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
  609. #define IS_NMSTRT_CHAR(enc, p, n) (0)
  610. #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
  611. #include "xmltok_impl.c"
  612. #undef MINBPC
  613. #undef BYTE_TYPE
  614. #undef BYTE_TO_ASCII
  615. #undef CHAR_MATCHES
  616. #undef IS_NAME_CHAR
  617. #undef IS_NAME_CHAR_MINBPC
  618. #undef IS_NMSTRT_CHAR
  619. #undef IS_NMSTRT_CHAR_MINBPC
  620. #undef IS_INVALID_CHAR
  621. #endif /* not XML_MIN_SIZE */
  622. #ifdef XML_NS
  623. static const struct normal_encoding little2_encoding_ns = {
  624. { VTABLE, 2, 0,
  625. #if BYTEORDER == 1234
  626. 1
  627. #else
  628. 0
  629. #endif
  630. },
  631. {
  632. #include "asciitab.h"
  633. #include "latin1tab.h"
  634. },
  635. STANDARD_VTABLE(little2_)
  636. };
  637. #endif
  638. static const struct normal_encoding little2_encoding = {
  639. { VTABLE, 2, 0,
  640. #if BYTEORDER == 1234
  641. 1
  642. #else
  643. 0
  644. #endif
  645. },
  646. {
  647. #define BT_COLON BT_NMSTRT
  648. #include "asciitab.h"
  649. #undef BT_COLON
  650. #include "latin1tab.h"
  651. },
  652. STANDARD_VTABLE(little2_)
  653. };
  654. #if BYTEORDER != 4321
  655. #ifdef XML_NS
  656. static const struct normal_encoding internal_little2_encoding_ns = {
  657. { VTABLE, 2, 0, 1 },
  658. {
  659. #include "iasciitab.h"
  660. #include "latin1tab.h"
  661. },
  662. STANDARD_VTABLE(little2_)
  663. };
  664. #endif
  665. static const struct normal_encoding internal_little2_encoding = {
  666. { VTABLE, 2, 0, 1 },
  667. {
  668. #define BT_COLON BT_NMSTRT
  669. #include "iasciitab.h"
  670. #undef BT_COLON
  671. #include "latin1tab.h"
  672. },
  673. STANDARD_VTABLE(little2_)
  674. };
  675. #endif
  676. #define BIG2_BYTE_TYPE(enc, p) \
  677. ((p)[0] == 0 \
  678. ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
  679. : unicode_byte_type((p)[0], (p)[1]))
  680. #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
  681. #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
  682. #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
  683. UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
  684. #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
  685. UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
  686. #ifdef XML_MIN_SIZE
  687. static int PTRFASTCALL
  688. big2_byteType(const ENCODING *enc, const char *p)
  689. {
  690. return BIG2_BYTE_TYPE(enc, p);
  691. }
  692. static int PTRFASTCALL
  693. big2_byteToAscii(const ENCODING *enc, const char *p)
  694. {
  695. return BIG2_BYTE_TO_ASCII(enc, p);
  696. }
  697. static int PTRCALL
  698. big2_charMatches(const ENCODING *enc, const char *p, int c)
  699. {
  700. return BIG2_CHAR_MATCHES(enc, p, c);
  701. }
  702. static int PTRFASTCALL
  703. big2_isNameMin(const ENCODING *enc, const char *p)
  704. {
  705. return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
  706. }
  707. static int PTRFASTCALL
  708. big2_isNmstrtMin(const ENCODING *enc, const char *p)
  709. {
  710. return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
  711. }
  712. #undef VTABLE
  713. #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
  714. #else /* not XML_MIN_SIZE */
  715. #undef PREFIX
  716. #define PREFIX(ident) big2_ ## ident
  717. #define MINBPC(enc) 2
  718. /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
  719. #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
  720. #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
  721. #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
  722. #define IS_NAME_CHAR(enc, p, n) 0
  723. #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
  724. #define IS_NMSTRT_CHAR(enc, p, n) (0)
  725. #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
  726. #include "xmltok_impl.c"
  727. #undef MINBPC
  728. #undef BYTE_TYPE
  729. #undef BYTE_TO_ASCII
  730. #undef CHAR_MATCHES
  731. #undef IS_NAME_CHAR
  732. #undef IS_NAME_CHAR_MINBPC
  733. #undef IS_NMSTRT_CHAR
  734. #undef IS_NMSTRT_CHAR_MINBPC
  735. #undef IS_INVALID_CHAR
  736. #endif /* not XML_MIN_SIZE */
  737. #ifdef XML_NS
  738. static const struct normal_encoding big2_encoding_ns = {
  739. { VTABLE, 2, 0,
  740. #if BYTEORDER == 4321
  741. 1
  742. #else
  743. 0
  744. #endif
  745. },
  746. {
  747. #include "asciitab.h"
  748. #include "latin1tab.h"
  749. },
  750. STANDARD_VTABLE(big2_)
  751. };
  752. #endif
  753. static const struct normal_encoding big2_encoding = {
  754. { VTABLE, 2, 0,
  755. #if BYTEORDER == 4321
  756. 1
  757. #else
  758. 0
  759. #endif
  760. },
  761. {
  762. #define BT_COLON BT_NMSTRT
  763. #include "asciitab.h"
  764. #undef BT_COLON
  765. #include "latin1tab.h"
  766. },
  767. STANDARD_VTABLE(big2_)
  768. };
  769. #if BYTEORDER != 1234
  770. #ifdef XML_NS
  771. static const struct normal_encoding internal_big2_encoding_ns = {
  772. { VTABLE, 2, 0, 1 },
  773. {
  774. #include "iasciitab.h"
  775. #include "latin1tab.h"
  776. },
  777. STANDARD_VTABLE(big2_)
  778. };
  779. #endif
  780. static const struct normal_encoding internal_big2_encoding = {
  781. { VTABLE, 2, 0, 1 },
  782. {
  783. #define BT_COLON BT_NMSTRT
  784. #include "iasciitab.h"
  785. #undef BT_COLON
  786. #include "latin1tab.h"
  787. },
  788. STANDARD_VTABLE(big2_)
  789. };
  790. #endif
  791. #undef PREFIX
  792. static int FASTCALL
  793. streqci(const char *s1, const char *s2)
  794. {
  795. for (;;) {
  796. char c1 = *s1++;
  797. char c2 = *s2++;
  798. if (ASCII_a <= c1 && c1 <= ASCII_z)
  799. c1 += ASCII_A - ASCII_a;
  800. if (ASCII_a <= c2 && c2 <= ASCII_z)
  801. c2 += ASCII_A - ASCII_a;
  802. if (c1 != c2)
  803. return 0;
  804. if (!c1)
  805. break;
  806. }
  807. return 1;
  808. }
  809. static void PTRCALL
  810. initUpdatePosition(const ENCODING *enc, const char *ptr,
  811. const char *end, POSITION *pos)
  812. {
  813. normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
  814. }
  815. static int
  816. toAscii(const ENCODING *enc, const char *ptr, const char *end)
  817. {
  818. char buf[1];
  819. char *p = buf;
  820. XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
  821. if (p == buf)
  822. return -1;
  823. else
  824. return buf[0];
  825. }
  826. static int FASTCALL
  827. isSpace(int c)
  828. {
  829. switch (c) {
  830. case 0x20:
  831. case 0xD:
  832. case 0xA:
  833. case 0x9:
  834. return 1;
  835. }
  836. return 0;
  837. }
  838. /* Return 1 if there's just optional white space or there's an S
  839. followed by name=val.
  840. */
  841. static int
  842. parsePseudoAttribute(const ENCODING *enc,
  843. const char *ptr,
  844. const char *end,
  845. const char **namePtr,
  846. const char **nameEndPtr,
  847. const char **valPtr,
  848. const char **nextTokPtr)
  849. {
  850. int c;
  851. char open;
  852. if (ptr == end) {
  853. *namePtr = NULL;
  854. return 1;
  855. }
  856. if (!isSpace(toAscii(enc, ptr, end))) {
  857. *nextTokPtr = ptr;
  858. return 0;
  859. }
  860. do {
  861. ptr += enc->minBytesPerChar;
  862. } while (isSpace(toAscii(enc, ptr, end)));
  863. if (ptr == end) {
  864. *namePtr = NULL;
  865. return 1;
  866. }
  867. *namePtr = ptr;
  868. for (;;) {
  869. c = toAscii(enc, ptr, end);
  870. if (c == -1) {
  871. *nextTokPtr = ptr;
  872. return 0;
  873. }
  874. if (c == ASCII_EQUALS) {
  875. *nameEndPtr = ptr;
  876. break;
  877. }
  878. if (isSpace(c)) {
  879. *nameEndPtr = ptr;
  880. do {
  881. ptr += enc->minBytesPerChar;
  882. } while (isSpace(c = toAscii(enc, ptr, end)));
  883. if (c != ASCII_EQUALS) {
  884. *nextTokPtr = ptr;
  885. return 0;
  886. }
  887. break;
  888. }
  889. ptr += enc->minBytesPerChar;
  890. }
  891. if (ptr == *namePtr) {
  892. *nextTokPtr = ptr;
  893. return 0;
  894. }
  895. ptr += enc->minBytesPerChar;
  896. c = toAscii(enc, ptr, end);
  897. while (isSpace(c)) {
  898. ptr += enc->minBytesPerChar;
  899. c = toAscii(enc, ptr, end);
  900. }
  901. if (c != ASCII_QUOT && c != ASCII_APOS) {
  902. *nextTokPtr = ptr;
  903. return 0;
  904. }
  905. open = (char)c;
  906. ptr += enc->minBytesPerChar;
  907. *valPtr = ptr;
  908. for (;; ptr += enc->minBytesPerChar) {
  909. c = toAscii(enc, ptr, end);
  910. if (c == open)
  911. break;
  912. if (!(ASCII_a <= c && c <= ASCII_z)
  913. && !(ASCII_A <= c && c <= ASCII_Z)
  914. && !(ASCII_0 <= c && c <= ASCII_9)
  915. && c != ASCII_PERIOD
  916. && c != ASCII_MINUS
  917. && c != ASCII_UNDERSCORE) {
  918. *nextTokPtr = ptr;
  919. return 0;
  920. }
  921. }
  922. *nextTokPtr = ptr + enc->minBytesPerChar;
  923. return 1;
  924. }
  925. static const char KW_version[] = {
  926. ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
  927. };
  928. static const char KW_encoding[] = {
  929. ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
  930. };
  931. static const char KW_standalone[] = {
  932. ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
  933. ASCII_n, ASCII_e, '\0'
  934. };
  935. static const char KW_yes[] = {
  936. ASCII_y, ASCII_e, ASCII_s, '\0'
  937. };
  938. static const char KW_no[] = {
  939. ASCII_n, ASCII_o, '\0'
  940. };
  941. static int
  942. doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
  943. const char *,
  944. const char *),
  945. int isGeneralTextEntity,
  946. const ENCODING *enc,
  947. const char *ptr,
  948. const char *end,
  949. const char **badPtr,
  950. const char **versionPtr,
  951. const char **versionEndPtr,
  952. const char **encodingName,
  953. const ENCODING **encoding,
  954. int *standalone)
  955. {
  956. const char *val = NULL;
  957. const char *name = NULL;
  958. const char *nameEnd = NULL;
  959. ptr += 5 * enc->minBytesPerChar;
  960. end -= 2 * enc->minBytesPerChar;
  961. if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
  962. || !name) {
  963. *badPtr = ptr;
  964. return 0;
  965. }
  966. if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
  967. if (!isGeneralTextEntity) {
  968. *badPtr = name;
  969. return 0;
  970. }
  971. }
  972. else {
  973. if (versionPtr)
  974. *versionPtr = val;
  975. if (versionEndPtr)
  976. *versionEndPtr = ptr;
  977. if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
  978. *badPtr = ptr;
  979. return 0;
  980. }
  981. if (!name) {
  982. if (isGeneralTextEntity) {
  983. /* a TextDecl must have an EncodingDecl */
  984. *badPtr = ptr;
  985. return 0;
  986. }
  987. return 1;
  988. }
  989. }
  990. if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
  991. int c = toAscii(enc, val, end);
  992. if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
  993. *badPtr = val;
  994. return 0;
  995. }
  996. if (encodingName)
  997. *encodingName = val;
  998. if (encoding)
  999. *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
  1000. if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
  1001. *badPtr = ptr;
  1002. return 0;
  1003. }
  1004. if (!name)
  1005. return 1;
  1006. }
  1007. if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
  1008. || isGeneralTextEntity) {
  1009. *badPtr = name;
  1010. return 0;
  1011. }
  1012. if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
  1013. if (standalone)
  1014. *standalone = 1;
  1015. }
  1016. else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
  1017. if (standalone)
  1018. *standalone = 0;
  1019. }
  1020. else {
  1021. *badPtr = val;
  1022. return 0;
  1023. }
  1024. while (isSpace(toAscii(enc, ptr, end)))
  1025. ptr += enc->minBytesPerChar;
  1026. if (ptr != end) {
  1027. *badPtr = ptr;
  1028. return 0;
  1029. }
  1030. return 1;
  1031. }
  1032. static int FASTCALL
  1033. checkCharRefNumber(int result)
  1034. {
  1035. switch (result >> 8) {
  1036. case 0xD8: case 0xD9: case 0xDA: case 0xDB:
  1037. case 0xDC: case 0xDD: case 0xDE: case 0xDF:
  1038. return -1;
  1039. case 0:
  1040. if (latin1_encoding.type[result] == BT_NONXML)
  1041. return -1;
  1042. break;
  1043. case 0xFF:
  1044. if (result == 0xFFFE || result == 0xFFFF)
  1045. return -1;
  1046. break;
  1047. }
  1048. return result;
  1049. }
  1050. int FASTCALL
  1051. XmlUtf8Encode(int c, char *buf)
  1052. {
  1053. enum {
  1054. /* minN is minimum legal resulting value for N byte sequence */
  1055. min2 = 0x80,
  1056. min3 = 0x800,
  1057. min4 = 0x10000
  1058. };
  1059. if (c < 0)
  1060. return 0;
  1061. if (c < min2) {
  1062. buf[0] = (char)(c | UTF8_cval1);
  1063. return 1;
  1064. }
  1065. if (c < min3) {
  1066. buf[0] = (char)((c >> 6) | UTF8_cval2);
  1067. buf[1] = (char)((c & 0x3f) | 0x80);
  1068. return 2;
  1069. }
  1070. if (c < min4) {
  1071. buf[0] = (char)((c >> 12) | UTF8_cval3);
  1072. buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
  1073. buf[2] = (char)((c & 0x3f) | 0x80);
  1074. return 3;
  1075. }
  1076. if (c < 0x110000) {
  1077. buf[0] = (char)((c >> 18) | UTF8_cval4);
  1078. buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
  1079. buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
  1080. buf[3] = (char)((c & 0x3f) | 0x80);
  1081. return 4;
  1082. }
  1083. return 0;
  1084. }
  1085. int FASTCALL
  1086. XmlUtf16Encode(int charNum, unsigned short *buf)
  1087. {
  1088. if (charNum < 0)
  1089. return 0;
  1090. if (charNum < 0x10000) {
  1091. buf[0] = (unsigned short)charNum;
  1092. return 1;
  1093. }
  1094. if (charNum < 0x110000) {
  1095. charNum -= 0x10000;
  1096. buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
  1097. buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
  1098. return 2;
  1099. }
  1100. return 0;
  1101. }
  1102. struct unknown_encoding {
  1103. struct normal_encoding normal;
  1104. CONVERTER convert;
  1105. void *userData;
  1106. unsigned short utf16[256];
  1107. char utf8[256][4];
  1108. };
  1109. #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc))
  1110. int
  1111. XmlSizeOfUnknownEncoding(void)
  1112. {
  1113. return sizeof(struct unknown_encoding);
  1114. }
  1115. static int PTRFASTCALL
  1116. unknown_isName(const ENCODING *enc, const char *p)
  1117. {
  1118. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1119. int c = uenc->convert(uenc->userData, p);
  1120. if (c & ~0xFFFF)
  1121. return 0;
  1122. return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
  1123. }
  1124. static int PTRFASTCALL
  1125. unknown_isNmstrt(const ENCODING *enc, const char *p)
  1126. {
  1127. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1128. int c = uenc->convert(uenc->userData, p);
  1129. if (c & ~0xFFFF)
  1130. return 0;
  1131. return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
  1132. }
  1133. static int PTRFASTCALL
  1134. unknown_isInvalid(const ENCODING *enc, const char *p)
  1135. {
  1136. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1137. int c = uenc->convert(uenc->userData, p);
  1138. return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
  1139. }
  1140. static void PTRCALL
  1141. unknown_toUtf8(const ENCODING *enc,
  1142. const char **fromP, const char *fromLim,
  1143. char **toP, const char *toLim)
  1144. {
  1145. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1146. char buf[XML_UTF8_ENCODE_MAX];
  1147. for (;;) {
  1148. const char *utf8;
  1149. int n;
  1150. if (*fromP == fromLim)
  1151. break;
  1152. utf8 = uenc->utf8[(unsigned char)**fromP];
  1153. n = *utf8++;
  1154. if (n == 0) {
  1155. int c = uenc->convert(uenc->userData, *fromP);
  1156. n = XmlUtf8Encode(c, buf);
  1157. if (n > toLim - *toP)
  1158. break;
  1159. utf8 = buf;
  1160. *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
  1161. - (BT_LEAD2 - 2));
  1162. }
  1163. else {
  1164. if (n > toLim - *toP)
  1165. break;
  1166. (*fromP)++;
  1167. }
  1168. do {
  1169. *(*toP)++ = *utf8++;
  1170. } while (--n != 0);
  1171. }
  1172. }
  1173. static void PTRCALL
  1174. unknown_toUtf16(const ENCODING *enc,
  1175. const char **fromP, const char *fromLim,
  1176. unsigned short **toP, const unsigned short *toLim)
  1177. {
  1178. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1179. while (*fromP != fromLim && *toP != toLim) {
  1180. unsigned short c = uenc->utf16[(unsigned char)**fromP];
  1181. if (c == 0) {
  1182. c = (unsigned short)
  1183. uenc->convert(uenc->userData, *fromP);
  1184. *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
  1185. - (BT_LEAD2 - 2));
  1186. }
  1187. else
  1188. (*fromP)++;
  1189. *(*toP)++ = c;
  1190. }
  1191. }
  1192. ENCODING *
  1193. XmlInitUnknownEncoding(void *mem,
  1194. int *table,
  1195. CONVERTER convert,
  1196. void *userData)
  1197. {
  1198. int i;
  1199. struct unknown_encoding *e = (struct unknown_encoding *)mem;
  1200. for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
  1201. ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
  1202. for (i = 0; i < 128; i++)
  1203. if (latin1_encoding.type[i] != BT_OTHER
  1204. && latin1_encoding.type[i] != BT_NONXML
  1205. && table[i] != i)
  1206. return 0;
  1207. for (i = 0; i < 256; i++) {
  1208. int c = table[i];
  1209. if (c == -1) {
  1210. e->normal.type[i] = BT_MALFORM;
  1211. /* This shouldn't really get used. */
  1212. e->utf16[i] = 0xFFFF;
  1213. e->utf8[i][0] = 1;
  1214. e->utf8[i][1] = 0;
  1215. }
  1216. else if (c < 0) {
  1217. if (c < -4)
  1218. return 0;
  1219. e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
  1220. e->utf8[i][0] = 0;
  1221. e->utf16[i] = 0;
  1222. }
  1223. else if (c < 0x80) {
  1224. if (latin1_encoding.type[c] != BT_OTHER
  1225. && latin1_encoding.type[c] != BT_NONXML
  1226. && c != i)
  1227. return 0;
  1228. e->normal.type[i] = latin1_encoding.type[c];
  1229. e->utf8[i][0] = 1;
  1230. e->utf8[i][1] = (char)c;
  1231. e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
  1232. }
  1233. else if (checkCharRefNumber(c) < 0) {
  1234. e->normal.type[i] = BT_NONXML;
  1235. /* This shouldn't really get used. */
  1236. e->utf16[i] = 0xFFFF;
  1237. e->utf8[i][0] = 1;
  1238. e->utf8[i][1] = 0;
  1239. }
  1240. else {
  1241. if (c > 0xFFFF)
  1242. return 0;
  1243. if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
  1244. e->normal.type[i] = BT_NMSTRT;
  1245. else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
  1246. e->normal.type[i] = BT_NAME;
  1247. else
  1248. e->normal.type[i] = BT_OTHER;
  1249. e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
  1250. e->utf16[i] = (unsigned short)c;
  1251. }
  1252. }
  1253. e->userData = userData;
  1254. e->convert = convert;
  1255. if (convert) {
  1256. e->normal.isName2 = unknown_isName;
  1257. e->normal.isName3 = unknown_isName;
  1258. e->normal.isName4 = unknown_isName;
  1259. e->normal.isNmstrt2 = unknown_isNmstrt;
  1260. e->normal.isNmstrt3 = unknown_isNmstrt;
  1261. e->normal.isNmstrt4 = unknown_isNmstrt;
  1262. e->normal.isInvalid2 = unknown_isInvalid;
  1263. e->normal.isInvalid3 = unknown_isInvalid;
  1264. e->normal.isInvalid4 = unknown_isInvalid;
  1265. }
  1266. e->normal.enc.utf8Convert = unknown_toUtf8;
  1267. e->normal.enc.utf16Convert = unknown_toUtf16;
  1268. return &(e->normal.enc);
  1269. }
  1270. /* If this enumeration is changed, getEncodingIndex and encodings
  1271. must also be changed. */
  1272. enum {
  1273. UNKNOWN_ENC = -1,
  1274. ISO_8859_1_ENC = 0,
  1275. US_ASCII_ENC,
  1276. UTF_8_ENC,
  1277. UTF_16_ENC,
  1278. UTF_16BE_ENC,
  1279. UTF_16LE_ENC,
  1280. /* must match encodingNames up to here */
  1281. NO_ENC
  1282. };
  1283. static const char KW_ISO_8859_1[] = {
  1284. ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
  1285. ASCII_MINUS, ASCII_1, '\0'
  1286. };
  1287. static const char KW_US_ASCII[] = {
  1288. ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
  1289. '\0'
  1290. };
  1291. static const char KW_UTF_8[] = {
  1292. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
  1293. };
  1294. static const char KW_UTF_16[] = {
  1295. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
  1296. };
  1297. static const char KW_UTF_16BE[] = {
  1298. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
  1299. '\0'
  1300. };
  1301. static const char KW_UTF_16LE[] = {
  1302. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
  1303. '\0'
  1304. };
  1305. static int FASTCALL
  1306. getEncodingIndex(const char *name)
  1307. {
  1308. static const char * const encodingNames[] = {
  1309. KW_ISO_8859_1,
  1310. KW_US_ASCII,
  1311. KW_UTF_8,
  1312. KW_UTF_16,
  1313. KW_UTF_16BE,
  1314. KW_UTF_16LE,
  1315. };
  1316. int i;
  1317. if (name == NULL)
  1318. return NO_ENC;
  1319. for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
  1320. if (streqci(name, encodingNames[i]))
  1321. return i;
  1322. return UNKNOWN_ENC;
  1323. }
  1324. /* For binary compatibility, we store the index of the encoding
  1325. specified at initialization in the isUtf16 member.
  1326. */
  1327. #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
  1328. #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
  1329. /* This is what detects the encoding. encodingTable maps from
  1330. encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
  1331. the external (protocol) specified encoding; state is
  1332. XML_CONTENT_STATE if we're parsing an external text entity, and
  1333. XML_PROLOG_STATE otherwise.
  1334. */
  1335. static int
  1336. initScan(const ENCODING * const *encodingTable,
  1337. const INIT_ENCODING *enc,
  1338. int state,
  1339. const char *ptr,
  1340. const char *end,
  1341. const char **nextTokPtr)
  1342. {
  1343. const ENCODING **encPtr;
  1344. if (ptr == end)
  1345. return XML_TOK_NONE;
  1346. encPtr = enc->encPtr;
  1347. if (ptr + 1 == end) {
  1348. /* only a single byte available for auto-detection */
  1349. #ifndef XML_DTD /* FIXME */
  1350. /* a well-formed document entity must have more than one byte */
  1351. if (state != XML_CONTENT_STATE)
  1352. return XML_TOK_PARTIAL;
  1353. #endif
  1354. /* so we're parsing an external text entity... */
  1355. /* if UTF-16 was externally specified, then we need at least 2 bytes */
  1356. switch (INIT_ENC_INDEX(enc)) {
  1357. case UTF_16_ENC:
  1358. case UTF_16LE_ENC:
  1359. case UTF_16BE_ENC:
  1360. return XML_TOK_PARTIAL;
  1361. }
  1362. switch ((unsigned char)*ptr) {
  1363. case 0xFE:
  1364. case 0xFF:
  1365. case 0xEF: /* possibly first byte of UTF-8 BOM */
  1366. if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1367. && state == XML_CONTENT_STATE)
  1368. break;
  1369. /* fall through */
  1370. case 0x00:
  1371. case 0x3C:
  1372. return XML_TOK_PARTIAL;
  1373. }
  1374. }
  1375. else {
  1376. switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
  1377. case 0xFEFF:
  1378. if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1379. && state == XML_CONTENT_STATE)
  1380. break;
  1381. *nextTokPtr = ptr + 2;
  1382. *encPtr = encodingTable[UTF_16BE_ENC];
  1383. return XML_TOK_BOM;
  1384. /* 00 3C is handled in the default case */
  1385. case 0x3C00:
  1386. if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
  1387. || INIT_ENC_INDEX(enc) == UTF_16_ENC)
  1388. && state == XML_CONTENT_STATE)
  1389. break;
  1390. *encPtr = encodingTable[UTF_16LE_ENC];
  1391. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1392. case 0xFFFE:
  1393. if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1394. && state == XML_CONTENT_STATE)
  1395. break;
  1396. *nextTokPtr = ptr + 2;
  1397. *encPtr = encodingTable[UTF_16LE_ENC];
  1398. return XML_TOK_BOM;
  1399. case 0xEFBB:
  1400. /* Maybe a UTF-8 BOM (EF BB BF) */
  1401. /* If there's an explicitly specified (external) encoding
  1402. of ISO-8859-1 or some flavour of UTF-16
  1403. and this is an external text entity,
  1404. don't look for the BOM,
  1405. because it might be a legal data.
  1406. */
  1407. if (state == XML_CONTENT_STATE) {
  1408. int e = INIT_ENC_INDEX(enc);
  1409. if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
  1410. || e == UTF_16LE_ENC || e == UTF_16_ENC)
  1411. break;
  1412. }
  1413. if (ptr + 2 == end)
  1414. return XML_TOK_PARTIAL;
  1415. if ((unsigned char)ptr[2] == 0xBF) {
  1416. *nextTokPtr = ptr + 3;
  1417. *encPtr = encodingTable[UTF_8_ENC];
  1418. return XML_TOK_BOM;
  1419. }
  1420. break;
  1421. default:
  1422. if (ptr[0] == '\0') {
  1423. /* 0 isn't a legal data character. Furthermore a document
  1424. entity can only start with ASCII characters. So the only
  1425. way this can fail to be big-endian UTF-16 if it it's an
  1426. external parsed general entity that's labelled as
  1427. UTF-16LE.
  1428. */
  1429. if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
  1430. break;
  1431. *encPtr = encodingTable[UTF_16BE_ENC];
  1432. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1433. }
  1434. else if (ptr[1] == '\0') {
  1435. /* We could recover here in the case:
  1436. - parsing an external entity
  1437. - second byte is 0
  1438. - no externally specified encoding
  1439. - no encoding declaration
  1440. by assuming UTF-16LE. But we don't, because this would mean when
  1441. presented just with a single byte, we couldn't reliably determine
  1442. whether we needed further bytes.
  1443. */
  1444. if (state == XML_CONTENT_STATE)
  1445. break;
  1446. *encPtr = encodingTable[UTF_16LE_ENC];
  1447. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1448. }
  1449. break;
  1450. }
  1451. }
  1452. *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
  1453. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1454. }
  1455. #define NS(x) x
  1456. #define ns(x) x
  1457. #include "xmltok_ns.c"
  1458. #undef NS
  1459. #undef ns
  1460. #ifdef XML_NS
  1461. #define NS(x) x ## NS
  1462. #define ns(x) x ## _ns
  1463. #include "xmltok_ns.c"
  1464. #undef NS
  1465. #undef ns
  1466. ENCODING *
  1467. XmlInitUnknownEncodingNS(void *mem,
  1468. int *table,
  1469. CONVERTER convert,
  1470. void *userData)
  1471. {
  1472. ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
  1473. if (enc)
  1474. ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
  1475. return enc;
  1476. }
  1477. #endif /* XML_NS */