PageRenderTime 220ms CodeModel.GetById 103ms app.highlight 101ms RepoModel.GetById 1ms app.codeStats 1ms

/Modules/expat/xmltok.c

http://unladen-swallow.googlecode.com/
C | 1639 lines | 1402 code | 162 blank | 75 comment | 301 complexity | 4ad8bf7e12ff6f2a59fdc4b80bd3ed0a MD5 | raw file
   1/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
   2   See the file COPYING for copying permission.
   3*/
   4
   5#ifdef COMPILED_FROM_DSP
   6#include "winconfig.h"
   7#elif defined(MACOS_CLASSIC)
   8#include "macconfig.h"
   9#elif defined(__amigaos4__)
  10#include "amigaconfig.h"
  11#else
  12#ifdef HAVE_EXPAT_CONFIG_H
  13#include <expat_config.h>
  14#endif
  15#endif /* ndef COMPILED_FROM_DSP */
  16
  17#include <stddef.h>
  18
  19#include "expat_external.h"
  20#include "internal.h"
  21#include "xmltok.h"
  22#include "nametab.h"
  23
  24#ifdef XML_DTD
  25#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
  26#else
  27#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
  28#endif
  29
  30#define VTABLE1 \
  31  { PREFIX(prologTok), PREFIX(contentTok), \
  32    PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
  33  { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
  34  PREFIX(sameName), \
  35  PREFIX(nameMatchesAscii), \
  36  PREFIX(nameLength), \
  37  PREFIX(skipS), \
  38  PREFIX(getAtts), \
  39  PREFIX(charRefNumber), \
  40  PREFIX(predefinedEntityName), \
  41  PREFIX(updatePosition), \
  42  PREFIX(isPublicId)
  43
  44#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
  45
  46#define UCS2_GET_NAMING(pages, hi, lo) \
  47   (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
  48
  49/* A 2 byte UTF-8 representation splits the characters 11 bits between
  50   the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
  51   pages, 3 bits to add to that index and 5 bits to generate the mask.
  52*/
  53#define UTF8_GET_NAMING2(pages, byte) \
  54    (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
  55                      + ((((byte)[0]) & 3) << 1) \
  56                      + ((((byte)[1]) >> 5) & 1)] \
  57         & (1 << (((byte)[1]) & 0x1F)))
  58
  59/* A 3 byte UTF-8 representation splits the characters 16 bits between
  60   the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
  61   into pages, 3 bits to add to that index and 5 bits to generate the
  62   mask.
  63*/
  64#define UTF8_GET_NAMING3(pages, byte) \
  65  (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
  66                             + ((((byte)[1]) >> 2) & 0xF)] \
  67                       << 3) \
  68                      + ((((byte)[1]) & 3) << 1) \
  69                      + ((((byte)[2]) >> 5) & 1)] \
  70         & (1 << (((byte)[2]) & 0x1F)))
  71
  72#define UTF8_GET_NAMING(pages, p, n) \
  73  ((n) == 2 \
  74  ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
  75  : ((n) == 3 \
  76     ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
  77     : 0))
  78
  79/* Detection of invalid UTF-8 sequences is based on Table 3.1B
  80   of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
  81   with the additional restriction of not allowing the Unicode
  82   code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
  83   Implementation details:
  84     (A & 0x80) == 0     means A < 0x80
  85   and
  86     (A & 0xC0) == 0xC0  means A > 0xBF
  87*/
  88
  89#define UTF8_INVALID2(p) \
  90  ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
  91
  92#define UTF8_INVALID3(p) \
  93  (((p)[2] & 0x80) == 0 \
  94  || \
  95  ((*p) == 0xEF && (p)[1] == 0xBF \
  96    ? \
  97    (p)[2] > 0xBD \
  98    : \
  99    ((p)[2] & 0xC0) == 0xC0) \
 100  || \
 101  ((*p) == 0xE0 \
 102    ? \
 103    (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
 104    : \
 105    ((p)[1] & 0x80) == 0 \
 106    || \
 107    ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
 108
 109#define UTF8_INVALID4(p) \
 110  (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
 111  || \
 112  ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
 113  || \
 114  ((*p) == 0xF0 \
 115    ? \
 116    (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
 117    : \
 118    ((p)[1] & 0x80) == 0 \
 119    || \
 120    ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
 121
 122static int PTRFASTCALL
 123isNever(const ENCODING *enc, const char *p)
 124{
 125  return 0;
 126}
 127
 128static int PTRFASTCALL
 129utf8_isName2(const ENCODING *enc, const char *p)
 130{
 131  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
 132}
 133
 134static int PTRFASTCALL
 135utf8_isName3(const ENCODING *enc, const char *p)
 136{
 137  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
 138}
 139
 140#define utf8_isName4 isNever
 141
 142static int PTRFASTCALL
 143utf8_isNmstrt2(const ENCODING *enc, const char *p)
 144{
 145  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
 146}
 147
 148static int PTRFASTCALL
 149utf8_isNmstrt3(const ENCODING *enc, const char *p)
 150{
 151  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
 152}
 153
 154#define utf8_isNmstrt4 isNever
 155
 156static int PTRFASTCALL
 157utf8_isInvalid2(const ENCODING *enc, const char *p)
 158{
 159  return UTF8_INVALID2((const unsigned char *)p);
 160}
 161
 162static int PTRFASTCALL
 163utf8_isInvalid3(const ENCODING *enc, const char *p)
 164{
 165  return UTF8_INVALID3((const unsigned char *)p);
 166}
 167
 168static int PTRFASTCALL
 169utf8_isInvalid4(const ENCODING *enc, const char *p)
 170{
 171  return UTF8_INVALID4((const unsigned char *)p);
 172}
 173
 174struct normal_encoding {
 175  ENCODING enc;
 176  unsigned char type[256];
 177#ifdef XML_MIN_SIZE
 178  int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
 179  int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
 180  int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
 181  int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
 182  int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
 183#endif /* XML_MIN_SIZE */
 184  int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
 185  int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
 186  int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
 187  int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
 188  int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
 189  int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
 190  int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
 191  int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
 192  int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
 193};
 194
 195#define AS_NORMAL_ENCODING(enc)   ((const struct normal_encoding *) (enc))
 196
 197#ifdef XML_MIN_SIZE
 198
 199#define STANDARD_VTABLE(E) \
 200 E ## byteType, \
 201 E ## isNameMin, \
 202 E ## isNmstrtMin, \
 203 E ## byteToAscii, \
 204 E ## charMatches,
 205
 206#else
 207
 208#define STANDARD_VTABLE(E) /* as nothing */
 209
 210#endif
 211
 212#define NORMAL_VTABLE(E) \
 213 E ## isName2, \
 214 E ## isName3, \
 215 E ## isName4, \
 216 E ## isNmstrt2, \
 217 E ## isNmstrt3, \
 218 E ## isNmstrt4, \
 219 E ## isInvalid2, \
 220 E ## isInvalid3, \
 221 E ## isInvalid4
 222
 223static int FASTCALL checkCharRefNumber(int);
 224
 225#include "xmltok_impl.h"
 226#include "ascii.h"
 227
 228#ifdef XML_MIN_SIZE
 229#define sb_isNameMin isNever
 230#define sb_isNmstrtMin isNever
 231#endif
 232
 233#ifdef XML_MIN_SIZE
 234#define MINBPC(enc) ((enc)->minBytesPerChar)
 235#else
 236/* minimum bytes per character */
 237#define MINBPC(enc) 1
 238#endif
 239
 240#define SB_BYTE_TYPE(enc, p) \
 241  (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
 242
 243#ifdef XML_MIN_SIZE
 244static int PTRFASTCALL
 245sb_byteType(const ENCODING *enc, const char *p)
 246{
 247  return SB_BYTE_TYPE(enc, p);
 248}
 249#define BYTE_TYPE(enc, p) \
 250 (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
 251#else
 252#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
 253#endif
 254
 255#ifdef XML_MIN_SIZE
 256#define BYTE_TO_ASCII(enc, p) \
 257 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
 258static int PTRFASTCALL
 259sb_byteToAscii(const ENCODING *enc, const char *p)
 260{
 261  return *p;
 262}
 263#else
 264#define BYTE_TO_ASCII(enc, p) (*(p))
 265#endif
 266
 267#define IS_NAME_CHAR(enc, p, n) \
 268 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
 269#define IS_NMSTRT_CHAR(enc, p, n) \
 270 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
 271#define IS_INVALID_CHAR(enc, p, n) \
 272 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
 273
 274#ifdef XML_MIN_SIZE
 275#define IS_NAME_CHAR_MINBPC(enc, p) \
 276 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
 277#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
 278 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
 279#else
 280#define IS_NAME_CHAR_MINBPC(enc, p) (0)
 281#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
 282#endif
 283
 284#ifdef XML_MIN_SIZE
 285#define CHAR_MATCHES(enc, p, c) \
 286 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
 287static int PTRCALL
 288sb_charMatches(const ENCODING *enc, const char *p, int c)
 289{
 290  return *p == c;
 291}
 292#else
 293/* c is an ASCII character */
 294#define CHAR_MATCHES(enc, p, c) (*(p) == c)
 295#endif
 296
 297#define PREFIX(ident) normal_ ## ident
 298#include "xmltok_impl.c"
 299
 300#undef MINBPC
 301#undef BYTE_TYPE
 302#undef BYTE_TO_ASCII
 303#undef CHAR_MATCHES
 304#undef IS_NAME_CHAR
 305#undef IS_NAME_CHAR_MINBPC
 306#undef IS_NMSTRT_CHAR
 307#undef IS_NMSTRT_CHAR_MINBPC
 308#undef IS_INVALID_CHAR
 309
 310enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
 311  UTF8_cval1 = 0x00,
 312  UTF8_cval2 = 0xc0,
 313  UTF8_cval3 = 0xe0,
 314  UTF8_cval4 = 0xf0
 315};
 316
 317static void PTRCALL
 318utf8_toUtf8(const ENCODING *enc,
 319            const char **fromP, const char *fromLim,
 320            char **toP, const char *toLim)
 321{
 322  char *to;
 323  const char *from;
 324  if (fromLim - *fromP > toLim - *toP) {
 325    /* Avoid copying partial characters. */
 326    for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
 327      if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
 328        break;
 329  }
 330  for (to = *toP, from = *fromP; from != fromLim; from++, to++)
 331    *to = *from;
 332  *fromP = from;
 333  *toP = to;
 334}
 335
 336static void PTRCALL
 337utf8_toUtf16(const ENCODING *enc,
 338             const char **fromP, const char *fromLim,
 339             unsigned short **toP, const unsigned short *toLim)
 340{
 341  unsigned short *to = *toP;
 342  const char *from = *fromP;
 343  while (from != fromLim && to != toLim) {
 344    switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
 345    case BT_LEAD2:
 346      *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
 347      from += 2;
 348      break;
 349    case BT_LEAD3:
 350      *to++ = (unsigned short)(((from[0] & 0xf) << 12)
 351                               | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
 352      from += 3;
 353      break;
 354    case BT_LEAD4:
 355      {
 356        unsigned long n;
 357        if (to + 1 == toLim)
 358          goto after;
 359        n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
 360            | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
 361        n -= 0x10000;
 362        to[0] = (unsigned short)((n >> 10) | 0xD800);
 363        to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
 364        to += 2;
 365        from += 4;
 366      }
 367      break;
 368    default:
 369      *to++ = *from++;
 370      break;
 371    }
 372  }
 373after:
 374  *fromP = from;
 375  *toP = to;
 376}
 377
 378#ifdef XML_NS
 379static const struct normal_encoding utf8_encoding_ns = {
 380  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
 381  {
 382#include "asciitab.h"
 383#include "utf8tab.h"
 384  },
 385  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
 386};
 387#endif
 388
 389static const struct normal_encoding utf8_encoding = {
 390  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
 391  {
 392#define BT_COLON BT_NMSTRT
 393#include "asciitab.h"
 394#undef BT_COLON
 395#include "utf8tab.h"
 396  },
 397  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
 398};
 399
 400#ifdef XML_NS
 401
 402static const struct normal_encoding internal_utf8_encoding_ns = {
 403  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
 404  {
 405#include "iasciitab.h"
 406#include "utf8tab.h"
 407  },
 408  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
 409};
 410
 411#endif
 412
 413static const struct normal_encoding internal_utf8_encoding = {
 414  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
 415  {
 416#define BT_COLON BT_NMSTRT
 417#include "iasciitab.h"
 418#undef BT_COLON
 419#include "utf8tab.h"
 420  },
 421  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
 422};
 423
 424static void PTRCALL
 425latin1_toUtf8(const ENCODING *enc,
 426              const char **fromP, const char *fromLim,
 427              char **toP, const char *toLim)
 428{
 429  for (;;) {
 430    unsigned char c;
 431    if (*fromP == fromLim)
 432      break;
 433    c = (unsigned char)**fromP;
 434    if (c & 0x80) {
 435      if (toLim - *toP < 2)
 436        break;
 437      *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
 438      *(*toP)++ = (char)((c & 0x3f) | 0x80);
 439      (*fromP)++;
 440    }
 441    else {
 442      if (*toP == toLim)
 443        break;
 444      *(*toP)++ = *(*fromP)++;
 445    }
 446  }
 447}
 448
 449static void PTRCALL
 450latin1_toUtf16(const ENCODING *enc,
 451               const char **fromP, const char *fromLim,
 452               unsigned short **toP, const unsigned short *toLim)
 453{
 454  while (*fromP != fromLim && *toP != toLim)
 455    *(*toP)++ = (unsigned char)*(*fromP)++;
 456}
 457
 458#ifdef XML_NS
 459
 460static const struct normal_encoding latin1_encoding_ns = {
 461  { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
 462  {
 463#include "asciitab.h"
 464#include "latin1tab.h"
 465  },
 466  STANDARD_VTABLE(sb_)
 467};
 468
 469#endif
 470
 471static const struct normal_encoding latin1_encoding = {
 472  { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
 473  {
 474#define BT_COLON BT_NMSTRT
 475#include "asciitab.h"
 476#undef BT_COLON
 477#include "latin1tab.h"
 478  },
 479  STANDARD_VTABLE(sb_)
 480};
 481
 482static void PTRCALL
 483ascii_toUtf8(const ENCODING *enc,
 484             const char **fromP, const char *fromLim,
 485             char **toP, const char *toLim)
 486{
 487  while (*fromP != fromLim && *toP != toLim)
 488    *(*toP)++ = *(*fromP)++;
 489}
 490
 491#ifdef XML_NS
 492
 493static const struct normal_encoding ascii_encoding_ns = {
 494  { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
 495  {
 496#include "asciitab.h"
 497/* BT_NONXML == 0 */
 498  },
 499  STANDARD_VTABLE(sb_)
 500};
 501
 502#endif
 503
 504static const struct normal_encoding ascii_encoding = {
 505  { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
 506  {
 507#define BT_COLON BT_NMSTRT
 508#include "asciitab.h"
 509#undef BT_COLON
 510/* BT_NONXML == 0 */
 511  },
 512  STANDARD_VTABLE(sb_)
 513};
 514
 515static int PTRFASTCALL
 516unicode_byte_type(char hi, char lo)
 517{
 518  switch ((unsigned char)hi) {
 519  case 0xD8: case 0xD9: case 0xDA: case 0xDB:
 520    return BT_LEAD4;
 521  case 0xDC: case 0xDD: case 0xDE: case 0xDF:
 522    return BT_TRAIL;
 523  case 0xFF:
 524    switch ((unsigned char)lo) {
 525    case 0xFF:
 526    case 0xFE:
 527      return BT_NONXML;
 528    }
 529    break;
 530  }
 531  return BT_NONASCII;
 532}
 533
 534#define DEFINE_UTF16_TO_UTF8(E) \
 535static void  PTRCALL \
 536E ## toUtf8(const ENCODING *enc, \
 537            const char **fromP, const char *fromLim, \
 538            char **toP, const char *toLim) \
 539{ \
 540  const char *from; \
 541  for (from = *fromP; from != fromLim; from += 2) { \
 542    int plane; \
 543    unsigned char lo2; \
 544    unsigned char lo = GET_LO(from); \
 545    unsigned char hi = GET_HI(from); \
 546    switch (hi) { \
 547    case 0: \
 548      if (lo < 0x80) { \
 549        if (*toP == toLim) { \
 550          *fromP = from; \
 551          return; \
 552        } \
 553        *(*toP)++ = lo; \
 554        break; \
 555      } \
 556      /* fall through */ \
 557    case 0x1: case 0x2: case 0x3: \
 558    case 0x4: case 0x5: case 0x6: case 0x7: \
 559      if (toLim -  *toP < 2) { \
 560        *fromP = from; \
 561        return; \
 562      } \
 563      *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
 564      *(*toP)++ = ((lo & 0x3f) | 0x80); \
 565      break; \
 566    default: \
 567      if (toLim -  *toP < 3)  { \
 568        *fromP = from; \
 569        return; \
 570      } \
 571      /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
 572      *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
 573      *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
 574      *(*toP)++ = ((lo & 0x3f) | 0x80); \
 575      break; \
 576    case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
 577      if (toLim -  *toP < 4) { \
 578        *fromP = from; \
 579        return; \
 580      } \
 581      plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
 582      *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
 583      *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
 584      from += 2; \
 585      lo2 = GET_LO(from); \
 586      *(*toP)++ = (((lo & 0x3) << 4) \
 587                   | ((GET_HI(from) & 0x3) << 2) \
 588                   | (lo2 >> 6) \
 589                   | 0x80); \
 590      *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
 591      break; \
 592    } \
 593  } \
 594  *fromP = from; \
 595}
 596
 597#define DEFINE_UTF16_TO_UTF16(E) \
 598static void  PTRCALL \
 599E ## toUtf16(const ENCODING *enc, \
 600             const char **fromP, const char *fromLim, \
 601             unsigned short **toP, const unsigned short *toLim) \
 602{ \
 603  /* Avoid copying first half only of surrogate */ \
 604  if (fromLim - *fromP > ((toLim - *toP) << 1) \
 605      && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
 606    fromLim -= 2; \
 607  for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
 608    *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
 609}
 610
 611#define SET2(ptr, ch) \
 612  (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
 613#define GET_LO(ptr) ((unsigned char)(ptr)[0])
 614#define GET_HI(ptr) ((unsigned char)(ptr)[1])
 615
 616DEFINE_UTF16_TO_UTF8(little2_)
 617DEFINE_UTF16_TO_UTF16(little2_)
 618
 619#undef SET2
 620#undef GET_LO
 621#undef GET_HI
 622
 623#define SET2(ptr, ch) \
 624  (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
 625#define GET_LO(ptr) ((unsigned char)(ptr)[1])
 626#define GET_HI(ptr) ((unsigned char)(ptr)[0])
 627
 628DEFINE_UTF16_TO_UTF8(big2_)
 629DEFINE_UTF16_TO_UTF16(big2_)
 630
 631#undef SET2
 632#undef GET_LO
 633#undef GET_HI
 634
 635#define LITTLE2_BYTE_TYPE(enc, p) \
 636 ((p)[1] == 0 \
 637  ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
 638  : unicode_byte_type((p)[1], (p)[0]))
 639#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
 640#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
 641#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
 642  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
 643#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
 644  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
 645
 646#ifdef XML_MIN_SIZE
 647
 648static int PTRFASTCALL
 649little2_byteType(const ENCODING *enc, const char *p)
 650{
 651  return LITTLE2_BYTE_TYPE(enc, p);
 652}
 653
 654static int PTRFASTCALL
 655little2_byteToAscii(const ENCODING *enc, const char *p)
 656{
 657  return LITTLE2_BYTE_TO_ASCII(enc, p);
 658}
 659
 660static int PTRCALL
 661little2_charMatches(const ENCODING *enc, const char *p, int c)
 662{
 663  return LITTLE2_CHAR_MATCHES(enc, p, c);
 664}
 665
 666static int PTRFASTCALL
 667little2_isNameMin(const ENCODING *enc, const char *p)
 668{
 669  return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
 670}
 671
 672static int PTRFASTCALL
 673little2_isNmstrtMin(const ENCODING *enc, const char *p)
 674{
 675  return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
 676}
 677
 678#undef VTABLE
 679#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
 680
 681#else /* not XML_MIN_SIZE */
 682
 683#undef PREFIX
 684#define PREFIX(ident) little2_ ## ident
 685#define MINBPC(enc) 2
 686/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
 687#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
 688#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
 689#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
 690#define IS_NAME_CHAR(enc, p, n) 0
 691#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
 692#define IS_NMSTRT_CHAR(enc, p, n) (0)
 693#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
 694
 695#include "xmltok_impl.c"
 696
 697#undef MINBPC
 698#undef BYTE_TYPE
 699#undef BYTE_TO_ASCII
 700#undef CHAR_MATCHES
 701#undef IS_NAME_CHAR
 702#undef IS_NAME_CHAR_MINBPC
 703#undef IS_NMSTRT_CHAR
 704#undef IS_NMSTRT_CHAR_MINBPC
 705#undef IS_INVALID_CHAR
 706
 707#endif /* not XML_MIN_SIZE */
 708
 709#ifdef XML_NS
 710
 711static const struct normal_encoding little2_encoding_ns = {
 712  { VTABLE, 2, 0,
 713#if BYTEORDER == 1234
 714    1
 715#else
 716    0
 717#endif
 718  },
 719  {
 720#include "asciitab.h"
 721#include "latin1tab.h"
 722  },
 723  STANDARD_VTABLE(little2_)
 724};
 725
 726#endif
 727
 728static const struct normal_encoding little2_encoding = {
 729  { VTABLE, 2, 0,
 730#if BYTEORDER == 1234
 731    1
 732#else
 733    0
 734#endif
 735  },
 736  {
 737#define BT_COLON BT_NMSTRT
 738#include "asciitab.h"
 739#undef BT_COLON
 740#include "latin1tab.h"
 741  },
 742  STANDARD_VTABLE(little2_)
 743};
 744
 745#if BYTEORDER != 4321
 746
 747#ifdef XML_NS
 748
 749static const struct normal_encoding internal_little2_encoding_ns = {
 750  { VTABLE, 2, 0, 1 },
 751  {
 752#include "iasciitab.h"
 753#include "latin1tab.h"
 754  },
 755  STANDARD_VTABLE(little2_)
 756};
 757
 758#endif
 759
 760static const struct normal_encoding internal_little2_encoding = {
 761  { VTABLE, 2, 0, 1 },
 762  {
 763#define BT_COLON BT_NMSTRT
 764#include "iasciitab.h"
 765#undef BT_COLON
 766#include "latin1tab.h"
 767  },
 768  STANDARD_VTABLE(little2_)
 769};
 770
 771#endif
 772
 773
 774#define BIG2_BYTE_TYPE(enc, p) \
 775 ((p)[0] == 0 \
 776  ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
 777  : unicode_byte_type((p)[0], (p)[1]))
 778#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
 779#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
 780#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
 781  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
 782#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
 783  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
 784
 785#ifdef XML_MIN_SIZE
 786
 787static int PTRFASTCALL
 788big2_byteType(const ENCODING *enc, const char *p)
 789{
 790  return BIG2_BYTE_TYPE(enc, p);
 791}
 792
 793static int PTRFASTCALL
 794big2_byteToAscii(const ENCODING *enc, const char *p)
 795{
 796  return BIG2_BYTE_TO_ASCII(enc, p);
 797}
 798
 799static int PTRCALL
 800big2_charMatches(const ENCODING *enc, const char *p, int c)
 801{
 802  return BIG2_CHAR_MATCHES(enc, p, c);
 803}
 804
 805static int PTRFASTCALL
 806big2_isNameMin(const ENCODING *enc, const char *p)
 807{
 808  return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
 809}
 810
 811static int PTRFASTCALL
 812big2_isNmstrtMin(const ENCODING *enc, const char *p)
 813{
 814  return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
 815}
 816
 817#undef VTABLE
 818#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
 819
 820#else /* not XML_MIN_SIZE */
 821
 822#undef PREFIX
 823#define PREFIX(ident) big2_ ## ident
 824#define MINBPC(enc) 2
 825/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
 826#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
 827#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
 828#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
 829#define IS_NAME_CHAR(enc, p, n) 0
 830#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
 831#define IS_NMSTRT_CHAR(enc, p, n) (0)
 832#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
 833
 834#include "xmltok_impl.c"
 835
 836#undef MINBPC
 837#undef BYTE_TYPE
 838#undef BYTE_TO_ASCII
 839#undef CHAR_MATCHES
 840#undef IS_NAME_CHAR
 841#undef IS_NAME_CHAR_MINBPC
 842#undef IS_NMSTRT_CHAR
 843#undef IS_NMSTRT_CHAR_MINBPC
 844#undef IS_INVALID_CHAR
 845
 846#endif /* not XML_MIN_SIZE */
 847
 848#ifdef XML_NS
 849
 850static const struct normal_encoding big2_encoding_ns = {
 851  { VTABLE, 2, 0,
 852#if BYTEORDER == 4321
 853  1
 854#else
 855  0
 856#endif
 857  },
 858  {
 859#include "asciitab.h"
 860#include "latin1tab.h"
 861  },
 862  STANDARD_VTABLE(big2_)
 863};
 864
 865#endif
 866
 867static const struct normal_encoding big2_encoding = {
 868  { VTABLE, 2, 0,
 869#if BYTEORDER == 4321
 870  1
 871#else
 872  0
 873#endif
 874  },
 875  {
 876#define BT_COLON BT_NMSTRT
 877#include "asciitab.h"
 878#undef BT_COLON
 879#include "latin1tab.h"
 880  },
 881  STANDARD_VTABLE(big2_)
 882};
 883
 884#if BYTEORDER != 1234
 885
 886#ifdef XML_NS
 887
 888static const struct normal_encoding internal_big2_encoding_ns = {
 889  { VTABLE, 2, 0, 1 },
 890  {
 891#include "iasciitab.h"
 892#include "latin1tab.h"
 893  },
 894  STANDARD_VTABLE(big2_)
 895};
 896
 897#endif
 898
 899static const struct normal_encoding internal_big2_encoding = {
 900  { VTABLE, 2, 0, 1 },
 901  {
 902#define BT_COLON BT_NMSTRT
 903#include "iasciitab.h"
 904#undef BT_COLON
 905#include "latin1tab.h"
 906  },
 907  STANDARD_VTABLE(big2_)
 908};
 909
 910#endif
 911
 912#undef PREFIX
 913
 914static int FASTCALL
 915streqci(const char *s1, const char *s2)
 916{
 917  for (;;) {
 918    char c1 = *s1++;
 919    char c2 = *s2++;
 920    if (ASCII_a <= c1 && c1 <= ASCII_z)
 921      c1 += ASCII_A - ASCII_a;
 922    if (ASCII_a <= c2 && c2 <= ASCII_z)
 923      c2 += ASCII_A - ASCII_a;
 924    if (c1 != c2)
 925      return 0;
 926    if (!c1)
 927      break;
 928  }
 929  return 1;
 930}
 931
 932static void PTRCALL
 933initUpdatePosition(const ENCODING *enc, const char *ptr,
 934                   const char *end, POSITION *pos)
 935{
 936  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
 937}
 938
 939static int
 940toAscii(const ENCODING *enc, const char *ptr, const char *end)
 941{
 942  char buf[1];
 943  char *p = buf;
 944  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
 945  if (p == buf)
 946    return -1;
 947  else
 948    return buf[0];
 949}
 950
 951static int FASTCALL
 952isSpace(int c)
 953{
 954  switch (c) {
 955  case 0x20:
 956  case 0xD:
 957  case 0xA:
 958  case 0x9:
 959    return 1;
 960  }
 961  return 0;
 962}
 963
 964/* Return 1 if there's just optional white space or there's an S
 965   followed by name=val.
 966*/
 967static int
 968parsePseudoAttribute(const ENCODING *enc,
 969                     const char *ptr,
 970                     const char *end,
 971                     const char **namePtr,
 972                     const char **nameEndPtr,
 973                     const char **valPtr,
 974                     const char **nextTokPtr)
 975{
 976  int c;
 977  char open;
 978  if (ptr == end) {
 979    *namePtr = NULL;
 980    return 1;
 981  }
 982  if (!isSpace(toAscii(enc, ptr, end))) {
 983    *nextTokPtr = ptr;
 984    return 0;
 985  }
 986  do {
 987    ptr += enc->minBytesPerChar;
 988  } while (isSpace(toAscii(enc, ptr, end)));
 989  if (ptr == end) {
 990    *namePtr = NULL;
 991    return 1;
 992  }
 993  *namePtr = ptr;
 994  for (;;) {
 995    c = toAscii(enc, ptr, end);
 996    if (c == -1) {
 997      *nextTokPtr = ptr;
 998      return 0;
 999    }
1000    if (c == ASCII_EQUALS) {
1001      *nameEndPtr = ptr;
1002      break;
1003    }
1004    if (isSpace(c)) {
1005      *nameEndPtr = ptr;
1006      do {
1007        ptr += enc->minBytesPerChar;
1008      } while (isSpace(c = toAscii(enc, ptr, end)));
1009      if (c != ASCII_EQUALS) {
1010        *nextTokPtr = ptr;
1011        return 0;
1012      }
1013      break;
1014    }
1015    ptr += enc->minBytesPerChar;
1016  }
1017  if (ptr == *namePtr) {
1018    *nextTokPtr = ptr;
1019    return 0;
1020  }
1021  ptr += enc->minBytesPerChar;
1022  c = toAscii(enc, ptr, end);
1023  while (isSpace(c)) {
1024    ptr += enc->minBytesPerChar;
1025    c = toAscii(enc, ptr, end);
1026  }
1027  if (c != ASCII_QUOT && c != ASCII_APOS) {
1028    *nextTokPtr = ptr;
1029    return 0;
1030  }
1031  open = (char)c;
1032  ptr += enc->minBytesPerChar;
1033  *valPtr = ptr;
1034  for (;; ptr += enc->minBytesPerChar) {
1035    c = toAscii(enc, ptr, end);
1036    if (c == open)
1037      break;
1038    if (!(ASCII_a <= c && c <= ASCII_z)
1039        && !(ASCII_A <= c && c <= ASCII_Z)
1040        && !(ASCII_0 <= c && c <= ASCII_9)
1041        && c != ASCII_PERIOD
1042        && c != ASCII_MINUS
1043        && c != ASCII_UNDERSCORE) {
1044      *nextTokPtr = ptr;
1045      return 0;
1046    }
1047  }
1048  *nextTokPtr = ptr + enc->minBytesPerChar;
1049  return 1;
1050}
1051
1052static const char KW_version[] = {
1053  ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1054};
1055
1056static const char KW_encoding[] = {
1057  ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1058};
1059
1060static const char KW_standalone[] = {
1061  ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
1062  ASCII_n, ASCII_e, '\0'
1063};
1064
1065static const char KW_yes[] = {
1066  ASCII_y, ASCII_e, ASCII_s,  '\0'
1067};
1068
1069static const char KW_no[] = {
1070  ASCII_n, ASCII_o,  '\0'
1071};
1072
1073static int
1074doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1075                                                 const char *,
1076                                                 const char *),
1077               int isGeneralTextEntity,
1078               const ENCODING *enc,
1079               const char *ptr,
1080               const char *end,
1081               const char **badPtr,
1082               const char **versionPtr,
1083               const char **versionEndPtr,
1084               const char **encodingName,
1085               const ENCODING **encoding,
1086               int *standalone)
1087{
1088  const char *val = NULL;
1089  const char *name = NULL;
1090  const char *nameEnd = NULL;
1091  ptr += 5 * enc->minBytesPerChar;
1092  end -= 2 * enc->minBytesPerChar;
1093  if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1094      || !name) {
1095    *badPtr = ptr;
1096    return 0;
1097  }
1098  if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1099    if (!isGeneralTextEntity) {
1100      *badPtr = name;
1101      return 0;
1102    }
1103  }
1104  else {
1105    if (versionPtr)
1106      *versionPtr = val;
1107    if (versionEndPtr)
1108      *versionEndPtr = ptr;
1109    if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1110      *badPtr = ptr;
1111      return 0;
1112    }
1113    if (!name) {
1114      if (isGeneralTextEntity) {
1115        /* a TextDecl must have an EncodingDecl */
1116        *badPtr = ptr;
1117        return 0;
1118      }
1119      return 1;
1120    }
1121  }
1122  if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1123    int c = toAscii(enc, val, end);
1124    if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1125      *badPtr = val;
1126      return 0;
1127    }
1128    if (encodingName)
1129      *encodingName = val;
1130    if (encoding)
1131      *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1132    if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1133      *badPtr = ptr;
1134      return 0;
1135    }
1136    if (!name)
1137      return 1;
1138  }
1139  if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1140      || isGeneralTextEntity) {
1141    *badPtr = name;
1142    return 0;
1143  }
1144  if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1145    if (standalone)
1146      *standalone = 1;
1147  }
1148  else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1149    if (standalone)
1150      *standalone = 0;
1151  }
1152  else {
1153    *badPtr = val;
1154    return 0;
1155  }
1156  while (isSpace(toAscii(enc, ptr, end)))
1157    ptr += enc->minBytesPerChar;
1158  if (ptr != end) {
1159    *badPtr = ptr;
1160    return 0;
1161  }
1162  return 1;
1163}
1164
1165static int FASTCALL
1166checkCharRefNumber(int result)
1167{
1168  switch (result >> 8) {
1169  case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1170  case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1171    return -1;
1172  case 0:
1173    if (latin1_encoding.type[result] == BT_NONXML)
1174      return -1;
1175    break;
1176  case 0xFF:
1177    if (result == 0xFFFE || result == 0xFFFF)
1178      return -1;
1179    break;
1180  }
1181  return result;
1182}
1183
1184int FASTCALL
1185XmlUtf8Encode(int c, char *buf)
1186{
1187  enum {
1188    /* minN is minimum legal resulting value for N byte sequence */
1189    min2 = 0x80,
1190    min3 = 0x800,
1191    min4 = 0x10000
1192  };
1193
1194  if (c < 0)
1195    return 0;
1196  if (c < min2) {
1197    buf[0] = (char)(c | UTF8_cval1);
1198    return 1;
1199  }
1200  if (c < min3) {
1201    buf[0] = (char)((c >> 6) | UTF8_cval2);
1202    buf[1] = (char)((c & 0x3f) | 0x80);
1203    return 2;
1204  }
1205  if (c < min4) {
1206    buf[0] = (char)((c >> 12) | UTF8_cval3);
1207    buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1208    buf[2] = (char)((c & 0x3f) | 0x80);
1209    return 3;
1210  }
1211  if (c < 0x110000) {
1212    buf[0] = (char)((c >> 18) | UTF8_cval4);
1213    buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1214    buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1215    buf[3] = (char)((c & 0x3f) | 0x80);
1216    return 4;
1217  }
1218  return 0;
1219}
1220
1221int FASTCALL
1222XmlUtf16Encode(int charNum, unsigned short *buf)
1223{
1224  if (charNum < 0)
1225    return 0;
1226  if (charNum < 0x10000) {
1227    buf[0] = (unsigned short)charNum;
1228    return 1;
1229  }
1230  if (charNum < 0x110000) {
1231    charNum -= 0x10000;
1232    buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1233    buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1234    return 2;
1235  }
1236  return 0;
1237}
1238
1239struct unknown_encoding {
1240  struct normal_encoding normal;
1241  CONVERTER convert;
1242  void *userData;
1243  unsigned short utf16[256];
1244  char utf8[256][4];
1245};
1246
1247#define AS_UNKNOWN_ENCODING(enc)  ((const struct unknown_encoding *) (enc))
1248
1249int
1250XmlSizeOfUnknownEncoding(void)
1251{
1252  return sizeof(struct unknown_encoding);
1253}
1254
1255static int PTRFASTCALL
1256unknown_isName(const ENCODING *enc, const char *p)
1257{
1258  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1259  int c = uenc->convert(uenc->userData, p);
1260  if (c & ~0xFFFF)
1261    return 0;
1262  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1263}
1264
1265static int PTRFASTCALL
1266unknown_isNmstrt(const ENCODING *enc, const char *p)
1267{
1268  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1269  int c = uenc->convert(uenc->userData, p);
1270  if (c & ~0xFFFF)
1271    return 0;
1272  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1273}
1274
1275static int PTRFASTCALL
1276unknown_isInvalid(const ENCODING *enc, const char *p)
1277{
1278  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1279  int c = uenc->convert(uenc->userData, p);
1280  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1281}
1282
1283static void PTRCALL
1284unknown_toUtf8(const ENCODING *enc,
1285               const char **fromP, const char *fromLim,
1286               char **toP, const char *toLim)
1287{
1288  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1289  char buf[XML_UTF8_ENCODE_MAX];
1290  for (;;) {
1291    const char *utf8;
1292    int n;
1293    if (*fromP == fromLim)
1294      break;
1295    utf8 = uenc->utf8[(unsigned char)**fromP];
1296    n = *utf8++;
1297    if (n == 0) {
1298      int c = uenc->convert(uenc->userData, *fromP);
1299      n = XmlUtf8Encode(c, buf);
1300      if (n > toLim - *toP)
1301        break;
1302      utf8 = buf;
1303      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1304                 - (BT_LEAD2 - 2));
1305    }
1306    else {
1307      if (n > toLim - *toP)
1308        break;
1309      (*fromP)++;
1310    }
1311    do {
1312      *(*toP)++ = *utf8++;
1313    } while (--n != 0);
1314  }
1315}
1316
1317static void PTRCALL
1318unknown_toUtf16(const ENCODING *enc,
1319                const char **fromP, const char *fromLim,
1320                unsigned short **toP, const unsigned short *toLim)
1321{
1322  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1323  while (*fromP != fromLim && *toP != toLim) {
1324    unsigned short c = uenc->utf16[(unsigned char)**fromP];
1325    if (c == 0) {
1326      c = (unsigned short)
1327          uenc->convert(uenc->userData, *fromP);
1328      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1329                 - (BT_LEAD2 - 2));
1330    }
1331    else
1332      (*fromP)++;
1333    *(*toP)++ = c;
1334  }
1335}
1336
1337ENCODING *
1338XmlInitUnknownEncoding(void *mem,
1339                       int *table,
1340                       CONVERTER convert, 
1341                       void *userData)
1342{
1343  int i;
1344  struct unknown_encoding *e = (struct unknown_encoding *)mem;
1345  for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1346    ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1347  for (i = 0; i < 128; i++)
1348    if (latin1_encoding.type[i] != BT_OTHER
1349        && latin1_encoding.type[i] != BT_NONXML
1350        && table[i] != i)
1351      return 0;
1352  for (i = 0; i < 256; i++) {
1353    int c = table[i];
1354    if (c == -1) {
1355      e->normal.type[i] = BT_MALFORM;
1356      /* This shouldn't really get used. */
1357      e->utf16[i] = 0xFFFF;
1358      e->utf8[i][0] = 1;
1359      e->utf8[i][1] = 0;
1360    }
1361    else if (c < 0) {
1362      if (c < -4)
1363        return 0;
1364      e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1365      e->utf8[i][0] = 0;
1366      e->utf16[i] = 0;
1367    }
1368    else if (c < 0x80) {
1369      if (latin1_encoding.type[c] != BT_OTHER
1370          && latin1_encoding.type[c] != BT_NONXML
1371          && c != i)
1372        return 0;
1373      e->normal.type[i] = latin1_encoding.type[c];
1374      e->utf8[i][0] = 1;
1375      e->utf8[i][1] = (char)c;
1376      e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1377    }
1378    else if (checkCharRefNumber(c) < 0) {
1379      e->normal.type[i] = BT_NONXML;
1380      /* This shouldn't really get used. */
1381      e->utf16[i] = 0xFFFF;
1382      e->utf8[i][0] = 1;
1383      e->utf8[i][1] = 0;
1384    }
1385    else {
1386      if (c > 0xFFFF)
1387        return 0;
1388      if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1389        e->normal.type[i] = BT_NMSTRT;
1390      else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1391        e->normal.type[i] = BT_NAME;
1392      else
1393        e->normal.type[i] = BT_OTHER;
1394      e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1395      e->utf16[i] = (unsigned short)c;
1396    }
1397  }
1398  e->userData = userData;
1399  e->convert = convert;
1400  if (convert) {
1401    e->normal.isName2 = unknown_isName;
1402    e->normal.isName3 = unknown_isName;
1403    e->normal.isName4 = unknown_isName;
1404    e->normal.isNmstrt2 = unknown_isNmstrt;
1405    e->normal.isNmstrt3 = unknown_isNmstrt;
1406    e->normal.isNmstrt4 = unknown_isNmstrt;
1407    e->normal.isInvalid2 = unknown_isInvalid;
1408    e->normal.isInvalid3 = unknown_isInvalid;
1409    e->normal.isInvalid4 = unknown_isInvalid;
1410  }
1411  e->normal.enc.utf8Convert = unknown_toUtf8;
1412  e->normal.enc.utf16Convert = unknown_toUtf16;
1413  return &(e->normal.enc);
1414}
1415
1416/* If this enumeration is changed, getEncodingIndex and encodings
1417must also be changed. */
1418enum {
1419  UNKNOWN_ENC = -1,
1420  ISO_8859_1_ENC = 0,
1421  US_ASCII_ENC,
1422  UTF_8_ENC,
1423  UTF_16_ENC,
1424  UTF_16BE_ENC,
1425  UTF_16LE_ENC,
1426  /* must match encodingNames up to here */
1427  NO_ENC
1428};
1429
1430static const char KW_ISO_8859_1[] = {
1431  ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
1432  ASCII_MINUS, ASCII_1, '\0'
1433};
1434static const char KW_US_ASCII[] = {
1435  ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
1436  '\0'
1437};
1438static const char KW_UTF_8[] =  {
1439  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1440};
1441static const char KW_UTF_16[] = {
1442  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1443};
1444static const char KW_UTF_16BE[] = {
1445  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
1446  '\0'
1447};
1448static const char KW_UTF_16LE[] = {
1449  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
1450  '\0'
1451};
1452
1453static int FASTCALL
1454getEncodingIndex(const char *name)
1455{
1456  static const char * const encodingNames[] = {
1457    KW_ISO_8859_1,
1458    KW_US_ASCII,
1459    KW_UTF_8,
1460    KW_UTF_16,
1461    KW_UTF_16BE,
1462    KW_UTF_16LE,
1463  };
1464  int i;
1465  if (name == NULL)
1466    return NO_ENC;
1467  for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1468    if (streqci(name, encodingNames[i]))
1469      return i;
1470  return UNKNOWN_ENC;
1471}
1472
1473/* For binary compatibility, we store the index of the encoding
1474   specified at initialization in the isUtf16 member.
1475*/
1476
1477#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1478#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1479
1480/* This is what detects the encoding.  encodingTable maps from
1481   encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1482   the external (protocol) specified encoding; state is
1483   XML_CONTENT_STATE if we're parsing an external text entity, and
1484   XML_PROLOG_STATE otherwise.
1485*/
1486
1487
1488static int
1489initScan(const ENCODING * const *encodingTable,
1490         const INIT_ENCODING *enc,
1491         int state,
1492         const char *ptr,
1493         const char *end,
1494         const char **nextTokPtr)
1495{
1496  const ENCODING **encPtr;
1497
1498  if (ptr == end)
1499    return XML_TOK_NONE;
1500  encPtr = enc->encPtr;
1501  if (ptr + 1 == end) {
1502    /* only a single byte available for auto-detection */
1503#ifndef XML_DTD /* FIXME */
1504    /* a well-formed document entity must have more than one byte */
1505    if (state != XML_CONTENT_STATE)
1506      return XML_TOK_PARTIAL;
1507#endif
1508    /* so we're parsing an external text entity... */
1509    /* if UTF-16 was externally specified, then we need at least 2 bytes */
1510    switch (INIT_ENC_INDEX(enc)) {
1511    case UTF_16_ENC:
1512    case UTF_16LE_ENC:
1513    case UTF_16BE_ENC:
1514      return XML_TOK_PARTIAL;
1515    }
1516    switch ((unsigned char)*ptr) {
1517    case 0xFE:
1518    case 0xFF:
1519    case 0xEF: /* possibly first byte of UTF-8 BOM */
1520      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1521          && state == XML_CONTENT_STATE)
1522        break;
1523      /* fall through */
1524    case 0x00:
1525    case 0x3C:
1526      return XML_TOK_PARTIAL;
1527    }
1528  }
1529  else {
1530    switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1531    case 0xFEFF:
1532      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1533          && state == XML_CONTENT_STATE)
1534        break;
1535      *nextTokPtr = ptr + 2;
1536      *encPtr = encodingTable[UTF_16BE_ENC];
1537      return XML_TOK_BOM;
1538    /* 00 3C is handled in the default case */
1539    case 0x3C00:
1540      if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1541           || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1542          && state == XML_CONTENT_STATE)
1543        break;
1544      *encPtr = encodingTable[UTF_16LE_ENC];
1545      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1546    case 0xFFFE:
1547      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1548          && state == XML_CONTENT_STATE)
1549        break;
1550      *nextTokPtr = ptr + 2;
1551      *encPtr = encodingTable[UTF_16LE_ENC];
1552      return XML_TOK_BOM;
1553    case 0xEFBB:
1554      /* Maybe a UTF-8 BOM (EF BB BF) */
1555      /* If there's an explicitly specified (external) encoding
1556         of ISO-8859-1 or some flavour of UTF-16
1557         and this is an external text entity,
1558         don't look for the BOM,
1559         because it might be a legal data.
1560      */
1561      if (state == XML_CONTENT_STATE) {
1562        int e = INIT_ENC_INDEX(enc);
1563        if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
1564            || e == UTF_16LE_ENC || e == UTF_16_ENC)
1565          break;
1566      }
1567      if (ptr + 2 == end)
1568        return XML_TOK_PARTIAL;
1569      if ((unsigned char)ptr[2] == 0xBF) {
1570        *nextTokPtr = ptr + 3;
1571        *encPtr = encodingTable[UTF_8_ENC];
1572        return XML_TOK_BOM;
1573      }
1574      break;
1575    default:
1576      if (ptr[0] == '\0') {
1577        /* 0 isn't a legal data character. Furthermore a document
1578           entity can only start with ASCII characters.  So the only
1579           way this can fail to be big-endian UTF-16 if it it's an
1580           external parsed general entity that's labelled as
1581           UTF-16LE.
1582        */
1583        if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1584          break;
1585        *encPtr = encodingTable[UTF_16BE_ENC];
1586        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1587      }
1588      else if (ptr[1] == '\0') {
1589        /* We could recover here in the case:
1590            - parsing an external entity
1591            - second byte is 0
1592            - no externally specified encoding
1593            - no encoding declaration
1594           by assuming UTF-16LE.  But we don't, because this would mean when
1595           presented just with a single byte, we couldn't reliably determine
1596           whether we needed further bytes.
1597        */
1598        if (state == XML_CONTENT_STATE)
1599          break;
1600        *encPtr = encodingTable[UTF_16LE_ENC];
1601        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1602      }
1603      break;
1604    }
1605  }
1606  *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1607  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1608}
1609
1610
1611#define NS(x) x
1612#define ns(x) x
1613#include "xmltok_ns.c"
1614#undef NS
1615#undef ns
1616
1617#ifdef XML_NS
1618
1619#define NS(x) x ## NS
1620#define ns(x) x ## _ns
1621
1622#include "xmltok_ns.c"
1623
1624#undef NS
1625#undef ns
1626
1627ENCODING *
1628XmlInitUnknownEncodingNS(void *mem,
1629                         int *table,
1630                         CONVERTER convert, 
1631                         void *userData)
1632{
1633  ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1634  if (enc)
1635    ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1636  return enc;
1637}
1638
1639#endif /* XML_NS */