PageRenderTime 101ms CodeModel.GetById 3ms app.highlight 89ms RepoModel.GetById 1ms app.codeStats 0ms

/Modules/expat/xmltok_impl.c

http://unladen-swallow.googlecode.com/
C | 1779 lines | 1684 code | 54 blank | 41 comment | 417 complexity | 2d135f01d4064285f754cb587b3c24cf MD5 | raw file
   1/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
   2   See the file COPYING for copying permission.
   3*/
   4
   5#ifndef IS_INVALID_CHAR
   6#define IS_INVALID_CHAR(enc, ptr, n) (0)
   7#endif
   8
   9#define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
  10    case BT_LEAD ## n: \
  11      if (end - ptr < n) \
  12        return XML_TOK_PARTIAL_CHAR; \
  13      if (IS_INVALID_CHAR(enc, ptr, n)) { \
  14        *(nextTokPtr) = (ptr); \
  15        return XML_TOK_INVALID; \
  16      } \
  17      ptr += n; \
  18      break;
  19
  20#define INVALID_CASES(ptr, nextTokPtr) \
  21  INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
  22  INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
  23  INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
  24  case BT_NONXML: \
  25  case BT_MALFORM: \
  26  case BT_TRAIL: \
  27    *(nextTokPtr) = (ptr); \
  28    return XML_TOK_INVALID;
  29
  30#define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
  31   case BT_LEAD ## n: \
  32     if (end - ptr < n) \
  33       return XML_TOK_PARTIAL_CHAR; \
  34     if (!IS_NAME_CHAR(enc, ptr, n)) { \
  35       *nextTokPtr = ptr; \
  36       return XML_TOK_INVALID; \
  37     } \
  38     ptr += n; \
  39     break;
  40
  41#define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
  42  case BT_NONASCII: \
  43    if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
  44      *nextTokPtr = ptr; \
  45      return XML_TOK_INVALID; \
  46    } \
  47  case BT_NMSTRT: \
  48  case BT_HEX: \
  49  case BT_DIGIT: \
  50  case BT_NAME: \
  51  case BT_MINUS: \
  52    ptr += MINBPC(enc); \
  53    break; \
  54  CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
  55  CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
  56  CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
  57
  58#define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
  59   case BT_LEAD ## n: \
  60     if (end - ptr < n) \
  61       return XML_TOK_PARTIAL_CHAR; \
  62     if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
  63       *nextTokPtr = ptr; \
  64       return XML_TOK_INVALID; \
  65     } \
  66     ptr += n; \
  67     break;
  68
  69#define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
  70  case BT_NONASCII: \
  71    if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
  72      *nextTokPtr = ptr; \
  73      return XML_TOK_INVALID; \
  74    } \
  75  case BT_NMSTRT: \
  76  case BT_HEX: \
  77    ptr += MINBPC(enc); \
  78    break; \
  79  CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
  80  CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
  81  CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
  82
  83#ifndef PREFIX
  84#define PREFIX(ident) ident
  85#endif
  86
  87/* ptr points to character following "<!-" */
  88
  89static int PTRCALL
  90PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
  91                    const char *end, const char **nextTokPtr)
  92{
  93  if (ptr != end) {
  94    if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
  95      *nextTokPtr = ptr;
  96      return XML_TOK_INVALID;
  97    }
  98    ptr += MINBPC(enc);
  99    while (ptr != end) {
 100      switch (BYTE_TYPE(enc, ptr)) {
 101      INVALID_CASES(ptr, nextTokPtr)
 102      case BT_MINUS:
 103        if ((ptr += MINBPC(enc)) == end)
 104          return XML_TOK_PARTIAL;
 105        if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
 106          if ((ptr += MINBPC(enc)) == end)
 107            return XML_TOK_PARTIAL;
 108          if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
 109            *nextTokPtr = ptr;
 110            return XML_TOK_INVALID;
 111          }
 112          *nextTokPtr = ptr + MINBPC(enc);
 113          return XML_TOK_COMMENT;
 114        }
 115        break;
 116      default:
 117        ptr += MINBPC(enc);
 118        break;
 119      }
 120    }
 121  }
 122  return XML_TOK_PARTIAL;
 123}
 124
 125/* ptr points to character following "<!" */
 126
 127static int PTRCALL
 128PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
 129                 const char *end, const char **nextTokPtr)
 130{
 131  if (ptr == end)
 132    return XML_TOK_PARTIAL;
 133  switch (BYTE_TYPE(enc, ptr)) {
 134  case BT_MINUS:
 135    return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
 136  case BT_LSQB:
 137    *nextTokPtr = ptr + MINBPC(enc);
 138    return XML_TOK_COND_SECT_OPEN;
 139  case BT_NMSTRT:
 140  case BT_HEX:
 141    ptr += MINBPC(enc);
 142    break;
 143  default:
 144    *nextTokPtr = ptr;
 145    return XML_TOK_INVALID;
 146  }
 147  while (ptr != end) {
 148    switch (BYTE_TYPE(enc, ptr)) {
 149    case BT_PERCNT:
 150      if (ptr + MINBPC(enc) == end)
 151        return XML_TOK_PARTIAL;
 152      /* don't allow <!ENTITY% foo "whatever"> */
 153      switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
 154      case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
 155        *nextTokPtr = ptr;
 156        return XML_TOK_INVALID;
 157      }
 158      /* fall through */
 159    case BT_S: case BT_CR: case BT_LF:
 160      *nextTokPtr = ptr;
 161      return XML_TOK_DECL_OPEN;
 162    case BT_NMSTRT:
 163    case BT_HEX:
 164      ptr += MINBPC(enc);
 165      break;
 166    default:
 167      *nextTokPtr = ptr;
 168      return XML_TOK_INVALID;
 169    }
 170  }
 171  return XML_TOK_PARTIAL;
 172}
 173
 174static int PTRCALL
 175PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr,
 176                      const char *end, int *tokPtr)
 177{
 178  int upper = 0;
 179  *tokPtr = XML_TOK_PI;
 180  if (end - ptr != MINBPC(enc)*3)
 181    return 1;
 182  switch (BYTE_TO_ASCII(enc, ptr)) {
 183  case ASCII_x:
 184    break;
 185  case ASCII_X:
 186    upper = 1;
 187    break;
 188  default:
 189    return 1;
 190  }
 191  ptr += MINBPC(enc);
 192  switch (BYTE_TO_ASCII(enc, ptr)) {
 193  case ASCII_m:
 194    break;
 195  case ASCII_M:
 196    upper = 1;
 197    break;
 198  default:
 199    return 1;
 200  }
 201  ptr += MINBPC(enc);
 202  switch (BYTE_TO_ASCII(enc, ptr)) {
 203  case ASCII_l:
 204    break;
 205  case ASCII_L:
 206    upper = 1;
 207    break;
 208  default:
 209    return 1;
 210  }
 211  if (upper)
 212    return 0;
 213  *tokPtr = XML_TOK_XML_DECL;
 214  return 1;
 215}
 216
 217/* ptr points to character following "<?" */
 218
 219static int PTRCALL
 220PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
 221               const char *end, const char **nextTokPtr)
 222{
 223  int tok;
 224  const char *target = ptr;
 225  if (ptr == end)
 226    return XML_TOK_PARTIAL;
 227  switch (BYTE_TYPE(enc, ptr)) {
 228  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
 229  default:
 230    *nextTokPtr = ptr;
 231    return XML_TOK_INVALID;
 232  }
 233  while (ptr != end) {
 234    switch (BYTE_TYPE(enc, ptr)) {
 235    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
 236    case BT_S: case BT_CR: case BT_LF:
 237      if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
 238        *nextTokPtr = ptr;
 239        return XML_TOK_INVALID;
 240      }
 241      ptr += MINBPC(enc);
 242      while (ptr != end) {
 243        switch (BYTE_TYPE(enc, ptr)) {
 244        INVALID_CASES(ptr, nextTokPtr)
 245        case BT_QUEST:
 246          ptr += MINBPC(enc);
 247          if (ptr == end)
 248            return XML_TOK_PARTIAL;
 249          if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
 250            *nextTokPtr = ptr + MINBPC(enc);
 251            return tok;
 252          }
 253          break;
 254        default:
 255          ptr += MINBPC(enc);
 256          break;
 257        }
 258      }
 259      return XML_TOK_PARTIAL;
 260    case BT_QUEST:
 261      if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
 262        *nextTokPtr = ptr;
 263        return XML_TOK_INVALID;
 264      }
 265      ptr += MINBPC(enc);
 266      if (ptr == end)
 267        return XML_TOK_PARTIAL;
 268      if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
 269        *nextTokPtr = ptr + MINBPC(enc);
 270        return tok;
 271      }
 272      /* fall through */
 273    default:
 274      *nextTokPtr = ptr;
 275      return XML_TOK_INVALID;
 276    }
 277  }
 278  return XML_TOK_PARTIAL;
 279}
 280
 281static int PTRCALL
 282PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr,
 283                         const char *end, const char **nextTokPtr)
 284{
 285  static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
 286                                     ASCII_T, ASCII_A, ASCII_LSQB };
 287  int i;
 288  /* CDATA[ */
 289  if (end - ptr < 6 * MINBPC(enc))
 290    return XML_TOK_PARTIAL;
 291  for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
 292    if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
 293      *nextTokPtr = ptr;
 294      return XML_TOK_INVALID;
 295    }
 296  }
 297  *nextTokPtr = ptr;
 298  return XML_TOK_CDATA_SECT_OPEN;
 299}
 300
 301static int PTRCALL
 302PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
 303                        const char *end, const char **nextTokPtr)
 304{
 305  if (ptr == end)
 306    return XML_TOK_NONE;
 307  if (MINBPC(enc) > 1) {
 308    size_t n = end - ptr;
 309    if (n & (MINBPC(enc) - 1)) {
 310      n &= ~(MINBPC(enc) - 1);
 311      if (n == 0)
 312        return XML_TOK_PARTIAL;
 313      end = ptr + n;
 314    }
 315  }
 316  switch (BYTE_TYPE(enc, ptr)) {
 317  case BT_RSQB:
 318    ptr += MINBPC(enc);
 319    if (ptr == end)
 320      return XML_TOK_PARTIAL;
 321    if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
 322      break;
 323    ptr += MINBPC(enc);
 324    if (ptr == end)
 325      return XML_TOK_PARTIAL;
 326    if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
 327      ptr -= MINBPC(enc);
 328      break;
 329    }
 330    *nextTokPtr = ptr + MINBPC(enc);
 331    return XML_TOK_CDATA_SECT_CLOSE;
 332  case BT_CR:
 333    ptr += MINBPC(enc);
 334    if (ptr == end)
 335      return XML_TOK_PARTIAL;
 336    if (BYTE_TYPE(enc, ptr) == BT_LF)
 337      ptr += MINBPC(enc);
 338    *nextTokPtr = ptr;
 339    return XML_TOK_DATA_NEWLINE;
 340  case BT_LF:
 341    *nextTokPtr = ptr + MINBPC(enc);
 342    return XML_TOK_DATA_NEWLINE;
 343  INVALID_CASES(ptr, nextTokPtr)
 344  default:
 345    ptr += MINBPC(enc);
 346    break;
 347  }
 348  while (ptr != end) {
 349    switch (BYTE_TYPE(enc, ptr)) {
 350#define LEAD_CASE(n) \
 351    case BT_LEAD ## n: \
 352      if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
 353        *nextTokPtr = ptr; \
 354        return XML_TOK_DATA_CHARS; \
 355      } \
 356      ptr += n; \
 357      break;
 358    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
 359#undef LEAD_CASE
 360    case BT_NONXML:
 361    case BT_MALFORM:
 362    case BT_TRAIL:
 363    case BT_CR:
 364    case BT_LF:
 365    case BT_RSQB:
 366      *nextTokPtr = ptr;
 367      return XML_TOK_DATA_CHARS;
 368    default:
 369      ptr += MINBPC(enc);
 370      break;
 371    }
 372  }
 373  *nextTokPtr = ptr;
 374  return XML_TOK_DATA_CHARS;
 375}
 376
 377/* ptr points to character following "</" */
 378
 379static int PTRCALL
 380PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
 381                   const char *end, const char **nextTokPtr)
 382{
 383  if (ptr == end)
 384    return XML_TOK_PARTIAL;
 385  switch (BYTE_TYPE(enc, ptr)) {
 386  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
 387  default:
 388    *nextTokPtr = ptr;
 389    return XML_TOK_INVALID;
 390  }
 391  while (ptr != end) {
 392    switch (BYTE_TYPE(enc, ptr)) {
 393    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
 394    case BT_S: case BT_CR: case BT_LF:
 395      for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
 396        switch (BYTE_TYPE(enc, ptr)) {
 397        case BT_S: case BT_CR: case BT_LF:
 398          break;
 399        case BT_GT:
 400          *nextTokPtr = ptr + MINBPC(enc);
 401          return XML_TOK_END_TAG;
 402        default:
 403          *nextTokPtr = ptr;
 404          return XML_TOK_INVALID;
 405        }
 406      }
 407      return XML_TOK_PARTIAL;
 408#ifdef XML_NS
 409    case BT_COLON:
 410      /* no need to check qname syntax here,
 411         since end-tag must match exactly */
 412      ptr += MINBPC(enc);
 413      break;
 414#endif
 415    case BT_GT:
 416      *nextTokPtr = ptr + MINBPC(enc);
 417      return XML_TOK_END_TAG;
 418    default:
 419      *nextTokPtr = ptr;
 420      return XML_TOK_INVALID;
 421    }
 422  }
 423  return XML_TOK_PARTIAL;
 424}
 425
 426/* ptr points to character following "&#X" */
 427
 428static int PTRCALL
 429PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
 430                       const char *end, const char **nextTokPtr)
 431{
 432  if (ptr != end) {
 433    switch (BYTE_TYPE(enc, ptr)) {
 434    case BT_DIGIT:
 435    case BT_HEX:
 436      break;
 437    default:
 438      *nextTokPtr = ptr;
 439      return XML_TOK_INVALID;
 440    }
 441    for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
 442      switch (BYTE_TYPE(enc, ptr)) {
 443      case BT_DIGIT:
 444      case BT_HEX:
 445        break;
 446      case BT_SEMI:
 447        *nextTokPtr = ptr + MINBPC(enc);
 448        return XML_TOK_CHAR_REF;
 449      default:
 450        *nextTokPtr = ptr;
 451        return XML_TOK_INVALID;
 452      }
 453    }
 454  }
 455  return XML_TOK_PARTIAL;
 456}
 457
 458/* ptr points to character following "&#" */
 459
 460static int PTRCALL
 461PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
 462                    const char *end, const char **nextTokPtr)
 463{
 464  if (ptr != end) {
 465    if (CHAR_MATCHES(enc, ptr, ASCII_x))
 466      return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
 467    switch (BYTE_TYPE(enc, ptr)) {
 468    case BT_DIGIT:
 469      break;
 470    default:
 471      *nextTokPtr = ptr;
 472      return XML_TOK_INVALID;
 473    }
 474    for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
 475      switch (BYTE_TYPE(enc, ptr)) {
 476      case BT_DIGIT:
 477        break;
 478      case BT_SEMI:
 479        *nextTokPtr = ptr + MINBPC(enc);
 480        return XML_TOK_CHAR_REF;
 481      default:
 482        *nextTokPtr = ptr;
 483        return XML_TOK_INVALID;
 484      }
 485    }
 486  }
 487  return XML_TOK_PARTIAL;
 488}
 489
 490/* ptr points to character following "&" */
 491
 492static int PTRCALL
 493PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
 494                const char **nextTokPtr)
 495{
 496  if (ptr == end)
 497    return XML_TOK_PARTIAL;
 498  switch (BYTE_TYPE(enc, ptr)) {
 499  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
 500  case BT_NUM:
 501    return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
 502  default:
 503    *nextTokPtr = ptr;
 504    return XML_TOK_INVALID;
 505  }
 506  while (ptr != end) {
 507    switch (BYTE_TYPE(enc, ptr)) {
 508    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
 509    case BT_SEMI:
 510      *nextTokPtr = ptr + MINBPC(enc);
 511      return XML_TOK_ENTITY_REF;
 512    default:
 513      *nextTokPtr = ptr;
 514      return XML_TOK_INVALID;
 515    }
 516  }
 517  return XML_TOK_PARTIAL;
 518}
 519
 520/* ptr points to character following first character of attribute name */
 521
 522static int PTRCALL
 523PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
 524                 const char **nextTokPtr)
 525{
 526#ifdef XML_NS
 527  int hadColon = 0;
 528#endif
 529  while (ptr != end) {
 530    switch (BYTE_TYPE(enc, ptr)) {
 531    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
 532#ifdef XML_NS
 533    case BT_COLON:
 534      if (hadColon) {
 535        *nextTokPtr = ptr;
 536        return XML_TOK_INVALID;
 537      }
 538      hadColon = 1;
 539      ptr += MINBPC(enc);
 540      if (ptr == end)
 541        return XML_TOK_PARTIAL;
 542      switch (BYTE_TYPE(enc, ptr)) {
 543      CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
 544      default:
 545        *nextTokPtr = ptr;
 546        return XML_TOK_INVALID;
 547      }
 548      break;
 549#endif
 550    case BT_S: case BT_CR: case BT_LF:
 551      for (;;) {
 552        int t;
 553
 554        ptr += MINBPC(enc);
 555        if (ptr == end)
 556          return XML_TOK_PARTIAL;
 557        t = BYTE_TYPE(enc, ptr);
 558        if (t == BT_EQUALS)
 559          break;
 560        switch (t) {
 561        case BT_S:
 562        case BT_LF:
 563        case BT_CR:
 564          break;
 565        default:
 566          *nextTokPtr = ptr;
 567          return XML_TOK_INVALID;
 568        }
 569      }
 570    /* fall through */
 571    case BT_EQUALS:
 572      {
 573        int open;
 574#ifdef XML_NS
 575        hadColon = 0;
 576#endif
 577        for (;;) {
 578          ptr += MINBPC(enc);
 579          if (ptr == end)
 580            return XML_TOK_PARTIAL;
 581          open = BYTE_TYPE(enc, ptr);
 582          if (open == BT_QUOT || open == BT_APOS)
 583            break;
 584          switch (open) {
 585          case BT_S:
 586          case BT_LF:
 587          case BT_CR:
 588            break;
 589          default:
 590            *nextTokPtr = ptr;
 591            return XML_TOK_INVALID;
 592          }
 593        }
 594        ptr += MINBPC(enc);
 595        /* in attribute value */
 596        for (;;) {
 597          int t;
 598          if (ptr == end)
 599            return XML_TOK_PARTIAL;
 600          t = BYTE_TYPE(enc, ptr);
 601          if (t == open)
 602            break;
 603          switch (t) {
 604          INVALID_CASES(ptr, nextTokPtr)
 605          case BT_AMP:
 606            {
 607              int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
 608              if (tok <= 0) {
 609                if (tok == XML_TOK_INVALID)
 610                  *nextTokPtr = ptr;
 611                return tok;
 612              }
 613              break;
 614            }
 615          case BT_LT:
 616            *nextTokPtr = ptr;
 617            return XML_TOK_INVALID;
 618          default:
 619            ptr += MINBPC(enc);
 620            break;
 621          }
 622        }
 623        ptr += MINBPC(enc);
 624        if (ptr == end)
 625          return XML_TOK_PARTIAL;
 626        switch (BYTE_TYPE(enc, ptr)) {
 627        case BT_S:
 628        case BT_CR:
 629        case BT_LF:
 630          break;
 631        case BT_SOL:
 632          goto sol;
 633        case BT_GT:
 634          goto gt;
 635        default:
 636          *nextTokPtr = ptr;
 637          return XML_TOK_INVALID;
 638        }
 639        /* ptr points to closing quote */
 640        for (;;) {
 641          ptr += MINBPC(enc);
 642          if (ptr == end)
 643            return XML_TOK_PARTIAL;
 644          switch (BYTE_TYPE(enc, ptr)) {
 645          CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
 646          case BT_S: case BT_CR: case BT_LF:
 647            continue;
 648          case BT_GT:
 649          gt:
 650            *nextTokPtr = ptr + MINBPC(enc);
 651            return XML_TOK_START_TAG_WITH_ATTS;
 652          case BT_SOL:
 653          sol:
 654            ptr += MINBPC(enc);
 655            if (ptr == end)
 656              return XML_TOK_PARTIAL;
 657            if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
 658              *nextTokPtr = ptr;
 659              return XML_TOK_INVALID;
 660            }
 661            *nextTokPtr = ptr + MINBPC(enc);
 662            return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
 663          default:
 664            *nextTokPtr = ptr;
 665            return XML_TOK_INVALID;
 666          }
 667          break;
 668        }
 669        break;
 670      }
 671    default:
 672      *nextTokPtr = ptr;
 673      return XML_TOK_INVALID;
 674    }
 675  }
 676  return XML_TOK_PARTIAL;
 677}
 678
 679/* ptr points to character following "<" */
 680
 681static int PTRCALL
 682PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
 683               const char **nextTokPtr)
 684{
 685#ifdef XML_NS
 686  int hadColon;
 687#endif
 688  if (ptr == end)
 689    return XML_TOK_PARTIAL;
 690  switch (BYTE_TYPE(enc, ptr)) {
 691  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
 692  case BT_EXCL:
 693    if ((ptr += MINBPC(enc)) == end)
 694      return XML_TOK_PARTIAL;
 695    switch (BYTE_TYPE(enc, ptr)) {
 696    case BT_MINUS:
 697      return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
 698    case BT_LSQB:
 699      return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
 700                                      end, nextTokPtr);
 701    }
 702    *nextTokPtr = ptr;
 703    return XML_TOK_INVALID;
 704  case BT_QUEST:
 705    return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
 706  case BT_SOL:
 707    return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
 708  default:
 709    *nextTokPtr = ptr;
 710    return XML_TOK_INVALID;
 711  }
 712#ifdef XML_NS
 713  hadColon = 0;
 714#endif
 715  /* we have a start-tag */
 716  while (ptr != end) {
 717    switch (BYTE_TYPE(enc, ptr)) {
 718    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
 719#ifdef XML_NS
 720    case BT_COLON:
 721      if (hadColon) {
 722        *nextTokPtr = ptr;
 723        return XML_TOK_INVALID;
 724      }
 725      hadColon = 1;
 726      ptr += MINBPC(enc);
 727      if (ptr == end)
 728        return XML_TOK_PARTIAL;
 729      switch (BYTE_TYPE(enc, ptr)) {
 730      CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
 731      default:
 732        *nextTokPtr = ptr;
 733        return XML_TOK_INVALID;
 734      }
 735      break;
 736#endif
 737    case BT_S: case BT_CR: case BT_LF:
 738      {
 739        ptr += MINBPC(enc);
 740        while (ptr != end) {
 741          switch (BYTE_TYPE(enc, ptr)) {
 742          CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
 743          case BT_GT:
 744            goto gt;
 745          case BT_SOL:
 746            goto sol;
 747          case BT_S: case BT_CR: case BT_LF:
 748            ptr += MINBPC(enc);
 749            continue;
 750          default:
 751            *nextTokPtr = ptr;
 752            return XML_TOK_INVALID;
 753          }
 754          return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
 755        }
 756        return XML_TOK_PARTIAL;
 757      }
 758    case BT_GT:
 759    gt:
 760      *nextTokPtr = ptr + MINBPC(enc);
 761      return XML_TOK_START_TAG_NO_ATTS;
 762    case BT_SOL:
 763    sol:
 764      ptr += MINBPC(enc);
 765      if (ptr == end)
 766        return XML_TOK_PARTIAL;
 767      if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
 768        *nextTokPtr = ptr;
 769        return XML_TOK_INVALID;
 770      }
 771      *nextTokPtr = ptr + MINBPC(enc);
 772      return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
 773    default:
 774      *nextTokPtr = ptr;
 775      return XML_TOK_INVALID;
 776    }
 777  }
 778  return XML_TOK_PARTIAL;
 779}
 780
 781static int PTRCALL
 782PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
 783                   const char **nextTokPtr)
 784{
 785  if (ptr == end)
 786    return XML_TOK_NONE;
 787  if (MINBPC(enc) > 1) {
 788    size_t n = end - ptr;
 789    if (n & (MINBPC(enc) - 1)) {
 790      n &= ~(MINBPC(enc) - 1);
 791      if (n == 0)
 792        return XML_TOK_PARTIAL;
 793      end = ptr + n;
 794    }
 795  }
 796  switch (BYTE_TYPE(enc, ptr)) {
 797  case BT_LT:
 798    return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
 799  case BT_AMP:
 800    return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
 801  case BT_CR:
 802    ptr += MINBPC(enc);
 803    if (ptr == end)
 804      return XML_TOK_TRAILING_CR;
 805    if (BYTE_TYPE(enc, ptr) == BT_LF)
 806      ptr += MINBPC(enc);
 807    *nextTokPtr = ptr;
 808    return XML_TOK_DATA_NEWLINE;
 809  case BT_LF:
 810    *nextTokPtr = ptr + MINBPC(enc);
 811    return XML_TOK_DATA_NEWLINE;
 812  case BT_RSQB:
 813    ptr += MINBPC(enc);
 814    if (ptr == end)
 815      return XML_TOK_TRAILING_RSQB;
 816    if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
 817      break;
 818    ptr += MINBPC(enc);
 819    if (ptr == end)
 820      return XML_TOK_TRAILING_RSQB;
 821    if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
 822      ptr -= MINBPC(enc);
 823      break;
 824    }
 825    *nextTokPtr = ptr;
 826    return XML_TOK_INVALID;
 827  INVALID_CASES(ptr, nextTokPtr)
 828  default:
 829    ptr += MINBPC(enc);
 830    break;
 831  }
 832  while (ptr != end) {
 833    switch (BYTE_TYPE(enc, ptr)) {
 834#define LEAD_CASE(n) \
 835    case BT_LEAD ## n: \
 836      if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
 837        *nextTokPtr = ptr; \
 838        return XML_TOK_DATA_CHARS; \
 839      } \
 840      ptr += n; \
 841      break;
 842    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
 843#undef LEAD_CASE
 844    case BT_RSQB:
 845      if (ptr + MINBPC(enc) != end) {
 846         if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
 847           ptr += MINBPC(enc);
 848           break;
 849         }
 850         if (ptr + 2*MINBPC(enc) != end) {
 851           if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
 852             ptr += MINBPC(enc);
 853             break;
 854           }
 855           *nextTokPtr = ptr + 2*MINBPC(enc);
 856           return XML_TOK_INVALID;
 857         }
 858      }
 859      /* fall through */
 860    case BT_AMP:
 861    case BT_LT:
 862    case BT_NONXML:
 863    case BT_MALFORM:
 864    case BT_TRAIL:
 865    case BT_CR:
 866    case BT_LF:
 867      *nextTokPtr = ptr;
 868      return XML_TOK_DATA_CHARS;
 869    default:
 870      ptr += MINBPC(enc);
 871      break;
 872    }
 873  }
 874  *nextTokPtr = ptr;
 875  return XML_TOK_DATA_CHARS;
 876}
 877
 878/* ptr points to character following "%" */
 879
 880static int PTRCALL
 881PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
 882                    const char **nextTokPtr)
 883{
 884  if (ptr == end)
 885    return -XML_TOK_PERCENT;
 886  switch (BYTE_TYPE(enc, ptr)) {
 887  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
 888  case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
 889    *nextTokPtr = ptr;
 890    return XML_TOK_PERCENT;
 891  default:
 892    *nextTokPtr = ptr;
 893    return XML_TOK_INVALID;
 894  }
 895  while (ptr != end) {
 896    switch (BYTE_TYPE(enc, ptr)) {
 897    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
 898    case BT_SEMI:
 899      *nextTokPtr = ptr + MINBPC(enc);
 900      return XML_TOK_PARAM_ENTITY_REF;
 901    default:
 902      *nextTokPtr = ptr;
 903      return XML_TOK_INVALID;
 904    }
 905  }
 906  return XML_TOK_PARTIAL;
 907}
 908
 909static int PTRCALL
 910PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
 911                      const char **nextTokPtr)
 912{
 913  if (ptr == end)
 914    return XML_TOK_PARTIAL;
 915  switch (BYTE_TYPE(enc, ptr)) {
 916  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
 917  default:
 918    *nextTokPtr = ptr;
 919    return XML_TOK_INVALID;
 920  }
 921  while (ptr != end) {
 922    switch (BYTE_TYPE(enc, ptr)) {
 923    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
 924    case BT_CR: case BT_LF: case BT_S:
 925    case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
 926      *nextTokPtr = ptr;
 927      return XML_TOK_POUND_NAME;
 928    default:
 929      *nextTokPtr = ptr;
 930      return XML_TOK_INVALID;
 931    }
 932  }
 933  return -XML_TOK_POUND_NAME;
 934}
 935
 936static int PTRCALL
 937PREFIX(scanLit)(int open, const ENCODING *enc,
 938                const char *ptr, const char *end,
 939                const char **nextTokPtr)
 940{
 941  while (ptr != end) {
 942    int t = BYTE_TYPE(enc, ptr);
 943    switch (t) {
 944    INVALID_CASES(ptr, nextTokPtr)
 945    case BT_QUOT:
 946    case BT_APOS:
 947      ptr += MINBPC(enc);
 948      if (t != open)
 949        break;
 950      if (ptr == end)
 951        return -XML_TOK_LITERAL;
 952      *nextTokPtr = ptr;
 953      switch (BYTE_TYPE(enc, ptr)) {
 954      case BT_S: case BT_CR: case BT_LF:
 955      case BT_GT: case BT_PERCNT: case BT_LSQB:
 956        return XML_TOK_LITERAL;
 957      default:
 958        return XML_TOK_INVALID;
 959      }
 960    default:
 961      ptr += MINBPC(enc);
 962      break;
 963    }
 964  }
 965  return XML_TOK_PARTIAL;
 966}
 967
 968static int PTRCALL
 969PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
 970                  const char **nextTokPtr)
 971{
 972  int tok;
 973  if (ptr == end)
 974    return XML_TOK_NONE;
 975  if (MINBPC(enc) > 1) {
 976    size_t n = end - ptr;
 977    if (n & (MINBPC(enc) - 1)) {
 978      n &= ~(MINBPC(enc) - 1);
 979      if (n == 0)
 980        return XML_TOK_PARTIAL;
 981      end = ptr + n;
 982    }
 983  }
 984  switch (BYTE_TYPE(enc, ptr)) {
 985  case BT_QUOT:
 986    return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
 987  case BT_APOS:
 988    return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
 989  case BT_LT:
 990    {
 991      ptr += MINBPC(enc);
 992      if (ptr == end)
 993        return XML_TOK_PARTIAL;
 994      switch (BYTE_TYPE(enc, ptr)) {
 995      case BT_EXCL:
 996        return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
 997      case BT_QUEST:
 998        return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
 999      case BT_NMSTRT:
1000      case BT_HEX:
1001      case BT_NONASCII:
1002      case BT_LEAD2:
1003      case BT_LEAD3:
1004      case BT_LEAD4:
1005        *nextTokPtr = ptr - MINBPC(enc);
1006        return XML_TOK_INSTANCE_START;
1007      }
1008      *nextTokPtr = ptr;
1009      return XML_TOK_INVALID;
1010    }
1011  case BT_CR:
1012    if (ptr + MINBPC(enc) == end) {
1013      *nextTokPtr = end;
1014      /* indicate that this might be part of a CR/LF pair */
1015      return -XML_TOK_PROLOG_S;
1016    }
1017    /* fall through */
1018  case BT_S: case BT_LF:
1019    for (;;) {
1020      ptr += MINBPC(enc);
1021      if (ptr == end)
1022        break;
1023      switch (BYTE_TYPE(enc, ptr)) {
1024      case BT_S: case BT_LF:
1025        break;
1026      case BT_CR:
1027        /* don't split CR/LF pair */
1028        if (ptr + MINBPC(enc) != end)
1029          break;
1030        /* fall through */
1031      default:
1032        *nextTokPtr = ptr;
1033        return XML_TOK_PROLOG_S;
1034      }
1035    }
1036    *nextTokPtr = ptr;
1037    return XML_TOK_PROLOG_S;
1038  case BT_PERCNT:
1039    return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1040  case BT_COMMA:
1041    *nextTokPtr = ptr + MINBPC(enc);
1042    return XML_TOK_COMMA;
1043  case BT_LSQB:
1044    *nextTokPtr = ptr + MINBPC(enc);
1045    return XML_TOK_OPEN_BRACKET;
1046  case BT_RSQB:
1047    ptr += MINBPC(enc);
1048    if (ptr == end)
1049      return -XML_TOK_CLOSE_BRACKET;
1050    if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1051      if (ptr + MINBPC(enc) == end)
1052        return XML_TOK_PARTIAL;
1053      if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1054        *nextTokPtr = ptr + 2*MINBPC(enc);
1055        return XML_TOK_COND_SECT_CLOSE;
1056      }
1057    }
1058    *nextTokPtr = ptr;
1059    return XML_TOK_CLOSE_BRACKET;
1060  case BT_LPAR:
1061    *nextTokPtr = ptr + MINBPC(enc);
1062    return XML_TOK_OPEN_PAREN;
1063  case BT_RPAR:
1064    ptr += MINBPC(enc);
1065    if (ptr == end)
1066      return -XML_TOK_CLOSE_PAREN;
1067    switch (BYTE_TYPE(enc, ptr)) {
1068    case BT_AST:
1069      *nextTokPtr = ptr + MINBPC(enc);
1070      return XML_TOK_CLOSE_PAREN_ASTERISK;
1071    case BT_QUEST:
1072      *nextTokPtr = ptr + MINBPC(enc);
1073      return XML_TOK_CLOSE_PAREN_QUESTION;
1074    case BT_PLUS:
1075      *nextTokPtr = ptr + MINBPC(enc);
1076      return XML_TOK_CLOSE_PAREN_PLUS;
1077    case BT_CR: case BT_LF: case BT_S:
1078    case BT_GT: case BT_COMMA: case BT_VERBAR:
1079    case BT_RPAR:
1080      *nextTokPtr = ptr;
1081      return XML_TOK_CLOSE_PAREN;
1082    }
1083    *nextTokPtr = ptr;
1084    return XML_TOK_INVALID;
1085  case BT_VERBAR:
1086    *nextTokPtr = ptr + MINBPC(enc);
1087    return XML_TOK_OR;
1088  case BT_GT:
1089    *nextTokPtr = ptr + MINBPC(enc);
1090    return XML_TOK_DECL_CLOSE;
1091  case BT_NUM:
1092    return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1093#define LEAD_CASE(n) \
1094  case BT_LEAD ## n: \
1095    if (end - ptr < n) \
1096      return XML_TOK_PARTIAL_CHAR; \
1097    if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1098      ptr += n; \
1099      tok = XML_TOK_NAME; \
1100      break; \
1101    } \
1102    if (IS_NAME_CHAR(enc, ptr, n)) { \
1103      ptr += n; \
1104      tok = XML_TOK_NMTOKEN; \
1105      break; \
1106    } \
1107    *nextTokPtr = ptr; \
1108    return XML_TOK_INVALID;
1109    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1110#undef LEAD_CASE
1111  case BT_NMSTRT:
1112  case BT_HEX:
1113    tok = XML_TOK_NAME;
1114    ptr += MINBPC(enc);
1115    break;
1116  case BT_DIGIT:
1117  case BT_NAME:
1118  case BT_MINUS:
1119#ifdef XML_NS
1120  case BT_COLON:
1121#endif
1122    tok = XML_TOK_NMTOKEN;
1123    ptr += MINBPC(enc);
1124    break;
1125  case BT_NONASCII:
1126    if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1127      ptr += MINBPC(enc);
1128      tok = XML_TOK_NAME;
1129      break;
1130    }
1131    if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1132      ptr += MINBPC(enc);
1133      tok = XML_TOK_NMTOKEN;
1134      break;
1135    }
1136    /* fall through */
1137  default:
1138    *nextTokPtr = ptr;
1139    return XML_TOK_INVALID;
1140  }
1141  while (ptr != end) {
1142    switch (BYTE_TYPE(enc, ptr)) {
1143    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1144    case BT_GT: case BT_RPAR: case BT_COMMA:
1145    case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1146    case BT_S: case BT_CR: case BT_LF:
1147      *nextTokPtr = ptr;
1148      return tok;
1149#ifdef XML_NS
1150    case BT_COLON:
1151      ptr += MINBPC(enc);
1152      switch (tok) {
1153      case XML_TOK_NAME:
1154        if (ptr == end)
1155          return XML_TOK_PARTIAL;
1156        tok = XML_TOK_PREFIXED_NAME;
1157        switch (BYTE_TYPE(enc, ptr)) {
1158        CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1159        default:
1160          tok = XML_TOK_NMTOKEN;
1161          break;
1162        }
1163        break;
1164      case XML_TOK_PREFIXED_NAME:
1165        tok = XML_TOK_NMTOKEN;
1166        break;
1167      }
1168      break;
1169#endif
1170    case BT_PLUS:
1171      if (tok == XML_TOK_NMTOKEN)  {
1172        *nextTokPtr = ptr;
1173        return XML_TOK_INVALID;
1174      }
1175      *nextTokPtr = ptr + MINBPC(enc);
1176      return XML_TOK_NAME_PLUS;
1177    case BT_AST:
1178      if (tok == XML_TOK_NMTOKEN)  {
1179        *nextTokPtr = ptr;
1180        return XML_TOK_INVALID;
1181      }
1182      *nextTokPtr = ptr + MINBPC(enc);
1183      return XML_TOK_NAME_ASTERISK;
1184    case BT_QUEST:
1185      if (tok == XML_TOK_NMTOKEN)  {
1186        *nextTokPtr = ptr;
1187        return XML_TOK_INVALID;
1188      }
1189      *nextTokPtr = ptr + MINBPC(enc);
1190      return XML_TOK_NAME_QUESTION;
1191    default:
1192      *nextTokPtr = ptr;
1193      return XML_TOK_INVALID;
1194    }
1195  }
1196  return -tok;
1197}
1198
1199static int PTRCALL
1200PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1201                          const char *end, const char **nextTokPtr)
1202{
1203  const char *start;
1204  if (ptr == end)
1205    return XML_TOK_NONE;
1206  start = ptr;
1207  while (ptr != end) {
1208    switch (BYTE_TYPE(enc, ptr)) {
1209#define LEAD_CASE(n) \
1210    case BT_LEAD ## n: ptr += n; break;
1211    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1212#undef LEAD_CASE
1213    case BT_AMP:
1214      if (ptr == start)
1215        return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1216      *nextTokPtr = ptr;
1217      return XML_TOK_DATA_CHARS;
1218    case BT_LT:
1219      /* this is for inside entity references */
1220      *nextTokPtr = ptr;
1221      return XML_TOK_INVALID;
1222    case BT_LF:
1223      if (ptr == start) {
1224        *nextTokPtr = ptr + MINBPC(enc);
1225        return XML_TOK_DATA_NEWLINE;
1226      }
1227      *nextTokPtr = ptr;
1228      return XML_TOK_DATA_CHARS;
1229    case BT_CR:
1230      if (ptr == start) {
1231        ptr += MINBPC(enc);
1232        if (ptr == end)
1233          return XML_TOK_TRAILING_CR;
1234        if (BYTE_TYPE(enc, ptr) == BT_LF)
1235          ptr += MINBPC(enc);
1236        *nextTokPtr = ptr;
1237        return XML_TOK_DATA_NEWLINE;
1238      }
1239      *nextTokPtr = ptr;
1240      return XML_TOK_DATA_CHARS;
1241    case BT_S:
1242      if (ptr == start) {
1243        *nextTokPtr = ptr + MINBPC(enc);
1244        return XML_TOK_ATTRIBUTE_VALUE_S;
1245      }
1246      *nextTokPtr = ptr;
1247      return XML_TOK_DATA_CHARS;
1248    default:
1249      ptr += MINBPC(enc);
1250      break;
1251    }
1252  }
1253  *nextTokPtr = ptr;
1254  return XML_TOK_DATA_CHARS;
1255}
1256
1257static int PTRCALL
1258PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1259                       const char *end, const char **nextTokPtr)
1260{
1261  const char *start;
1262  if (ptr == end)
1263    return XML_TOK_NONE;
1264  start = ptr;
1265  while (ptr != end) {
1266    switch (BYTE_TYPE(enc, ptr)) {
1267#define LEAD_CASE(n) \
1268    case BT_LEAD ## n: ptr += n; break;
1269    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1270#undef LEAD_CASE
1271    case BT_AMP:
1272      if (ptr == start)
1273        return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1274      *nextTokPtr = ptr;
1275      return XML_TOK_DATA_CHARS;
1276    case BT_PERCNT:
1277      if (ptr == start) {
1278        int tok =  PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1279                                       end, nextTokPtr);
1280        return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1281      }
1282      *nextTokPtr = ptr;
1283      return XML_TOK_DATA_CHARS;
1284    case BT_LF:
1285      if (ptr == start) {
1286        *nextTokPtr = ptr + MINBPC(enc);
1287        return XML_TOK_DATA_NEWLINE;
1288      }
1289      *nextTokPtr = ptr;
1290      return XML_TOK_DATA_CHARS;
1291    case BT_CR:
1292      if (ptr == start) {
1293        ptr += MINBPC(enc);
1294        if (ptr == end)
1295          return XML_TOK_TRAILING_CR;
1296        if (BYTE_TYPE(enc, ptr) == BT_LF)
1297          ptr += MINBPC(enc);
1298        *nextTokPtr = ptr;
1299        return XML_TOK_DATA_NEWLINE;
1300      }
1301      *nextTokPtr = ptr;
1302      return XML_TOK_DATA_CHARS;
1303    default:
1304      ptr += MINBPC(enc);
1305      break;
1306    }
1307  }
1308  *nextTokPtr = ptr;
1309  return XML_TOK_DATA_CHARS;
1310}
1311
1312#ifdef XML_DTD
1313
1314static int PTRCALL
1315PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1316                         const char *end, const char **nextTokPtr)
1317{
1318  int level = 0;
1319  if (MINBPC(enc) > 1) {
1320    size_t n = end - ptr;
1321    if (n & (MINBPC(enc) - 1)) {
1322      n &= ~(MINBPC(enc) - 1);
1323      end = ptr + n;
1324    }
1325  }
1326  while (ptr != end) {
1327    switch (BYTE_TYPE(enc, ptr)) {
1328    INVALID_CASES(ptr, nextTokPtr)
1329    case BT_LT:
1330      if ((ptr += MINBPC(enc)) == end)
1331        return XML_TOK_PARTIAL;
1332      if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1333        if ((ptr += MINBPC(enc)) == end)
1334          return XML_TOK_PARTIAL;
1335        if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1336          ++level;
1337          ptr += MINBPC(enc);
1338        }
1339      }
1340      break;
1341    case BT_RSQB:
1342      if ((ptr += MINBPC(enc)) == end)
1343        return XML_TOK_PARTIAL;
1344      if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1345        if ((ptr += MINBPC(enc)) == end)
1346          return XML_TOK_PARTIAL;
1347        if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1348          ptr += MINBPC(enc);
1349          if (level == 0) {
1350            *nextTokPtr = ptr;
1351            return XML_TOK_IGNORE_SECT;
1352          }
1353          --level;
1354        }
1355      }
1356      break;
1357    default:
1358      ptr += MINBPC(enc);
1359      break;
1360    }
1361  }
1362  return XML_TOK_PARTIAL;
1363}
1364
1365#endif /* XML_DTD */
1366
1367static int PTRCALL
1368PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1369                   const char **badPtr)
1370{
1371  ptr += MINBPC(enc);
1372  end -= MINBPC(enc);
1373  for (; ptr != end; ptr += MINBPC(enc)) {
1374    switch (BYTE_TYPE(enc, ptr)) {
1375    case BT_DIGIT:
1376    case BT_HEX:
1377    case BT_MINUS:
1378    case BT_APOS:
1379    case BT_LPAR:
1380    case BT_RPAR:
1381    case BT_PLUS:
1382    case BT_COMMA:
1383    case BT_SOL:
1384    case BT_EQUALS:
1385    case BT_QUEST:
1386    case BT_CR:
1387    case BT_LF:
1388    case BT_SEMI:
1389    case BT_EXCL:
1390    case BT_AST:
1391    case BT_PERCNT:
1392    case BT_NUM:
1393#ifdef XML_NS
1394    case BT_COLON:
1395#endif
1396      break;
1397    case BT_S:
1398      if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1399        *badPtr = ptr;
1400        return 0;
1401      }
1402      break;
1403    case BT_NAME:
1404    case BT_NMSTRT:
1405      if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1406        break;
1407    default:
1408      switch (BYTE_TO_ASCII(enc, ptr)) {
1409      case 0x24: /* $ */
1410      case 0x40: /* @ */
1411        break;
1412      default:
1413        *badPtr = ptr;
1414        return 0;
1415      }
1416      break;
1417    }
1418  }
1419  return 1;
1420}
1421
1422/* This must only be called for a well-formed start-tag or empty
1423   element tag.  Returns the number of attributes.  Pointers to the
1424   first attsMax attributes are stored in atts.
1425*/
1426
1427static int PTRCALL
1428PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1429                int attsMax, ATTRIBUTE *atts)
1430{
1431  enum { other, inName, inValue } state = inName;
1432  int nAtts = 0;
1433  int open = 0; /* defined when state == inValue;
1434                   initialization just to shut up compilers */
1435
1436  for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1437    switch (BYTE_TYPE(enc, ptr)) {
1438#define START_NAME \
1439      if (state == other) { \
1440        if (nAtts < attsMax) { \
1441          atts[nAtts].name = ptr; \
1442          atts[nAtts].normalized = 1; \
1443        } \
1444        state = inName; \
1445      }
1446#define LEAD_CASE(n) \
1447    case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1448    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1449#undef LEAD_CASE
1450    case BT_NONASCII:
1451    case BT_NMSTRT:
1452    case BT_HEX:
1453      START_NAME
1454      break;
1455#undef START_NAME
1456    case BT_QUOT:
1457      if (state != inValue) {
1458        if (nAtts < attsMax)
1459          atts[nAtts].valuePtr = ptr + MINBPC(enc);
1460        state = inValue;
1461        open = BT_QUOT;
1462      }
1463      else if (open == BT_QUOT) {
1464        state = other;
1465        if (nAtts < attsMax)
1466          atts[nAtts].valueEnd = ptr;
1467        nAtts++;
1468      }
1469      break;
1470    case BT_APOS:
1471      if (state != inValue) {
1472        if (nAtts < attsMax)
1473          atts[nAtts].valuePtr = ptr + MINBPC(enc);
1474        state = inValue;
1475        open = BT_APOS;
1476      }
1477      else if (open == BT_APOS) {
1478        state = other;
1479        if (nAtts < attsMax)
1480          atts[nAtts].valueEnd = ptr;
1481        nAtts++;
1482      }
1483      break;
1484    case BT_AMP:
1485      if (nAtts < attsMax)
1486        atts[nAtts].normalized = 0;
1487      break;
1488    case BT_S:
1489      if (state == inName)
1490        state = other;
1491      else if (state == inValue
1492               && nAtts < attsMax
1493               && atts[nAtts].normalized
1494               && (ptr == atts[nAtts].valuePtr
1495                   || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1496                   || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1497                   || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1498        atts[nAtts].normalized = 0;
1499      break;
1500    case BT_CR: case BT_LF:
1501      /* This case ensures that the first attribute name is counted
1502         Apart from that we could just change state on the quote. */
1503      if (state == inName)
1504        state = other;
1505      else if (state == inValue && nAtts < attsMax)
1506        atts[nAtts].normalized = 0;
1507      break;
1508    case BT_GT:
1509    case BT_SOL:
1510      if (state != inValue)
1511        return nAtts;
1512      break;
1513    default:
1514      break;
1515    }
1516  }
1517  /* not reached */
1518}
1519
1520static int PTRFASTCALL
1521PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
1522{
1523  int result = 0;
1524  /* skip &# */
1525  ptr += 2*MINBPC(enc);
1526  if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1527    for (ptr += MINBPC(enc);
1528         !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1529         ptr += MINBPC(enc)) {
1530      int c = BYTE_TO_ASCII(enc, ptr);
1531      switch (c) {
1532      case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1533      case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1534        result <<= 4;
1535        result |= (c - ASCII_0);
1536        break;
1537      case ASCII_A: case ASCII_B: case ASCII_C:
1538      case ASCII_D: case ASCII_E: case ASCII_F:
1539        result <<= 4;
1540        result += 10 + (c - ASCII_A);
1541        break;
1542      case ASCII_a: case ASCII_b: case ASCII_c:
1543      case ASCII_d: case ASCII_e: case ASCII_f:
1544        result <<= 4;
1545        result += 10 + (c - ASCII_a);
1546        break;
1547      }
1548      if (result >= 0x110000)
1549        return -1;
1550    }
1551  }
1552  else {
1553    for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1554      int c = BYTE_TO_ASCII(enc, ptr);
1555      result *= 10;
1556      result += (c - ASCII_0);
1557      if (result >= 0x110000)
1558        return -1;
1559    }
1560  }
1561  return checkCharRefNumber(result);
1562}
1563
1564static int PTRCALL
1565PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1566                             const char *end)
1567{
1568  switch ((end - ptr)/MINBPC(enc)) {
1569  case 2:
1570    if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1571      switch (BYTE_TO_ASCII(enc, ptr)) {
1572      case ASCII_l:
1573        return ASCII_LT;
1574      case ASCII_g:
1575        return ASCII_GT;
1576      }
1577    }
1578    break;
1579  case 3:
1580    if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1581      ptr += MINBPC(enc);
1582      if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1583        ptr += MINBPC(enc);
1584        if (CHAR_MATCHES(enc, ptr, ASCII_p))
1585          return ASCII_AMP;
1586      }
1587    }
1588    break;
1589  case 4:
1590    switch (BYTE_TO_ASCII(enc, ptr)) {
1591    case ASCII_q:
1592      ptr += MINBPC(enc);
1593      if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1594        ptr += MINBPC(enc);
1595        if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1596          ptr += MINBPC(enc);
1597          if (CHAR_MATCHES(enc, ptr, ASCII_t))
1598            return ASCII_QUOT;
1599        }
1600      }
1601      break;
1602    case ASCII_a:
1603      ptr += MINBPC(enc);
1604      if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1605        ptr += MINBPC(enc);
1606        if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1607          ptr += MINBPC(enc);
1608          if (CHAR_MATCHES(enc, ptr, ASCII_s))
1609            return ASCII_APOS;
1610        }
1611      }
1612      break;
1613    }
1614  }
1615  return 0;
1616}
1617
1618static int PTRCALL
1619PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1620{
1621  for (;;) {
1622    switch (BYTE_TYPE(enc, ptr1)) {
1623#define LEAD_CASE(n) \
1624    case BT_LEAD ## n: \
1625      if (*ptr1++ != *ptr2++) \
1626        return 0;
1627    LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1628#undef LEAD_CASE
1629      /* fall through */
1630      if (*ptr1++ != *ptr2++)
1631        return 0;
1632      break;
1633    case BT_NONASCII:
1634    case BT_NMSTRT:
1635#ifdef XML_NS
1636    case BT_COLON:
1637#endif
1638    case BT_HEX:
1639    case BT_DIGIT:
1640    case BT_NAME:
1641    case BT_MINUS:
1642      if (*ptr2++ != *ptr1++)
1643        return 0;
1644      if (MINBPC(enc) > 1) {
1645        if (*ptr2++ != *ptr1++)
1646          return 0;
1647        if (MINBPC(enc) > 2) {
1648          if (*ptr2++ != *ptr1++)
1649            return 0;
1650          if (MINBPC(enc) > 3) {
1651            if (*ptr2++ != *ptr1++)
1652              return 0;
1653          }
1654        }
1655      }
1656      break;
1657    default:
1658      if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1659        return 1;
1660      switch (BYTE_TYPE(enc, ptr2)) {
1661      case BT_LEAD2:
1662      case BT_LEAD3:
1663      case BT_LEAD4:
1664      case BT_NONASCII:
1665      case BT_NMSTRT:
1666#ifdef XML_NS
1667      case BT_COLON:
1668#endif
1669      case BT_HEX:
1670      case BT_DIGIT:
1671      case BT_NAME:
1672      case BT_MINUS:
1673        return 0;
1674      default:
1675        return 1;
1676      }
1677    }
1678  }
1679  /* not reached */
1680}
1681
1682static int PTRCALL
1683PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1684                         const char *end1, const char *ptr2)
1685{
1686  for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1687    if (ptr1 == end1)
1688      return 0;
1689    if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1690      return 0;
1691  }
1692  return ptr1 == end1;
1693}
1694
1695static int PTRFASTCALL
1696PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1697{
1698  const char *start = ptr;
1699  for (;;) {
1700    switch (BYTE_TYPE(enc, ptr)) {
1701#define LEAD_CASE(n) \
1702    case BT_LEAD ## n: ptr += n; break;
1703    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1704#undef LEAD_CASE
1705    case BT_NONASCII:
1706    case BT_NMSTRT:
1707#ifdef XML_NS
1708    case BT_COLON:
1709#endif
1710    case BT_HEX:
1711    case BT_DIGIT:
1712    case BT_NAME:
1713    case BT_MINUS:
1714      ptr += MINBPC(enc);
1715      break;
1716    default:
1717      return (int)(ptr - start);
1718    }
1719  }
1720}
1721
1722static const char * PTRFASTCALL
1723PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1724{
1725  for (;;) {
1726    switch (BYTE_TYPE(enc, ptr)) {
1727    case BT_LF:
1728    case BT_CR:
1729    case BT_S:
1730      ptr += MINBPC(enc);
1731      break;
1732    default:
1733      return ptr;
1734    }
1735  }
1736}
1737
1738static void PTRCALL
1739PREFIX(updatePosition)(const ENCODING *enc,
1740                       const char *ptr,
1741                       const char *end,
1742                       POSITION *pos)
1743{
1744  while (ptr < end) {
1745    switch (BYTE_TYPE(enc, ptr)) {
1746#define LEAD_CASE(n) \
1747    case BT_LEAD ## n: \
1748      ptr += n; \
1749      break;
1750    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1751#undef LEAD_CASE
1752    case BT_LF:
1753      pos->columnNumber = (XML_Size)-1;
1754      pos->lineNumber++;
1755      ptr += MINBPC(enc);
1756      break;
1757    case BT_CR:
1758      pos->lineNumber++;
1759      ptr += MINBPC(enc);
1760      if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
1761        ptr += MINBPC(enc);
1762      pos->columnNumber = (XML_Size)-1;
1763      break;
1764    default:
1765      ptr += MINBPC(enc);
1766      break;
1767    }
1768    pos->columnNumber++;
1769  }
1770}
1771
1772#undef DO_LEAD_CASE
1773#undef MULTIBYTE_CASES
1774#undef INVALID_CASES
1775#undef CHECK_NAME_CASE
1776#undef CHECK_NAME_CASES
1777#undef CHECK_NMSTRT_CASE
1778#undef CHECK_NMSTRT_CASES
1779