/lib/html5lib/library/HTML5/Tokenizer.php
PHP | 2307 lines | 1474 code | 193 blank | 640 comment | 146 complexity | 21910acc3e9c678900b754ddc815905e MD5 | raw file
Possible License(s): GPL-3.0
Large files files are truncated, but you can click here to view the full file
- <?php
- /*
- Copyright 2007 Jeroen van der Meer <http://jero.net/>
- Copyright 2008 Edward Z. Yang <http://htmlpurifier.org/>
- Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
- Permission is hereby granted, free of charge, to any person obtaining a
- copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- The above copyright notice and this permission notice shall be included
- in all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
- // Some conventions:
- // /* */ indicates verbatim text from the HTML 5 specification
- // // indicates regular comments
- // all flags are in hyphenated form
- class HTML5_Tokenizer {
- /**
- * Points to an InputStream object.
- */
- protected $stream;
- /**
- * Tree builder that the tokenizer emits token to.
- */
- private $tree;
- /**
- * Current content model we are parsing as.
- */
- protected $content_model;
- /**
- * Current token that is being built, but not yet emitted. Also
- * is the last token emitted, if applicable.
- */
- protected $token;
- // These are constants describing the content model
- const PCDATA = 0;
- const RCDATA = 1;
- const CDATA = 2;
- const PLAINTEXT = 3;
- // These are constants describing tokens
- // XXX should probably be moved somewhere else, probably the
- // HTML5 class.
- const DOCTYPE = 0;
- const STARTTAG = 1;
- const ENDTAG = 2;
- const COMMENT = 3;
- const CHARACTER = 4;
- const SPACECHARACTER = 5;
- const EOF = 6;
- const PARSEERROR = 7;
- // These are constants representing bunches of characters.
- const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
- const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
- const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz';
- const DIGIT = '0123456789';
- const HEX = '0123456789ABCDEFabcdef';
- const WHITESPACE = "\t\n\x0c ";
- /**
- * @param $data Data to parse
- */
- public function __construct($data, $builder = null) {
- $this->stream = new HTML5_InputStream($data);
- if (!$builder) $this->tree = new HTML5_TreeBuilder;
- $this->content_model = self::PCDATA;
- }
- public function parseFragment($context = null) {
- $this->tree->setupContext($context);
- if ($this->tree->content_model) {
- $this->content_model = $this->tree->content_model;
- $this->tree->content_model = null;
- }
- $this->parse();
- }
- // XXX maybe convert this into an iterator? regardless, this function
- // and the save function should go into a Parser facade of some sort
- /**
- * Performs the actual parsing of the document.
- */
- public function parse() {
- // Current state
- $state = 'data';
- // This is used to avoid having to have look-behind in the data state.
- $lastFourChars = '';
- /**
- * Escape flag as specified by the HTML5 specification: "used to
- * control the behavior of the tokeniser. It is either true or
- * false, and initially must be set to the false state."
- */
- $escape = false;
- //echo "\n\n";
- while($state !== null) {
-
- /*echo $state . ' ';
- switch ($this->content_model) {
- case self::PCDATA: echo 'PCDATA'; break;
- case self::RCDATA: echo 'RCDATA'; break;
- case self::CDATA: echo 'CDATA'; break;
- case self::PLAINTEXT: echo 'PLAINTEXT'; break;
- }
- if ($escape) echo " escape";
- echo "\n";*/
-
- switch($state) {
- case 'data':
- /* Consume the next input character */
- $char = $this->stream->char();
- $lastFourChars .= $char;
- if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
- // see below for meaning
- $hyp_cond =
- !$escape &&
- (
- $this->content_model === self::RCDATA ||
- $this->content_model === self::CDATA
- );
- $amp_cond =
- !$escape &&
- (
- $this->content_model === self::PCDATA ||
- $this->content_model === self::RCDATA
- );
- $lt_cond =
- $this->content_model === self::PCDATA ||
- (
- (
- $this->content_model === self::RCDATA ||
- $this->content_model === self::CDATA
- ) &&
- !$escape
- );
- $gt_cond =
- $escape &&
- (
- $this->content_model === self::RCDATA ||
- $this->content_model === self::CDATA
- );
- if($char === '&' && $amp_cond) {
- /* U+0026 AMPERSAND (&)
- When the content model flag is set to one of the PCDATA or RCDATA
- states and the escape flag is false: switch to the
- character reference data state. Otherwise: treat it as per
- the "anything else" entry below. */
- $state = 'characterReferenceData';
- } elseif(
- $char === '-' &&
- $hyp_cond &&
- $lastFourChars === '<!--'
- ) {
- /*
- U+002D HYPHEN-MINUS (-)
- If the content model flag is set to either the RCDATA state or
- the CDATA state, and the escape flag is false, and there are at
- least three characters before this one in the input stream, and the
- last four characters in the input stream, including this one, are
- U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
- and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
- $escape = true;
- /* In any case, emit the input character as a character token. Stay
- in the data state. */
- $this->emitToken(array(
- 'type' => self::CHARACTER,
- 'data' => '-'
- ));
- // We do the "any case" part as part of "anything else".
- /* U+003C LESS-THAN SIGN (<) */
- } elseif($char === '<' && $lt_cond) {
- /* When the content model flag is set to the PCDATA state: switch
- to the tag open state.
- When the content model flag is set to either the RCDATA state or
- the CDATA state and the escape flag is false: switch to the tag
- open state.
- Otherwise: treat it as per the "anything else" entry below. */
- $state = 'tagOpen';
- /* U+003E GREATER-THAN SIGN (>) */
- } elseif(
- $char === '>' &&
- $gt_cond &&
- substr($lastFourChars, 1) === '-->'
- ) {
- /* If the content model flag is set to either the RCDATA state or
- the CDATA state, and the escape flag is true, and the last three
- characters in the input stream including this one are U+002D
- HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
- set the escape flag to false. */
- $escape = false;
- /* In any case, emit the input character as a character token.
- Stay in the data state. */
- $this->emitToken(array(
- 'type' => self::CHARACTER,
- 'data' => '>'
- ));
- // We do the "any case" part as part of "anything else".
- } elseif($char === false) {
- /* EOF
- Emit an end-of-file token. */
- $state = null;
- $this->tree->emitToken(array(
- 'type' => self::EOF
- ));
-
- } elseif($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- // Directly after emitting a token you switch back to the "data
- // state". At that point spaceCharacters are important so they are
- // emitted separately.
- $chars = $this->stream->charsWhile(self::WHITESPACE);
- $this->emitToken(array(
- 'type' => self::SPACECHARACTER,
- 'data' => $char . $chars
- ));
- $lastFourChars .= $chars;
- if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
- } else {
- /* Anything else
- THIS IS AN OPTIMIZATION: Get as many character that
- otherwise would also be treated as a character token and emit it
- as a single character token. Stay in the data state. */
-
- $mask = '';
- if ($hyp_cond) $mask .= '-';
- if ($amp_cond) $mask .= '&';
- if ($lt_cond) $mask .= '<';
- if ($gt_cond) $mask .= '>';
- if ($mask === '') {
- $chars = $this->stream->remainingChars();
- } else {
- $chars = $this->stream->charsUntil($mask);
- }
- $this->emitToken(array(
- 'type' => self::CHARACTER,
- 'data' => $char . $chars
- ));
- $lastFourChars .= $chars;
- if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
- $state = 'data';
- }
- break;
- case 'characterReferenceData':
- /* (This cannot happen if the content model flag
- is set to the CDATA state.) */
- /* Attempt to consume a character reference, with no
- additional allowed character. */
- $entity = $this->consumeCharacterReference();
- /* If nothing is returned, emit a U+0026 AMPERSAND
- character token. Otherwise, emit the character token that
- was returned. */
- // This is all done when consuming the character reference.
- $this->emitToken(array(
- 'type' => self::CHARACTER,
- 'data' => $entity
- ));
- /* Finally, switch to the data state. */
- $state = 'data';
- break;
- case 'tagOpen':
- $char = $this->stream->char();
- switch($this->content_model) {
- case self::RCDATA:
- case self::CDATA:
- /* Consume the next input character. If it is a
- U+002F SOLIDUS (/) character, switch to the close
- tag open state. Otherwise, emit a U+003C LESS-THAN
- SIGN character token and reconsume the current input
- character in the data state. */
- // We consumed above.
- if($char === '/') {
- $state = 'closeTagOpen';
- } else {
- $this->emitToken(array(
- 'type' => self::CHARACTER,
- 'data' => '<'
- ));
- $this->stream->unget();
- $state = 'data';
- }
- break;
- case self::PCDATA:
- /* If the content model flag is set to the PCDATA state
- Consume the next input character: */
- // We consumed above.
- if($char === '!') {
- /* U+0021 EXCLAMATION MARK (!)
- Switch to the markup declaration open state. */
- $state = 'markupDeclarationOpen';
- } elseif($char === '/') {
- /* U+002F SOLIDUS (/)
- Switch to the close tag open state. */
- $state = 'closeTagOpen';
- } elseif('A' <= $char && $char <= 'Z') {
- /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
- Create a new start tag token, set its tag name to the lowercase
- version of the input character (add 0x0020 to the character's code
- point), then switch to the tag name state. (Don't emit the token
- yet; further details will be filled in before it is emitted.) */
- $this->token = array(
- 'name' => strtolower($char),
- 'type' => self::STARTTAG,
- 'attr' => array()
- );
- $state = 'tagName';
- } elseif('a' <= $char && $char <= 'z') {
- /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
- Create a new start tag token, set its tag name to the input
- character, then switch to the tag name state. (Don't emit
- the token yet; further details will be filled in before it
- is emitted.) */
- $this->token = array(
- 'name' => $char,
- 'type' => self::STARTTAG,
- 'attr' => array()
- );
- $state = 'tagName';
- } elseif($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Parse error. Emit a U+003C LESS-THAN SIGN character token and a
- U+003E GREATER-THAN SIGN character token. Switch to the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-tag-name-but-got-right-bracket'
- ));
- $this->emitToken(array(
- 'type' => self::CHARACTER,
- 'data' => '<>'
- ));
- $state = 'data';
- } elseif($char === '?') {
- /* U+003F QUESTION MARK (?)
- Parse error. Switch to the bogus comment state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-tag-name-but-got-question-mark'
- ));
- $this->token = array(
- 'data' => '?',
- 'type' => self::COMMENT
- );
- $state = 'bogusComment';
- } else {
- /* Anything else
- Parse error. Emit a U+003C LESS-THAN SIGN character token and
- reconsume the current input character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-tag-name'
- ));
- $this->emitToken(array(
- 'type' => self::CHARACTER,
- 'data' => '<'
- ));
- $state = 'data';
- $this->stream->unget();
- }
- break;
- }
- break;
- case 'closeTagOpen':
- if (
- $this->content_model === self::RCDATA ||
- $this->content_model === self::CDATA
- ) {
- /* If the content model flag is set to the RCDATA or CDATA
- states... */
- $name = strtolower($this->stream->charsWhile(self::ALPHA));
- $following = $this->stream->char();
- $this->stream->unget();
- if (
- !$this->token ||
- $this->token['name'] !== $name ||
- $this->token['name'] === $name && !in_array($following, array("\x09", "\x0A", "\x0C", "\x20", "\x3E", "\x2F", false))
- ) {
- /* if no start tag token has ever been emitted by this instance
- of the tokenizer (fragment case), or, if the next few
- characters do not match the tag name of the last start tag
- token emitted (compared in an ASCII case-insensitive manner),
- or if they do but they are not immediately followed by one of
- the following characters:
- * U+0009 CHARACTER TABULATION
- * U+000A LINE FEED (LF)
- * U+000C FORM FEED (FF)
- * U+0020 SPACE
- * U+003E GREATER-THAN SIGN (>)
- * U+002F SOLIDUS (/)
- * EOF
- ...then emit a U+003C LESS-THAN SIGN character token, a
- U+002F SOLIDUS character token, and switch to the data
- state to process the next input character. */
- // XXX: Probably ought to replace in_array with $following === x ||...
- // We also need to emit $name now we've consumed that, as we
- // know it'll just be emitted as a character token.
- $this->emitToken(array(
- 'type' => self::CHARACTER,
- 'data' => '</' . $name
- ));
- $state = 'data';
- } else {
- // This matches what would happen if we actually did the
- // otherwise below (but we can't because we've consumed too
- // much).
- // Start the end tag token with the name we already have.
- $this->token = array(
- 'name' => $name,
- 'type' => self::ENDTAG
- );
- // Change to tag name state.
- $state = 'tagName';
- }
- } elseif ($this->content_model === self::PCDATA) {
- /* Otherwise, if the content model flag is set to the PCDATA
- state [...]: */
- $char = $this->stream->char();
- if ('A' <= $char && $char <= 'Z') {
- /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
- Create a new end tag token, set its tag name to the lowercase version
- of the input character (add 0x0020 to the character's code point), then
- switch to the tag name state. (Don't emit the token yet; further details
- will be filled in before it is emitted.) */
- $this->token = array(
- 'name' => strtolower($char),
- 'type' => self::ENDTAG
- );
- $state = 'tagName';
- } elseif ('a' <= $char && $char <= 'z') {
- /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
- Create a new end tag token, set its tag name to the
- input character, then switch to the tag name state.
- (Don't emit the token yet; further details will be
- filled in before it is emitted.) */
- $this->token = array(
- 'name' => $char,
- 'type' => self::ENDTAG
- );
- $state = 'tagName';
- } elseif($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Parse error. Switch to the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-closing-tag-but-got-right-bracket'
- ));
- $state = 'data';
- } elseif($char === false) {
- /* EOF
- Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
- SOLIDUS character token. Reconsume the EOF character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-closing-tag-but-got-eof'
- ));
- $this->emitToken(array(
- 'type' => self::CHARACTER,
- 'data' => '</'
- ));
- $this->stream->unget();
- $state = 'data';
- } else {
- /* Parse error. Switch to the bogus comment state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-closing-tag-but-got-char'
- ));
- $this->token = array(
- 'data' => $char,
- 'type' => self::COMMENT
- );
- $state = 'bogusComment';
- }
- }
- break;
- case 'tagName':
- /* Consume the next input character: */
- $char = $this->stream->char();
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Switch to the before attribute name state. */
- $state = 'beforeAttributeName';
- } elseif($char === '/') {
- /* U+002F SOLIDUS (/)
- Switch to the self-closing start tag state. */
- $state = 'selfClosingStartTag';
- } elseif($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Emit the current tag token. Switch to the data state. */
- $this->emitToken($this->token);
- $state = 'data';
- } elseif('A' <= $char && $char <= 'Z') {
- /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
- Append the lowercase version of the current input
- character (add 0x0020 to the character's code point) to
- the current tag token's tag name. Stay in the tag name state. */
- $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
- $this->token['name'] .= strtolower($char . $chars);
- $state = 'tagName';
- } elseif($char === false) {
- /* EOF
- Parse error. Emit the current tag token. Reconsume the EOF
- character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-tag-name'
- ));
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- /* Anything else
- Append the current input character to the current tag token's tag name.
- Stay in the tag name state. */
- $chars = $this->stream->charsUntil("\t\n\x0C />" . self::UPPER_ALPHA);
- $this->token['name'] .= $char . $chars;
- $state = 'tagName';
- }
- break;
- case 'beforeAttributeName':
- /* Consume the next input character: */
- $char = $this->stream->char();
- // this conditional is optimized, check bottom
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Stay in the before attribute name state. */
- $state = 'beforeAttributeName';
- } elseif($char === '/') {
- /* U+002F SOLIDUS (/)
- Switch to the self-closing start tag state. */
- $state = 'selfClosingStartTag';
- } elseif($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Emit the current tag token. Switch to the data state. */
- $this->emitToken($this->token);
- $state = 'data';
- } elseif('A' <= $char && $char <= 'Z') {
- /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
- Start a new attribute in the current tag token. Set that
- attribute's name to the lowercase version of the current
- input character (add 0x0020 to the character's code
- point), and its value to the empty string. Switch to the
- attribute name state.*/
- $this->token['attr'][] = array(
- 'name' => strtolower($char),
- 'value' => ''
- );
- $state = 'attributeName';
- } elseif($char === false) {
- /* EOF
- Parse error. Emit the current tag token. Reconsume the EOF
- character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-attribute-name-but-got-eof'
- ));
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- /* U+0022 QUOTATION MARK (")
- U+0027 APOSTROPHE (')
- U+003D EQUALS SIGN (=)
- Parse error. Treat it as per the "anything else" entry
- below. */
- if($char === '"' || $char === "'" || $char === '=') {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'invalid-character-in-attribute-name'
- ));
- }
- /* Anything else
- Start a new attribute in the current tag token. Set that attribute's
- name to the current input character, and its value to the empty string.
- Switch to the attribute name state. */
- $this->token['attr'][] = array(
- 'name' => $char,
- 'value' => ''
- );
- $state = 'attributeName';
- }
- break;
- case 'attributeName':
- // Consume the next input character:
- $char = $this->stream->char();
- // this conditional is optimized, check bottom
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Switch to the after attribute name state. */
- $state = 'afterAttributeName';
- } elseif($char === '/') {
- /* U+002F SOLIDUS (/)
- Switch to the self-closing start tag state. */
- $state = 'selfClosingStartTag';
- } elseif($char === '=') {
- /* U+003D EQUALS SIGN (=)
- Switch to the before attribute value state. */
- $state = 'beforeAttributeValue';
- } elseif($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Emit the current tag token. Switch to the data state. */
- $this->emitToken($this->token);
- $state = 'data';
- } elseif('A' <= $char && $char <= 'Z') {
- /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
- Append the lowercase version of the current input
- character (add 0x0020 to the character's code point) to
- the current attribute's name. Stay in the attribute name
- state. */
- $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
- $last = count($this->token['attr']) - 1;
- $this->token['attr'][$last]['name'] .= strtolower($char . $chars);
- $state = 'attributeName';
- } elseif($char === false) {
- /* EOF
- Parse error. Emit the current tag token. Reconsume the EOF
- character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-attribute-name'
- ));
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- /* U+0022 QUOTATION MARK (")
- U+0027 APOSTROPHE (')
- Parse error. Treat it as per the "anything else"
- entry below. */
- if($char === '"' || $char === "'") {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'invalid-character-in-attribute-name'
- ));
- }
- /* Anything else
- Append the current input character to the current attribute's name.
- Stay in the attribute name state. */
- $chars = $this->stream->charsUntil("\t\n\x0C /=>\"'" . self::UPPER_ALPHA);
- $last = count($this->token['attr']) - 1;
- $this->token['attr'][$last]['name'] .= $char . $chars;
- $state = 'attributeName';
- }
- /* When the user agent leaves the attribute name state
- (and before emitting the tag token, if appropriate), the
- complete attribute's name must be compared to the other
- attributes on the same token; if there is already an
- attribute on the token with the exact same name, then this
- is a parse error and the new attribute must be dropped, along
- with the value that gets associated with it (if any). */
- // this might be implemented in the emitToken method
- break;
- case 'afterAttributeName':
- // Consume the next input character:
- $char = $this->stream->char();
- // this is an optimized conditional, check the bottom
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Stay in the after attribute name state. */
- $state = 'afterAttributeName';
- } elseif($char === '/') {
- /* U+002F SOLIDUS (/)
- Switch to the self-closing start tag state. */
- $state = 'selfClosingStartTag';
- } elseif($char === '=') {
- /* U+003D EQUALS SIGN (=)
- Switch to the before attribute value state. */
- $state = 'beforeAttributeValue';
- } elseif($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Emit the current tag token. Switch to the data state. */
- $this->emitToken($this->token);
- $state = 'data';
- } elseif('A' <= $char && $char <= 'Z') {
- /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
- Start a new attribute in the current tag token. Set that
- attribute's name to the lowercase version of the current
- input character (add 0x0020 to the character's code
- point), and its value to the empty string. Switch to the
- attribute name state. */
- $this->token['attr'][] = array(
- 'name' => strtolower($char),
- 'value' => ''
- );
- $state = 'attributeName';
- } elseif($char === false) {
- /* EOF
- Parse error. Emit the current tag token. Reconsume the EOF
- character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-end-of-tag-but-got-eof'
- ));
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- /* U+0022 QUOTATION MARK (")
- U+0027 APOSTROPHE (')
- Parse error. Treat it as per the "anything else"
- entry below. */
- if($char === '"' || $char === "'") {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'invalid-character-after-attribute-name'
- ));
- }
- /* Anything else
- Start a new attribute in the current tag token. Set that attribute's
- name to the current input character, and its value to the empty string.
- Switch to the attribute name state. */
- $this->token['attr'][] = array(
- 'name' => $char,
- 'value' => ''
- );
- $state = 'attributeName';
- }
- break;
- case 'beforeAttributeValue':
- // Consume the next input character:
- $char = $this->stream->char();
- // this is an optimized conditional
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Stay in the before attribute value state. */
- $state = 'beforeAttributeValue';
- } elseif($char === '"') {
- /* U+0022 QUOTATION MARK (")
- Switch to the attribute value (double-quoted) state. */
- $state = 'attributeValueDoubleQuoted';
- } elseif($char === '&') {
- /* U+0026 AMPERSAND (&)
- Switch to the attribute value (unquoted) state and reconsume
- this input character. */
- $this->stream->unget();
- $state = 'attributeValueUnquoted';
- } elseif($char === '\'') {
- /* U+0027 APOSTROPHE (')
- Switch to the attribute value (single-quoted) state. */
- $state = 'attributeValueSingleQuoted';
- } elseif($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Parse error. Emit the current tag token. Switch to the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-attribute-value-but-got-right-bracket'
- ));
- $this->emitToken($this->token);
- $state = 'data';
- } elseif($char === false) {
- /* EOF
- Parse error. Emit the current tag token. Reconsume
- the character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-attribute-value-but-got-eof'
- ));
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- /* U+003D EQUALS SIGN (=)
- Parse error. Treat it as per the "anything else" entry below. */
- if($char === '=') {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'equals-in-unquoted-attribute-value'
- ));
- }
- /* Anything else
- Append the current input character to the current attribute's value.
- Switch to the attribute value (unquoted) state. */
- $last = count($this->token['attr']) - 1;
- $this->token['attr'][$last]['value'] .= $char;
- $state = 'attributeValueUnquoted';
- }
- break;
- case 'attributeValueDoubleQuoted':
- // Consume the next input character:
- $char = $this->stream->char();
- if($char === '"') {
- /* U+0022 QUOTATION MARK (")
- Switch to the after attribute value (quoted) state. */
- $state = 'afterAttributeValueQuoted';
- } elseif($char === '&') {
- /* U+0026 AMPERSAND (&)
- Switch to the character reference in attribute value
- state, with the additional allowed character
- being U+0022 QUOTATION MARK ("). */
- $this->characterReferenceInAttributeValue('"');
- } elseif($char === false) {
- /* EOF
- Parse error. Emit the current tag token. Reconsume the character
- in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-attribute-value-double-quote'
- ));
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- /* Anything else
- Append the current input character to the current attribute's value.
- Stay in the attribute value (double-quoted) state. */
- $chars = $this->stream->charsUntil('"&');
- $last = count($this->token['attr']) - 1;
- $this->token['attr'][$last]['value'] .= $char . $chars;
- $state = 'attributeValueDoubleQuoted';
- }
- break;
- case 'attributeValueSingleQuoted':
- // Consume the next input character:
- $char = $this->stream->char();
- if($char === "'") {
- /* U+0022 QUOTATION MARK (')
- Switch to the after attribute value state. */
- $state = 'afterAttributeValueQuoted';
- } elseif($char === '&') {
- /* U+0026 AMPERSAND (&)
- Switch to the entity in attribute value state. */
- $this->characterReferenceInAttributeValue("'");
- } elseif($char === false) {
- /* EOF
- Parse error. Emit the current tag token. Reconsume the character
- in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-attribute-value-single-quote'
- ));
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- /* Anything else
- Append the current input character to the current attribute's value.
- Stay in the attribute value (single-quoted) state. */
- $chars = $this->stream->charsUntil("'&");
- $last = count($this->token['attr']) - 1;
- $this->token['attr'][$last]['value'] .= $char . $chars;
- $state = 'attributeValueSingleQuoted';
- }
- break;
- case 'attributeValueUnquoted':
- // Consume the next input character:
- $char = $this->stream->char();
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Switch to the before attribute name state. */
- $state = 'beforeAttributeName';
- } elseif($char === '&') {
- /* U+0026 AMPERSAND (&)
- Switch to the entity in attribute value state. */
- $this->characterReferenceInAttributeValue();
- } elseif($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Emit the current tag token. Switch to the data state. */
- $this->emitToken($this->token);
- $state = 'data';
- } elseif ($char === false) {
- /* EOF
- Parse error. Emit the current tag token. Reconsume
- the character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-attribute-value-no-quotes'
- ));
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- /* U+0022 QUOTATION MARK (")
- U+0027 APOSTROPHE (')
- U+003D EQUALS SIGN (=)
- Parse error. Treat it as per the "anything else"
- entry below. */
- if($char === '"' || $char === "'" || $char === '=') {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-character-in-unquoted-attribute-value'
- ));
- }
- /* Anything else
- Append the current input character to the current attribute's value.
- Stay in the attribute value (unquoted) state. */
- $chars = $this->stream->charsUntil("\t\n\x0c &>\"'=");
- $last = count($this->token['attr']) - 1;
- $this->token['attr'][$last]['value'] .= $char . $chars;
- $state = 'attributeValueUnquoted';
- }
- break;
- case 'afterAttributeValueQuoted':
- /* Consume the next input character: */
- $char = $this->stream->char();
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- …
Large files files are truncated, but you can click here to view the full file