/system/classes/htmltokenizer.php
PHP | 351 lines | 265 code | 48 blank | 38 comment | 50 complexity | 766e12edbdbb9a8385c77c729ec5757c MD5 | raw file
Possible License(s): Apache-2.0
- <?php
- /**
- * @package Habari
- *
- */
- /**
- * Tokenizer for HTML.
- * For use by HTMLParser.
- */
- class HTMLTokenizer
- {
- const NODE_TYPE_TEXT = 1;
- const NODE_TYPE_ELEMENT_OPEN = 2;
- const NODE_TYPE_ELEMENT_CLOSE = 3;
- const NODE_TYPE_PI = 4;
- const NODE_TYPE_COMMENT = 5;
- const NODE_TYPE_CDATA_SECTION = 6;
- const NODE_TYPE_STATEMENT = 7;
- const NODE_TYPE_ELEMENT_EMPTY = 8;
- /* States of the Machine ;p */
- private static $STATE_FINISHED = -1;
- private static $STATE_START = 0;
- private static $STATE_TAG = 1;
- private static $STATE_ELEMENT_OPEN = 2;
- private static $STATE_ELEMENT_CLOSE = 3;
- private static $STATE_STATEMENT = 4;
- private static $STATE_PI = 5;
- /* Character Ranges */
- private static $CHR_TAG_BEGIN = '<';
- private static $CHR_TAG_END = '>';
- private static $CHR_TAG_END_TRIM = '/';
- private static $CHR_ATTRNAME_END = '=';
- private static $CHR_WHITESPACE = " \t\r\n"; // SP, TAB, CR, LF
- private $html;
- private $pos;
- private $len;
- private $state;
- private $nodes;
- private static $empty_elements = array( 'img', 'br', 'hr', 'input', 'area', 'base', 'col', 'link', 'meta', 'param', 'command', 'keygen', 'source' );
- public function __construct( $html, $escape = true )
- {
- $this->html = $html;
- $this->len = strlen( $html );
- $this->pos = 0;
- $this->nodes = new HTMLTokenSet($escape);
- $this->state = self::$STATE_START;
- }
- public function parse()
- {
- while ( $this->has_more() && $this->state != self::$STATE_FINISHED ) {
- switch ( $this->state ) {
- case self::$STATE_START:
- $this->state = $this->parse_start();
- break;
- case self::$STATE_TAG:
- $this->state = $this->parse_tag();
- break;
- case self::$STATE_ELEMENT_OPEN:
- $this->state = $this->parse_element_open();
- break;
- case self::$STATE_ELEMENT_CLOSE:
- $this->state = $this->parse_element_close();
- break;
- case self::$STATE_STATEMENT:
- $this->state = $this->parse_statement();
- break;
- case self::$STATE_PI:
- $this->state = $this->parse_pi();
- break;
- default:
- Error::raise( sprintf( _t( 'Invalid state %d in %s->parse()' ), $this->state, __CLASS__ ) );
- $this->state = self::$STATE_FINISHED;
- break;
- }
- }
- return $this->nodes;
- }
- public function has_more()
- {
- return ( $this->pos < $this->len );
- }
- private function node( $type, $name, $value, $attrs )
- {
- $this->nodes[] = array(
- 'type' => $type,
- 'name' => $name,
- 'value' => $value,
- 'attrs' => $attrs,
- );
- }
- private function dec( $n = 1 )
- {
- $this->pos -= $n;
- }
- private function inc( $n = 1 )
- {
- $this->pos += $n;
- }
- private function get()
- {
- if ( $this->has_more() ) {
- return $this->html{ $this->pos++ };
- }
- return null;
- }
- private function peek()
- {
- return $this->html{ $this->pos };
- }
- private function up_to_str( $str )
- {
- $pos = $this->pos;
- $this->pos = strpos( $this->html, $str, $pos );
- if ( $this->pos === false ) {
- // finish
- $this->pos = $this->len;
- }
- return substr( $this->html, $pos, $this->pos - $pos );
- }
- private function up_to_chr( $chr )
- {
- $pos = $this->pos;
- $seg_len = strcspn( $this->html, $chr, $pos );
- $this->pos += $seg_len;
- return substr( $this->html, $pos, $seg_len );
- }
- private function skip_whitespace()
- {
- $this->pos += strspn( $this->html, self::$CHR_WHITESPACE, $this->pos );
- }
- private function parse_start()
- {
- $data = $this->up_to_str( self::$CHR_TAG_BEGIN );
- $this->inc();
- if ( $data != '' ) {
- $this->node( self::NODE_TYPE_TEXT, '#text', $data, null );
- }
- return self::$STATE_TAG;
- }
- private function parse_attributes()
- {
- $attr = array();
- $name = '';
- $this->skip_whitespace();
- // read attribute name
- while ( $name = $this->up_to_chr( self::$CHR_ATTRNAME_END . self::$CHR_TAG_END . self::$CHR_WHITESPACE ) ) {
- $name = strtolower( rtrim( $name, self::$CHR_TAG_END_TRIM ) );
- // skip any whitespace
- $this->skip_whitespace();
- // first non-ws char
- $char = $this->get();
- if ( $char == '=' ) {
- // attribute value follows
- $this->skip_whitespace();
- $char = $this->get();
- if ( $char == '"' ) {
- // double-quoted
- $value = $this->up_to_str( '"' );
- $this->inc();
- }
- elseif ( $char == '\'' ) {
- // single-quoted
- $value = $this->up_to_str( '\'' );
- $this->inc();
- }
- else {
- // bad, bad, bad
- $this->dec();
- $value = $this->up_to_chr( self::$CHR_WHITESPACE . '>' );
- }
- }
- elseif ( $char !== null ) {
- // TODO HTMLParser should handle #IMPLIED attrs
- $value = null;
- $this->dec();
- }
- else {
- // default
- $value = null;
- }
- // store that attribute only if it's not empty
- if ( $name ) {
- $attr[$name] = $value;
- }
- $this->skip_whitespace();
- }
- return $attr;
- }
- private function parse_tag()
- {
- switch ( $this->get() ) {
- case '!':
- return self::$STATE_STATEMENT;
- break;
- case '?':
- // mmmh, PI
- return self::$STATE_PI;
- break;
- case '/':
- return self::$STATE_ELEMENT_CLOSE;
- break;
- default:
- // we just ate the first char of the tagName, oops
- $this->dec();
- return self::$STATE_ELEMENT_OPEN;
- }
- }
- private function parse_element_open()
- {
- $tag = rtrim( $this->up_to_chr( self::$CHR_TAG_END . self::$CHR_WHITESPACE ), self::$CHR_TAG_END_TRIM );
- if ( $tag != '' ) {
- $attr = $this->parse_attributes();
- $char = $this->get();
- if ( ( $char == '/' && $this->peek() == '>' ) || in_array( $tag, self::$empty_elements ) ) {
- // empty element
- if ( $char == '/' && $this->peek() == '>' ) {
- // empty element in collapsed form
- $this->inc(); // skip peeked '>'
- }
- $this->node( self::NODE_TYPE_ELEMENT_EMPTY, $tag, null, $attr );
- }
- else {
- $this->node( self::NODE_TYPE_ELEMENT_OPEN, $tag, null, $attr );
- }
- }
- return self::$STATE_START;
- }
- private function parse_element_close()
- {
- $tag = $this->up_to_chr( self::$CHR_TAG_END );
- if ( $tag != '' ) {
- $char = $this->get();
- if ( $char == '/' && $this->peek() == '>' ) {
- $this->inc();
- }
- $this->node( self::NODE_TYPE_ELEMENT_CLOSE, $tag, null, null );
- }
- return self::$STATE_START;
- }
- private function parse_statement()
- {
- // everything starting with <!
- $nodeName = '#statement';
- $nodeType = self::NODE_TYPE_STATEMENT;
- $char = $this->get();
- if ( $char == '[' ) {
- // CDATA
- // <http://www.w3.org/TR/DOM-Level-2-Core/core.html>
- $nodeName = '#cdata-section';
- $nodeType = self::NODE_TYPE_CDATA_SECTION;
- $this->inc( 6 ); // strlen( 'CDATA[' )
- $data = $this->up_to_str( ']]>' );
- $this->inc( 2 ); // strlen( ']]' )
- }
- elseif ( $char == '-' && $this->peek() == '-' ) {
- // comment
- $nodeName = '#comment';
- $nodeType = self::NODE_TYPE_COMMENT;
- // skip peeked -
- $this->inc();
- // consume text
- $data = $this->up_to_str( '-->' );
- $data = $data; // should trim() upstream
- // skip over final --
- $this->inc( 2 );
- }
- else {
- // some other kind of statement
- $this->dec();
- }
- if ( $nodeType == self::NODE_TYPE_STATEMENT ) {
- $data = '';
- $nodeName = $this->up_to_chr( self::$CHR_TAG_END . self::$CHR_TAG_END_TRIM . self::$CHR_WHITESPACE );
- if ( $this->peek() != '>' ) {
- // there be data or something
- $this->skip_whitespace();
- $data .= $this->up_to_chr( '[>' );
- if ( $this->peek() == '[' ) {
- // internal subset
- $data .= $this->get() . $this->up_to_str( ']' ) . $this->get();
- }
- }
- $data .= $this->up_to_str( '>' );
- // not like anyone uses them, eh?
- }
- // skip over final '>'
- $this->inc();
- if ( $data != '' ) {
- $this->node( $nodeType, $nodeName, $data, null );
- }
- return self::$STATE_START;
- }
- private function parse_pi()
- {
- $target = $this->up_to_chr( self::$CHR_TAG_END . self::$CHR_WHITESPACE );
- $data = $this->up_to_chr( self::$CHR_TAG_END );
- // skip over closing tag
- $this->inc( 1 );
- $this->node( self::NODE_TYPE_PI, $target, $data, array() );
- return self::$STATE_START;
- }
- }
- ?>