PageRenderTime 48ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/system/classes/htmltokenizer.php

https://github.com/HabariMag/habarimag-old
PHP | 351 lines | 265 code | 48 blank | 38 comment | 50 complexity | 766e12edbdbb9a8385c77c729ec5757c MD5 | raw file
Possible License(s): Apache-2.0
  1. <?php
  2. /**
  3. * @package Habari
  4. *
  5. */
  6. /**
  7. * Tokenizer for HTML.
  8. * For use by HTMLParser.
  9. */
  10. class HTMLTokenizer
  11. {
  12. const NODE_TYPE_TEXT = 1;
  13. const NODE_TYPE_ELEMENT_OPEN = 2;
  14. const NODE_TYPE_ELEMENT_CLOSE = 3;
  15. const NODE_TYPE_PI = 4;
  16. const NODE_TYPE_COMMENT = 5;
  17. const NODE_TYPE_CDATA_SECTION = 6;
  18. const NODE_TYPE_STATEMENT = 7;
  19. const NODE_TYPE_ELEMENT_EMPTY = 8;
  20. /* States of the Machine ;p */
  21. private static $STATE_FINISHED = -1;
  22. private static $STATE_START = 0;
  23. private static $STATE_TAG = 1;
  24. private static $STATE_ELEMENT_OPEN = 2;
  25. private static $STATE_ELEMENT_CLOSE = 3;
  26. private static $STATE_STATEMENT = 4;
  27. private static $STATE_PI = 5;
  28. /* Character Ranges */
  29. private static $CHR_TAG_BEGIN = '<';
  30. private static $CHR_TAG_END = '>';
  31. private static $CHR_TAG_END_TRIM = '/';
  32. private static $CHR_ATTRNAME_END = '=';
  33. private static $CHR_WHITESPACE = " \t\r\n"; // SP, TAB, CR, LF
  34. private $html;
  35. private $pos;
  36. private $len;
  37. private $state;
  38. private $nodes;
  39. private static $empty_elements = array( 'img', 'br', 'hr', 'input', 'area', 'base', 'col', 'link', 'meta', 'param', 'command', 'keygen', 'source' );
  40. public function __construct( $html, $escape = true )
  41. {
  42. $this->html = $html;
  43. $this->len = strlen( $html );
  44. $this->pos = 0;
  45. $this->nodes = new HTMLTokenSet($escape);
  46. $this->state = self::$STATE_START;
  47. }
  48. public function parse()
  49. {
  50. while ( $this->has_more() && $this->state != self::$STATE_FINISHED ) {
  51. switch ( $this->state ) {
  52. case self::$STATE_START:
  53. $this->state = $this->parse_start();
  54. break;
  55. case self::$STATE_TAG:
  56. $this->state = $this->parse_tag();
  57. break;
  58. case self::$STATE_ELEMENT_OPEN:
  59. $this->state = $this->parse_element_open();
  60. break;
  61. case self::$STATE_ELEMENT_CLOSE:
  62. $this->state = $this->parse_element_close();
  63. break;
  64. case self::$STATE_STATEMENT:
  65. $this->state = $this->parse_statement();
  66. break;
  67. case self::$STATE_PI:
  68. $this->state = $this->parse_pi();
  69. break;
  70. default:
  71. Error::raise( sprintf( _t( 'Invalid state %d in %s->parse()' ), $this->state, __CLASS__ ) );
  72. $this->state = self::$STATE_FINISHED;
  73. break;
  74. }
  75. }
  76. return $this->nodes;
  77. }
  78. public function has_more()
  79. {
  80. return ( $this->pos < $this->len );
  81. }
  82. private function node( $type, $name, $value, $attrs )
  83. {
  84. $this->nodes[] = array(
  85. 'type' => $type,
  86. 'name' => $name,
  87. 'value' => $value,
  88. 'attrs' => $attrs,
  89. );
  90. }
  91. private function dec( $n = 1 )
  92. {
  93. $this->pos -= $n;
  94. }
  95. private function inc( $n = 1 )
  96. {
  97. $this->pos += $n;
  98. }
  99. private function get()
  100. {
  101. if ( $this->has_more() ) {
  102. return $this->html{ $this->pos++ };
  103. }
  104. return null;
  105. }
  106. private function peek()
  107. {
  108. return $this->html{ $this->pos };
  109. }
  110. private function up_to_str( $str )
  111. {
  112. $pos = $this->pos;
  113. $this->pos = strpos( $this->html, $str, $pos );
  114. if ( $this->pos === false ) {
  115. // finish
  116. $this->pos = $this->len;
  117. }
  118. return substr( $this->html, $pos, $this->pos - $pos );
  119. }
  120. private function up_to_chr( $chr )
  121. {
  122. $pos = $this->pos;
  123. $seg_len = strcspn( $this->html, $chr, $pos );
  124. $this->pos += $seg_len;
  125. return substr( $this->html, $pos, $seg_len );
  126. }
  127. private function skip_whitespace()
  128. {
  129. $this->pos += strspn( $this->html, self::$CHR_WHITESPACE, $this->pos );
  130. }
  131. private function parse_start()
  132. {
  133. $data = $this->up_to_str( self::$CHR_TAG_BEGIN );
  134. $this->inc();
  135. if ( $data != '' ) {
  136. $this->node( self::NODE_TYPE_TEXT, '#text', $data, null );
  137. }
  138. return self::$STATE_TAG;
  139. }
  140. private function parse_attributes()
  141. {
  142. $attr = array();
  143. $name = '';
  144. $this->skip_whitespace();
  145. // read attribute name
  146. while ( $name = $this->up_to_chr( self::$CHR_ATTRNAME_END . self::$CHR_TAG_END . self::$CHR_WHITESPACE ) ) {
  147. $name = strtolower( rtrim( $name, self::$CHR_TAG_END_TRIM ) );
  148. // skip any whitespace
  149. $this->skip_whitespace();
  150. // first non-ws char
  151. $char = $this->get();
  152. if ( $char == '=' ) {
  153. // attribute value follows
  154. $this->skip_whitespace();
  155. $char = $this->get();
  156. if ( $char == '"' ) {
  157. // double-quoted
  158. $value = $this->up_to_str( '"' );
  159. $this->inc();
  160. }
  161. elseif ( $char == '\'' ) {
  162. // single-quoted
  163. $value = $this->up_to_str( '\'' );
  164. $this->inc();
  165. }
  166. else {
  167. // bad, bad, bad
  168. $this->dec();
  169. $value = $this->up_to_chr( self::$CHR_WHITESPACE . '>' );
  170. }
  171. }
  172. elseif ( $char !== null ) {
  173. // TODO HTMLParser should handle #IMPLIED attrs
  174. $value = null;
  175. $this->dec();
  176. }
  177. else {
  178. // default
  179. $value = null;
  180. }
  181. // store that attribute only if it's not empty
  182. if ( $name ) {
  183. $attr[$name] = $value;
  184. }
  185. $this->skip_whitespace();
  186. }
  187. return $attr;
  188. }
  189. private function parse_tag()
  190. {
  191. switch ( $this->get() ) {
  192. case '!':
  193. return self::$STATE_STATEMENT;
  194. break;
  195. case '?':
  196. // mmmh, PI
  197. return self::$STATE_PI;
  198. break;
  199. case '/':
  200. return self::$STATE_ELEMENT_CLOSE;
  201. break;
  202. default:
  203. // we just ate the first char of the tagName, oops
  204. $this->dec();
  205. return self::$STATE_ELEMENT_OPEN;
  206. }
  207. }
  208. private function parse_element_open()
  209. {
  210. $tag = rtrim( $this->up_to_chr( self::$CHR_TAG_END . self::$CHR_WHITESPACE ), self::$CHR_TAG_END_TRIM );
  211. if ( $tag != '' ) {
  212. $attr = $this->parse_attributes();
  213. $char = $this->get();
  214. if ( ( $char == '/' && $this->peek() == '>' ) || in_array( $tag, self::$empty_elements ) ) {
  215. // empty element
  216. if ( $char == '/' && $this->peek() == '>' ) {
  217. // empty element in collapsed form
  218. $this->inc(); // skip peeked '>'
  219. }
  220. $this->node( self::NODE_TYPE_ELEMENT_EMPTY, $tag, null, $attr );
  221. }
  222. else {
  223. $this->node( self::NODE_TYPE_ELEMENT_OPEN, $tag, null, $attr );
  224. }
  225. }
  226. return self::$STATE_START;
  227. }
  228. private function parse_element_close()
  229. {
  230. $tag = $this->up_to_chr( self::$CHR_TAG_END );
  231. if ( $tag != '' ) {
  232. $char = $this->get();
  233. if ( $char == '/' && $this->peek() == '>' ) {
  234. $this->inc();
  235. }
  236. $this->node( self::NODE_TYPE_ELEMENT_CLOSE, $tag, null, null );
  237. }
  238. return self::$STATE_START;
  239. }
  240. private function parse_statement()
  241. {
  242. // everything starting with <!
  243. $nodeName = '#statement';
  244. $nodeType = self::NODE_TYPE_STATEMENT;
  245. $char = $this->get();
  246. if ( $char == '[' ) {
  247. // CDATA
  248. // <http://www.w3.org/TR/DOM-Level-2-Core/core.html>
  249. $nodeName = '#cdata-section';
  250. $nodeType = self::NODE_TYPE_CDATA_SECTION;
  251. $this->inc( 6 ); // strlen( 'CDATA[' )
  252. $data = $this->up_to_str( ']]>' );
  253. $this->inc( 2 ); // strlen( ']]' )
  254. }
  255. elseif ( $char == '-' && $this->peek() == '-' ) {
  256. // comment
  257. $nodeName = '#comment';
  258. $nodeType = self::NODE_TYPE_COMMENT;
  259. // skip peeked -
  260. $this->inc();
  261. // consume text
  262. $data = $this->up_to_str( '-->' );
  263. $data = $data; // should trim() upstream
  264. // skip over final --
  265. $this->inc( 2 );
  266. }
  267. else {
  268. // some other kind of statement
  269. $this->dec();
  270. }
  271. if ( $nodeType == self::NODE_TYPE_STATEMENT ) {
  272. $data = '';
  273. $nodeName = $this->up_to_chr( self::$CHR_TAG_END . self::$CHR_TAG_END_TRIM . self::$CHR_WHITESPACE );
  274. if ( $this->peek() != '>' ) {
  275. // there be data or something
  276. $this->skip_whitespace();
  277. $data .= $this->up_to_chr( '[>' );
  278. if ( $this->peek() == '[' ) {
  279. // internal subset
  280. $data .= $this->get() . $this->up_to_str( ']' ) . $this->get();
  281. }
  282. }
  283. $data .= $this->up_to_str( '>' );
  284. // not like anyone uses them, eh?
  285. }
  286. // skip over final '>'
  287. $this->inc();
  288. if ( $data != '' ) {
  289. $this->node( $nodeType, $nodeName, $data, null );
  290. }
  291. return self::$STATE_START;
  292. }
  293. private function parse_pi()
  294. {
  295. $target = $this->up_to_chr( self::$CHR_TAG_END . self::$CHR_WHITESPACE );
  296. $data = $this->up_to_chr( self::$CHR_TAG_END );
  297. // skip over closing tag
  298. $this->inc( 1 );
  299. $this->node( self::NODE_TYPE_PI, $target, $data, array() );
  300. return self::$STATE_START;
  301. }
  302. }
  303. ?>