PageRenderTime 39ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/common/libraries/plugin/htmlpurifier/library/HTMLPurifier/Lexer/PEARSax3.php

https://bitbucket.org/chamilo/chamilo-dev/
PHP | 156 lines | 87 code | 21 blank | 48 comment | 12 complexity | 4f10c91025db62ba1fdab48a6de31486 MD5 | raw file
Possible License(s): GPL-2.0, BSD-3-Clause, LGPL-2.1, LGPL-3.0, GPL-3.0, MIT
  1. <?php
  2. /**
  3. * Proof-of-concept lexer that uses the PEAR package XML_HTMLSax3 to parse HTML.
  4. *
  5. * PEAR, not suprisingly, also has a SAX parser for HTML. I don't know
  6. * very much about implementation, but it's fairly well written. However, that
  7. * abstraction comes at a price: performance. You need to have it installed,
  8. * and if the API changes, it might break our adapter. Not sure whether or not
  9. * it's UTF-8 aware, but it has some entity parsing trouble (in all areas,
  10. * text and attributes).
  11. *
  12. * Quite personally, I don't recommend using the PEAR class, and the defaults
  13. * don't use it. The unit tests do perform the tests on the SAX parser too, but
  14. * whatever it does for poorly formed HTML is up to it.
  15. *
  16. * @todo Generalize so that XML_HTMLSax is also supported.
  17. *
  18. * @warning Entity-resolution inside attributes is broken.
  19. */
  20. class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
  21. {
  22. /**
  23. * Internal accumulator array for SAX parsers.
  24. */
  25. protected $tokens = array();
  26. protected $last_token_was_empty;
  27. private $parent_handler;
  28. private $stack = array();
  29. public function tokenizeHTML($string, $config, $context)
  30. {
  31. $this->tokens = array();
  32. $this->last_token_was_empty = false;
  33. $string = $this->normalize($string, $config, $context);
  34. $this->parent_handler = set_error_handler(array($this, 'muteStrictErrorHandler'));
  35. $parser = new XML_HTMLSax3();
  36. $parser->set_object($this);
  37. $parser->set_element_handler('openHandler', 'closeHandler');
  38. $parser->set_data_handler('dataHandler');
  39. $parser->set_escape_handler('escapeHandler');
  40. // doesn't seem to work correctly for attributes
  41. $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
  42. $parser->parse($string);
  43. restore_error_handler();
  44. return $this->tokens;
  45. }
  46. /**
  47. * Open tag event handler, interface is defined by PEAR package.
  48. */
  49. public function openHandler(&$parser, $name, $attrs, $closed)
  50. {
  51. // entities are not resolved in attrs
  52. foreach ($attrs as $key => $attr)
  53. {
  54. $attrs[$key] = $this->parseData($attr);
  55. }
  56. if ($closed)
  57. {
  58. $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
  59. $this->last_token_was_empty = true;
  60. }
  61. else
  62. {
  63. $this->tokens[] = new HTMLPurifier_Token_Start($name, $attrs);
  64. }
  65. $this->stack[] = $name;
  66. return true;
  67. }
  68. /**
  69. * Close tag event handler, interface is defined by PEAR package.
  70. */
  71. public function closeHandler(&$parser, $name)
  72. {
  73. // HTMLSax3 seems to always send empty tags an extra close tag
  74. // check and ignore if you see it:
  75. // [TESTME] to make sure it doesn't overreach
  76. if ($this->last_token_was_empty)
  77. {
  78. $this->last_token_was_empty = false;
  79. return true;
  80. }
  81. $this->tokens[] = new HTMLPurifier_Token_End($name);
  82. if (! empty($this->stack))
  83. array_pop($this->stack);
  84. return true;
  85. }
  86. /**
  87. * Data event handler, interface is defined by PEAR package.
  88. */
  89. public function dataHandler(&$parser, $data)
  90. {
  91. $this->last_token_was_empty = false;
  92. $this->tokens[] = new HTMLPurifier_Token_Text($data);
  93. return true;
  94. }
  95. /**
  96. * Escaped text handler, interface is defined by PEAR package.
  97. */
  98. public function escapeHandler(&$parser, $data)
  99. {
  100. if (strpos($data, '--') === 0)
  101. {
  102. // remove trailing and leading double-dashes
  103. $data = substr($data, 2);
  104. if (strlen($data) >= 2 && substr($data, - 2) == "--")
  105. {
  106. $data = substr($data, 0, - 2);
  107. }
  108. if (isset($this->stack[sizeof($this->stack) - 1]) && $this->stack[sizeof($this->stack) - 1] == "style")
  109. {
  110. $this->tokens[] = new HTMLPurifier_Token_Text($data);
  111. }
  112. else
  113. {
  114. $this->tokens[] = new HTMLPurifier_Token_Comment($data);
  115. }
  116. $this->last_token_was_empty = false;
  117. }
  118. // CDATA is handled elsewhere, but if it was handled here:
  119. //if (strpos($data, '[CDATA[') === 0) {
  120. // $this->tokens[] = new HTMLPurifier_Token_Text(
  121. // substr($data, 7, strlen($data) - 9) );
  122. //}
  123. return true;
  124. }
  125. /**
  126. * An error handler that mutes strict errors
  127. */
  128. public function muteStrictErrorHandler($errno, $errstr, $errfile = null, $errline = null, $errcontext = null)
  129. {
  130. if ($errno == E_STRICT)
  131. return;
  132. return call_user_func($this->parent_handler, $errno, $errstr, $errfile, $errline, $errcontext);
  133. }
  134. }
  135. // vim: et sw=4 sts=4