PageRenderTime 35ms CodeModel.GetById 0ms RepoModel.GetById 0ms app.codeStats 0ms

/common/libraries/plugin/htmlpurifier/library/HTMLPurifier/Lexer.php

https://bitbucket.org/chamilo/chamilo-dev/
PHP | 344 lines | 160 code | 48 blank | 136 comment | 25 complexity | 5114e7a37c2af24248d6de50d814d53d MD5 | raw file
Possible License(s): GPL-2.0, BSD-3-Clause, LGPL-2.1, LGPL-3.0, GPL-3.0, MIT
  1. <?php
  2. /**
  3. * Forgivingly lexes HTML (SGML-style) markup into tokens.
  4. *
  5. * A lexer parses a string of SGML-style markup and converts them into
  6. * corresponding tokens. It doesn't check for well-formedness, although its
  7. * internal mechanism may make this automatic (such as the case of
  8. * HTMLPurifier_Lexer_DOMLex). There are several implementations to choose
  9. * from.
  10. *
  11. * A lexer is HTML-oriented: it might work with XML, but it's not
  12. * recommended, as we adhere to a subset of the specification for optimization
  13. * reasons. This might change in the future. Also, most tokenizers are not
  14. * expected to handle DTDs or PIs.
  15. *
  16. * This class should not be directly instantiated, but you may use create() to
  17. * retrieve a default copy of the lexer. Being a supertype, this class
  18. * does not actually define any implementation, but offers commonly used
  19. * convenience functions for subclasses.
  20. *
  21. * @note The unit tests will instantiate this class for testing purposes, as
  22. * many of the utility functions require a class to be instantiated.
  23. * This means that, even though this class is not runnable, it will
  24. * not be declared abstract.
  25. *
  26. * @par
  27. *
  28. * @note
  29. * We use tokens rather than create a DOM representation because DOM would:
  30. *
  31. * @par
  32. * -# Require more processing and memory to create,
  33. * -# Is not streamable, and
  34. * -# Has the entire document structure (html and body not needed).
  35. *
  36. * @par
  37. * However, DOM is helpful in that it makes it easy to move around nodes
  38. * without a lot of lookaheads to see when a tag is closed. This is a
  39. * limitation of the token system and some workarounds would be nice.
  40. */
  41. class HTMLPurifier_Lexer
  42. {
  43. /**
  44. * Whether or not this lexer implements line-number/column-number tracking.
  45. * If it does, set to true.
  46. */
  47. public $tracksLineNumbers = false;
  48. // -- STATIC ----------------------------------------------------------
  49. /**
  50. * Retrieves or sets the default Lexer as a Prototype Factory.
  51. *
  52. * By default HTMLPurifier_Lexer_DOMLex will be returned. There are
  53. * a few exceptions involving special features that only DirectLex
  54. * implements.
  55. *
  56. * @note The behavior of this class has changed, rather than accepting
  57. * a prototype object, it now accepts a configuration object.
  58. * To specify your own prototype, set %Core.LexerImpl to it.
  59. * This change in behavior de-singletonizes the lexer object.
  60. *
  61. * @param $config Instance of HTMLPurifier_Config
  62. * @return Concrete lexer.
  63. */
  64. public static function create($config)
  65. {
  66. if (! ($config instanceof HTMLPurifier_Config))
  67. {
  68. $lexer = $config;
  69. trigger_error("Passing a prototype to
  70. HTMLPurifier_Lexer::create() is deprecated, please instead
  71. use %Core.LexerImpl", E_USER_WARNING);
  72. }
  73. else
  74. {
  75. $lexer = $config->get('Core.LexerImpl');
  76. }
  77. $needs_tracking = $config->get('Core.MaintainLineNumbers') || $config->get('Core.CollectErrors');
  78. $inst = null;
  79. if (is_object($lexer))
  80. {
  81. $inst = $lexer;
  82. }
  83. else
  84. {
  85. if (is_null($lexer))
  86. {
  87. do
  88. {
  89. // auto-detection algorithm
  90. if ($needs_tracking)
  91. {
  92. $lexer = 'DirectLex';
  93. break;
  94. }
  95. if (class_exists('DOMDocument') && method_exists('DOMDocument', 'loadHTML') && ! extension_loaded('domxml'))
  96. {
  97. // check for DOM support, because while it's part of the
  98. // core, it can be disabled compile time. Also, the PECL
  99. // domxml extension overrides the default DOM, and is evil
  100. // and nasty and we shan't bother to support it
  101. $lexer = 'DOMLex';
  102. }
  103. else
  104. {
  105. $lexer = 'DirectLex';
  106. }
  107. }
  108. while (0);
  109. } // do..while so we can break
  110. // instantiate recognized string names
  111. switch ($lexer)
  112. {
  113. case 'DOMLex' :
  114. $inst = new HTMLPurifier_Lexer_DOMLex();
  115. break;
  116. case 'DirectLex' :
  117. $inst = new HTMLPurifier_Lexer_DirectLex();
  118. break;
  119. case 'PH5P' :
  120. $inst = new HTMLPurifier_Lexer_PH5P();
  121. break;
  122. default :
  123. throw new HTMLPurifier_Exception("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer));
  124. }
  125. }
  126. if (! $inst)
  127. throw new HTMLPurifier_Exception('No lexer was instantiated');
  128. // once PHP DOM implements native line numbers, or we
  129. // hack out something using XSLT, remove this stipulation
  130. if ($needs_tracking && ! $inst->tracksLineNumbers)
  131. {
  132. throw new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)');
  133. }
  134. return $inst;
  135. }
  136. // -- CONVENIENCE MEMBERS ---------------------------------------------
  137. public function __construct()
  138. {
  139. $this->_entity_parser = new HTMLPurifier_EntityParser();
  140. }
  141. /**
  142. * Most common entity to raw value conversion table for special entities.
  143. */
  144. protected $_special_entity2str = array('&quot;' => '"', '&amp;' => '&', '&lt;' => '<', '&gt;' => '>',
  145. '&#39;' => "'", '&#039;' => "'", '&#x27;' => "'");
  146. /**
  147. * Parses special entities into the proper characters.
  148. *
  149. * This string will translate escaped versions of the special characters
  150. * into the correct ones.
  151. *
  152. * @warning
  153. * You should be able to treat the output of this function as
  154. * completely parsed, but that's only because all other entities should
  155. * have been handled previously in substituteNonSpecialEntities()
  156. *
  157. * @param $string String character data to be parsed.
  158. * @returns Parsed character data.
  159. */
  160. public function parseData($string)
  161. {
  162. // following functions require at least one character
  163. if ($string === '')
  164. return '';
  165. // subtracts amps that cannot possibly be escaped
  166. $num_amp = substr_count($string, '&') - substr_count($string, '& ') - ($string[strlen($string) - 1] === '&' ? 1 : 0);
  167. if (! $num_amp)
  168. return $string; // abort if no entities
  169. $num_esc_amp = substr_count($string, '&amp;');
  170. $string = strtr($string, $this->_special_entity2str);
  171. // code duplication for sake of optimization, see above
  172. $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') - ($string[strlen($string) - 1] === '&' ? 1 : 0);
  173. if ($num_amp_2 <= $num_esc_amp)
  174. return $string;
  175. // hmm... now we have some uncommon entities. Use the callback.
  176. $string = $this->_entity_parser->substituteSpecialEntities($string);
  177. return $string;
  178. }
  179. /**
  180. * Lexes an HTML string into tokens.
  181. *
  182. * @param $string String HTML.
  183. * @return HTMLPurifier_Token array representation of HTML.
  184. */
  185. public function tokenizeHTML($string, $config, $context)
  186. {
  187. trigger_error('Call to abstract class', E_USER_ERROR);
  188. }
  189. /**
  190. * Translates CDATA sections into regular sections (through escaping).
  191. *
  192. * @param $string HTML string to process.
  193. * @returns HTML with CDATA sections escaped.
  194. */
  195. protected static function escapeCDATA($string)
  196. {
  197. return preg_replace_callback('/<!\[CDATA\[(.+?)\]\]>/s', array('HTMLPurifier_Lexer', 'CDATACallback'), $string);
  198. }
  199. /**
  200. * Special CDATA case that is especially convoluted for <script>
  201. */
  202. protected static function escapeCommentedCDATA($string)
  203. {
  204. return preg_replace_callback('#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s', array('HTMLPurifier_Lexer',
  205. 'CDATACallback'), $string);
  206. }
  207. /**
  208. * Special Internet Explorer conditional comments should be removed.
  209. */
  210. protected static function removeIEConditional($string)
  211. {
  212. return preg_replace('#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings
  213. '', $string);
  214. }
  215. /**
  216. * Callback function for escapeCDATA() that does the work.
  217. *
  218. * @warning Though this is public in order to let the callback happen,
  219. * calling it directly is not recommended.
  220. * @params $matches PCRE matches array, with index 0 the entire match
  221. * and 1 the inside of the CDATA section.
  222. * @returns Escaped internals of the CDATA section.
  223. */
  224. protected static function CDATACallback($matches)
  225. {
  226. // not exactly sure why the character set is needed, but whatever
  227. return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
  228. }
  229. /**
  230. * Takes a piece of HTML and normalizes it by converting entities, fixing
  231. * encoding, extracting bits, and other good stuff.
  232. * @todo Consider making protected
  233. */
  234. public function normalize($html, $config, $context)
  235. {
  236. // normalize newlines to \n
  237. if ($config->get('Core.NormalizeNewlines'))
  238. {
  239. $html = str_replace("\r\n", "\n", $html);
  240. $html = str_replace("\r", "\n", $html);
  241. }
  242. if ($config->get('HTML.Trusted'))
  243. {
  244. // escape convoluted CDATA
  245. $html = $this->escapeCommentedCDATA($html);
  246. }
  247. // escape CDATA
  248. $html = $this->escapeCDATA($html);
  249. $html = $this->removeIEConditional($html);
  250. // extract body from document if applicable
  251. if ($config->get('Core.ConvertDocumentToFragment'))
  252. {
  253. $e = false;
  254. if ($config->get('Core.CollectErrors'))
  255. {
  256. $e = & $context->get('ErrorCollector');
  257. }
  258. $new_html = $this->extractBody($html);
  259. if ($e && $new_html != $html)
  260. {
  261. $e->send(E_WARNING, 'Lexer: Extracted body');
  262. }
  263. $html = $new_html;
  264. }
  265. // expand entities that aren't the big five
  266. $html = $this->_entity_parser->substituteNonSpecialEntities($html);
  267. // clean into wellformed UTF-8 string for an SGML context: this has
  268. // to be done after entity expansion because the entities sometimes
  269. // represent non-SGML characters (horror, horror!)
  270. $html = HTMLPurifier_Encoder :: cleanUTF8($html);
  271. // if processing instructions are to removed, remove them now
  272. if ($config->get('Core.RemoveProcessingInstructions'))
  273. {
  274. $html = preg_replace('#<\?.+?\?>#s', '', $html);
  275. }
  276. return $html;
  277. }
  278. /**
  279. * Takes a string of HTML (fragment or document) and returns the content
  280. * @todo Consider making protected
  281. */
  282. public function extractBody($html)
  283. {
  284. $matches = array();
  285. $result = preg_match('!<body[^>]*>(.*)</body>!is', $html, $matches);
  286. if ($result)
  287. {
  288. return $matches[1];
  289. }
  290. else
  291. {
  292. return $html;
  293. }
  294. }
  295. }
  296. // vim: et sw=4 sts=4