PageRenderTime 53ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/moodle/lib/htmlpurifier/HTMLPurifier/Lexer.php

https://bitbucket.org/geek745/moodle-db2
PHP | 363 lines | 197 code | 41 blank | 125 comment | 18 complexity | 42aae0705d716384d5aff01572f344f3 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1, BSD-3-Clause, LGPL-2.0
  1. <?php
  2. require_once 'HTMLPurifier/Token.php';
  3. require_once 'HTMLPurifier/Encoder.php';
  4. require_once 'HTMLPurifier/EntityParser.php';
  5. // implementations
  6. require_once 'HTMLPurifier/Lexer/DirectLex.php';
  7. if (version_compare(PHP_VERSION, "5", ">=")) {
  8. // You can remove the if statement if you are running PHP 5 only.
  9. // We ought to get the strict version to follow those rules.
  10. require_once 'HTMLPurifier/Lexer/DOMLex.php';
  11. }
  12. HTMLPurifier_ConfigSchema::define(
  13. 'Core', 'ConvertDocumentToFragment', true, 'bool', '
  14. This parameter determines whether or not the filter should convert
  15. input that is a full document with html and body tags to a fragment
  16. of just the contents of a body tag. This parameter is simply something
  17. HTML Purifier can do during an edge-case: for most inputs, this
  18. processing is not necessary.
  19. ');
  20. HTMLPurifier_ConfigSchema::defineAlias('Core', 'AcceptFullDocuments', 'Core', 'ConvertDocumentToFragment');
  21. HTMLPurifier_ConfigSchema::define(
  22. 'Core', 'LexerImpl', null, 'mixed/null', '
  23. <p>
  24. This parameter determines what lexer implementation can be used. The
  25. valid values are:
  26. </p>
  27. <dl>
  28. <dt><em>null</em></dt>
  29. <dd>
  30. Recommended, the lexer implementation will be auto-detected based on
  31. your PHP-version and configuration.
  32. </dd>
  33. <dt><em>string</em> lexer identifier</dt>
  34. <dd>
  35. This is a slim way of manually overridding the implementation.
  36. Currently recognized values are: DOMLex (the default PHP5 implementation)
  37. and DirectLex (the default PHP4 implementation). Only use this if
  38. you know what you are doing: usually, the auto-detection will
  39. manage things for cases you aren\'t even aware of.
  40. </dd>
  41. <dt><em>object</em> lexer instance</dt>
  42. <dd>
  43. Super-advanced: you can specify your own, custom, implementation that
  44. implements the interface defined by <code>HTMLPurifier_Lexer</code>.
  45. I may remove this option simply because I don\'t expect anyone
  46. to use it.
  47. </dd>
  48. </dl>
  49. <p>
  50. This directive has been available since 2.0.0.
  51. </p>
  52. '
  53. );
  54. HTMLPurifier_ConfigSchema::define(
  55. 'Core', 'MaintainLineNumbers', null, 'bool/null', '
  56. <p>
  57. If true, HTML Purifier will add line number information to all tokens.
  58. This is useful when error reporting is turned on, but can result in
  59. significant performance degradation and should not be used when
  60. unnecessary. This directive must be used with the DirectLex lexer,
  61. as the DOMLex lexer does not (yet) support this functionality.
  62. If the value is null, an appropriate value will be selected based
  63. on other configuration. This directive has been available since 2.0.0.
  64. </p>
  65. ');
  66. HTMLPurifier_ConfigSchema::define(
  67. 'Core', 'AggressivelyFixLt', false, 'bool', '
  68. This directive enables aggressive pre-filter fixes HTML Purifier can
  69. perform in order to ensure that open angled-brackets do not get killed
  70. during parsing stage. Enabling this will result in two preg_replace_callback
  71. calls and one preg_replace call for every bit of HTML passed through here.
  72. It is not necessary and will have no effect for PHP 4.
  73. This directive has been available since 2.1.0.
  74. ');
  75. /**
  76. * Forgivingly lexes HTML (SGML-style) markup into tokens.
  77. *
  78. * A lexer parses a string of SGML-style markup and converts them into
  79. * corresponding tokens. It doesn't check for well-formedness, although its
  80. * internal mechanism may make this automatic (such as the case of
  81. * HTMLPurifier_Lexer_DOMLex). There are several implementations to choose
  82. * from.
  83. *
  84. * A lexer is HTML-oriented: it might work with XML, but it's not
  85. * recommended, as we adhere to a subset of the specification for optimization
  86. * reasons.
  87. *
  88. * This class should not be directly instantiated, but you may use create() to
  89. * retrieve a default copy of the lexer. Being a supertype, this class
  90. * does not actually define any implementation, but offers commonly used
  91. * convenience functions for subclasses.
  92. *
  93. * @note The unit tests will instantiate this class for testing purposes, as
  94. * many of the utility functions require a class to be instantiated.
  95. * Be careful when porting this class to PHP 5.
  96. *
  97. * @par
  98. *
  99. * @note
  100. * We use tokens rather than create a DOM representation because DOM would:
  101. *
  102. * @par
  103. * -# Require more processing power to create,
  104. * -# Require recursion to iterate,
  105. * -# Must be compatible with PHP 5's DOM (otherwise duplication),
  106. * -# Has the entire document structure (html and body not needed), and
  107. * -# Has unknown readability improvement.
  108. *
  109. * @par
  110. * What the last item means is that the functions for manipulating tokens are
  111. * already fairly compact, and when well-commented, more abstraction may not
  112. * be needed.
  113. *
  114. * @see HTMLPurifier_Token
  115. */
  116. class HTMLPurifier_Lexer
  117. {
  118. // -- STATIC ----------------------------------------------------------
  119. /**
  120. * Retrieves or sets the default Lexer as a Prototype Factory.
  121. *
  122. * Depending on what PHP version you are running, the abstract base
  123. * Lexer class will determine which concrete Lexer is best for you:
  124. * HTMLPurifier_Lexer_DirectLex for PHP 4, and HTMLPurifier_Lexer_DOMLex
  125. * for PHP 5 and beyond. This general rule has a few exceptions to it
  126. * involving special features that only DirectLex implements.
  127. *
  128. * @static
  129. *
  130. * @note The behavior of this class has changed, rather than accepting
  131. * a prototype object, it now accepts a configuration object.
  132. * To specify your own prototype, set %Core.LexerImpl to it.
  133. * This change in behavior de-singletonizes the lexer object.
  134. *
  135. * @note In PHP4, it is possible to call this factory method from
  136. * subclasses, such usage is not recommended and not
  137. * forwards-compatible.
  138. *
  139. * @param $prototype Optional prototype lexer or configuration object
  140. * @return Concrete lexer.
  141. */
  142. function create($config) {
  143. if (!is_a($config, 'HTMLPurifier_Config')) {
  144. $lexer = $config;
  145. trigger_error("Passing a prototype to
  146. HTMLPurifier_Lexer::create() is deprecated, please instead
  147. use %Core.LexerImpl", E_USER_WARNING);
  148. } else {
  149. $lexer = $config->get('Core', 'LexerImpl');
  150. }
  151. if (is_object($lexer)) {
  152. return $lexer;
  153. }
  154. if (is_null($lexer)) { do {
  155. // auto-detection algorithm
  156. // once PHP DOM implements native line numbers, or we
  157. // hack out something using XSLT, remove this stipulation
  158. $line_numbers = $config->get('Core', 'MaintainLineNumbers');
  159. if (
  160. $line_numbers === true ||
  161. ($line_numbers === null && $config->get('Core', 'CollectErrors'))
  162. ) {
  163. $lexer = 'DirectLex';
  164. break;
  165. }
  166. if (version_compare(PHP_VERSION, "5", ">=") && // check for PHP5
  167. class_exists('DOMDocument')) { // check for DOM support
  168. $lexer = 'DOMLex';
  169. } else {
  170. $lexer = 'DirectLex';
  171. }
  172. } while(0); } // do..while so we can break
  173. // instantiate recognized string names
  174. switch ($lexer) {
  175. case 'DOMLex':
  176. return new HTMLPurifier_Lexer_DOMLex();
  177. case 'DirectLex':
  178. return new HTMLPurifier_Lexer_DirectLex();
  179. case 'PH5P':
  180. // experimental Lexer that must be manually included
  181. return new HTMLPurifier_Lexer_PH5P();
  182. default:
  183. trigger_error("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer), E_USER_ERROR);
  184. }
  185. }
  186. // -- CONVENIENCE MEMBERS ---------------------------------------------
  187. function HTMLPurifier_Lexer() {
  188. $this->_entity_parser = new HTMLPurifier_EntityParser();
  189. }
  190. /**
  191. * Most common entity to raw value conversion table for special entities.
  192. * @protected
  193. */
  194. var $_special_entity2str =
  195. array(
  196. '&quot;' => '"',
  197. '&amp;' => '&',
  198. '&lt;' => '<',
  199. '&gt;' => '>',
  200. '&#39;' => "'",
  201. '&#039;' => "'",
  202. '&#x27;' => "'"
  203. );
  204. /**
  205. * Parses special entities into the proper characters.
  206. *
  207. * This string will translate escaped versions of the special characters
  208. * into the correct ones.
  209. *
  210. * @warning
  211. * You should be able to treat the output of this function as
  212. * completely parsed, but that's only because all other entities should
  213. * have been handled previously in substituteNonSpecialEntities()
  214. *
  215. * @param $string String character data to be parsed.
  216. * @returns Parsed character data.
  217. */
  218. function parseData($string) {
  219. // following functions require at least one character
  220. if ($string === '') return '';
  221. // subtracts amps that cannot possibly be escaped
  222. $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
  223. ($string[strlen($string)-1] === '&' ? 1 : 0);
  224. if (!$num_amp) return $string; // abort if no entities
  225. $num_esc_amp = substr_count($string, '&amp;');
  226. $string = strtr($string, $this->_special_entity2str);
  227. // code duplication for sake of optimization, see above
  228. $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
  229. ($string[strlen($string)-1] === '&' ? 1 : 0);
  230. if ($num_amp_2 <= $num_esc_amp) return $string;
  231. // hmm... now we have some uncommon entities. Use the callback.
  232. $string = $this->_entity_parser->substituteSpecialEntities($string);
  233. return $string;
  234. }
  235. /**
  236. * Lexes an HTML string into tokens.
  237. *
  238. * @param $string String HTML.
  239. * @return HTMLPurifier_Token array representation of HTML.
  240. */
  241. function tokenizeHTML($string, $config, &$context) {
  242. trigger_error('Call to abstract class', E_USER_ERROR);
  243. }
  244. /**
  245. * Translates CDATA sections into regular sections (through escaping).
  246. *
  247. * @static
  248. * @protected
  249. * @param $string HTML string to process.
  250. * @returns HTML with CDATA sections escaped.
  251. */
  252. function escapeCDATA($string) {
  253. return preg_replace_callback(
  254. '/<!\[CDATA\[(.+?)\]\]>/s',
  255. array('HTMLPurifier_Lexer', 'CDATACallback'),
  256. $string
  257. );
  258. }
  259. /**
  260. * Special CDATA case that is especiall convoluted for <script>
  261. */
  262. function escapeCommentedCDATA($string) {
  263. return preg_replace_callback(
  264. '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
  265. array('HTMLPurifier_Lexer', 'CDATACallback'),
  266. $string
  267. );
  268. }
  269. /**
  270. * Callback function for escapeCDATA() that does the work.
  271. *
  272. * @static
  273. * @warning Though this is public in order to let the callback happen,
  274. * calling it directly is not recommended.
  275. * @params $matches PCRE matches array, with index 0 the entire match
  276. * and 1 the inside of the CDATA section.
  277. * @returns Escaped internals of the CDATA section.
  278. */
  279. function CDATACallback($matches) {
  280. // not exactly sure why the character set is needed, but whatever
  281. return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
  282. }
  283. /**
  284. * Takes a piece of HTML and normalizes it by converting entities, fixing
  285. * encoding, extracting bits, and other good stuff.
  286. */
  287. function normalize($html, $config, &$context) {
  288. // extract body from document if applicable
  289. if ($config->get('Core', 'ConvertDocumentToFragment')) {
  290. $html = $this->extractBody($html);
  291. }
  292. // normalize newlines to \n
  293. $html = str_replace("\r\n", "\n", $html);
  294. $html = str_replace("\r", "\n", $html);
  295. if ($config->get('HTML', 'Trusted')) {
  296. // escape convoluted CDATA
  297. $html = $this->escapeCommentedCDATA($html);
  298. }
  299. // escape CDATA
  300. $html = $this->escapeCDATA($html);
  301. // expand entities that aren't the big five
  302. $html = $this->_entity_parser->substituteNonSpecialEntities($html);
  303. // clean into wellformed UTF-8 string for an SGML context: this has
  304. // to be done after entity expansion because the entities sometimes
  305. // represent non-SGML characters (horror, horror!)
  306. $html = HTMLPurifier_Encoder::cleanUTF8($html);
  307. return $html;
  308. }
  309. /**
  310. * Takes a string of HTML (fragment or document) and returns the content
  311. */
  312. function extractBody($html) {
  313. $matches = array();
  314. $result = preg_match('!<body[^>]*>(.+?)</body>!is', $html, $matches);
  315. if ($result) {
  316. return $matches[1];
  317. } else {
  318. return $html;
  319. }
  320. }
  321. }