PageRenderTime 130ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 3ms

/include/HTMLPurifier/HTMLPurifier.standalone.php

https://bitbucket.org/cviolette/sugarcrm
PHP | 14864 lines | 7789 code | 2556 blank | 4519 comment | 1503 complexity | 98906b387369850c456a850dbf20b9da MD5 | raw file
Possible License(s): LGPL-2.1, MPL-2.0-no-copyleft-exception, BSD-3-Clause
  1. <?php
  2. /**
  3. * @file
  4. * This file was auto-generated by generate-includes.php and includes all of
  5. * the core files required by HTML Purifier. Use this if performance is a
  6. * primary concern and you are using an opcode cache. PLEASE DO NOT EDIT THIS
  7. * FILE, changes will be overwritten the next time the script is run.
  8. *
  9. * @version 4.3.0
  10. *
  11. * @warning
  12. * You must *not* include any other HTML Purifier files before this file,
  13. * because 'require' not 'require_once' is used.
  14. *
  15. * @warning
  16. * This file requires that the include path contains the HTML Purifier
  17. * library directory; this is not auto-set.
  18. */
  19. /*! @mainpage
  20. *
  21. * HTML Purifier is an HTML filter that will take an arbitrary snippet of
  22. * HTML and rigorously test, validate and filter it into a version that
  23. * is safe for output onto webpages. It achieves this by:
  24. *
  25. * -# Lexing (parsing into tokens) the document,
  26. * -# Executing various strategies on the tokens:
  27. * -# Removing all elements not in the whitelist,
  28. * -# Making the tokens well-formed,
  29. * -# Fixing the nesting of the nodes, and
  30. * -# Validating attributes of the nodes; and
  31. * -# Generating HTML from the purified tokens.
  32. *
  33. * However, most users will only need to interface with the HTMLPurifier
  34. * and HTMLPurifier_Config.
  35. */
  36. /*
  37. HTML Purifier 4.3.0 - Standards Compliant HTML Filtering
  38. Copyright (C) 2006-2008 Edward Z. Yang
  39. This library is free software; you can redistribute it and/or
  40. modify it under the terms of the GNU Lesser General Public
  41. License as published by the Free Software Foundation; either
  42. version 2.1 of the License, or (at your option) any later version.
  43. This library is distributed in the hope that it will be useful,
  44. but WITHOUT ANY WARRANTY; without even the implied warranty of
  45. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  46. Lesser General Public License for more details.
  47. You should have received a copy of the GNU Lesser General Public
  48. License along with this library; if not, write to the Free Software
  49. Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  50. */
  51. /**
  52. * Facade that coordinates HTML Purifier's subsystems in order to purify HTML.
  53. *
  54. * @note There are several points in which configuration can be specified
  55. * for HTML Purifier. The precedence of these (from lowest to
  56. * highest) is as follows:
  57. * -# Instance: new HTMLPurifier($config)
  58. * -# Invocation: purify($html, $config)
  59. * These configurations are entirely independent of each other and
  60. * are *not* merged (this behavior may change in the future).
  61. *
  62. * @todo We need an easier way to inject strategies using the configuration
  63. * object.
  64. */
  65. class HTMLPurifier
  66. {
  67. /** Version of HTML Purifier */
  68. public $version = '4.3.0';
  69. /** Constant with version of HTML Purifier */
  70. const VERSION = '4.3.0';
  71. /** Global configuration object */
  72. public $config;
  73. /** Array of extra HTMLPurifier_Filter objects to run on HTML, for backwards compatibility */
  74. private $filters = array();
  75. /** Single instance of HTML Purifier */
  76. private static $instance;
  77. protected $strategy, $generator;
  78. /**
  79. * Resultant HTMLPurifier_Context of last run purification. Is an array
  80. * of contexts if the last called method was purifyArray().
  81. */
  82. public $context;
  83. /**
  84. * Initializes the purifier.
  85. * @param $config Optional HTMLPurifier_Config object for all instances of
  86. * the purifier, if omitted, a default configuration is
  87. * supplied (which can be overridden on a per-use basis).
  88. * The parameter can also be any type that
  89. * HTMLPurifier_Config::create() supports.
  90. */
  91. public function __construct($config = null) {
  92. $this->config = HTMLPurifier_Config::create($config);
  93. $this->strategy = new HTMLPurifier_Strategy_Core();
  94. }
  95. /**
  96. * Adds a filter to process the output. First come first serve
  97. * @param $filter HTMLPurifier_Filter object
  98. */
  99. public function addFilter($filter) {
  100. trigger_error('HTMLPurifier->addFilter() is deprecated, use configuration directives in the Filter namespace or Filter.Custom', E_USER_WARNING);
  101. $this->filters[] = $filter;
  102. }
  103. /**
  104. * Filters an HTML snippet/document to be XSS-free and standards-compliant.
  105. *
  106. * @param $html String of HTML to purify
  107. * @param $config HTMLPurifier_Config object for this operation, if omitted,
  108. * defaults to the config object specified during this
  109. * object's construction. The parameter can also be any type
  110. * that HTMLPurifier_Config::create() supports.
  111. * @return Purified HTML
  112. */
  113. public function purify($html, $config = null) {
  114. // :TODO: make the config merge in, instead of replace
  115. $config = $config ? HTMLPurifier_Config::create($config) : $this->config;
  116. // implementation is partially environment dependant, partially
  117. // configuration dependant
  118. $lexer = HTMLPurifier_Lexer::create($config);
  119. $context = new HTMLPurifier_Context();
  120. // setup HTML generator
  121. $this->generator = new HTMLPurifier_Generator($config, $context);
  122. $context->register('Generator', $this->generator);
  123. // set up global context variables
  124. if ($config->get('Core.CollectErrors')) {
  125. // may get moved out if other facilities use it
  126. $language_factory = HTMLPurifier_LanguageFactory::instance();
  127. $language = $language_factory->create($config, $context);
  128. $context->register('Locale', $language);
  129. $error_collector = new HTMLPurifier_ErrorCollector($context);
  130. $context->register('ErrorCollector', $error_collector);
  131. }
  132. // setup id_accumulator context, necessary due to the fact that
  133. // AttrValidator can be called from many places
  134. $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
  135. $context->register('IDAccumulator', $id_accumulator);
  136. $html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context);
  137. // setup filters
  138. $filter_flags = $config->getBatch('Filter');
  139. $custom_filters = $filter_flags['Custom'];
  140. unset($filter_flags['Custom']);
  141. $filters = array();
  142. foreach ($filter_flags as $filter => $flag) {
  143. if (!$flag) continue;
  144. if (strpos($filter, '.') !== false) continue;
  145. $class = "HTMLPurifier_Filter_$filter";
  146. $filters[] = new $class;
  147. }
  148. foreach ($custom_filters as $filter) {
  149. // maybe "HTMLPurifier_Filter_$filter", but be consistent with AutoFormat
  150. $filters[] = $filter;
  151. }
  152. $filters = array_merge($filters, $this->filters);
  153. // maybe prepare(), but later
  154. for ($i = 0, $filter_size = count($filters); $i < $filter_size; $i++) {
  155. $html = $filters[$i]->preFilter($html, $config, $context);
  156. }
  157. // purified HTML
  158. $html =
  159. $this->generator->generateFromTokens(
  160. // list of tokens
  161. $this->strategy->execute(
  162. // list of un-purified tokens
  163. $lexer->tokenizeHTML(
  164. // un-purified HTML
  165. $html, $config, $context
  166. ),
  167. $config, $context
  168. )
  169. );
  170. for ($i = $filter_size - 1; $i >= 0; $i--) {
  171. $html = $filters[$i]->postFilter($html, $config, $context);
  172. }
  173. $html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context);
  174. $this->context =& $context;
  175. return $html;
  176. }
  177. /**
  178. * Filters an array of HTML snippets
  179. * @param $config Optional HTMLPurifier_Config object for this operation.
  180. * See HTMLPurifier::purify() for more details.
  181. * @return Array of purified HTML
  182. */
  183. public function purifyArray($array_of_html, $config = null) {
  184. $context_array = array();
  185. foreach ($array_of_html as $key => $html) {
  186. $array_of_html[$key] = $this->purify($html, $config);
  187. $context_array[$key] = $this->context;
  188. }
  189. $this->context = $context_array;
  190. return $array_of_html;
  191. }
  192. /**
  193. * Singleton for enforcing just one HTML Purifier in your system
  194. * @param $prototype Optional prototype HTMLPurifier instance to
  195. * overload singleton with, or HTMLPurifier_Config
  196. * instance to configure the generated version with.
  197. */
  198. public static function instance($prototype = null) {
  199. if (!self::$instance || $prototype) {
  200. if ($prototype instanceof HTMLPurifier) {
  201. self::$instance = $prototype;
  202. } elseif ($prototype) {
  203. self::$instance = new HTMLPurifier($prototype);
  204. } else {
  205. self::$instance = new HTMLPurifier();
  206. }
  207. }
  208. return self::$instance;
  209. }
  210. /**
  211. * @note Backwards compatibility, see instance()
  212. */
  213. public static function getInstance($prototype = null) {
  214. return HTMLPurifier::instance($prototype);
  215. }
  216. }
  217. /**
  218. * Defines common attribute collections that modules reference
  219. */
  220. class HTMLPurifier_AttrCollections
  221. {
  222. /**
  223. * Associative array of attribute collections, indexed by name
  224. */
  225. public $info = array();
  226. /**
  227. * Performs all expansions on internal data for use by other inclusions
  228. * It also collects all attribute collection extensions from
  229. * modules
  230. * @param $attr_types HTMLPurifier_AttrTypes instance
  231. * @param $modules Hash array of HTMLPurifier_HTMLModule members
  232. */
  233. public function __construct($attr_types, $modules) {
  234. // load extensions from the modules
  235. foreach ($modules as $module) {
  236. foreach ($module->attr_collections as $coll_i => $coll) {
  237. if (!isset($this->info[$coll_i])) {
  238. $this->info[$coll_i] = array();
  239. }
  240. foreach ($coll as $attr_i => $attr) {
  241. if ($attr_i === 0 && isset($this->info[$coll_i][$attr_i])) {
  242. // merge in includes
  243. $this->info[$coll_i][$attr_i] = array_merge(
  244. $this->info[$coll_i][$attr_i], $attr);
  245. continue;
  246. }
  247. $this->info[$coll_i][$attr_i] = $attr;
  248. }
  249. }
  250. }
  251. // perform internal expansions and inclusions
  252. foreach ($this->info as $name => $attr) {
  253. // merge attribute collections that include others
  254. $this->performInclusions($this->info[$name]);
  255. // replace string identifiers with actual attribute objects
  256. $this->expandIdentifiers($this->info[$name], $attr_types);
  257. }
  258. }
  259. /**
  260. * Takes a reference to an attribute associative array and performs
  261. * all inclusions specified by the zero index.
  262. * @param &$attr Reference to attribute array
  263. */
  264. public function performInclusions(&$attr) {
  265. if (!isset($attr[0])) return;
  266. $merge = $attr[0];
  267. $seen = array(); // recursion guard
  268. // loop through all the inclusions
  269. for ($i = 0; isset($merge[$i]); $i++) {
  270. if (isset($seen[$merge[$i]])) continue;
  271. $seen[$merge[$i]] = true;
  272. // foreach attribute of the inclusion, copy it over
  273. if (!isset($this->info[$merge[$i]])) continue;
  274. foreach ($this->info[$merge[$i]] as $key => $value) {
  275. if (isset($attr[$key])) continue; // also catches more inclusions
  276. $attr[$key] = $value;
  277. }
  278. if (isset($this->info[$merge[$i]][0])) {
  279. // recursion
  280. $merge = array_merge($merge, $this->info[$merge[$i]][0]);
  281. }
  282. }
  283. unset($attr[0]);
  284. }
  285. /**
  286. * Expands all string identifiers in an attribute array by replacing
  287. * them with the appropriate values inside HTMLPurifier_AttrTypes
  288. * @param &$attr Reference to attribute array
  289. * @param $attr_types HTMLPurifier_AttrTypes instance
  290. */
  291. public function expandIdentifiers(&$attr, $attr_types) {
  292. // because foreach will process new elements we add, make sure we
  293. // skip duplicates
  294. $processed = array();
  295. foreach ($attr as $def_i => $def) {
  296. // skip inclusions
  297. if ($def_i === 0) continue;
  298. if (isset($processed[$def_i])) continue;
  299. // determine whether or not attribute is required
  300. if ($required = (strpos($def_i, '*') !== false)) {
  301. // rename the definition
  302. unset($attr[$def_i]);
  303. $def_i = trim($def_i, '*');
  304. $attr[$def_i] = $def;
  305. }
  306. $processed[$def_i] = true;
  307. // if we've already got a literal object, move on
  308. if (is_object($def)) {
  309. // preserve previous required
  310. $attr[$def_i]->required = ($required || $attr[$def_i]->required);
  311. continue;
  312. }
  313. if ($def === false) {
  314. unset($attr[$def_i]);
  315. continue;
  316. }
  317. if ($t = $attr_types->get($def)) {
  318. $attr[$def_i] = $t;
  319. $attr[$def_i]->required = $required;
  320. } else {
  321. unset($attr[$def_i]);
  322. }
  323. }
  324. }
  325. }
  326. /**
  327. * Base class for all validating attribute definitions.
  328. *
  329. * This family of classes forms the core for not only HTML attribute validation,
  330. * but also any sort of string that needs to be validated or cleaned (which
  331. * means CSS properties and composite definitions are defined here too).
  332. * Besides defining (through code) what precisely makes the string valid,
  333. * subclasses are also responsible for cleaning the code if possible.
  334. */
  335. abstract class HTMLPurifier_AttrDef
  336. {
  337. /**
  338. * Tells us whether or not an HTML attribute is minimized. Has no
  339. * meaning in other contexts.
  340. */
  341. public $minimized = false;
  342. /**
  343. * Tells us whether or not an HTML attribute is required. Has no
  344. * meaning in other contexts
  345. */
  346. public $required = false;
  347. /**
  348. * Validates and cleans passed string according to a definition.
  349. *
  350. * @param $string String to be validated and cleaned.
  351. * @param $config Mandatory HTMLPurifier_Config object.
  352. * @param $context Mandatory HTMLPurifier_AttrContext object.
  353. */
  354. abstract public function validate($string, $config, $context);
  355. /**
  356. * Convenience method that parses a string as if it were CDATA.
  357. *
  358. * This method process a string in the manner specified at
  359. * <http://www.w3.org/TR/html4/types.html#h-6.2> by removing
  360. * leading and trailing whitespace, ignoring line feeds, and replacing
  361. * carriage returns and tabs with spaces. While most useful for HTML
  362. * attributes specified as CDATA, it can also be applied to most CSS
  363. * values.
  364. *
  365. * @note This method is not entirely standards compliant, as trim() removes
  366. * more types of whitespace than specified in the spec. In practice,
  367. * this is rarely a problem, as those extra characters usually have
  368. * already been removed by HTMLPurifier_Encoder.
  369. *
  370. * @warning This processing is inconsistent with XML's whitespace handling
  371. * as specified by section 3.3.3 and referenced XHTML 1.0 section
  372. * 4.7. However, note that we are NOT necessarily
  373. * parsing XML, thus, this behavior may still be correct. We
  374. * assume that newlines have been normalized.
  375. */
  376. public function parseCDATA($string) {
  377. $string = trim($string);
  378. $string = str_replace(array("\n", "\t", "\r"), ' ', $string);
  379. return $string;
  380. }
  381. /**
  382. * Factory method for creating this class from a string.
  383. * @param $string String construction info
  384. * @return Created AttrDef object corresponding to $string
  385. */
  386. public function make($string) {
  387. // default implementation, return a flyweight of this object.
  388. // If $string has an effect on the returned object (i.e. you
  389. // need to overload this method), it is best
  390. // to clone or instantiate new copies. (Instantiation is safer.)
  391. return $this;
  392. }
  393. /**
  394. * Removes spaces from rgb(0, 0, 0) so that shorthand CSS properties work
  395. * properly. THIS IS A HACK!
  396. */
  397. protected function mungeRgb($string) {
  398. return preg_replace('/rgb\((\d+)\s*,\s*(\d+)\s*,\s*(\d+)\)/', 'rgb(\1,\2,\3)', $string);
  399. }
  400. /**
  401. * Parses a possibly escaped CSS string and returns the "pure"
  402. * version of it.
  403. */
  404. protected function expandCSSEscape($string) {
  405. // flexibly parse it
  406. $ret = '';
  407. for ($i = 0, $c = strlen($string); $i < $c; $i++) {
  408. if ($string[$i] === '\\') {
  409. $i++;
  410. if ($i >= $c) {
  411. $ret .= '\\';
  412. break;
  413. }
  414. if (ctype_xdigit($string[$i])) {
  415. $code = $string[$i];
  416. for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
  417. if (!ctype_xdigit($string[$i])) break;
  418. $code .= $string[$i];
  419. }
  420. // We have to be extremely careful when adding
  421. // new characters, to make sure we're not breaking
  422. // the encoding.
  423. $char = HTMLPurifier_Encoder::unichr(hexdec($code));
  424. if (HTMLPurifier_Encoder::cleanUTF8($char) === '') continue;
  425. $ret .= $char;
  426. if ($i < $c && trim($string[$i]) !== '') $i--;
  427. continue;
  428. }
  429. if ($string[$i] === "\n") continue;
  430. }
  431. $ret .= $string[$i];
  432. }
  433. return $ret;
  434. }
  435. }
  436. /**
  437. * Processes an entire attribute array for corrections needing multiple values.
  438. *
  439. * Occasionally, a certain attribute will need to be removed and popped onto
  440. * another value. Instead of creating a complex return syntax for
  441. * HTMLPurifier_AttrDef, we just pass the whole attribute array to a
  442. * specialized object and have that do the special work. That is the
  443. * family of HTMLPurifier_AttrTransform.
  444. *
  445. * An attribute transformation can be assigned to run before or after
  446. * HTMLPurifier_AttrDef validation. See HTMLPurifier_HTMLDefinition for
  447. * more details.
  448. */
  449. abstract class HTMLPurifier_AttrTransform
  450. {
  451. /**
  452. * Abstract: makes changes to the attributes dependent on multiple values.
  453. *
  454. * @param $attr Assoc array of attributes, usually from
  455. * HTMLPurifier_Token_Tag::$attr
  456. * @param $config Mandatory HTMLPurifier_Config object.
  457. * @param $context Mandatory HTMLPurifier_Context object
  458. * @returns Processed attribute array.
  459. */
  460. abstract public function transform($attr, $config, $context);
  461. /**
  462. * Prepends CSS properties to the style attribute, creating the
  463. * attribute if it doesn't exist.
  464. * @param $attr Attribute array to process (passed by reference)
  465. * @param $css CSS to prepend
  466. */
  467. public function prependCSS(&$attr, $css) {
  468. $attr['style'] = isset($attr['style']) ? $attr['style'] : '';
  469. $attr['style'] = $css . $attr['style'];
  470. }
  471. /**
  472. * Retrieves and removes an attribute
  473. * @param $attr Attribute array to process (passed by reference)
  474. * @param $key Key of attribute to confiscate
  475. */
  476. public function confiscateAttr(&$attr, $key) {
  477. if (!isset($attr[$key])) return null;
  478. $value = $attr[$key];
  479. unset($attr[$key]);
  480. return $value;
  481. }
  482. }
  483. /**
  484. * Provides lookup array of attribute types to HTMLPurifier_AttrDef objects
  485. */
  486. class HTMLPurifier_AttrTypes
  487. {
  488. /**
  489. * Lookup array of attribute string identifiers to concrete implementations
  490. */
  491. protected $info = array();
  492. /**
  493. * Constructs the info array, supplying default implementations for attribute
  494. * types.
  495. */
  496. public function __construct() {
  497. // pseudo-types, must be instantiated via shorthand
  498. $this->info['Enum'] = new HTMLPurifier_AttrDef_Enum();
  499. $this->info['Bool'] = new HTMLPurifier_AttrDef_HTML_Bool();
  500. $this->info['CDATA'] = new HTMLPurifier_AttrDef_Text();
  501. $this->info['ID'] = new HTMLPurifier_AttrDef_HTML_ID();
  502. $this->info['Length'] = new HTMLPurifier_AttrDef_HTML_Length();
  503. $this->info['MultiLength'] = new HTMLPurifier_AttrDef_HTML_MultiLength();
  504. $this->info['NMTOKENS'] = new HTMLPurifier_AttrDef_HTML_Nmtokens();
  505. $this->info['Pixels'] = new HTMLPurifier_AttrDef_HTML_Pixels();
  506. $this->info['Text'] = new HTMLPurifier_AttrDef_Text();
  507. $this->info['URI'] = new HTMLPurifier_AttrDef_URI();
  508. $this->info['LanguageCode'] = new HTMLPurifier_AttrDef_Lang();
  509. $this->info['Color'] = new HTMLPurifier_AttrDef_HTML_Color();
  510. // unimplemented aliases
  511. $this->info['ContentType'] = new HTMLPurifier_AttrDef_Text();
  512. $this->info['ContentTypes'] = new HTMLPurifier_AttrDef_Text();
  513. $this->info['Charsets'] = new HTMLPurifier_AttrDef_Text();
  514. $this->info['Character'] = new HTMLPurifier_AttrDef_Text();
  515. // "proprietary" types
  516. $this->info['Class'] = new HTMLPurifier_AttrDef_HTML_Class();
  517. // number is really a positive integer (one or more digits)
  518. // FIXME: ^^ not always, see start and value of list items
  519. $this->info['Number'] = new HTMLPurifier_AttrDef_Integer(false, false, true);
  520. }
  521. /**
  522. * Retrieves a type
  523. * @param $type String type name
  524. * @return Object AttrDef for type
  525. */
  526. public function get($type) {
  527. // determine if there is any extra info tacked on
  528. if (strpos($type, '#') !== false) list($type, $string) = explode('#', $type, 2);
  529. else $string = '';
  530. if (!isset($this->info[$type])) {
  531. trigger_error('Cannot retrieve undefined attribute type ' . $type, E_USER_ERROR);
  532. return;
  533. }
  534. return $this->info[$type]->make($string);
  535. }
  536. /**
  537. * Sets a new implementation for a type
  538. * @param $type String type name
  539. * @param $impl Object AttrDef for type
  540. */
  541. public function set($type, $impl) {
  542. $this->info[$type] = $impl;
  543. }
  544. }
  545. /**
  546. * Validates the attributes of a token. Doesn't manage required attributes
  547. * very well. The only reason we factored this out was because RemoveForeignElements
  548. * also needed it besides ValidateAttributes.
  549. */
  550. class HTMLPurifier_AttrValidator
  551. {
  552. /**
  553. * Validates the attributes of a token, returning a modified token
  554. * that has valid tokens
  555. * @param $token Reference to token to validate. We require a reference
  556. * because the operation this class performs on the token are
  557. * not atomic, so the context CurrentToken to be updated
  558. * throughout
  559. * @param $config Instance of HTMLPurifier_Config
  560. * @param $context Instance of HTMLPurifier_Context
  561. */
  562. public function validateToken(&$token, &$config, $context) {
  563. $definition = $config->getHTMLDefinition();
  564. $e =& $context->get('ErrorCollector', true);
  565. // initialize IDAccumulator if necessary
  566. $ok =& $context->get('IDAccumulator', true);
  567. if (!$ok) {
  568. $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
  569. $context->register('IDAccumulator', $id_accumulator);
  570. }
  571. // initialize CurrentToken if necessary
  572. $current_token =& $context->get('CurrentToken', true);
  573. if (!$current_token) $context->register('CurrentToken', $token);
  574. if (
  575. !$token instanceof HTMLPurifier_Token_Start &&
  576. !$token instanceof HTMLPurifier_Token_Empty
  577. ) return $token;
  578. // create alias to global definition array, see also $defs
  579. // DEFINITION CALL
  580. $d_defs = $definition->info_global_attr;
  581. // don't update token until the very end, to ensure an atomic update
  582. $attr = $token->attr;
  583. // do global transformations (pre)
  584. // nothing currently utilizes this
  585. foreach ($definition->info_attr_transform_pre as $transform) {
  586. $attr = $transform->transform($o = $attr, $config, $context);
  587. if ($e) {
  588. if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
  589. }
  590. }
  591. // do local transformations only applicable to this element (pre)
  592. // ex. <p align="right"> to <p style="text-align:right;">
  593. foreach ($definition->info[$token->name]->attr_transform_pre as $transform) {
  594. $attr = $transform->transform($o = $attr, $config, $context);
  595. if ($e) {
  596. if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
  597. }
  598. }
  599. // create alias to this element's attribute definition array, see
  600. // also $d_defs (global attribute definition array)
  601. // DEFINITION CALL
  602. $defs = $definition->info[$token->name]->attr;
  603. $attr_key = false;
  604. $context->register('CurrentAttr', $attr_key);
  605. // iterate through all the attribute keypairs
  606. // Watch out for name collisions: $key has previously been used
  607. foreach ($attr as $attr_key => $value) {
  608. // call the definition
  609. if ( isset($defs[$attr_key]) ) {
  610. // there is a local definition defined
  611. if ($defs[$attr_key] === false) {
  612. // We've explicitly been told not to allow this element.
  613. // This is usually when there's a global definition
  614. // that must be overridden.
  615. // Theoretically speaking, we could have a
  616. // AttrDef_DenyAll, but this is faster!
  617. $result = false;
  618. } else {
  619. // validate according to the element's definition
  620. $result = $defs[$attr_key]->validate(
  621. $value, $config, $context
  622. );
  623. }
  624. } elseif ( isset($d_defs[$attr_key]) ) {
  625. // there is a global definition defined, validate according
  626. // to the global definition
  627. $result = $d_defs[$attr_key]->validate(
  628. $value, $config, $context
  629. );
  630. } else {
  631. // system never heard of the attribute? DELETE!
  632. $result = false;
  633. }
  634. // put the results into effect
  635. if ($result === false || $result === null) {
  636. // this is a generic error message that should replaced
  637. // with more specific ones when possible
  638. if ($e) $e->send(E_ERROR, 'AttrValidator: Attribute removed');
  639. // remove the attribute
  640. unset($attr[$attr_key]);
  641. } elseif (is_string($result)) {
  642. // generally, if a substitution is happening, there
  643. // was some sort of implicit correction going on. We'll
  644. // delegate it to the attribute classes to say exactly what.
  645. // simple substitution
  646. $attr[$attr_key] = $result;
  647. } else {
  648. // nothing happens
  649. }
  650. // we'd also want slightly more complicated substitution
  651. // involving an array as the return value,
  652. // although we're not sure how colliding attributes would
  653. // resolve (certain ones would be completely overriden,
  654. // others would prepend themselves).
  655. }
  656. $context->destroy('CurrentAttr');
  657. // post transforms
  658. // global (error reporting untested)
  659. foreach ($definition->info_attr_transform_post as $transform) {
  660. $attr = $transform->transform($o = $attr, $config, $context);
  661. if ($e) {
  662. if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
  663. }
  664. }
  665. // local (error reporting untested)
  666. foreach ($definition->info[$token->name]->attr_transform_post as $transform) {
  667. $attr = $transform->transform($o = $attr, $config, $context);
  668. if ($e) {
  669. if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
  670. }
  671. }
  672. $token->attr = $attr;
  673. // destroy CurrentToken if we made it ourselves
  674. if (!$current_token) $context->destroy('CurrentToken');
  675. }
  676. }
  677. // constants are slow, so we use as few as possible
  678. if (!defined('HTMLPURIFIER_PREFIX')) {
  679. define('HTMLPURIFIER_PREFIX', dirname(__FILE__) . '/standalone');
  680. set_include_path(HTMLPURIFIER_PREFIX . PATH_SEPARATOR . get_include_path());
  681. }
  682. // accomodations for versions earlier than 5.0.2
  683. // borrowed from PHP_Compat, LGPL licensed, by Aidan Lister <aidan@php.net>
  684. if (!defined('PHP_EOL')) {
  685. switch (strtoupper(substr(PHP_OS, 0, 3))) {
  686. case 'WIN':
  687. define('PHP_EOL', "\r\n");
  688. break;
  689. case 'DAR':
  690. define('PHP_EOL', "\r");
  691. break;
  692. default:
  693. define('PHP_EOL', "\n");
  694. }
  695. }
  696. /**
  697. * Bootstrap class that contains meta-functionality for HTML Purifier such as
  698. * the autoload function.
  699. *
  700. * @note
  701. * This class may be used without any other files from HTML Purifier.
  702. */
  703. class HTMLPurifier_Bootstrap
  704. {
  705. /**
  706. * Autoload function for HTML Purifier
  707. * @param $class Class to load
  708. */
  709. public static function autoload($class) {
  710. $file = HTMLPurifier_Bootstrap::getPath($class);
  711. if (!$file) return false;
  712. // Technically speaking, it should be ok and more efficient to
  713. // just do 'require', but Antonio Parraga reports that with
  714. // Zend extensions such as Zend debugger and APC, this invariant
  715. // may be broken. Since we have efficient alternatives, pay
  716. // the cost here and avoid the bug.
  717. require_once HTMLPURIFIER_PREFIX . '/' . $file;
  718. return true;
  719. }
  720. /**
  721. * Returns the path for a specific class.
  722. */
  723. public static function getPath($class) {
  724. if (strncmp('HTMLPurifier', $class, 12) !== 0) return false;
  725. // Custom implementations
  726. if (strncmp('HTMLPurifier_Language_', $class, 22) === 0) {
  727. $code = str_replace('_', '-', substr($class, 22));
  728. $file = 'HTMLPurifier/Language/classes/' . $code . '.php';
  729. } else {
  730. $file = str_replace('_', '/', $class) . '.php';
  731. }
  732. if (!file_exists(HTMLPURIFIER_PREFIX . '/' . $file)) return false;
  733. return $file;
  734. }
  735. /**
  736. * "Pre-registers" our autoloader on the SPL stack.
  737. */
  738. public static function registerAutoload() {
  739. $autoload = array('HTMLPurifier_Bootstrap', 'autoload');
  740. if ( ($funcs = spl_autoload_functions()) === false ) {
  741. spl_autoload_register($autoload);
  742. } elseif (function_exists('spl_autoload_unregister')) {
  743. $buggy = version_compare(PHP_VERSION, '5.2.11', '<');
  744. $compat = version_compare(PHP_VERSION, '5.1.2', '<=') &&
  745. version_compare(PHP_VERSION, '5.1.0', '>=');
  746. foreach ($funcs as $func) {
  747. if ($buggy && is_array($func)) {
  748. // :TRICKY: There are some compatibility issues and some
  749. // places where we need to error out
  750. $reflector = new ReflectionMethod($func[0], $func[1]);
  751. if (!$reflector->isStatic()) {
  752. throw new Exception('
  753. HTML Purifier autoloader registrar is not compatible
  754. with non-static object methods due to PHP Bug #44144;
  755. Please do not use HTMLPurifier.autoload.php (or any
  756. file that includes this file); instead, place the code:
  757. spl_autoload_register(array(\'HTMLPurifier_Bootstrap\', \'autoload\'))
  758. after your own autoloaders.
  759. ');
  760. }
  761. // Suprisingly, spl_autoload_register supports the
  762. // Class::staticMethod callback format, although call_user_func doesn't
  763. if ($compat) $func = implode('::', $func);
  764. }
  765. spl_autoload_unregister($func);
  766. }
  767. spl_autoload_register($autoload);
  768. foreach ($funcs as $func) spl_autoload_register($func);
  769. }
  770. }
  771. }
  772. /**
  773. * Super-class for definition datatype objects, implements serialization
  774. * functions for the class.
  775. */
  776. abstract class HTMLPurifier_Definition
  777. {
  778. /**
  779. * Has setup() been called yet?
  780. */
  781. public $setup = false;
  782. /**
  783. * If true, write out the final definition object to the cache after
  784. * setup. This will be true only if all invocations to get a raw
  785. * definition object are also optimized. This does not cause file
  786. * system thrashing because on subsequent calls the cached object
  787. * is used and any writes to the raw definition object are short
  788. * circuited. See enduser-customize.html for the high-level
  789. * picture.
  790. */
  791. public $optimized = null;
  792. /**
  793. * What type of definition is it?
  794. */
  795. public $type;
  796. /**
  797. * Sets up the definition object into the final form, something
  798. * not done by the constructor
  799. * @param $config HTMLPurifier_Config instance
  800. */
  801. abstract protected function doSetup($config);
  802. /**
  803. * Setup function that aborts if already setup
  804. * @param $config HTMLPurifier_Config instance
  805. */
  806. public function setup($config) {
  807. if ($this->setup) return;
  808. $this->setup = true;
  809. $this->doSetup($config);
  810. }
  811. }
  812. /**
  813. * Defines allowed CSS attributes and what their values are.
  814. * @see HTMLPurifier_HTMLDefinition
  815. */
  816. class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
  817. {
  818. public $type = 'CSS';
  819. /**
  820. * Assoc array of attribute name to definition object.
  821. */
  822. public $info = array();
  823. /**
  824. * Constructs the info array. The meat of this class.
  825. */
  826. protected function doSetup($config) {
  827. $this->info['text-align'] = new HTMLPurifier_AttrDef_Enum(
  828. array('left', 'right', 'center', 'justify'), false);
  829. $border_style =
  830. $this->info['border-bottom-style'] =
  831. $this->info['border-right-style'] =
  832. $this->info['border-left-style'] =
  833. $this->info['border-top-style'] = new HTMLPurifier_AttrDef_Enum(
  834. array('none', 'hidden', 'dotted', 'dashed', 'solid', 'double',
  835. 'groove', 'ridge', 'inset', 'outset'), false);
  836. $this->info['border-style'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_style);
  837. $this->info['clear'] = new HTMLPurifier_AttrDef_Enum(
  838. array('none', 'left', 'right', 'both'), false);
  839. $this->info['float'] = new HTMLPurifier_AttrDef_Enum(
  840. array('none', 'left', 'right'), false);
  841. $this->info['font-style'] = new HTMLPurifier_AttrDef_Enum(
  842. array('normal', 'italic', 'oblique'), false);
  843. $this->info['font-variant'] = new HTMLPurifier_AttrDef_Enum(
  844. array('normal', 'small-caps'), false);
  845. $uri_or_none = new HTMLPurifier_AttrDef_CSS_Composite(
  846. array(
  847. new HTMLPurifier_AttrDef_Enum(array('none')),
  848. new HTMLPurifier_AttrDef_CSS_URI()
  849. )
  850. );
  851. $this->info['list-style-position'] = new HTMLPurifier_AttrDef_Enum(
  852. array('inside', 'outside'), false);
  853. $this->info['list-style-type'] = new HTMLPurifier_AttrDef_Enum(
  854. array('disc', 'circle', 'square', 'decimal', 'lower-roman',
  855. 'upper-roman', 'lower-alpha', 'upper-alpha', 'none'), false);
  856. $this->info['list-style-image'] = $uri_or_none;
  857. $this->info['list-style'] = new HTMLPurifier_AttrDef_CSS_ListStyle($config);
  858. $this->info['text-transform'] = new HTMLPurifier_AttrDef_Enum(
  859. array('capitalize', 'uppercase', 'lowercase', 'none'), false);
  860. $this->info['color'] = new HTMLPurifier_AttrDef_CSS_Color();
  861. $this->info['background-image'] = $uri_or_none;
  862. $this->info['background-repeat'] = new HTMLPurifier_AttrDef_Enum(
  863. array('repeat', 'repeat-x', 'repeat-y', 'no-repeat')
  864. );
  865. $this->info['background-attachment'] = new HTMLPurifier_AttrDef_Enum(
  866. array('scroll', 'fixed')
  867. );
  868. $this->info['background-position'] = new HTMLPurifier_AttrDef_CSS_BackgroundPosition();
  869. $border_color =
  870. $this->info['border-top-color'] =
  871. $this->info['border-bottom-color'] =
  872. $this->info['border-left-color'] =
  873. $this->info['border-right-color'] =
  874. $this->info['background-color'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  875. new HTMLPurifier_AttrDef_Enum(array('transparent')),
  876. new HTMLPurifier_AttrDef_CSS_Color()
  877. ));
  878. $this->info['background'] = new HTMLPurifier_AttrDef_CSS_Background($config);
  879. $this->info['border-color'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_color);
  880. $border_width =
  881. $this->info['border-top-width'] =
  882. $this->info['border-bottom-width'] =
  883. $this->info['border-left-width'] =
  884. $this->info['border-right-width'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  885. new HTMLPurifier_AttrDef_Enum(array('thin', 'medium', 'thick')),
  886. new HTMLPurifier_AttrDef_CSS_Length('0') //disallow negative
  887. ));
  888. $this->info['border-width'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_width);
  889. $this->info['letter-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  890. new HTMLPurifier_AttrDef_Enum(array('normal')),
  891. new HTMLPurifier_AttrDef_CSS_Length()
  892. ));
  893. $this->info['word-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  894. new HTMLPurifier_AttrDef_Enum(array('normal')),
  895. new HTMLPurifier_AttrDef_CSS_Length()
  896. ));
  897. $this->info['font-size'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  898. new HTMLPurifier_AttrDef_Enum(array('xx-small', 'x-small',
  899. 'small', 'medium', 'large', 'x-large', 'xx-large',
  900. 'larger', 'smaller')),
  901. new HTMLPurifier_AttrDef_CSS_Percentage(),
  902. new HTMLPurifier_AttrDef_CSS_Length()
  903. ));
  904. $this->info['line-height'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  905. new HTMLPurifier_AttrDef_Enum(array('normal')),
  906. new HTMLPurifier_AttrDef_CSS_Number(true), // no negatives
  907. new HTMLPurifier_AttrDef_CSS_Length('0'),
  908. new HTMLPurifier_AttrDef_CSS_Percentage(true)
  909. ));
  910. $margin =
  911. $this->info['margin-top'] =
  912. $this->info['margin-bottom'] =
  913. $this->info['margin-left'] =
  914. $this->info['margin-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  915. new HTMLPurifier_AttrDef_CSS_Length(),
  916. new HTMLPurifier_AttrDef_CSS_Percentage(),
  917. new HTMLPurifier_AttrDef_Enum(array('auto'))
  918. ));
  919. $this->info['margin'] = new HTMLPurifier_AttrDef_CSS_Multiple($margin);
  920. // non-negative
  921. $padding =
  922. $this->info['padding-top'] =
  923. $this->info['padding-bottom'] =
  924. $this->info['padding-left'] =
  925. $this->info['padding-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  926. new HTMLPurifier_AttrDef_CSS_Length('0'),
  927. new HTMLPurifier_AttrDef_CSS_Percentage(true)
  928. ));
  929. $this->info['padding'] = new HTMLPurifier_AttrDef_CSS_Multiple($padding);
  930. $this->info['text-indent'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  931. new HTMLPurifier_AttrDef_CSS_Length(),
  932. new HTMLPurifier_AttrDef_CSS_Percentage()
  933. ));
  934. $trusted_wh = new HTMLPurifier_AttrDef_CSS_Composite(array(
  935. new HTMLPurifier_AttrDef_CSS_Length('0'),
  936. new HTMLPurifier_AttrDef_CSS_Percentage(true),
  937. new HTMLPurifier_AttrDef_Enum(array('auto'))
  938. ));
  939. $max = $config->get('CSS.MaxImgLength');
  940. $this->info['width'] =
  941. $this->info['height'] =
  942. $max === null ?
  943. $trusted_wh :
  944. new HTMLPurifier_AttrDef_Switch('img',
  945. // For img tags:
  946. new HTMLPurifier_AttrDef_CSS_Composite(array(
  947. new HTMLPurifier_AttrDef_CSS_Length('0', $max),
  948. new HTMLPurifier_AttrDef_Enum(array('auto'))
  949. )),
  950. // For everyone else:
  951. $trusted_wh
  952. );
  953. $this->info['text-decoration'] = new HTMLPurifier_AttrDef_CSS_TextDecoration();
  954. $this->info['font-family'] = new HTMLPurifier_AttrDef_CSS_FontFamily();
  955. // this could use specialized code
  956. $this->info['font-weight'] = new HTMLPurifier_AttrDef_Enum(
  957. array('normal', 'bold', 'bolder', 'lighter', '100', '200', '300',
  958. '400', '500', '600', '700', '800', '900'), false);
  959. // MUST be called after other font properties, as it references
  960. // a CSSDefinition object
  961. $this->info['font'] = new HTMLPurifier_AttrDef_CSS_Font($config);
  962. // same here
  963. $this->info['border'] =
  964. $this->info['border-bottom'] =
  965. $this->info['border-top'] =
  966. $this->info['border-left'] =
  967. $this->info['border-right'] = new HTMLPurifier_AttrDef_CSS_Border($config);
  968. $this->info['border-collapse'] = new HTMLPurifier_AttrDef_Enum(array(
  969. 'collapse', 'separate'));
  970. $this->info['caption-side'] = new HTMLPurifier_AttrDef_Enum(array(
  971. 'top', 'bottom'));
  972. $this->info['table-layout'] = new HTMLPurifier_AttrDef_Enum(array(
  973. 'auto', 'fixed'));
  974. $this->info['vertical-align'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  975. new HTMLPurifier_AttrDef_Enum(array('baseline', 'sub', 'super',
  976. 'top', 'text-top', 'middle', 'bottom', 'text-bottom')),
  977. new HTMLPurifier_AttrDef_CSS_Length(),
  978. new HTMLPurifier_AttrDef_CSS_Percentage()
  979. ));
  980. $this->info['border-spacing'] = new HTMLPurifier_AttrDef_CSS_Multiple(new HTMLPurifier_AttrDef_CSS_Length(), 2);
  981. // partial support
  982. $this->info['white-space'] = new HTMLPurifier_AttrDef_Enum(array('nowrap'));
  983. if ($config->get('CSS.Proprietary')) {
  984. $this->doSetupProprietary($config);
  985. }
  986. if ($config->get('CSS.AllowTricky')) {
  987. $this->doSetupTricky($config);
  988. }
  989. if ($config->get('CSS.Trusted')) {
  990. $this->doSetupTrusted($config);
  991. }
  992. $allow_important = $config->get('CSS.AllowImportant');
  993. // wrap all attr-defs with decorator that handles !important
  994. foreach ($this->info as $k => $v) {
  995. $this->info[$k] = new HTMLPurifier_AttrDef_CSS_ImportantDecorator($v, $allow_important);
  996. }
  997. $this->setupConfigStuff($config);
  998. }
  999. protected function doSetupProprietary($config) {
  1000. // Internet Explorer only scrollbar colors
  1001. $this->info['scrollbar-arrow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  1002. $this->info['scrollbar-base-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  1003. $this->info['scrollbar-darkshadow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  1004. $this->info['scrollbar-face-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  1005. $this->info['scrollbar-highlight-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  1006. $this->info['scrollbar-shadow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  1007. // technically not proprietary, but CSS3, and no one supports it
  1008. $this->info['opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
  1009. $this->info['-moz-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
  1010. $this->info['-khtml-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
  1011. // only opacity, for now
  1012. $this->info['filter'] = new HTMLPurifier_AttrDef_CSS_Filter();
  1013. }
  1014. protected function doSetupTricky($config) {
  1015. $this->info['display'] = new HTMLPurifier_AttrDef_Enum(array(
  1016. 'inline', 'block', 'list-item', 'run-in', 'compact',
  1017. 'marker', 'table', 'inline-table', 'table-row-group',
  1018. 'table-header-group', 'table-footer-group', 'table-row',
  1019. 'table-column-group', 'table-column', 'table-cell', 'table-caption', 'none'
  1020. ));
  1021. $this->info['visibility'] = new HTMLPurifier_AttrDef_Enum(array(
  1022. 'visible', 'hidden', 'collapse'
  1023. ));
  1024. $this->info['overflow'] = new HTMLPurifier_AttrDef_Enum(array('visible', 'hidden', 'auto', 'scroll'));
  1025. }
  1026. protected function doSetupTrusted($config) {
  1027. $this->info['position'] = new HTMLPurifier_AttrDef_Enum(array(
  1028. 'static', 'relative', 'absolute', 'fixed'
  1029. ));
  1030. $this->info['top'] =
  1031. $this->info['left'] =
  1032. $this->info['right'] =
  1033. $this->info['bottom'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  1034. new HTMLPurifier_AttrDef_CSS_Length(),
  1035. new HTMLPurifier_AttrDef_CSS_Percentage(),
  1036. new HTMLPurifier_AttrDef_Enum(array('auto')),
  1037. ));
  1038. $this->info['z-index'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  1039. new HTMLPurifier_AttrDef_Integer(),
  1040. new HTMLPurifier_AttrDef_Enum(array('auto')),
  1041. ));
  1042. }
  1043. /**
  1044. * Performs extra config-based processing. Based off of
  1045. * HTMLPurifier_HTMLDefinition.
  1046. * @todo Refactor duplicate elements into common class (probably using
  1047. * composition, not inheritance).
  1048. */
  1049. protected function setupConfigStuff($config) {
  1050. // setup allowed elements
  1051. $support = "(for information on implementing this, see the ".
  1052. "support forums) ";
  1053. $allowed_properties = $config->get('CSS.AllowedProperties');
  1054. if ($allowed_properties !== null) {
  1055. foreach ($this->info as $name => $d) {
  1056. if(!isset($allowed_properties[$name])) unset($this->info[$name]);
  1057. unset($allowed_properties[$name]);
  1058. }
  1059. // emit errors
  1060. foreach ($allowed_properties as $name => $d) {
  1061. // :TODO: Is this htmlspecialchars() call really necessary?
  1062. $name = htmlspecialchars($name);
  1063. trigger_error("Style attribute '$name' is not supported $support", E_USER_WARNING);
  1064. }
  1065. }
  1066. $forbidden_properties = $config->get('CSS.ForbiddenProperties');
  1067. if ($forbidden_properties !== null) {
  1068. foreach ($this->info as $name => $d) {
  1069. if (isset($forbidden_properties[$name])) {
  1070. unset($this->info[$name]);
  1071. }
  1072. }
  1073. }
  1074. }
  1075. }
  1076. /**
  1077. * Defines allowed child nodes and validates tokens against it.
  1078. */
  1079. abstract class HTMLPurifier_ChildDef
  1080. {
  1081. /**
  1082. * Type of child definition, usually right-most part of class name lowercase.
  1083. * Used occasionally in terms of context.
  1084. */
  1085. public $type;
  1086. /**
  1087. * Bool that indicates whether or not an empty array of children is okay
  1088. *
  1089. * This is necessary for redundant checking when changes affecting
  1090. * a child node may cause a parent node to now be disallowed.
  1091. */
  1092. public $allow_empty;
  1093. /**
  1094. * Lookup array of all elements that this definition could possibly allow
  1095. */
  1096. public $elements = array();
  1097. /**
  1098. * Get lookup of tag names that should not close this element automatically.
  1099. * All other elements will do so.
  1100. */
  1101. public function getAllowedElements($config) {
  1102. return $this->elements;
  1103. }
  1104. /**
  1105. * Validates nodes according to definition and returns modification.
  1106. *
  1107. * @param $tokens_of_children Array of HTMLPurifier_Token
  1108. * @param $config HTMLPurifier_Config object
  1109. * @param $context HTMLPurifier_Context object
  1110. * @return bool true to leave nodes as is
  1111. * @return bool false to remove parent node
  1112. * @return array of replacement child tokens
  1113. */
  1114. abstract public function validateChildren($tokens_of_children, $config, $context);
  1115. }
  1116. /**
  1117. * Configuration object that triggers customizable behavior.
  1118. *
  1119. * @warning This class is strongly defined: that means that the class
  1120. * will fail if an undefined directive is retrieved or set.
  1121. *
  1122. * @note Many classes that could (although many times don't) use the
  1123. * configuration object make it a mandatory parameter. This is
  1124. * because a configuration object should always be forwarded,
  1125. * otherwise, you run the risk of missing a parameter and then
  1126. * being stumped when a configuration directive doesn't work.
  1127. *
  1128. * @todo Reconsider some of the public member variables
  1129. */
  1130. class HTMLPurifier_Config
  1131. {
  1132. /**
  1133. * HTML Purifier's version
  1134. */
  1135. public $version = '4.3.0';
  1136. /**
  1137. * Bool indicator whether or not to automatically finalize
  1138. * the object if a read operation is done
  1139. */
  1140. public $autoFinalize = true;
  1141. // protected member variables
  1142. /**
  1143. * Namespace indexed array of serials for specific namespaces (see
  1144. * getSerial() for more info).
  1145. */
  1146. protected $serials = array();
  1147. /**
  1148. * Serial for entire configuration object
  1149. */
  1150. protected $serial;
  1151. /**
  1152. * Parser for variables
  1153. */
  1154. protected $parser;
  1155. /**
  1156. * Reference HTMLPurifier_ConfigSchema for value checking
  1157. * @note This is public for introspective purposes. Please don't
  1158. * abuse!
  1159. */
  1160. public $def;
  1161. /**
  1162. * Indexed array of definitions
  1163. */
  1164. protected $definitions;
  1165. /**
  1166. * Bool indicator whether or not config is finalized
  1167. */
  1168. protected $finalized = false;
  1169. /**
  1170. * Property list containing configuration directives.
  1171. */
  1172. protected $plist;
  1173. /**
  1174. * Whether or not a set is taking place due to an
  1175. * alias lookup.
  1176. */
  1177. private $aliasMode;
  1178. /**
  1179. * Set to false if you do not want line and file numbers in errors
  1180. * (useful when unit testing). This will also compress some errors
  1181. * and exceptions.
  1182. */
  1183. public $chatty = true;
  1184. /**
  1185. * Current lock; only gets to this namespace are allowed.
  1186. */
  1187. private $lock;
  1188. /**
  1189. * @param $definition HTMLPurifier_ConfigSchema that defines what directives
  1190. * are allowed.
  1191. */
  1192. public function __construct($definition, $parent = null) {
  1193. $parent = $parent ? $parent : $definition->defaultPlist;
  1194. $this->plist = new HTMLPurifier_PropertyList($parent);
  1195. $this->def = $definition; // keep a copy around for checking
  1196. $this->parser = new HTMLPurifier_VarParser_Flexible();
  1197. }
  1198. /**
  1199. * Convenience constructor that creates a config object based on a mixed var
  1200. * @param mixed $config Variable that defines the state of the config
  1201. * object. Can be: a HTMLPurifier_Config() object,
  1202. * an array of directives based on loadArray(),
  1203. * or a string filename of an ini file.
  1204. * @param HTMLPurifier_ConfigSchema Schema object
  1205. * @return Configured HTMLPurifier_Config object
  1206. */
  1207. public static function create($config, $schema = null) {
  1208. if ($config instanceof HTMLPurifier_Config) {
  1209. // pass-through
  1210. return $config;
  1211. }
  1212. if (!$schema) {
  1213. $ret = HTMLPurifier_Config::createDefault();
  1214. } else {
  1215. $ret = new HTMLPurifier_Config($schema);
  1216. }
  1217. if (is_string($config)) $ret->loadIni($config);
  1218. elseif (is_array($config)) $ret->loadArray($config);
  1219. return $ret;
  1220. }
  1221. /**
  1222. * Creates a new config object that inherits from a previous one.
  1223. * @param HTMLPurifier_Config $config Configuration object to inherit
  1224. * from.
  1225. * @return HTMLPurifier_Config object with $config as its parent.
  1226. */
  1227. public static function inherit(HTMLPurifier_Config $config) {
  1228. return new HTMLPurifier_Config($config->def, $config->plist);
  1229. }
  1230. /**
  1231. * Convenience constructor that creates a default configuration object.
  1232. * @return Default HTMLPurifier_Config object.
  1233. */
  1234. public static function createDefault() {
  1235. $definition = HTMLPurifier_ConfigSchema::instance();
  1236. $config = new HTMLPurifier_Config($definition);
  1237. return $config;
  1238. }
  1239. /**
  1240. * Retreives a value from the configuration.
  1241. * @param $key String key
  1242. */
  1243. public function get($key, $a = null) {
  1244. if ($a !== null) {
  1245. $this->triggerError("Using deprecated API: use \$config->get('$key.$a') instead", E_USER_WARNING);
  1246. $key = "$key.$a";
  1247. }
  1248. if (!$this->finalized) $this->autoFinalize();
  1249. if (!isset($this->def->info[$key])) {
  1250. // can't add % due to SimpleTest bug
  1251. $this->triggerError('Cannot retrieve value of undefined directive ' . htmlspecialchars($key),
  1252. E_USER_WARNING);
  1253. return;
  1254. }
  1255. if (isset($this->def->info[$key]->isAlias)) {
  1256. $d = $this->def->info[$key];
  1257. $this->triggerError('Cannot get value from aliased directive, use real name ' . $d->key,
  1258. E_USER_ERROR);
  1259. return;
  1260. }
  1261. if ($this->lock) {
  1262. list($ns) = explode('.', $key);
  1263. if ($ns !== $this->lock) {
  1264. $this->triggerError('Cannot get value of namespace ' . $ns . ' when lock for ' . $this->lock . ' is active, this probably indicates a Definition setup method is accessing directives that are not within its namespace', E_USER_ERROR);
  1265. return;
  1266. }
  1267. }
  1268. return $this->plist->get($key);
  1269. }
  1270. /**
  1271. * Retreives an array of directives to values from a given namespace
  1272. * @param $namespace String namespace
  1273. */
  1274. public function getBatch($namespace) {
  1275. if (!$this->finalized) $this->autoFinalize();
  1276. $full = $this->getAll();
  1277. if (!isset($full[$namespace])) {
  1278. $this->triggerError('Cannot retrieve undefined namespace ' . htmlspecialchars($namespace),
  1279. E_USER_WARNING);
  1280. return;
  1281. }
  1282. return $full[$namespace];
  1283. }
  1284. /**
  1285. * Returns a md5 signature of a segment of the configuration object
  1286. * that uniquely identifies that particular configuration
  1287. * @note Revision is handled specially and is removed from the batch
  1288. * before processing!
  1289. * @param $namespace Namespace to get serial for
  1290. */
  1291. public function getBatchSerial($namespace) {
  1292. if (empty($this->serials[$namespace])) {
  1293. $batch = $this->getBatch($namespace);
  1294. unset($batch['DefinitionRev']);
  1295. $this->serials[$namespace] = md5(serialize($batch));
  1296. }
  1297. return $this->serials[$namespace];
  1298. }
  1299. /**
  1300. * Returns a md5 signature for the entire configuration object
  1301. * that uniquely identifies that particular configuration
  1302. */
  1303. public function getSerial() {
  1304. if (empty($this->serial)) {
  1305. $this->serial = md5(serialize($this->getAll()));
  1306. }
  1307. return $this->serial;
  1308. }
  1309. /**
  1310. * Retrieves all directives, organized by namespace
  1311. * @warning This is a pretty inefficient function, avoid if you can
  1312. */
  1313. public function getAll() {
  1314. if (!$this->finalized) $this->autoFinalize();
  1315. $ret = array();
  1316. foreach ($this->plist->squash() as $name => $value) {
  1317. list($ns, $key) = explode('.', $name, 2);
  1318. $ret[$ns][$key] = $value;
  1319. }
  1320. return $ret;
  1321. }
  1322. /**
  1323. * Sets a value to configuration.
  1324. * @param $key String key
  1325. * @param $value Mixed value
  1326. */
  1327. public function set($key, $value, $a = null) {
  1328. if (strpos($key, '.') === false) {
  1329. $namespace = $key;
  1330. $directive = $value;
  1331. $value = $a;
  1332. $key = "$key.$directive";
  1333. $this->triggerError("Using deprecated API: use \$config->set('$key', ...) instead", E_USER_NOTICE);
  1334. } else {
  1335. list($namespace) = explode('.', $key);
  1336. }
  1337. if ($this->isFinalized('Cannot set directive after finalization')) return;
  1338. if (!isset($this->def->info[$key])) {
  1339. $this->triggerError('Cannot set undefined directive ' . htmlspecialchars($key) . ' to value',
  1340. E_USER_WARNING);
  1341. return;
  1342. }
  1343. $def = $this->def->info[$key];
  1344. if (isset($def->isAlias)) {
  1345. if ($this->aliasMode) {
  1346. $this->triggerError('Double-aliases not allowed, please fix '.
  1347. 'ConfigSchema bug with' . $key, E_USER_ERROR);
  1348. return;
  1349. }
  1350. $this->aliasMode = true;
  1351. $this->set($def->key, $value);
  1352. $this->aliasMode = false;
  1353. $this->triggerError("$key is an alias, preferred directive name is {$def->key}", E_USER_NOTICE);
  1354. return;
  1355. }
  1356. // Raw type might be negative when using the fully optimized form
  1357. // of stdclass, which indicates allow_null == true
  1358. $rtype = is_int($def) ? $def : $def->type;
  1359. if ($rtype < 0) {
  1360. $type = -$rtype;
  1361. $allow_null = true;
  1362. } else {
  1363. $type = $rtype;
  1364. $allow_null = isset($def->allow_null);
  1365. }
  1366. try {
  1367. $value = $this->parser->parse($value, $type, $allow_null);
  1368. } catch (HTMLPurifier_VarParserException $e) {
  1369. $this->triggerError('Value for ' . $key . ' is of invalid type, should be ' . HTMLPurifier_VarParser::getTypeName($type), E_USER_WARNING);
  1370. return;
  1371. }
  1372. if (is_string($value) && is_object($def)) {
  1373. // resolve value alias if defined
  1374. if (isset($def->aliases[$value])) {
  1375. $value = $def->aliases[$value];
  1376. }
  1377. // check to see if the value is allowed
  1378. if (isset($def->allowed) && !isset($def->allowed[$value])) {
  1379. $this->triggerError('Value not supported, valid values are: ' .
  1380. $this->_listify($def->allowed), E_USER_WARNING);
  1381. return;
  1382. }
  1383. }
  1384. $this->plist->set($key, $value);
  1385. // reset definitions if the directives they depend on changed
  1386. // this is a very costly process, so it's discouraged
  1387. // with finalization
  1388. if ($namespace == 'HTML' || $namespace == 'CSS' || $namespace == 'URI') {
  1389. $this->definitions[$namespace] = null;
  1390. }
  1391. $this->serials[$namespace] = false;
  1392. }
  1393. /**
  1394. * Convenience function for error reporting
  1395. */
  1396. private function _listify($lookup) {
  1397. $list = array();
  1398. foreach ($lookup as $name => $b) $list[] = $name;
  1399. return implode(', ', $list);
  1400. }
  1401. /**
  1402. * Retrieves object reference to the HTML definition.
  1403. * @param $raw Return a copy that has not been setup yet. Must be
  1404. * called before it's been setup, otherwise won't work.
  1405. * @param $optimized If true, this method may return null, to
  1406. * indicate that a cached version of the modified
  1407. * definition object is available and no further edits
  1408. * are necessary. Consider using
  1409. * maybeGetRawHTMLDefinition, which is more explicitly
  1410. * named, instead.
  1411. */
  1412. public function getHTMLDefinition($raw = false, $optimized = false) {
  1413. return $this->getDefinition('HTML', $raw, $optimized);
  1414. }
  1415. /**
  1416. * Retrieves object reference to the CSS definition
  1417. * @param $raw Return a copy that has not been setup yet. Must be
  1418. * called before it's been setup, otherwise won't work.
  1419. * @param $optimized If true, this method may return null, to
  1420. * indicate that a cached version of the modified
  1421. * definition object is available and no further edits
  1422. * are necessary. Consider using
  1423. * maybeGetRawCSSDefinition, which is more explicitly
  1424. * named, instead.
  1425. */
  1426. public function getCSSDefinition($raw = false, $optimized = false) {
  1427. return $this->getDefinition('CSS', $raw, $optimized);
  1428. }
  1429. /**
  1430. * Retrieves object reference to the URI definition
  1431. * @param $raw Return a copy that has not been setup yet. Must be
  1432. * called before it's been setup, otherwise won't work.
  1433. * @param $optimized If true, this method may return null, to
  1434. * indicate that a cached version of the modified
  1435. * definition object is available and no further edits
  1436. * are necessary. Consider using
  1437. * maybeGetRawURIDefinition, which is more explicitly
  1438. * named, instead.
  1439. */
  1440. public function getURIDefinition($raw = false, $optimized = false) {
  1441. return $this->getDefinition('URI', $raw, $optimized);
  1442. }
  1443. /**
  1444. * Retrieves a definition
  1445. * @param $type Type of definition: HTML, CSS, etc
  1446. * @param $raw Whether or not definition should be returned raw
  1447. * @param $optimized Only has an effect when $raw is true. Whether
  1448. * or not to return null if the result is already present in
  1449. * the cache. This is off by default for backwards
  1450. * compatibility reasons, but you need to do things this
  1451. * way in order to ensure that caching is done properly.
  1452. * Check out enduser-customize.html for more details.
  1453. * We probably won't ever change this default, as much as the
  1454. * maybe semantics is the "right thing to do."
  1455. */
  1456. public function getDefinition($type, $raw = false, $optimized = false) {
  1457. if ($optimized && !$raw) {
  1458. throw new HTMLPurifier_Exception("Cannot set optimized = true when raw = false");
  1459. }
  1460. if (!$this->finalized) $this->autoFinalize();
  1461. // temporarily suspend locks, so we can handle recursive definition calls
  1462. $lock = $this->lock;
  1463. $this->lock = null;
  1464. $factory = HTMLPurifier_DefinitionCacheFactory::instance();
  1465. $cache = $factory->create($type, $this);
  1466. $this->lock = $lock;
  1467. if (!$raw) {
  1468. // full definition
  1469. // ---------------
  1470. // check if definition is in memory
  1471. if (!empty($this->definitions[$type])) {
  1472. $def = $this->definitions[$type];
  1473. // check if the definition is setup
  1474. if ($def->setup) {
  1475. return $def;
  1476. } else {
  1477. $def->setup($this);
  1478. if ($def->optimized) $cache->add($def, $this);
  1479. return $def;
  1480. }
  1481. }
  1482. // check if definition is in cache
  1483. $def = $cache->get($this);
  1484. if ($def) {
  1485. // definition in cache, save to memory and return it
  1486. $this->definitions[$type] = $def;
  1487. return $def;
  1488. }
  1489. // initialize it
  1490. $def = $this->initDefinition($type);
  1491. // set it up
  1492. $this->lock = $type;
  1493. $def->setup($this);
  1494. $this->lock = null;
  1495. // save in cache
  1496. $cache->add($def, $this);
  1497. // return it
  1498. return $def;
  1499. } else {
  1500. // raw definition
  1501. // --------------
  1502. // check preconditions
  1503. $def = null;
  1504. if ($optimized) {
  1505. if (is_null($this->get($type . '.DefinitionID'))) {
  1506. // fatally error out if definition ID not set
  1507. throw new HTMLPurifier_Exception("Cannot retrieve raw version without specifying %$type.DefinitionID");
  1508. }
  1509. }
  1510. if (!empty($this->definitions[$type])) {
  1511. $def = $this->definitions[$type];
  1512. if ($def->setup && !$optimized) {
  1513. $extra = $this->chatty ? " (try moving this code block earlier in your initialization)" : "";
  1514. throw new HTMLPurifier_Exception("Cannot retrieve raw definition after it has already been setup" . $extra);
  1515. }
  1516. if ($def->optimized === null) {
  1517. $extra = $this->chatty ? " (try flushing your cache)" : "";
  1518. throw new HTMLPurifier_Exception("Optimization status of definition is unknown" . $extra);
  1519. }
  1520. if ($def->optimized !== $optimized) {
  1521. $msg = $optimized ? "optimized" : "unoptimized";
  1522. $extra = $this->chatty ? " (this backtrace is for the first inconsistent call, which was for a $msg raw definition)" : "";
  1523. throw new HTMLPurifier_Exception("Inconsistent use of optimized and unoptimized raw definition retrievals" . $extra);
  1524. }
  1525. }
  1526. // check if definition was in memory
  1527. if ($def) {
  1528. if ($def->setup) {
  1529. // invariant: $optimized === true (checked above)
  1530. return null;
  1531. } else {
  1532. return $def;
  1533. }
  1534. }
  1535. // if optimized, check if definition was in cache
  1536. // (because we do the memory check first, this formulation
  1537. // is prone to cache slamming, but I think
  1538. // guaranteeing that either /all/ of the raw
  1539. // setup code or /none/ of it is run is more important.)
  1540. if ($optimized) {
  1541. // This code path only gets run once; once we put
  1542. // something in $definitions (which is guaranteed by the
  1543. // trailing code), we always short-circuit above.
  1544. $def = $cache->get($this);
  1545. if ($def) {
  1546. // save the full definition for later, but don't
  1547. // return it yet
  1548. $this->definitions[$type] = $def;
  1549. return null;
  1550. }
  1551. }
  1552. // check invariants for creation
  1553. if (!$optimized) {
  1554. if (!is_null($this->get($type . '.DefinitionID'))) {
  1555. if ($this->chatty) {
  1556. $this->triggerError("Due to a documentation error in previous version of HTML Purifier, your definitions are not being cached. If this is OK, you can remove the %$type.DefinitionRev and %$type.DefinitionID declaration. Otherwise, modify your code to use maybeGetRawDefinition, and test if the returned value is null before making any edits (if it is null, that means that a cached version is available, and no raw operations are necessary). See <a href='http://htmlpurifier.org/docs/enduser-customize.html#optimized'>Customize</a> for more details", E_USER_WARNING);
  1557. } else {
  1558. $this->triggerError("Useless DefinitionID declaration", E_USER_WARNING);
  1559. }
  1560. }
  1561. }
  1562. // initialize it
  1563. $def = $this->initDefinition($type);
  1564. $def->optimized = $optimized;
  1565. return $def;
  1566. }
  1567. throw new HTMLPurifier_Exception("The impossible happened!");
  1568. }
  1569. private function initDefinition($type) {
  1570. // quick checks failed, let's create the object
  1571. if ($type == 'HTML') {
  1572. $def = new HTMLPurifier_HTMLDefinition();
  1573. } elseif ($type == 'CSS') {
  1574. $def = new HTMLPurifier_CSSDefinition();
  1575. } elseif ($type == 'URI') {
  1576. $def = new HTMLPurifier_URIDefinition();
  1577. } else {
  1578. throw new HTMLPurifier_Exception("Definition of $type type not supported");
  1579. }
  1580. $this->definitions[$type] = $def;
  1581. return $def;
  1582. }
  1583. public function maybeGetRawDefinition($name) {
  1584. return $this->getDefinition($name, true, true);
  1585. }
  1586. public function maybeGetRawHTMLDefinition() {
  1587. return $this->getDefinition('HTML', true, true);
  1588. }
  1589. public function maybeGetRawCSSDefinition() {
  1590. return $this->getDefinition('CSS', true, true);
  1591. }
  1592. public function maybeGetRawURIDefinition() {
  1593. return $this->getDefinition('URI', true, true);
  1594. }
  1595. /**
  1596. * Loads configuration values from an array with the following structure:
  1597. * Namespace.Directive => Value
  1598. * @param $config_array Configuration associative array
  1599. */
  1600. public function loadArray($config_array) {
  1601. if ($this->isFinalized('Cannot load directives after finalization')) return;
  1602. foreach ($config_array as $key => $value) {
  1603. $key = str_replace('_', '.', $key);
  1604. if (strpos($key, '.') !== false) {
  1605. $this->set($key, $value);
  1606. } else {
  1607. $namespace = $key;
  1608. $namespace_values = $value;
  1609. foreach ($namespace_values as $directive => $value) {
  1610. $this->set($namespace .'.'. $directive, $value);
  1611. }
  1612. }
  1613. }
  1614. }
  1615. /**
  1616. * Returns a list of array(namespace, directive) for all directives
  1617. * that are allowed in a web-form context as per an allowed
  1618. * namespaces/directives list.
  1619. * @param $allowed List of allowed namespaces/directives
  1620. */
  1621. public static function getAllowedDirectivesForForm($allowed, $schema = null) {
  1622. if (!$schema) {
  1623. $schema = HTMLPurifier_ConfigSchema::instance();
  1624. }
  1625. if ($allowed !== true) {
  1626. if (is_string($allowed)) $allowed = array($allowed);
  1627. $allowed_ns = array();
  1628. $allowed_directives = array();
  1629. $blacklisted_directives = array();
  1630. foreach ($allowed as $ns_or_directive) {
  1631. if (strpos($ns_or_directive, '.') !== false) {
  1632. // directive
  1633. if ($ns_or_directive[0] == '-') {
  1634. $blacklisted_directives[substr($ns_or_directive, 1)] = true;
  1635. } else {
  1636. $allowed_directives[$ns_or_directive] = true;
  1637. }
  1638. } else {
  1639. // namespace
  1640. $allowed_ns[$ns_or_directive] = true;
  1641. }
  1642. }
  1643. }
  1644. $ret = array();
  1645. foreach ($schema->info as $key => $def) {
  1646. list($ns, $directive) = explode('.', $key, 2);
  1647. if ($allowed !== true) {
  1648. if (isset($blacklisted_directives["$ns.$directive"])) continue;
  1649. if (!isset($allowed_directives["$ns.$directive"]) && !isset($allowed_ns[$ns])) continue;
  1650. }
  1651. if (isset($def->isAlias)) continue;
  1652. if ($directive == 'DefinitionID' || $directive == 'DefinitionRev') continue;
  1653. $ret[] = array($ns, $directive);
  1654. }
  1655. return $ret;
  1656. }
  1657. /**
  1658. * Loads configuration values from $_GET/$_POST that were posted
  1659. * via ConfigForm
  1660. * @param $array $_GET or $_POST array to import
  1661. * @param $index Index/name that the config variables are in
  1662. * @param $allowed List of allowed namespaces/directives
  1663. * @param $mq_fix Boolean whether or not to enable magic quotes fix
  1664. * @param $schema Instance of HTMLPurifier_ConfigSchema to use, if not global copy
  1665. */
  1666. public static function loadArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) {
  1667. $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $schema);
  1668. $config = HTMLPurifier_Config::create($ret, $schema);
  1669. return $config;
  1670. }
  1671. /**
  1672. * Merges in configuration values from $_GET/$_POST to object. NOT STATIC.
  1673. * @note Same parameters as loadArrayFromForm
  1674. */
  1675. public function mergeArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true) {
  1676. $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $this->def);
  1677. $this->loadArray($ret);
  1678. }
  1679. /**
  1680. * Prepares an array from a form into something usable for the more
  1681. * strict parts of HTMLPurifier_Config
  1682. */
  1683. public static function prepareArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) {
  1684. if ($index !== false) $array = (isset($array[$index]) && is_array($array[$index])) ? $array[$index] : array();
  1685. $mq = $mq_fix && function_exists('get_magic_quotes_gpc') && get_magic_quotes_gpc();
  1686. $allowed = HTMLPurifier_Config::getAllowedDirectivesForForm($allowed, $schema);
  1687. $ret = array();
  1688. foreach ($allowed as $key) {
  1689. list($ns, $directive) = $key;
  1690. $skey = "$ns.$directive";
  1691. if (!empty($array["Null_$skey"])) {
  1692. $ret[$ns][$directive] = null;
  1693. continue;
  1694. }
  1695. if (!isset($array[$skey])) continue;
  1696. $value = $mq ? stripslashes($array[$skey]) : $array[$skey];
  1697. $ret[$ns][$directive] = $value;
  1698. }
  1699. return $ret;
  1700. }
  1701. /**
  1702. * Loads configuration values from an ini file
  1703. * @param $filename Name of ini file
  1704. */
  1705. public function loadIni($filename) {
  1706. if ($this->isFinalized('Cannot load directives after finalization')) return;
  1707. $array = parse_ini_file($filename, true);
  1708. $this->loadArray($array);
  1709. }
  1710. /**
  1711. * Checks whether or not the configuration object is finalized.
  1712. * @param $error String error message, or false for no error
  1713. */
  1714. public function isFinalized($error = false) {
  1715. if ($this->finalized && $error) {
  1716. $this->triggerError($error, E_USER_ERROR);
  1717. }
  1718. return $this->finalized;
  1719. }
  1720. /**
  1721. * Finalizes configuration only if auto finalize is on and not
  1722. * already finalized
  1723. */
  1724. public function autoFinalize() {
  1725. if ($this->autoFinalize) {
  1726. $this->finalize();
  1727. } else {
  1728. $this->plist->squash(true);
  1729. }
  1730. }
  1731. /**
  1732. * Finalizes a configuration object, prohibiting further change
  1733. */
  1734. public function finalize() {
  1735. $this->finalized = true;
  1736. unset($this->parser);
  1737. }
  1738. /**
  1739. * Produces a nicely formatted error message by supplying the
  1740. * stack frame information OUTSIDE of HTMLPurifier_Config.
  1741. */
  1742. protected function triggerError($msg, $no) {
  1743. // determine previous stack frame
  1744. $extra = '';
  1745. if ($this->chatty) {
  1746. $trace = debug_backtrace();
  1747. // zip(tail(trace), trace) -- but PHP is not Haskell har har
  1748. for ($i = 0, $c = count($trace); $i < $c - 1; $i++) {
  1749. if ($trace[$i + 1]['class'] === 'HTMLPurifier_Config') {
  1750. continue;
  1751. }
  1752. $frame = $trace[$i];
  1753. $extra = " invoked on line {$frame['line']} in file {$frame['file']}";
  1754. break;
  1755. }
  1756. }
  1757. trigger_error($msg . $extra, $no);
  1758. }
  1759. /**
  1760. * Returns a serialized form of the configuration object that can
  1761. * be reconstituted.
  1762. */
  1763. public function serialize() {
  1764. $this->getDefinition('HTML');
  1765. $this->getDefinition('CSS');
  1766. $this->getDefinition('URI');
  1767. return serialize($this);
  1768. }
  1769. }
  1770. /**
  1771. * Configuration definition, defines directives and their defaults.
  1772. */
  1773. class HTMLPurifier_ConfigSchema {
  1774. /**
  1775. * Defaults of the directives and namespaces.
  1776. * @note This shares the exact same structure as HTMLPurifier_Config::$conf
  1777. */
  1778. public $defaults = array();
  1779. /**
  1780. * The default property list. Do not edit this property list.
  1781. */
  1782. public $defaultPlist;
  1783. /**
  1784. * Definition of the directives. The structure of this is:
  1785. *
  1786. * array(
  1787. * 'Namespace' => array(
  1788. * 'Directive' => new stdclass(),
  1789. * )
  1790. * )
  1791. *
  1792. * The stdclass may have the following properties:
  1793. *
  1794. * - If isAlias isn't set:
  1795. * - type: Integer type of directive, see HTMLPurifier_VarParser for definitions
  1796. * - allow_null: If set, this directive allows null values
  1797. * - aliases: If set, an associative array of value aliases to real values
  1798. * - allowed: If set, a lookup array of allowed (string) values
  1799. * - If isAlias is set:
  1800. * - namespace: Namespace this directive aliases to
  1801. * - name: Directive name this directive aliases to
  1802. *
  1803. * In certain degenerate cases, stdclass will actually be an integer. In
  1804. * that case, the value is equivalent to an stdclass with the type
  1805. * property set to the integer. If the integer is negative, type is
  1806. * equal to the absolute value of integer, and allow_null is true.
  1807. *
  1808. * This class is friendly with HTMLPurifier_Config. If you need introspection
  1809. * about the schema, you're better of using the ConfigSchema_Interchange,
  1810. * which uses more memory but has much richer information.
  1811. */
  1812. public $info = array();
  1813. /**
  1814. * Application-wide singleton
  1815. */
  1816. static protected $singleton;
  1817. public function __construct() {
  1818. $this->defaultPlist = new HTMLPurifier_PropertyList();
  1819. }
  1820. /**
  1821. * Unserializes the default ConfigSchema.
  1822. */
  1823. public static function makeFromSerial() {
  1824. $contents = file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/ConfigSchema/schema.ser');
  1825. $r = unserialize($contents);
  1826. if (!$r) {
  1827. $hash = sha1($contents);
  1828. trigger_error("Unserialization of configuration schema failed, sha1 of file was $hash", E_USER_ERROR);
  1829. }
  1830. return $r;
  1831. }
  1832. /**
  1833. * Retrieves an instance of the application-wide configuration definition.
  1834. */
  1835. public static function instance($prototype = null) {
  1836. if ($prototype !== null) {
  1837. HTMLPurifier_ConfigSchema::$singleton = $prototype;
  1838. } elseif (HTMLPurifier_ConfigSchema::$singleton === null || $prototype === true) {
  1839. HTMLPurifier_ConfigSchema::$singleton = HTMLPurifier_ConfigSchema::makeFromSerial();
  1840. }
  1841. return HTMLPurifier_ConfigSchema::$singleton;
  1842. }
  1843. /**
  1844. * Defines a directive for configuration
  1845. * @warning Will fail of directive's namespace is defined.
  1846. * @warning This method's signature is slightly different from the legacy
  1847. * define() static method! Beware!
  1848. * @param $namespace Namespace the directive is in
  1849. * @param $name Key of directive
  1850. * @param $default Default value of directive
  1851. * @param $type Allowed type of the directive. See
  1852. * HTMLPurifier_DirectiveDef::$type for allowed values
  1853. * @param $allow_null Whether or not to allow null values
  1854. */
  1855. public function add($key, $default, $type, $allow_null) {
  1856. $obj = new stdclass();
  1857. $obj->type = is_int($type) ? $type : HTMLPurifier_VarParser::$types[$type];
  1858. if ($allow_null) $obj->allow_null = true;
  1859. $this->info[$key] = $obj;
  1860. $this->defaults[$key] = $default;
  1861. $this->defaultPlist->set($key, $default);
  1862. }
  1863. /**
  1864. * Defines a directive value alias.
  1865. *
  1866. * Directive value aliases are convenient for developers because it lets
  1867. * them set a directive to several values and get the same result.
  1868. * @param $namespace Directive's namespace
  1869. * @param $name Name of Directive
  1870. * @param $aliases Hash of aliased values to the real alias
  1871. */
  1872. public function addValueAliases($key, $aliases) {
  1873. if (!isset($this->info[$key]->aliases)) {
  1874. $this->info[$key]->aliases = array();
  1875. }
  1876. foreach ($aliases as $alias => $real) {
  1877. $this->info[$key]->aliases[$alias] = $real;
  1878. }
  1879. }
  1880. /**
  1881. * Defines a set of allowed values for a directive.
  1882. * @warning This is slightly different from the corresponding static
  1883. * method definition.
  1884. * @param $namespace Namespace of directive
  1885. * @param $name Name of directive
  1886. * @param $allowed Lookup array of allowed values
  1887. */
  1888. public function addAllowedValues($key, $allowed) {
  1889. $this->info[$key]->allowed = $allowed;
  1890. }
  1891. /**
  1892. * Defines a directive alias for backwards compatibility
  1893. * @param $namespace
  1894. * @param $name Directive that will be aliased
  1895. * @param $new_namespace
  1896. * @param $new_name Directive that the alias will be to
  1897. */
  1898. public function addAlias($key, $new_key) {
  1899. $obj = new stdclass;
  1900. $obj->key = $new_key;
  1901. $obj->isAlias = true;
  1902. $this->info[$key] = $obj;
  1903. }
  1904. /**
  1905. * Replaces any stdclass that only has the type property with type integer.
  1906. */
  1907. public function postProcess() {
  1908. foreach ($this->info as $key => $v) {
  1909. if (count((array) $v) == 1) {
  1910. $this->info[$key] = $v->type;
  1911. } elseif (count((array) $v) == 2 && isset($v->allow_null)) {
  1912. $this->info[$key] = -$v->type;
  1913. }
  1914. }
  1915. }
  1916. }
  1917. /**
  1918. * @todo Unit test
  1919. */
  1920. class HTMLPurifier_ContentSets
  1921. {
  1922. /**
  1923. * List of content set strings (pipe seperators) indexed by name.
  1924. */
  1925. public $info = array();
  1926. /**
  1927. * List of content set lookups (element => true) indexed by name.
  1928. * @note This is in HTMLPurifier_HTMLDefinition->info_content_sets
  1929. */
  1930. public $lookup = array();
  1931. /**
  1932. * Synchronized list of defined content sets (keys of info)
  1933. */
  1934. protected $keys = array();
  1935. /**
  1936. * Synchronized list of defined content values (values of info)
  1937. */
  1938. protected $values = array();
  1939. /**
  1940. * Merges in module's content sets, expands identifiers in the content
  1941. * sets and populates the keys, values and lookup member variables.
  1942. * @param $modules List of HTMLPurifier_HTMLModule
  1943. */
  1944. public function __construct($modules) {
  1945. if (!is_array($modules)) $modules = array($modules);
  1946. // populate content_sets based on module hints
  1947. // sorry, no way of overloading
  1948. foreach ($modules as $module_i => $module) {
  1949. foreach ($module->content_sets as $key => $value) {
  1950. $temp = $this->convertToLookup($value);
  1951. if (isset($this->lookup[$key])) {
  1952. // add it into the existing content set
  1953. $this->lookup[$key] = array_merge($this->lookup[$key], $temp);
  1954. } else {
  1955. $this->lookup[$key] = $temp;
  1956. }
  1957. }
  1958. }
  1959. $old_lookup = false;
  1960. while ($old_lookup !== $this->lookup) {
  1961. $old_lookup = $this->lookup;
  1962. foreach ($this->lookup as $i => $set) {
  1963. $add = array();
  1964. foreach ($set as $element => $x) {
  1965. if (isset($this->lookup[$element])) {
  1966. $add += $this->lookup[$element];
  1967. unset($this->lookup[$i][$element]);
  1968. }
  1969. }
  1970. $this->lookup[$i] += $add;
  1971. }
  1972. }
  1973. foreach ($this->lookup as $key => $lookup) {
  1974. $this->info[$key] = implode(' | ', array_keys($lookup));
  1975. }
  1976. $this->keys = array_keys($this->info);
  1977. $this->values = array_values($this->info);
  1978. }
  1979. /**
  1980. * Accepts a definition; generates and assigns a ChildDef for it
  1981. * @param $def HTMLPurifier_ElementDef reference
  1982. * @param $module Module that defined the ElementDef
  1983. */
  1984. public function generateChildDef(&$def, $module) {
  1985. if (!empty($def->child)) return; // already done!
  1986. $content_model = $def->content_model;
  1987. if (is_string($content_model)) {
  1988. // Assume that $this->keys is alphanumeric
  1989. $def->content_model = preg_replace_callback(
  1990. '/\b(' . implode('|', $this->keys) . ')\b/',
  1991. array($this, 'generateChildDefCallback'),
  1992. $content_model
  1993. );
  1994. //$def->content_model = str_replace(
  1995. // $this->keys, $this->values, $content_model);
  1996. }
  1997. $def->child = $this->getChildDef($def, $module);
  1998. }
  1999. public function generateChildDefCallback($matches) {
  2000. return $this->info[$matches[0]];
  2001. }
  2002. /**
  2003. * Instantiates a ChildDef based on content_model and content_model_type
  2004. * member variables in HTMLPurifier_ElementDef
  2005. * @note This will also defer to modules for custom HTMLPurifier_ChildDef
  2006. * subclasses that need content set expansion
  2007. * @param $def HTMLPurifier_ElementDef to have ChildDef extracted
  2008. * @return HTMLPurifier_ChildDef corresponding to ElementDef
  2009. */
  2010. public function getChildDef($def, $module) {
  2011. $value = $def->content_model;
  2012. if (is_object($value)) {
  2013. trigger_error(
  2014. 'Literal object child definitions should be stored in '.
  2015. 'ElementDef->child not ElementDef->content_model',
  2016. E_USER_NOTICE
  2017. );
  2018. return $value;
  2019. }
  2020. switch ($def->content_model_type) {
  2021. case 'required':
  2022. return new HTMLPurifier_ChildDef_Required($value);
  2023. case 'optional':
  2024. return new HTMLPurifier_ChildDef_Optional($value);
  2025. case 'empty':
  2026. return new HTMLPurifier_ChildDef_Empty();
  2027. case 'custom':
  2028. return new HTMLPurifier_ChildDef_Custom($value);
  2029. }
  2030. // defer to its module
  2031. $return = false;
  2032. if ($module->defines_child_def) { // save a func call
  2033. $return = $module->getChildDef($def);
  2034. }
  2035. if ($return !== false) return $return;
  2036. // error-out
  2037. trigger_error(
  2038. 'Could not determine which ChildDef class to instantiate',
  2039. E_USER_ERROR
  2040. );
  2041. return false;
  2042. }
  2043. /**
  2044. * Converts a string list of elements separated by pipes into
  2045. * a lookup array.
  2046. * @param $string List of elements
  2047. * @return Lookup array of elements
  2048. */
  2049. protected function convertToLookup($string) {
  2050. $array = explode('|', str_replace(' ', '', $string));
  2051. $ret = array();
  2052. foreach ($array as $i => $k) {
  2053. $ret[$k] = true;
  2054. }
  2055. return $ret;
  2056. }
  2057. }
  2058. /**
  2059. * Registry object that contains information about the current context.
  2060. * @warning Is a bit buggy when variables are set to null: it thinks
  2061. * they don't exist! So use false instead, please.
  2062. * @note Since the variables Context deals with may not be objects,
  2063. * references are very important here! Do not remove!
  2064. */
  2065. class HTMLPurifier_Context
  2066. {
  2067. /**
  2068. * Private array that stores the references.
  2069. */
  2070. private $_storage = array();
  2071. /**
  2072. * Registers a variable into the context.
  2073. * @param $name String name
  2074. * @param $ref Reference to variable to be registered
  2075. */
  2076. public function register($name, &$ref) {
  2077. if (isset($this->_storage[$name])) {
  2078. trigger_error("Name $name produces collision, cannot re-register",
  2079. E_USER_ERROR);
  2080. return;
  2081. }
  2082. $this->_storage[$name] =& $ref;
  2083. }
  2084. /**
  2085. * Retrieves a variable reference from the context.
  2086. * @param $name String name
  2087. * @param $ignore_error Boolean whether or not to ignore error
  2088. */
  2089. public function &get($name, $ignore_error = false) {
  2090. if (!isset($this->_storage[$name])) {
  2091. if (!$ignore_error) {
  2092. trigger_error("Attempted to retrieve non-existent variable $name",
  2093. E_USER_ERROR);
  2094. }
  2095. $var = null; // so we can return by reference
  2096. return $var;
  2097. }
  2098. return $this->_storage[$name];
  2099. }
  2100. /**
  2101. * Destorys a variable in the context.
  2102. * @param $name String name
  2103. */
  2104. public function destroy($name) {
  2105. if (!isset($this->_storage[$name])) {
  2106. trigger_error("Attempted to destroy non-existent variable $name",
  2107. E_USER_ERROR);
  2108. return;
  2109. }
  2110. unset($this->_storage[$name]);
  2111. }
  2112. /**
  2113. * Checks whether or not the variable exists.
  2114. * @param $name String name
  2115. */
  2116. public function exists($name) {
  2117. return isset($this->_storage[$name]);
  2118. }
  2119. /**
  2120. * Loads a series of variables from an associative array
  2121. * @param $context_array Assoc array of variables to load
  2122. */
  2123. public function loadArray($context_array) {
  2124. foreach ($context_array as $key => $discard) {
  2125. $this->register($key, $context_array[$key]);
  2126. }
  2127. }
  2128. }
  2129. /**
  2130. * Abstract class representing Definition cache managers that implements
  2131. * useful common methods and is a factory.
  2132. * @todo Create a separate maintenance file advanced users can use to
  2133. * cache their custom HTMLDefinition, which can be loaded
  2134. * via a configuration directive
  2135. * @todo Implement memcached
  2136. */
  2137. abstract class HTMLPurifier_DefinitionCache
  2138. {
  2139. public $type;
  2140. /**
  2141. * @param $name Type of definition objects this instance of the
  2142. * cache will handle.
  2143. */
  2144. public function __construct($type) {
  2145. $this->type = $type;
  2146. }
  2147. /**
  2148. * Generates a unique identifier for a particular configuration
  2149. * @param Instance of HTMLPurifier_Config
  2150. */
  2151. public function generateKey($config) {
  2152. return $config->version . ',' . // possibly replace with function calls
  2153. $config->getBatchSerial($this->type) . ',' .
  2154. $config->get($this->type . '.DefinitionRev');
  2155. }
  2156. /**
  2157. * Tests whether or not a key is old with respect to the configuration's
  2158. * version and revision number.
  2159. * @param $key Key to test
  2160. * @param $config Instance of HTMLPurifier_Config to test against
  2161. */
  2162. public function isOld($key, $config) {
  2163. if (substr_count($key, ',') < 2) return true;
  2164. list($version, $hash, $revision) = explode(',', $key, 3);
  2165. $compare = version_compare($version, $config->version);
  2166. // version mismatch, is always old
  2167. if ($compare != 0) return true;
  2168. // versions match, ids match, check revision number
  2169. if (
  2170. $hash == $config->getBatchSerial($this->type) &&
  2171. $revision < $config->get($this->type . '.DefinitionRev')
  2172. ) return true;
  2173. return false;
  2174. }
  2175. /**
  2176. * Checks if a definition's type jives with the cache's type
  2177. * @note Throws an error on failure
  2178. * @param $def Definition object to check
  2179. * @return Boolean true if good, false if not
  2180. */
  2181. public function checkDefType($def) {
  2182. if ($def->type !== $this->type) {
  2183. trigger_error("Cannot use definition of type {$def->type} in cache for {$this->type}");
  2184. return false;
  2185. }
  2186. return true;
  2187. }
  2188. /**
  2189. * Adds a definition object to the cache
  2190. */
  2191. abstract public function add($def, $config);
  2192. /**
  2193. * Unconditionally saves a definition object to the cache
  2194. */
  2195. abstract public function set($def, $config);
  2196. /**
  2197. * Replace an object in the cache
  2198. */
  2199. abstract public function replace($def, $config);
  2200. /**
  2201. * Retrieves a definition object from the cache
  2202. */
  2203. abstract public function get($config);
  2204. /**
  2205. * Removes a definition object to the cache
  2206. */
  2207. abstract public function remove($config);
  2208. /**
  2209. * Clears all objects from cache
  2210. */
  2211. abstract public function flush($config);
  2212. /**
  2213. * Clears all expired (older version or revision) objects from cache
  2214. * @note Be carefuly implementing this method as flush. Flush must
  2215. * not interfere with other Definition types, and cleanup()
  2216. * should not be repeatedly called by userland code.
  2217. */
  2218. abstract public function cleanup($config);
  2219. }
  2220. /**
  2221. * Responsible for creating definition caches.
  2222. */
  2223. class HTMLPurifier_DefinitionCacheFactory
  2224. {
  2225. protected $caches = array('Serializer' => array());
  2226. protected $implementations = array();
  2227. protected $decorators = array();
  2228. /**
  2229. * Initialize default decorators
  2230. */
  2231. public function setup() {
  2232. $this->addDecorator('Cleanup');
  2233. }
  2234. /**
  2235. * Retrieves an instance of global definition cache factory.
  2236. */
  2237. public static function instance($prototype = null) {
  2238. static $instance;
  2239. if ($prototype !== null) {
  2240. $instance = $prototype;
  2241. } elseif ($instance === null || $prototype === true) {
  2242. $instance = new HTMLPurifier_DefinitionCacheFactory();
  2243. $instance->setup();
  2244. }
  2245. return $instance;
  2246. }
  2247. /**
  2248. * Registers a new definition cache object
  2249. * @param $short Short name of cache object, for reference
  2250. * @param $long Full class name of cache object, for construction
  2251. */
  2252. public function register($short, $long) {
  2253. $this->implementations[$short] = $long;
  2254. }
  2255. /**
  2256. * Factory method that creates a cache object based on configuration
  2257. * @param $name Name of definitions handled by cache
  2258. * @param $config Instance of HTMLPurifier_Config
  2259. */
  2260. public function create($type, $config) {
  2261. $method = $config->get('Cache.DefinitionImpl');
  2262. if ($method === null) {
  2263. return new HTMLPurifier_DefinitionCache_Null($type);
  2264. }
  2265. if (!empty($this->caches[$method][$type])) {
  2266. return $this->caches[$method][$type];
  2267. }
  2268. if (
  2269. isset($this->implementations[$method]) &&
  2270. class_exists($class = $this->implementations[$method], false)
  2271. ) {
  2272. $cache = new $class($type);
  2273. } else {
  2274. if ($method != 'Serializer') {
  2275. trigger_error("Unrecognized DefinitionCache $method, using Serializer instead", E_USER_WARNING);
  2276. }
  2277. $cache = new HTMLPurifier_DefinitionCache_Serializer($type);
  2278. }
  2279. foreach ($this->decorators as $decorator) {
  2280. $new_cache = $decorator->decorate($cache);
  2281. // prevent infinite recursion in PHP 4
  2282. unset($cache);
  2283. $cache = $new_cache;
  2284. }
  2285. $this->caches[$method][$type] = $cache;
  2286. return $this->caches[$method][$type];
  2287. }
  2288. /**
  2289. * Registers a decorator to add to all new cache objects
  2290. * @param
  2291. */
  2292. public function addDecorator($decorator) {
  2293. if (is_string($decorator)) {
  2294. $class = "HTMLPurifier_DefinitionCache_Decorator_$decorator";
  2295. $decorator = new $class;
  2296. }
  2297. $this->decorators[$decorator->name] = $decorator;
  2298. }
  2299. }
  2300. /**
  2301. * Represents a document type, contains information on which modules
  2302. * need to be loaded.
  2303. * @note This class is inspected by Printer_HTMLDefinition->renderDoctype.
  2304. * If structure changes, please update that function.
  2305. */
  2306. class HTMLPurifier_Doctype
  2307. {
  2308. /**
  2309. * Full name of doctype
  2310. */
  2311. public $name;
  2312. /**
  2313. * List of standard modules (string identifiers or literal objects)
  2314. * that this doctype uses
  2315. */
  2316. public $modules = array();
  2317. /**
  2318. * List of modules to use for tidying up code
  2319. */
  2320. public $tidyModules = array();
  2321. /**
  2322. * Is the language derived from XML (i.e. XHTML)?
  2323. */
  2324. public $xml = true;
  2325. /**
  2326. * List of aliases for this doctype
  2327. */
  2328. public $aliases = array();
  2329. /**
  2330. * Public DTD identifier
  2331. */
  2332. public $dtdPublic;
  2333. /**
  2334. * System DTD identifier
  2335. */
  2336. public $dtdSystem;
  2337. public function __construct($name = null, $xml = true, $modules = array(),
  2338. $tidyModules = array(), $aliases = array(), $dtd_public = null, $dtd_system = null
  2339. ) {
  2340. $this->name = $name;
  2341. $this->xml = $xml;
  2342. $this->modules = $modules;
  2343. $this->tidyModules = $tidyModules;
  2344. $this->aliases = $aliases;
  2345. $this->dtdPublic = $dtd_public;
  2346. $this->dtdSystem = $dtd_system;
  2347. }
  2348. }
  2349. class HTMLPurifier_DoctypeRegistry
  2350. {
  2351. /**
  2352. * Hash of doctype names to doctype objects
  2353. */
  2354. protected $doctypes;
  2355. /**
  2356. * Lookup table of aliases to real doctype names
  2357. */
  2358. protected $aliases;
  2359. /**
  2360. * Registers a doctype to the registry
  2361. * @note Accepts a fully-formed doctype object, or the
  2362. * parameters for constructing a doctype object
  2363. * @param $doctype Name of doctype or literal doctype object
  2364. * @param $modules Modules doctype will load
  2365. * @param $modules_for_modes Modules doctype will load for certain modes
  2366. * @param $aliases Alias names for doctype
  2367. * @return Editable registered doctype
  2368. */
  2369. public function register($doctype, $xml = true, $modules = array(),
  2370. $tidy_modules = array(), $aliases = array(), $dtd_public = null, $dtd_system = null
  2371. ) {
  2372. if (!is_array($modules)) $modules = array($modules);
  2373. if (!is_array($tidy_modules)) $tidy_modules = array($tidy_modules);
  2374. if (!is_array($aliases)) $aliases = array($aliases);
  2375. if (!is_object($doctype)) {
  2376. $doctype = new HTMLPurifier_Doctype(
  2377. $doctype, $xml, $modules, $tidy_modules, $aliases, $dtd_public, $dtd_system
  2378. );
  2379. }
  2380. $this->doctypes[$doctype->name] = $doctype;
  2381. $name = $doctype->name;
  2382. // hookup aliases
  2383. foreach ($doctype->aliases as $alias) {
  2384. if (isset($this->doctypes[$alias])) continue;
  2385. $this->aliases[$alias] = $name;
  2386. }
  2387. // remove old aliases
  2388. if (isset($this->aliases[$name])) unset($this->aliases[$name]);
  2389. return $doctype;
  2390. }
  2391. /**
  2392. * Retrieves reference to a doctype of a certain name
  2393. * @note This function resolves aliases
  2394. * @note When possible, use the more fully-featured make()
  2395. * @param $doctype Name of doctype
  2396. * @return Editable doctype object
  2397. */
  2398. public function get($doctype) {
  2399. if (isset($this->aliases[$doctype])) $doctype = $this->aliases[$doctype];
  2400. if (!isset($this->doctypes[$doctype])) {
  2401. trigger_error('Doctype ' . htmlspecialchars($doctype) . ' does not exist', E_USER_ERROR);
  2402. $anon = new HTMLPurifier_Doctype($doctype);
  2403. return $anon;
  2404. }
  2405. return $this->doctypes[$doctype];
  2406. }
  2407. /**
  2408. * Creates a doctype based on a configuration object,
  2409. * will perform initialization on the doctype
  2410. * @note Use this function to get a copy of doctype that config
  2411. * can hold on to (this is necessary in order to tell
  2412. * Generator whether or not the current document is XML
  2413. * based or not).
  2414. */
  2415. public function make($config) {
  2416. return clone $this->get($this->getDoctypeFromConfig($config));
  2417. }
  2418. /**
  2419. * Retrieves the doctype from the configuration object
  2420. */
  2421. public function getDoctypeFromConfig($config) {
  2422. // recommended test
  2423. $doctype = $config->get('HTML.Doctype');
  2424. if (!empty($doctype)) return $doctype;
  2425. $doctype = $config->get('HTML.CustomDoctype');
  2426. if (!empty($doctype)) return $doctype;
  2427. // backwards-compatibility
  2428. if ($config->get('HTML.XHTML')) {
  2429. $doctype = 'XHTML 1.0';
  2430. } else {
  2431. $doctype = 'HTML 4.01';
  2432. }
  2433. if ($config->get('HTML.Strict')) {
  2434. $doctype .= ' Strict';
  2435. } else {
  2436. $doctype .= ' Transitional';
  2437. }
  2438. return $doctype;
  2439. }
  2440. }
  2441. /**
  2442. * Structure that stores an HTML element definition. Used by
  2443. * HTMLPurifier_HTMLDefinition and HTMLPurifier_HTMLModule.
  2444. * @note This class is inspected by HTMLPurifier_Printer_HTMLDefinition.
  2445. * Please update that class too.
  2446. * @warning If you add new properties to this class, you MUST update
  2447. * the mergeIn() method.
  2448. */
  2449. class HTMLPurifier_ElementDef
  2450. {
  2451. /**
  2452. * Does the definition work by itself, or is it created solely
  2453. * for the purpose of merging into another definition?
  2454. */
  2455. public $standalone = true;
  2456. /**
  2457. * Associative array of attribute name to HTMLPurifier_AttrDef
  2458. * @note Before being processed by HTMLPurifier_AttrCollections
  2459. * when modules are finalized during
  2460. * HTMLPurifier_HTMLDefinition->setup(), this array may also
  2461. * contain an array at index 0 that indicates which attribute
  2462. * collections to load into the full array. It may also
  2463. * contain string indentifiers in lieu of HTMLPurifier_AttrDef,
  2464. * see HTMLPurifier_AttrTypes on how they are expanded during
  2465. * HTMLPurifier_HTMLDefinition->setup() processing.
  2466. */
  2467. public $attr = array();
  2468. /**
  2469. * Indexed list of tag's HTMLPurifier_AttrTransform to be done before validation
  2470. */
  2471. public $attr_transform_pre = array();
  2472. /**
  2473. * Indexed list of tag's HTMLPurifier_AttrTransform to be done after validation
  2474. */
  2475. public $attr_transform_post = array();
  2476. /**
  2477. * HTMLPurifier_ChildDef of this tag.
  2478. */
  2479. public $child;
  2480. /**
  2481. * Abstract string representation of internal ChildDef rules. See
  2482. * HTMLPurifier_ContentSets for how this is parsed and then transformed
  2483. * into an HTMLPurifier_ChildDef.
  2484. * @warning This is a temporary variable that is not available after
  2485. * being processed by HTMLDefinition
  2486. */
  2487. public $content_model;
  2488. /**
  2489. * Value of $child->type, used to determine which ChildDef to use,
  2490. * used in combination with $content_model.
  2491. * @warning This must be lowercase
  2492. * @warning This is a temporary variable that is not available after
  2493. * being processed by HTMLDefinition
  2494. */
  2495. public $content_model_type;
  2496. /**
  2497. * Does the element have a content model (#PCDATA | Inline)*? This
  2498. * is important for chameleon ins and del processing in
  2499. * HTMLPurifier_ChildDef_Chameleon. Dynamically set: modules don't
  2500. * have to worry about this one.
  2501. */
  2502. public $descendants_are_inline = false;
  2503. /**
  2504. * List of the names of required attributes this element has. Dynamically
  2505. * populated by HTMLPurifier_HTMLDefinition::getElement
  2506. */
  2507. public $required_attr = array();
  2508. /**
  2509. * Lookup table of tags excluded from all descendants of this tag.
  2510. * @note SGML permits exclusions for all descendants, but this is
  2511. * not possible with DTDs or XML Schemas. W3C has elected to
  2512. * use complicated compositions of content_models to simulate
  2513. * exclusion for children, but we go the simpler, SGML-style
  2514. * route of flat-out exclusions, which correctly apply to
  2515. * all descendants and not just children. Note that the XHTML
  2516. * Modularization Abstract Modules are blithely unaware of such
  2517. * distinctions.
  2518. */
  2519. public $excludes = array();
  2520. /**
  2521. * This tag is explicitly auto-closed by the following tags.
  2522. */
  2523. public $autoclose = array();
  2524. /**
  2525. * If a foreign element is found in this element, test if it is
  2526. * allowed by this sub-element; if it is, instead of closing the
  2527. * current element, place it inside this element.
  2528. */
  2529. public $wrap;
  2530. /**
  2531. * Whether or not this is a formatting element affected by the
  2532. * "Active Formatting Elements" algorithm.
  2533. */
  2534. public $formatting;
  2535. /**
  2536. * Low-level factory constructor for creating new standalone element defs
  2537. */
  2538. public static function create($content_model, $content_model_type, $attr) {
  2539. $def = new HTMLPurifier_ElementDef();
  2540. $def->content_model = $content_model;
  2541. $def->content_model_type = $content_model_type;
  2542. $def->attr = $attr;
  2543. return $def;
  2544. }
  2545. /**
  2546. * Merges the values of another element definition into this one.
  2547. * Values from the new element def take precedence if a value is
  2548. * not mergeable.
  2549. */
  2550. public function mergeIn($def) {
  2551. // later keys takes precedence
  2552. foreach($def->attr as $k => $v) {
  2553. if ($k === 0) {
  2554. // merge in the includes
  2555. // sorry, no way to override an include
  2556. foreach ($v as $v2) {
  2557. $this->attr[0][] = $v2;
  2558. }
  2559. continue;
  2560. }
  2561. if ($v === false) {
  2562. if (isset($this->attr[$k])) unset($this->attr[$k]);
  2563. continue;
  2564. }
  2565. $this->attr[$k] = $v;
  2566. }
  2567. $this->_mergeAssocArray($this->attr_transform_pre, $def->attr_transform_pre);
  2568. $this->_mergeAssocArray($this->attr_transform_post, $def->attr_transform_post);
  2569. $this->_mergeAssocArray($this->excludes, $def->excludes);
  2570. if(!empty($def->content_model)) {
  2571. $this->content_model =
  2572. str_replace("#SUPER", $this->content_model, $def->content_model);
  2573. $this->child = false;
  2574. }
  2575. if(!empty($def->content_model_type)) {
  2576. $this->content_model_type = $def->content_model_type;
  2577. $this->child = false;
  2578. }
  2579. if(!is_null($def->child)) $this->child = $def->child;
  2580. if(!is_null($def->formatting)) $this->formatting = $def->formatting;
  2581. if($def->descendants_are_inline) $this->descendants_are_inline = $def->descendants_are_inline;
  2582. }
  2583. /**
  2584. * Merges one array into another, removes values which equal false
  2585. * @param $a1 Array by reference that is merged into
  2586. * @param $a2 Array that merges into $a1
  2587. */
  2588. private function _mergeAssocArray(&$a1, $a2) {
  2589. foreach ($a2 as $k => $v) {
  2590. if ($v === false) {
  2591. if (isset($a1[$k])) unset($a1[$k]);
  2592. continue;
  2593. }
  2594. $a1[$k] = $v;
  2595. }
  2596. }
  2597. }
  2598. /**
  2599. * A UTF-8 specific character encoder that handles cleaning and transforming.
  2600. * @note All functions in this class should be static.
  2601. */
  2602. class HTMLPurifier_Encoder
  2603. {
  2604. /**
  2605. * Constructor throws fatal error if you attempt to instantiate class
  2606. */
  2607. private function __construct() {
  2608. trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
  2609. }
  2610. /**
  2611. * Error-handler that mutes errors, alternative to shut-up operator.
  2612. */
  2613. public static function muteErrorHandler() {}
  2614. /**
  2615. * Cleans a UTF-8 string for well-formedness and SGML validity
  2616. *
  2617. * It will parse according to UTF-8 and return a valid UTF8 string, with
  2618. * non-SGML codepoints excluded.
  2619. *
  2620. * @note Just for reference, the non-SGML code points are 0 to 31 and
  2621. * 127 to 159, inclusive. However, we allow code points 9, 10
  2622. * and 13, which are the tab, line feed and carriage return
  2623. * respectively. 128 and above the code points map to multibyte
  2624. * UTF-8 representations.
  2625. *
  2626. * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
  2627. * hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
  2628. * LGPL license. Notes on what changed are inside, but in general,
  2629. * the original code transformed UTF-8 text into an array of integer
  2630. * Unicode codepoints. Understandably, transforming that back to
  2631. * a string would be somewhat expensive, so the function was modded to
  2632. * directly operate on the string. However, this discourages code
  2633. * reuse, and the logic enumerated here would be useful for any
  2634. * function that needs to be able to understand UTF-8 characters.
  2635. * As of right now, only smart lossless character encoding converters
  2636. * would need that, and I'm probably not going to implement them.
  2637. * Once again, PHP 6 should solve all our problems.
  2638. */
  2639. public static function cleanUTF8($str, $force_php = false) {
  2640. // UTF-8 validity is checked since PHP 4.3.5
  2641. // This is an optimization: if the string is already valid UTF-8, no
  2642. // need to do PHP stuff. 99% of the time, this will be the case.
  2643. // The regexp matches the XML char production, as well as well as excluding
  2644. // non-SGML codepoints U+007F to U+009F
  2645. if (preg_match('/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) {
  2646. return $str;
  2647. }
  2648. $mState = 0; // cached expected number of octets after the current octet
  2649. // until the beginning of the next UTF8 character sequence
  2650. $mUcs4 = 0; // cached Unicode character
  2651. $mBytes = 1; // cached expected number of octets in the current sequence
  2652. // original code involved an $out that was an array of Unicode
  2653. // codepoints. Instead of having to convert back into UTF-8, we've
  2654. // decided to directly append valid UTF-8 characters onto a string
  2655. // $out once they're done. $char accumulates raw bytes, while $mUcs4
  2656. // turns into the Unicode code point, so there's some redundancy.
  2657. $out = '';
  2658. $char = '';
  2659. $len = strlen($str);
  2660. for($i = 0; $i < $len; $i++) {
  2661. $in = ord($str{$i});
  2662. $char .= $str[$i]; // append byte to char
  2663. if (0 == $mState) {
  2664. // When mState is zero we expect either a US-ASCII character
  2665. // or a multi-octet sequence.
  2666. if (0 == (0x80 & ($in))) {
  2667. // US-ASCII, pass straight through.
  2668. if (($in <= 31 || $in == 127) &&
  2669. !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
  2670. ) {
  2671. // control characters, remove
  2672. } else {
  2673. $out .= $char;
  2674. }
  2675. // reset
  2676. $char = '';
  2677. $mBytes = 1;
  2678. } elseif (0xC0 == (0xE0 & ($in))) {
  2679. // First octet of 2 octet sequence
  2680. $mUcs4 = ($in);
  2681. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  2682. $mState = 1;
  2683. $mBytes = 2;
  2684. } elseif (0xE0 == (0xF0 & ($in))) {
  2685. // First octet of 3 octet sequence
  2686. $mUcs4 = ($in);
  2687. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  2688. $mState = 2;
  2689. $mBytes = 3;
  2690. } elseif (0xF0 == (0xF8 & ($in))) {
  2691. // First octet of 4 octet sequence
  2692. $mUcs4 = ($in);
  2693. $mUcs4 = ($mUcs4 & 0x07) << 18;
  2694. $mState = 3;
  2695. $mBytes = 4;
  2696. } elseif (0xF8 == (0xFC & ($in))) {
  2697. // First octet of 5 octet sequence.
  2698. //
  2699. // This is illegal because the encoded codepoint must be
  2700. // either:
  2701. // (a) not the shortest form or
  2702. // (b) outside the Unicode range of 0-0x10FFFF.
  2703. // Rather than trying to resynchronize, we will carry on
  2704. // until the end of the sequence and let the later error
  2705. // handling code catch it.
  2706. $mUcs4 = ($in);
  2707. $mUcs4 = ($mUcs4 & 0x03) << 24;
  2708. $mState = 4;
  2709. $mBytes = 5;
  2710. } elseif (0xFC == (0xFE & ($in))) {
  2711. // First octet of 6 octet sequence, see comments for 5
  2712. // octet sequence.
  2713. $mUcs4 = ($in);
  2714. $mUcs4 = ($mUcs4 & 1) << 30;
  2715. $mState = 5;
  2716. $mBytes = 6;
  2717. } else {
  2718. // Current octet is neither in the US-ASCII range nor a
  2719. // legal first octet of a multi-octet sequence.
  2720. $mState = 0;
  2721. $mUcs4 = 0;
  2722. $mBytes = 1;
  2723. $char = '';
  2724. }
  2725. } else {
  2726. // When mState is non-zero, we expect a continuation of the
  2727. // multi-octet sequence
  2728. if (0x80 == (0xC0 & ($in))) {
  2729. // Legal continuation.
  2730. $shift = ($mState - 1) * 6;
  2731. $tmp = $in;
  2732. $tmp = ($tmp & 0x0000003F) << $shift;
  2733. $mUcs4 |= $tmp;
  2734. if (0 == --$mState) {
  2735. // End of the multi-octet sequence. mUcs4 now contains
  2736. // the final Unicode codepoint to be output
  2737. // Check for illegal sequences and codepoints.
  2738. // From Unicode 3.1, non-shortest form is illegal
  2739. if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
  2740. ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
  2741. ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
  2742. (4 < $mBytes) ||
  2743. // From Unicode 3.2, surrogate characters = illegal
  2744. (($mUcs4 & 0xFFFFF800) == 0xD800) ||
  2745. // Codepoints outside the Unicode range are illegal
  2746. ($mUcs4 > 0x10FFFF)
  2747. ) {
  2748. } elseif (0xFEFF != $mUcs4 && // omit BOM
  2749. // check for valid Char unicode codepoints
  2750. (
  2751. 0x9 == $mUcs4 ||
  2752. 0xA == $mUcs4 ||
  2753. 0xD == $mUcs4 ||
  2754. (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
  2755. // 7F-9F is not strictly prohibited by XML,
  2756. // but it is non-SGML, and thus we don't allow it
  2757. (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
  2758. (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
  2759. )
  2760. ) {
  2761. $out .= $char;
  2762. }
  2763. // initialize UTF8 cache (reset)
  2764. $mState = 0;
  2765. $mUcs4 = 0;
  2766. $mBytes = 1;
  2767. $char = '';
  2768. }
  2769. } else {
  2770. // ((0xC0 & (*in) != 0x80) && (mState != 0))
  2771. // Incomplete multi-octet sequence.
  2772. // used to result in complete fail, but we'll reset
  2773. $mState = 0;
  2774. $mUcs4 = 0;
  2775. $mBytes = 1;
  2776. $char ='';
  2777. }
  2778. }
  2779. }
  2780. return $out;
  2781. }
  2782. /**
  2783. * Translates a Unicode codepoint into its corresponding UTF-8 character.
  2784. * @note Based on Feyd's function at
  2785. * <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
  2786. * which is in public domain.
  2787. * @note While we're going to do code point parsing anyway, a good
  2788. * optimization would be to refuse to translate code points that
  2789. * are non-SGML characters. However, this could lead to duplication.
  2790. * @note This is very similar to the unichr function in
  2791. * maintenance/generate-entity-file.php (although this is superior,
  2792. * due to its sanity checks).
  2793. */
  2794. // +----------+----------+----------+----------+
  2795. // | 33222222 | 22221111 | 111111 | |
  2796. // | 10987654 | 32109876 | 54321098 | 76543210 | bit
  2797. // +----------+----------+----------+----------+
  2798. // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
  2799. // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
  2800. // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
  2801. // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
  2802. // +----------+----------+----------+----------+
  2803. // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
  2804. // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
  2805. // +----------+----------+----------+----------+
  2806. public static function unichr($code) {
  2807. if($code > 1114111 or $code < 0 or
  2808. ($code >= 55296 and $code <= 57343) ) {
  2809. // bits are set outside the "valid" range as defined
  2810. // by UNICODE 4.1.0
  2811. return '';
  2812. }
  2813. $x = $y = $z = $w = 0;
  2814. if ($code < 128) {
  2815. // regular ASCII character
  2816. $x = $code;
  2817. } else {
  2818. // set up bits for UTF-8
  2819. $x = ($code & 63) | 128;
  2820. if ($code < 2048) {
  2821. $y = (($code & 2047) >> 6) | 192;
  2822. } else {
  2823. $y = (($code & 4032) >> 6) | 128;
  2824. if($code < 65536) {
  2825. $z = (($code >> 12) & 15) | 224;
  2826. } else {
  2827. $z = (($code >> 12) & 63) | 128;
  2828. $w = (($code >> 18) & 7) | 240;
  2829. }
  2830. }
  2831. }
  2832. // set up the actual character
  2833. $ret = '';
  2834. if($w) $ret .= chr($w);
  2835. if($z) $ret .= chr($z);
  2836. if($y) $ret .= chr($y);
  2837. $ret .= chr($x);
  2838. return $ret;
  2839. }
  2840. /**
  2841. * Converts a string to UTF-8 based on configuration.
  2842. */
  2843. public static function convertToUTF8($str, $config, $context) {
  2844. $encoding = $config->get('Core.Encoding');
  2845. if ($encoding === 'utf-8') return $str;
  2846. static $iconv = null;
  2847. if ($iconv === null) $iconv = function_exists('iconv');
  2848. set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
  2849. if ($iconv && !$config->get('Test.ForceNoIconv')) {
  2850. $str = iconv($encoding, 'utf-8//IGNORE', $str);
  2851. if ($str === false) {
  2852. // $encoding is not a valid encoding
  2853. restore_error_handler();
  2854. trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR);
  2855. return '';
  2856. }
  2857. // If the string is bjorked by Shift_JIS or a similar encoding
  2858. // that doesn't support all of ASCII, convert the naughty
  2859. // characters to their true byte-wise ASCII/UTF-8 equivalents.
  2860. $str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding));
  2861. restore_error_handler();
  2862. return $str;
  2863. } elseif ($encoding === 'iso-8859-1') {
  2864. $str = utf8_encode($str);
  2865. restore_error_handler();
  2866. return $str;
  2867. }
  2868. trigger_error('Encoding not supported, please install iconv', E_USER_ERROR);
  2869. }
  2870. /**
  2871. * Converts a string from UTF-8 based on configuration.
  2872. * @note Currently, this is a lossy conversion, with unexpressable
  2873. * characters being omitted.
  2874. */
  2875. public static function convertFromUTF8($str, $config, $context) {
  2876. $encoding = $config->get('Core.Encoding');
  2877. if ($encoding === 'utf-8') return $str;
  2878. static $iconv = null;
  2879. if ($iconv === null) $iconv = function_exists('iconv');
  2880. if ($escape = $config->get('Core.EscapeNonASCIICharacters')) {
  2881. $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
  2882. }
  2883. set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
  2884. if ($iconv && !$config->get('Test.ForceNoIconv')) {
  2885. // Undo our previous fix in convertToUTF8, otherwise iconv will barf
  2886. $ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);
  2887. if (!$escape && !empty($ascii_fix)) {
  2888. $clear_fix = array();
  2889. foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = '';
  2890. $str = strtr($str, $clear_fix);
  2891. }
  2892. $str = strtr($str, array_flip($ascii_fix));
  2893. // Normal stuff
  2894. $str = iconv('utf-8', $encoding . '//IGNORE', $str);
  2895. restore_error_handler();
  2896. return $str;
  2897. } elseif ($encoding === 'iso-8859-1') {
  2898. $str = utf8_decode($str);
  2899. restore_error_handler();
  2900. return $str;
  2901. }
  2902. trigger_error('Encoding not supported', E_USER_ERROR);
  2903. }
  2904. /**
  2905. * Lossless (character-wise) conversion of HTML to ASCII
  2906. * @param $str UTF-8 string to be converted to ASCII
  2907. * @returns ASCII encoded string with non-ASCII character entity-ized
  2908. * @warning Adapted from MediaWiki, claiming fair use: this is a common
  2909. * algorithm. If you disagree with this license fudgery,
  2910. * implement it yourself.
  2911. * @note Uses decimal numeric entities since they are best supported.
  2912. * @note This is a DUMB function: it has no concept of keeping
  2913. * character entities that the projected character encoding
  2914. * can allow. We could possibly implement a smart version
  2915. * but that would require it to also know which Unicode
  2916. * codepoints the charset supported (not an easy task).
  2917. * @note Sort of with cleanUTF8() but it assumes that $str is
  2918. * well-formed UTF-8
  2919. */
  2920. public static function convertToASCIIDumbLossless($str) {
  2921. $bytesleft = 0;
  2922. $result = '';
  2923. $working = 0;
  2924. $len = strlen($str);
  2925. for( $i = 0; $i < $len; $i++ ) {
  2926. $bytevalue = ord( $str[$i] );
  2927. if( $bytevalue <= 0x7F ) { //0xxx xxxx
  2928. $result .= chr( $bytevalue );
  2929. $bytesleft = 0;
  2930. } elseif( $bytevalue <= 0xBF ) { //10xx xxxx
  2931. $working = $working << 6;
  2932. $working += ($bytevalue & 0x3F);
  2933. $bytesleft--;
  2934. if( $bytesleft <= 0 ) {
  2935. $result .= "&#" . $working . ";";
  2936. }
  2937. } elseif( $bytevalue <= 0xDF ) { //110x xxxx
  2938. $working = $bytevalue & 0x1F;
  2939. $bytesleft = 1;
  2940. } elseif( $bytevalue <= 0xEF ) { //1110 xxxx
  2941. $working = $bytevalue & 0x0F;
  2942. $bytesleft = 2;
  2943. } else { //1111 0xxx
  2944. $working = $bytevalue & 0x07;
  2945. $bytesleft = 3;
  2946. }
  2947. }
  2948. return $result;
  2949. }
  2950. /**
  2951. * This expensive function tests whether or not a given character
  2952. * encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
  2953. * fail this test, and require special processing. Variable width
  2954. * encodings shouldn't ever fail.
  2955. *
  2956. * @param string $encoding Encoding name to test, as per iconv format
  2957. * @param bool $bypass Whether or not to bypass the precompiled arrays.
  2958. * @return Array of UTF-8 characters to their corresponding ASCII,
  2959. * which can be used to "undo" any overzealous iconv action.
  2960. */
  2961. public static function testEncodingSupportsASCII($encoding, $bypass = false) {
  2962. static $encodings = array();
  2963. if (!$bypass) {
  2964. if (isset($encodings[$encoding])) return $encodings[$encoding];
  2965. $lenc = strtolower($encoding);
  2966. switch ($lenc) {
  2967. case 'shift_jis':
  2968. return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
  2969. case 'johab':
  2970. return array("\xE2\x82\xA9" => '\\');
  2971. }
  2972. if (strpos($lenc, 'iso-8859-') === 0) return array();
  2973. }
  2974. $ret = array();
  2975. set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
  2976. if (iconv('UTF-8', $encoding, 'a') === false) return false;
  2977. for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
  2978. $c = chr($i); // UTF-8 char
  2979. $r = iconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion
  2980. if (
  2981. $r === '' ||
  2982. // This line is needed for iconv implementations that do not
  2983. // omit characters that do not exist in the target character set
  2984. ($r === $c && iconv($encoding, 'UTF-8//IGNORE', $r) !== $c)
  2985. ) {
  2986. // Reverse engineer: what's the UTF-8 equiv of this byte
  2987. // sequence? This assumes that there's no variable width
  2988. // encoding that doesn't support ASCII.
  2989. $ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
  2990. }
  2991. }
  2992. restore_error_handler();
  2993. $encodings[$encoding] = $ret;
  2994. return $ret;
  2995. }
  2996. }
  2997. /**
  2998. * Object that provides entity lookup table from entity name to character
  2999. */
  3000. class HTMLPurifier_EntityLookup {
  3001. /**
  3002. * Assoc array of entity name to character represented.
  3003. */
  3004. public $table;
  3005. /**
  3006. * Sets up the entity lookup table from the serialized file contents.
  3007. * @note The serialized contents are versioned, but were generated
  3008. * using the maintenance script generate_entity_file.php
  3009. * @warning This is not in constructor to help enforce the Singleton
  3010. */
  3011. public function setup($file = false) {
  3012. if (!$file) {
  3013. $file = HTMLPURIFIER_PREFIX . '/HTMLPurifier/EntityLookup/entities.ser';
  3014. }
  3015. $this->table = unserialize(file_get_contents($file));
  3016. }
  3017. /**
  3018. * Retrieves sole instance of the object.
  3019. * @param Optional prototype of custom lookup table to overload with.
  3020. */
  3021. public static function instance($prototype = false) {
  3022. // no references, since PHP doesn't copy unless modified
  3023. static $instance = null;
  3024. if ($prototype) {
  3025. $instance = $prototype;
  3026. } elseif (!$instance) {
  3027. $instance = new HTMLPurifier_EntityLookup();
  3028. $instance->setup();
  3029. }
  3030. return $instance;
  3031. }
  3032. }
  3033. // if want to implement error collecting here, we'll need to use some sort
  3034. // of global data (probably trigger_error) because it's impossible to pass
  3035. // $config or $context to the callback functions.
  3036. /**
  3037. * Handles referencing and derefencing character entities
  3038. */
  3039. class HTMLPurifier_EntityParser
  3040. {
  3041. /**
  3042. * Reference to entity lookup table.
  3043. */
  3044. protected $_entity_lookup;
  3045. /**
  3046. * Callback regex string for parsing entities.
  3047. */
  3048. protected $_substituteEntitiesRegex =
  3049. '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
  3050. // 1. hex 2. dec 3. string (XML style)
  3051. /**
  3052. * Decimal to parsed string conversion table for special entities.
  3053. */
  3054. protected $_special_dec2str =
  3055. array(
  3056. 34 => '"',
  3057. 38 => '&',
  3058. 39 => "'",
  3059. 60 => '<',
  3060. 62 => '>'
  3061. );
  3062. /**
  3063. * Stripped entity names to decimal conversion table for special entities.
  3064. */
  3065. protected $_special_ent2dec =
  3066. array(
  3067. 'quot' => 34,
  3068. 'amp' => 38,
  3069. 'lt' => 60,
  3070. 'gt' => 62
  3071. );
  3072. /**
  3073. * Substitutes non-special entities with their parsed equivalents. Since
  3074. * running this whenever you have parsed character is t3h 5uck, we run
  3075. * it before everything else.
  3076. *
  3077. * @param $string String to have non-special entities parsed.
  3078. * @returns Parsed string.
  3079. */
  3080. public function substituteNonSpecialEntities($string) {
  3081. // it will try to detect missing semicolons, but don't rely on it
  3082. return preg_replace_callback(
  3083. $this->_substituteEntitiesRegex,
  3084. array($this, 'nonSpecialEntityCallback'),
  3085. $string
  3086. );
  3087. }
  3088. /**
  3089. * Callback function for substituteNonSpecialEntities() that does the work.
  3090. *
  3091. * @param $matches PCRE matches array, with 0 the entire match, and
  3092. * either index 1, 2 or 3 set with a hex value, dec value,
  3093. * or string (respectively).
  3094. * @returns Replacement string.
  3095. */
  3096. protected function nonSpecialEntityCallback($matches) {
  3097. // replaces all but big five
  3098. $entity = $matches[0];
  3099. $is_num = (@$matches[0][1] === '#');
  3100. if ($is_num) {
  3101. $is_hex = (@$entity[2] === 'x');
  3102. $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
  3103. // abort for special characters
  3104. if (isset($this->_special_dec2str[$code])) return $entity;
  3105. return HTMLPurifier_Encoder::unichr($code);
  3106. } else {
  3107. if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
  3108. if (!$this->_entity_lookup) {
  3109. $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
  3110. }
  3111. if (isset($this->_entity_lookup->table[$matches[3]])) {
  3112. return $this->_entity_lookup->table[$matches[3]];
  3113. } else {
  3114. return $entity;
  3115. }
  3116. }
  3117. }
  3118. /**
  3119. * Substitutes only special entities with their parsed equivalents.
  3120. *
  3121. * @notice We try to avoid calling this function because otherwise, it
  3122. * would have to be called a lot (for every parsed section).
  3123. *
  3124. * @param $string String to have non-special entities parsed.
  3125. * @returns Parsed string.
  3126. */
  3127. public function substituteSpecialEntities($string) {
  3128. return preg_replace_callback(
  3129. $this->_substituteEntitiesRegex,
  3130. array($this, 'specialEntityCallback'),
  3131. $string);
  3132. }
  3133. /**
  3134. * Callback function for substituteSpecialEntities() that does the work.
  3135. *
  3136. * This callback has same syntax as nonSpecialEntityCallback().
  3137. *
  3138. * @param $matches PCRE-style matches array, with 0 the entire match, and
  3139. * either index 1, 2 or 3 set with a hex value, dec value,
  3140. * or string (respectively).
  3141. * @returns Replacement string.
  3142. */
  3143. protected function specialEntityCallback($matches) {
  3144. $entity = $matches[0];
  3145. $is_num = (@$matches[0][1] === '#');
  3146. if ($is_num) {
  3147. $is_hex = (@$entity[2] === 'x');
  3148. $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
  3149. return isset($this->_special_dec2str[$int]) ?
  3150. $this->_special_dec2str[$int] :
  3151. $entity;
  3152. } else {
  3153. return isset($this->_special_ent2dec[$matches[3]]) ?
  3154. $this->_special_ent2dec[$matches[3]] :
  3155. $entity;
  3156. }
  3157. }
  3158. }
  3159. /**
  3160. * Error collection class that enables HTML Purifier to report HTML
  3161. * problems back to the user
  3162. */
  3163. class HTMLPurifier_ErrorCollector
  3164. {
  3165. /**
  3166. * Identifiers for the returned error array. These are purposely numeric
  3167. * so list() can be used.
  3168. */
  3169. const LINENO = 0;
  3170. const SEVERITY = 1;
  3171. const MESSAGE = 2;
  3172. const CHILDREN = 3;
  3173. protected $errors;
  3174. protected $_current;
  3175. protected $_stacks = array(array());
  3176. protected $locale;
  3177. protected $generator;
  3178. protected $context;
  3179. protected $lines = array();
  3180. public function __construct($context) {
  3181. $this->locale =& $context->get('Locale');
  3182. $this->context = $context;
  3183. $this->_current =& $this->_stacks[0];
  3184. $this->errors =& $this->_stacks[0];
  3185. }
  3186. /**
  3187. * Sends an error message to the collector for later use
  3188. * @param $severity int Error severity, PHP error style (don't use E_USER_)
  3189. * @param $msg string Error message text
  3190. * @param $subst1 string First substitution for $msg
  3191. * @param $subst2 string ...
  3192. */
  3193. public function send($severity, $msg) {
  3194. $args = array();
  3195. if (func_num_args() > 2) {
  3196. $args = func_get_args();
  3197. array_shift($args);
  3198. unset($args[0]);
  3199. }
  3200. $token = $this->context->get('CurrentToken', true);
  3201. $line = $token ? $token->line : $this->context->get('CurrentLine', true);
  3202. $col = $token ? $token->col : $this->context->get('CurrentCol', true);
  3203. $attr = $this->context->get('CurrentAttr', true);
  3204. // perform special substitutions, also add custom parameters
  3205. $subst = array();
  3206. if (!is_null($token)) {
  3207. $args['CurrentToken'] = $token;
  3208. }
  3209. if (!is_null($attr)) {
  3210. $subst['$CurrentAttr.Name'] = $attr;
  3211. if (isset($token->attr[$attr])) $subst['$CurrentAttr.Value'] = $token->attr[$attr];
  3212. }
  3213. if (empty($args)) {
  3214. $msg = $this->locale->getMessage($msg);
  3215. } else {
  3216. $msg = $this->locale->formatMessage($msg, $args);
  3217. }
  3218. if (!empty($subst)) $msg = strtr($msg, $subst);
  3219. // (numerically indexed)
  3220. $error = array(
  3221. self::LINENO => $line,
  3222. self::SEVERITY => $severity,
  3223. self::MESSAGE => $msg,
  3224. self::CHILDREN => array()
  3225. );
  3226. $this->_current[] = $error;
  3227. // NEW CODE BELOW ...
  3228. $struct = null;
  3229. // Top-level errors are either:
  3230. // TOKEN type, if $value is set appropriately, or
  3231. // "syntax" type, if $value is null
  3232. $new_struct = new HTMLPurifier_ErrorStruct();
  3233. $new_struct->type = HTMLPurifier_ErrorStruct::TOKEN;
  3234. if ($token) $new_struct->value = clone $token;
  3235. if (is_int($line) && is_int($col)) {
  3236. if (isset($this->lines[$line][$col])) {
  3237. $struct = $this->lines[$line][$col];
  3238. } else {
  3239. $struct = $this->lines[$line][$col] = $new_struct;
  3240. }
  3241. // These ksorts may present a performance problem
  3242. ksort($this->lines[$line], SORT_NUMERIC);
  3243. } else {
  3244. if (isset($this->lines[-1])) {
  3245. $struct = $this->lines[-1];
  3246. } else {
  3247. $struct = $this->lines[-1] = $new_struct;
  3248. }
  3249. }
  3250. ksort($this->lines, SORT_NUMERIC);
  3251. // Now, check if we need to operate on a lower structure
  3252. if (!empty($attr)) {
  3253. $struct = $struct->getChild(HTMLPurifier_ErrorStruct::ATTR, $attr);
  3254. if (!$struct->value) {
  3255. $struct->value = array($attr, 'PUT VALUE HERE');
  3256. }
  3257. }
  3258. if (!empty($cssprop)) {
  3259. $struct = $struct->getChild(HTMLPurifier_ErrorStruct::CSSPROP, $cssprop);
  3260. if (!$struct->value) {
  3261. // if we tokenize CSS this might be a little more difficult to do
  3262. $struct->value = array($cssprop, 'PUT VALUE HERE');
  3263. }
  3264. }
  3265. // Ok, structs are all setup, now time to register the error
  3266. $struct->addError($severity, $msg);
  3267. }
  3268. /**
  3269. * Retrieves raw error data for custom formatter to use
  3270. * @param List of arrays in format of array(line of error,
  3271. * error severity, error message,
  3272. * recursive sub-errors array)
  3273. */
  3274. public function getRaw() {
  3275. return $this->errors;
  3276. }
  3277. /**
  3278. * Default HTML formatting implementation for error messages
  3279. * @param $config Configuration array, vital for HTML output nature
  3280. * @param $errors Errors array to display; used for recursion.
  3281. */
  3282. public function getHTMLFormatted($config, $errors = null) {
  3283. $ret = array();
  3284. $this->generator = new HTMLPurifier_Generator($config, $this->context);
  3285. if ($errors === null) $errors = $this->errors;
  3286. // 'At line' message needs to be removed
  3287. // generation code for new structure goes here. It needs to be recursive.
  3288. foreach ($this->lines as $line => $col_array) {
  3289. if ($line == -1) continue;
  3290. foreach ($col_array as $col => $struct) {
  3291. $this->_renderStruct($ret, $struct, $line, $col);
  3292. }
  3293. }
  3294. if (isset($this->lines[-1])) {
  3295. $this->_renderStruct($ret, $this->lines[-1]);
  3296. }
  3297. if (empty($errors)) {
  3298. return '<p>' . $this->locale->getMessage('ErrorCollector: No errors') . '</p>';
  3299. } else {
  3300. return '<ul><li>' . implode('</li><li>', $ret) . '</li></ul>';
  3301. }
  3302. }
  3303. private function _renderStruct(&$ret, $struct, $line = null, $col = null) {
  3304. $stack = array($struct);
  3305. $context_stack = array(array());
  3306. while ($current = array_pop($stack)) {
  3307. $context = array_pop($context_stack);
  3308. foreach ($current->errors as $error) {
  3309. list($severity, $msg) = $error;
  3310. $string = '';
  3311. $string .= '<div>';
  3312. // W3C uses an icon to indicate the severity of the error.
  3313. $error = $this->locale->getErrorName($severity);
  3314. $string .= "<span class=\"error e$severity\"><strong>$error</strong></span> ";
  3315. if (!is_null($line) && !is_null($col)) {
  3316. $string .= "<em class=\"location\">Line $line, Column $col: </em> ";
  3317. } else {
  3318. $string .= '<em class="location">End of Document: </em> ';
  3319. }
  3320. $string .= '<strong class="description">' . $this->generator->escape($msg) . '</strong> ';
  3321. $string .= '</div>';
  3322. // Here, have a marker for the character on the column appropriate.
  3323. // Be sure to clip extremely long lines.
  3324. //$string .= '<pre>';
  3325. //$string .= '';
  3326. //$string .= '</pre>';
  3327. $ret[] = $string;
  3328. }
  3329. foreach ($current->children as $type => $array) {
  3330. $context[] = $current;
  3331. $stack = array_merge($stack, array_reverse($array, true));
  3332. for ($i = count($array); $i > 0; $i--) {
  3333. $context_stack[] = $context;
  3334. }
  3335. }
  3336. }
  3337. }
  3338. }
  3339. /**
  3340. * Records errors for particular segments of an HTML document such as tokens,
  3341. * attributes or CSS properties. They can contain error structs (which apply
  3342. * to components of what they represent), but their main purpose is to hold
  3343. * errors applying to whatever struct is being used.
  3344. */
  3345. class HTMLPurifier_ErrorStruct
  3346. {
  3347. /**
  3348. * Possible values for $children first-key. Note that top-level structures
  3349. * are automatically token-level.
  3350. */
  3351. const TOKEN = 0;
  3352. const ATTR = 1;
  3353. const CSSPROP = 2;
  3354. /**
  3355. * Type of this struct.
  3356. */
  3357. public $type;
  3358. /**
  3359. * Value of the struct we are recording errors for. There are various
  3360. * values for this:
  3361. * - TOKEN: Instance of HTMLPurifier_Token
  3362. * - ATTR: array('attr-name', 'value')
  3363. * - CSSPROP: array('prop-name', 'value')
  3364. */
  3365. public $value;
  3366. /**
  3367. * Errors registered for this structure.
  3368. */
  3369. public $errors = array();
  3370. /**
  3371. * Child ErrorStructs that are from this structure. For example, a TOKEN
  3372. * ErrorStruct would contain ATTR ErrorStructs. This is a multi-dimensional
  3373. * array in structure: [TYPE]['identifier']
  3374. */
  3375. public $children = array();
  3376. public function getChild($type, $id) {
  3377. if (!isset($this->children[$type][$id])) {
  3378. $this->children[$type][$id] = new HTMLPurifier_ErrorStruct();
  3379. $this->children[$type][$id]->type = $type;
  3380. }
  3381. return $this->children[$type][$id];
  3382. }
  3383. public function addError($severity, $message) {
  3384. $this->errors[] = array($severity, $message);
  3385. }
  3386. }
  3387. /**
  3388. * Global exception class for HTML Purifier; any exceptions we throw
  3389. * are from here.
  3390. */
  3391. class HTMLPurifier_Exception extends Exception
  3392. {
  3393. }
  3394. /**
  3395. * Represents a pre or post processing filter on HTML Purifier's output
  3396. *
  3397. * Sometimes, a little ad-hoc fixing of HTML has to be done before
  3398. * it gets sent through HTML Purifier: you can use filters to acheive
  3399. * this effect. For instance, YouTube videos can be preserved using
  3400. * this manner. You could have used a decorator for this task, but
  3401. * PHP's support for them is not terribly robust, so we're going
  3402. * to just loop through the filters.
  3403. *
  3404. * Filters should be exited first in, last out. If there are three filters,
  3405. * named 1, 2 and 3, the order of execution should go 1->preFilter,
  3406. * 2->preFilter, 3->preFilter, purify, 3->postFilter, 2->postFilter,
  3407. * 1->postFilter.
  3408. *
  3409. * @note Methods are not declared abstract as it is perfectly legitimate
  3410. * for an implementation not to want anything to happen on a step
  3411. */
  3412. class HTMLPurifier_Filter
  3413. {
  3414. /**
  3415. * Name of the filter for identification purposes
  3416. */
  3417. public $name;
  3418. /**
  3419. * Pre-processor function, handles HTML before HTML Purifier
  3420. */
  3421. public function preFilter($html, $config, $context) {
  3422. return $html;
  3423. }
  3424. /**
  3425. * Post-processor function, handles HTML after HTML Purifier
  3426. */
  3427. public function postFilter($html, $config, $context) {
  3428. return $html;
  3429. }
  3430. }
  3431. /**
  3432. * Generates HTML from tokens.
  3433. * @todo Refactor interface so that configuration/context is determined
  3434. * upon instantiation, no need for messy generateFromTokens() calls
  3435. * @todo Make some of the more internal functions protected, and have
  3436. * unit tests work around that
  3437. */
  3438. class HTMLPurifier_Generator
  3439. {
  3440. /**
  3441. * Whether or not generator should produce XML output
  3442. */
  3443. private $_xhtml = true;
  3444. /**
  3445. * :HACK: Whether or not generator should comment the insides of <script> tags
  3446. */
  3447. private $_scriptFix = false;
  3448. /**
  3449. * Cache of HTMLDefinition during HTML output to determine whether or
  3450. * not attributes should be minimized.
  3451. */
  3452. private $_def;
  3453. /**
  3454. * Cache of %Output.SortAttr
  3455. */
  3456. private $_sortAttr;
  3457. /**
  3458. * Cache of %Output.FlashCompat
  3459. */
  3460. private $_flashCompat;
  3461. /**
  3462. * Cache of %Output.FixInnerHTML
  3463. */
  3464. private $_innerHTMLFix;
  3465. /**
  3466. * Stack for keeping track of object information when outputting IE
  3467. * compatibility code.
  3468. */
  3469. private $_flashStack = array();
  3470. /**
  3471. * Configuration for the generator
  3472. */
  3473. protected $config;
  3474. /**
  3475. * @param $config Instance of HTMLPurifier_Config
  3476. * @param $context Instance of HTMLPurifier_Context
  3477. */
  3478. public function __construct($config, $context) {
  3479. $this->config = $config;
  3480. $this->_scriptFix = $config->get('Output.CommentScriptContents');
  3481. $this->_innerHTMLFix = $config->get('Output.FixInnerHTML');
  3482. $this->_sortAttr = $config->get('Output.SortAttr');
  3483. $this->_flashCompat = $config->get('Output.FlashCompat');
  3484. $this->_def = $config->getHTMLDefinition();
  3485. $this->_xhtml = $this->_def->doctype->xml;
  3486. }
  3487. /**
  3488. * Generates HTML from an array of tokens.
  3489. * @param $tokens Array of HTMLPurifier_Token
  3490. * @param $config HTMLPurifier_Config object
  3491. * @return Generated HTML
  3492. */
  3493. public function generateFromTokens($tokens) {
  3494. if (!$tokens) return '';
  3495. // Basic algorithm
  3496. $html = '';
  3497. for ($i = 0, $size = count($tokens); $i < $size; $i++) {
  3498. if ($this->_scriptFix && $tokens[$i]->name === 'script'
  3499. && $i + 2 < $size && $tokens[$i+2] instanceof HTMLPurifier_Token_End) {
  3500. // script special case
  3501. // the contents of the script block must be ONE token
  3502. // for this to work.
  3503. $html .= $this->generateFromToken($tokens[$i++]);
  3504. $html .= $this->generateScriptFromToken($tokens[$i++]);
  3505. }
  3506. $html .= $this->generateFromToken($tokens[$i]);
  3507. }
  3508. // Tidy cleanup
  3509. if (extension_loaded('tidy') && $this->config->get('Output.TidyFormat')) {
  3510. $tidy = new Tidy;
  3511. $tidy->parseString($html, array(
  3512. 'indent'=> true,
  3513. 'output-xhtml' => $this->_xhtml,
  3514. 'show-body-only' => true,
  3515. 'indent-spaces' => 2,
  3516. 'wrap' => 68,
  3517. ), 'utf8');
  3518. $tidy->cleanRepair();
  3519. $html = (string) $tidy; // explicit cast necessary
  3520. }
  3521. // Normalize newlines to system defined value
  3522. if ($this->config->get('Core.NormalizeNewlines')) {
  3523. $nl = $this->config->get('Output.Newline');
  3524. if ($nl === null) $nl = PHP_EOL;
  3525. if ($nl !== "\n") $html = str_replace("\n", $nl, $html);
  3526. }
  3527. return $html;
  3528. }
  3529. /**
  3530. * Generates HTML from a single token.
  3531. * @param $token HTMLPurifier_Token object.
  3532. * @return Generated HTML
  3533. */
  3534. public function generateFromToken($token) {
  3535. if (!$token instanceof HTMLPurifier_Token) {
  3536. trigger_error('Cannot generate HTML from non-HTMLPurifier_Token object', E_USER_WARNING);
  3537. return '';
  3538. } elseif ($token instanceof HTMLPurifier_Token_Start) {
  3539. $attr = $this->generateAttributes($token->attr, $token->name);
  3540. if ($this->_flashCompat) {
  3541. if ($token->name == "object") {
  3542. $flash = new stdclass();
  3543. $flash->attr = $token->attr;
  3544. $flash->param = array();
  3545. $this->_flashStack[] = $flash;
  3546. }
  3547. }
  3548. return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
  3549. } elseif ($token instanceof HTMLPurifier_Token_End) {
  3550. $_extra = '';
  3551. if ($this->_flashCompat) {
  3552. if ($token->name == "object" && !empty($this->_flashStack)) {
  3553. // doesn't do anything for now
  3554. }
  3555. }
  3556. return $_extra . '</' . $token->name . '>';
  3557. } elseif ($token instanceof HTMLPurifier_Token_Empty) {
  3558. if ($this->_flashCompat && $token->name == "param" && !empty($this->_flashStack)) {
  3559. $this->_flashStack[count($this->_flashStack)-1]->param[$token->attr['name']] = $token->attr['value'];
  3560. }
  3561. $attr = $this->generateAttributes($token->attr, $token->name);
  3562. return '<' . $token->name . ($attr ? ' ' : '') . $attr .
  3563. ( $this->_xhtml ? ' /': '' ) // <br /> v. <br>
  3564. . '>';
  3565. } elseif ($token instanceof HTMLPurifier_Token_Text) {
  3566. return $this->escape($token->data, ENT_NOQUOTES);
  3567. } elseif ($token instanceof HTMLPurifier_Token_Comment) {
  3568. return '<!--' . $token->data . '-->';
  3569. } else {
  3570. return '';
  3571. }
  3572. }
  3573. /**
  3574. * Special case processor for the contents of script tags
  3575. * @warning This runs into problems if there's already a literal
  3576. * --> somewhere inside the script contents.
  3577. */
  3578. public function generateScriptFromToken($token) {
  3579. if (!$token instanceof HTMLPurifier_Token_Text) return $this->generateFromToken($token);
  3580. // Thanks <http://lachy.id.au/log/2005/05/script-comments>
  3581. $data = preg_replace('#//\s*$#', '', $token->data);
  3582. return '<!--//--><![CDATA[//><!--' . "\n" . trim($data) . "\n" . '//--><!]]>';
  3583. }
  3584. /**
  3585. * Generates attribute declarations from attribute array.
  3586. * @note This does not include the leading or trailing space.
  3587. * @param $assoc_array_of_attributes Attribute array
  3588. * @param $element Name of element attributes are for, used to check
  3589. * attribute minimization.
  3590. * @return Generate HTML fragment for insertion.
  3591. */
  3592. public function generateAttributes($assoc_array_of_attributes, $element = false) {
  3593. $html = '';
  3594. if ($this->_sortAttr) ksort($assoc_array_of_attributes);
  3595. foreach ($assoc_array_of_attributes as $key => $value) {
  3596. if (!$this->_xhtml) {
  3597. // Remove namespaced attributes
  3598. if (strpos($key, ':') !== false) continue;
  3599. // Check if we should minimize the attribute: val="val" -> val
  3600. if ($element && !empty($this->_def->info[$element]->attr[$key]->minimized)) {
  3601. $html .= $key . ' ';
  3602. continue;
  3603. }
  3604. }
  3605. // Workaround for Internet Explorer innerHTML bug.
  3606. // Essentially, Internet Explorer, when calculating
  3607. // innerHTML, omits quotes if there are no instances of
  3608. // angled brackets, quotes or spaces. However, when parsing
  3609. // HTML (for example, when you assign to innerHTML), it
  3610. // treats backticks as quotes. Thus,
  3611. // <img alt="``" />
  3612. // becomes
  3613. // <img alt=`` />
  3614. // becomes
  3615. // <img alt='' />
  3616. // Fortunately, all we need to do is trigger an appropriate
  3617. // quoting style, which we do by adding an extra space.
  3618. // This also is consistent with the W3C spec, which states
  3619. // that user agents may ignore leading or trailing
  3620. // whitespace (in fact, most don't, at least for attributes
  3621. // like alt, but an extra space at the end is barely
  3622. // noticeable). Still, we have a configuration knob for
  3623. // this, since this transformation is not necesary if you
  3624. // don't process user input with innerHTML or you don't plan
  3625. // on supporting Internet Explorer.
  3626. if ($this->_innerHTMLFix) {
  3627. if (strpos($value, '`') !== false) {
  3628. // check if correct quoting style would not already be
  3629. // triggered
  3630. if (strcspn($value, '"\' <>') === strlen($value)) {
  3631. // protect!
  3632. $value .= ' ';
  3633. }
  3634. }
  3635. }
  3636. $html .= $key.'="'.$this->escape($value).'" ';
  3637. }
  3638. return rtrim($html);
  3639. }
  3640. /**
  3641. * Escapes raw text data.
  3642. * @todo This really ought to be protected, but until we have a facility
  3643. * for properly generating HTML here w/o using tokens, it stays
  3644. * public.
  3645. * @param $string String data to escape for HTML.
  3646. * @param $quote Quoting style, like htmlspecialchars. ENT_NOQUOTES is
  3647. * permissible for non-attribute output.
  3648. * @return String escaped data.
  3649. */
  3650. public function escape($string, $quote = null) {
  3651. // Workaround for APC bug on Mac Leopard reported by sidepodcast
  3652. // http://htmlpurifier.org/phorum/read.php?3,4823,4846
  3653. if ($quote === null) $quote = ENT_COMPAT;
  3654. return htmlspecialchars($string, $quote, 'UTF-8');
  3655. }
  3656. }
  3657. /**
  3658. * Definition of the purified HTML that describes allowed children,
  3659. * attributes, and many other things.
  3660. *
  3661. * Conventions:
  3662. *
  3663. * All member variables that are prefixed with info
  3664. * (including the main $info array) are used by HTML Purifier internals
  3665. * and should not be directly edited when customizing the HTMLDefinition.
  3666. * They can usually be set via configuration directives or custom
  3667. * modules.
  3668. *
  3669. * On the other hand, member variables without the info prefix are used
  3670. * internally by the HTMLDefinition and MUST NOT be used by other HTML
  3671. * Purifier internals. Many of them, however, are public, and may be
  3672. * edited by userspace code to tweak the behavior of HTMLDefinition.
  3673. *
  3674. * @note This class is inspected by Printer_HTMLDefinition; please
  3675. * update that class if things here change.
  3676. *
  3677. * @warning Directives that change this object's structure must be in
  3678. * the HTML or Attr namespace!
  3679. */
  3680. class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition
  3681. {
  3682. // FULLY-PUBLIC VARIABLES ---------------------------------------------
  3683. /**
  3684. * Associative array of element names to HTMLPurifier_ElementDef
  3685. */
  3686. public $info = array();
  3687. /**
  3688. * Associative array of global attribute name to attribute definition.
  3689. */
  3690. public $info_global_attr = array();
  3691. /**
  3692. * String name of parent element HTML will be going into.
  3693. */
  3694. public $info_parent = 'div';
  3695. /**
  3696. * Definition for parent element, allows parent element to be a
  3697. * tag that's not allowed inside the HTML fragment.
  3698. */
  3699. public $info_parent_def;
  3700. /**
  3701. * String name of element used to wrap inline elements in block context
  3702. * @note This is rarely used except for BLOCKQUOTEs in strict mode
  3703. */
  3704. public $info_block_wrapper = 'p';
  3705. /**
  3706. * Associative array of deprecated tag name to HTMLPurifier_TagTransform
  3707. */
  3708. public $info_tag_transform = array();
  3709. /**
  3710. * Indexed list of HTMLPurifier_AttrTransform to be performed before validation.
  3711. */
  3712. public $info_attr_transform_pre = array();
  3713. /**
  3714. * Indexed list of HTMLPurifier_AttrTransform to be performed after validation.
  3715. */
  3716. public $info_attr_transform_post = array();
  3717. /**
  3718. * Nested lookup array of content set name (Block, Inline) to
  3719. * element name to whether or not it belongs in that content set.
  3720. */
  3721. public $info_content_sets = array();
  3722. /**
  3723. * Indexed list of HTMLPurifier_Injector to be used.
  3724. */
  3725. public $info_injector = array();
  3726. /**
  3727. * Doctype object
  3728. */
  3729. public $doctype;
  3730. // RAW CUSTOMIZATION STUFF --------------------------------------------
  3731. /**
  3732. * Adds a custom attribute to a pre-existing element
  3733. * @note This is strictly convenience, and does not have a corresponding
  3734. * method in HTMLPurifier_HTMLModule
  3735. * @param $element_name String element name to add attribute to
  3736. * @param $attr_name String name of attribute
  3737. * @param $def Attribute definition, can be string or object, see
  3738. * HTMLPurifier_AttrTypes for details
  3739. */
  3740. public function addAttribute($element_name, $attr_name, $def) {
  3741. $module = $this->getAnonymousModule();
  3742. if (!isset($module->info[$element_name])) {
  3743. $element = $module->addBlankElement($element_name);
  3744. } else {
  3745. $element = $module->info[$element_name];
  3746. }
  3747. $element->attr[$attr_name] = $def;
  3748. }
  3749. /**
  3750. * Adds a custom element to your HTML definition
  3751. * @note See HTMLPurifier_HTMLModule::addElement for detailed
  3752. * parameter and return value descriptions.
  3753. */
  3754. public function addElement($element_name, $type, $contents, $attr_collections, $attributes = array()) {
  3755. $module = $this->getAnonymousModule();
  3756. // assume that if the user is calling this, the element
  3757. // is safe. This may not be a good idea
  3758. $element = $module->addElement($element_name, $type, $contents, $attr_collections, $attributes);
  3759. return $element;
  3760. }
  3761. /**
  3762. * Adds a blank element to your HTML definition, for overriding
  3763. * existing behavior
  3764. * @note See HTMLPurifier_HTMLModule::addBlankElement for detailed
  3765. * parameter and return value descriptions.
  3766. */
  3767. public function addBlankElement($element_name) {
  3768. $module = $this->getAnonymousModule();
  3769. $element = $module->addBlankElement($element_name);
  3770. return $element;
  3771. }
  3772. /**
  3773. * Retrieves a reference to the anonymous module, so you can
  3774. * bust out advanced features without having to make your own
  3775. * module.
  3776. */
  3777. public function getAnonymousModule() {
  3778. if (!$this->_anonModule) {
  3779. $this->_anonModule = new HTMLPurifier_HTMLModule();
  3780. $this->_anonModule->name = 'Anonymous';
  3781. }
  3782. return $this->_anonModule;
  3783. }
  3784. private $_anonModule;
  3785. // PUBLIC BUT INTERNAL VARIABLES --------------------------------------
  3786. public $type = 'HTML';
  3787. public $manager; /**< Instance of HTMLPurifier_HTMLModuleManager */
  3788. /**
  3789. * Performs low-cost, preliminary initialization.
  3790. */
  3791. public function __construct() {
  3792. $this->manager = new HTMLPurifier_HTMLModuleManager();
  3793. }
  3794. protected function doSetup($config) {
  3795. $this->processModules($config);
  3796. $this->setupConfigStuff($config);
  3797. unset($this->manager);
  3798. // cleanup some of the element definitions
  3799. foreach ($this->info as $k => $v) {
  3800. unset($this->info[$k]->content_model);
  3801. unset($this->info[$k]->content_model_type);
  3802. }
  3803. }
  3804. /**
  3805. * Extract out the information from the manager
  3806. */
  3807. protected function processModules($config) {
  3808. if ($this->_anonModule) {
  3809. // for user specific changes
  3810. // this is late-loaded so we don't have to deal with PHP4
  3811. // reference wonky-ness
  3812. $this->manager->addModule($this->_anonModule);
  3813. unset($this->_anonModule);
  3814. }
  3815. $this->manager->setup($config);
  3816. $this->doctype = $this->manager->doctype;
  3817. foreach ($this->manager->modules as $module) {
  3818. foreach($module->info_tag_transform as $k => $v) {
  3819. if ($v === false) unset($this->info_tag_transform[$k]);
  3820. else $this->info_tag_transform[$k] = $v;
  3821. }
  3822. foreach($module->info_attr_transform_pre as $k => $v) {
  3823. if ($v === false) unset($this->info_attr_transform_pre[$k]);
  3824. else $this->info_attr_transform_pre[$k] = $v;
  3825. }
  3826. foreach($module->info_attr_transform_post as $k => $v) {
  3827. if ($v === false) unset($this->info_attr_transform_post[$k]);
  3828. else $this->info_attr_transform_post[$k] = $v;
  3829. }
  3830. foreach ($module->info_injector as $k => $v) {
  3831. if ($v === false) unset($this->info_injector[$k]);
  3832. else $this->info_injector[$k] = $v;
  3833. }
  3834. }
  3835. $this->info = $this->manager->getElements();
  3836. $this->info_content_sets = $this->manager->contentSets->lookup;
  3837. }
  3838. /**
  3839. * Sets up stuff based on config. We need a better way of doing this.
  3840. */
  3841. protected function setupConfigStuff($config) {
  3842. $block_wrapper = $config->get('HTML.BlockWrapper');
  3843. if (isset($this->info_content_sets['Block'][$block_wrapper])) {
  3844. $this->info_block_wrapper = $block_wrapper;
  3845. } else {
  3846. trigger_error('Cannot use non-block element as block wrapper',
  3847. E_USER_ERROR);
  3848. }
  3849. $parent = $config->get('HTML.Parent');
  3850. $def = $this->manager->getElement($parent, true);
  3851. if ($def) {
  3852. $this->info_parent = $parent;
  3853. $this->info_parent_def = $def;
  3854. } else {
  3855. trigger_error('Cannot use unrecognized element as parent',
  3856. E_USER_ERROR);
  3857. $this->info_parent_def = $this->manager->getElement($this->info_parent, true);
  3858. }
  3859. // support template text
  3860. $support = "(for information on implementing this, see the ".
  3861. "support forums) ";
  3862. // setup allowed elements -----------------------------------------
  3863. $allowed_elements = $config->get('HTML.AllowedElements');
  3864. $allowed_attributes = $config->get('HTML.AllowedAttributes'); // retrieve early
  3865. if (!is_array($allowed_elements) && !is_array($allowed_attributes)) {
  3866. $allowed = $config->get('HTML.Allowed');
  3867. if (is_string($allowed)) {
  3868. list($allowed_elements, $allowed_attributes) = $this->parseTinyMCEAllowedList($allowed);
  3869. }
  3870. }
  3871. if (is_array($allowed_elements)) {
  3872. foreach ($this->info as $name => $d) {
  3873. if(!isset($allowed_elements[$name])) unset($this->info[$name]);
  3874. unset($allowed_elements[$name]);
  3875. }
  3876. // emit errors
  3877. foreach ($allowed_elements as $element => $d) {
  3878. $element = htmlspecialchars($element); // PHP doesn't escape errors, be careful!
  3879. trigger_error("Element '$element' is not supported $support", E_USER_WARNING);
  3880. }
  3881. }
  3882. // setup allowed attributes ---------------------------------------
  3883. $allowed_attributes_mutable = $allowed_attributes; // by copy!
  3884. if (is_array($allowed_attributes)) {
  3885. // This actually doesn't do anything, since we went away from
  3886. // global attributes. It's possible that userland code uses
  3887. // it, but HTMLModuleManager doesn't!
  3888. foreach ($this->info_global_attr as $attr => $x) {
  3889. $keys = array($attr, "*@$attr", "*.$attr");
  3890. $delete = true;
  3891. foreach ($keys as $key) {
  3892. if ($delete && isset($allowed_attributes[$key])) {
  3893. $delete = false;
  3894. }
  3895. if (isset($allowed_attributes_mutable[$key])) {
  3896. unset($allowed_attributes_mutable[$key]);
  3897. }
  3898. }
  3899. if ($delete) unset($this->info_global_attr[$attr]);
  3900. }
  3901. foreach ($this->info as $tag => $info) {
  3902. foreach ($info->attr as $attr => $x) {
  3903. $keys = array("$tag@$attr", $attr, "*@$attr", "$tag.$attr", "*.$attr");
  3904. $delete = true;
  3905. foreach ($keys as $key) {
  3906. if ($delete && isset($allowed_attributes[$key])) {
  3907. $delete = false;
  3908. }
  3909. if (isset($allowed_attributes_mutable[$key])) {
  3910. unset($allowed_attributes_mutable[$key]);
  3911. }
  3912. }
  3913. if ($delete) {
  3914. if ($this->info[$tag]->attr[$attr]->required) {
  3915. trigger_error("Required attribute '$attr' in element '$tag' was not allowed, which means '$tag' will not be allowed either", E_USER_WARNING);
  3916. }
  3917. unset($this->info[$tag]->attr[$attr]);
  3918. }
  3919. }
  3920. }
  3921. // emit errors
  3922. foreach ($allowed_attributes_mutable as $elattr => $d) {
  3923. $bits = preg_split('/[.@]/', $elattr, 2);
  3924. $c = count($bits);
  3925. switch ($c) {
  3926. case 2:
  3927. if ($bits[0] !== '*') {
  3928. $element = htmlspecialchars($bits[0]);
  3929. $attribute = htmlspecialchars($bits[1]);
  3930. if (!isset($this->info[$element])) {
  3931. trigger_error("Cannot allow attribute '$attribute' if element '$element' is not allowed/supported $support");
  3932. } else {
  3933. trigger_error("Attribute '$attribute' in element '$element' not supported $support",
  3934. E_USER_WARNING);
  3935. }
  3936. break;
  3937. }
  3938. // otherwise fall through
  3939. case 1:
  3940. $attribute = htmlspecialchars($bits[0]);
  3941. trigger_error("Global attribute '$attribute' is not ".
  3942. "supported in any elements $support",
  3943. E_USER_WARNING);
  3944. break;
  3945. }
  3946. }
  3947. }
  3948. // setup forbidden elements ---------------------------------------
  3949. $forbidden_elements = $config->get('HTML.ForbiddenElements');
  3950. $forbidden_attributes = $config->get('HTML.ForbiddenAttributes');
  3951. foreach ($this->info as $tag => $info) {
  3952. if (isset($forbidden_elements[$tag])) {
  3953. unset($this->info[$tag]);
  3954. continue;
  3955. }
  3956. foreach ($info->attr as $attr => $x) {
  3957. if (
  3958. isset($forbidden_attributes["$tag@$attr"]) ||
  3959. isset($forbidden_attributes["*@$attr"]) ||
  3960. isset($forbidden_attributes[$attr])
  3961. ) {
  3962. unset($this->info[$tag]->attr[$attr]);
  3963. continue;
  3964. } // this segment might get removed eventually
  3965. elseif (isset($forbidden_attributes["$tag.$attr"])) {
  3966. // $tag.$attr are not user supplied, so no worries!
  3967. trigger_error("Error with $tag.$attr: tag.attr syntax not supported for HTML.ForbiddenAttributes; use tag@attr instead", E_USER_WARNING);
  3968. }
  3969. }
  3970. }
  3971. foreach ($forbidden_attributes as $key => $v) {
  3972. if (strlen($key) < 2) continue;
  3973. if ($key[0] != '*') continue;
  3974. if ($key[1] == '.') {
  3975. trigger_error("Error with $key: *.attr syntax not supported for HTML.ForbiddenAttributes; use attr instead", E_USER_WARNING);
  3976. }
  3977. }
  3978. // setup injectors -----------------------------------------------------
  3979. foreach ($this->info_injector as $i => $injector) {
  3980. if ($injector->checkNeeded($config) !== false) {
  3981. // remove injector that does not have it's required
  3982. // elements/attributes present, and is thus not needed.
  3983. unset($this->info_injector[$i]);
  3984. }
  3985. }
  3986. }
  3987. /**
  3988. * Parses a TinyMCE-flavored Allowed Elements and Attributes list into
  3989. * separate lists for processing. Format is element[attr1|attr2],element2...
  3990. * @warning Although it's largely drawn from TinyMCE's implementation,
  3991. * it is different, and you'll probably have to modify your lists
  3992. * @param $list String list to parse
  3993. * @param array($allowed_elements, $allowed_attributes)
  3994. * @todo Give this its own class, probably static interface
  3995. */
  3996. public function parseTinyMCEAllowedList($list) {
  3997. $list = str_replace(array(' ', "\t"), '', $list);
  3998. $elements = array();
  3999. $attributes = array();
  4000. $chunks = preg_split('/(,|[\n\r]+)/', $list);
  4001. foreach ($chunks as $chunk) {
  4002. if (empty($chunk)) continue;
  4003. // remove TinyMCE element control characters
  4004. if (!strpos($chunk, '[')) {
  4005. $element = $chunk;
  4006. $attr = false;
  4007. } else {
  4008. list($element, $attr) = explode('[', $chunk);
  4009. }
  4010. if ($element !== '*') $elements[$element] = true;
  4011. if (!$attr) continue;
  4012. $attr = substr($attr, 0, strlen($attr) - 1); // remove trailing ]
  4013. $attr = explode('|', $attr);
  4014. foreach ($attr as $key) {
  4015. $attributes["$element.$key"] = true;
  4016. }
  4017. }
  4018. return array($elements, $attributes);
  4019. }
  4020. }
  4021. /**
  4022. * Represents an XHTML 1.1 module, with information on elements, tags
  4023. * and attributes.
  4024. * @note Even though this is technically XHTML 1.1, it is also used for
  4025. * regular HTML parsing. We are using modulization as a convenient
  4026. * way to represent the internals of HTMLDefinition, and our
  4027. * implementation is by no means conforming and does not directly
  4028. * use the normative DTDs or XML schemas.
  4029. * @note The public variables in a module should almost directly
  4030. * correspond to the variables in HTMLPurifier_HTMLDefinition.
  4031. * However, the prefix info carries no special meaning in these
  4032. * objects (include it anyway if that's the correspondence though).
  4033. * @todo Consider making some member functions protected
  4034. */
  4035. class HTMLPurifier_HTMLModule
  4036. {
  4037. // -- Overloadable ----------------------------------------------------
  4038. /**
  4039. * Short unique string identifier of the module
  4040. */
  4041. public $name;
  4042. /**
  4043. * Informally, a list of elements this module changes. Not used in
  4044. * any significant way.
  4045. */
  4046. public $elements = array();
  4047. /**
  4048. * Associative array of element names to element definitions.
  4049. * Some definitions may be incomplete, to be merged in later
  4050. * with the full definition.
  4051. */
  4052. public $info = array();
  4053. /**
  4054. * Associative array of content set names to content set additions.
  4055. * This is commonly used to, say, add an A element to the Inline
  4056. * content set. This corresponds to an internal variable $content_sets
  4057. * and NOT info_content_sets member variable of HTMLDefinition.
  4058. */
  4059. public $content_sets = array();
  4060. /**
  4061. * Associative array of attribute collection names to attribute
  4062. * collection additions. More rarely used for adding attributes to
  4063. * the global collections. Example is the StyleAttribute module adding
  4064. * the style attribute to the Core. Corresponds to HTMLDefinition's
  4065. * attr_collections->info, since the object's data is only info,
  4066. * with extra behavior associated with it.
  4067. */
  4068. public $attr_collections = array();
  4069. /**
  4070. * Associative array of deprecated tag name to HTMLPurifier_TagTransform
  4071. */
  4072. public $info_tag_transform = array();
  4073. /**
  4074. * List of HTMLPurifier_AttrTransform to be performed before validation.
  4075. */
  4076. public $info_attr_transform_pre = array();
  4077. /**
  4078. * List of HTMLPurifier_AttrTransform to be performed after validation.
  4079. */
  4080. public $info_attr_transform_post = array();
  4081. /**
  4082. * List of HTMLPurifier_Injector to be performed during well-formedness fixing.
  4083. * An injector will only be invoked if all of it's pre-requisites are met;
  4084. * if an injector fails setup, there will be no error; it will simply be
  4085. * silently disabled.
  4086. */
  4087. public $info_injector = array();
  4088. /**
  4089. * Boolean flag that indicates whether or not getChildDef is implemented.
  4090. * For optimization reasons: may save a call to a function. Be sure
  4091. * to set it if you do implement getChildDef(), otherwise it will have
  4092. * no effect!
  4093. */
  4094. public $defines_child_def = false;
  4095. /**
  4096. * Boolean flag whether or not this module is safe. If it is not safe, all
  4097. * of its members are unsafe. Modules are safe by default (this might be
  4098. * slightly dangerous, but it doesn't make much sense to force HTML Purifier,
  4099. * which is based off of safe HTML, to explicitly say, "This is safe," even
  4100. * though there are modules which are "unsafe")
  4101. *
  4102. * @note Previously, safety could be applied at an element level granularity.
  4103. * We've removed this ability, so in order to add "unsafe" elements
  4104. * or attributes, a dedicated module with this property set to false
  4105. * must be used.
  4106. */
  4107. public $safe = true;
  4108. /**
  4109. * Retrieves a proper HTMLPurifier_ChildDef subclass based on
  4110. * content_model and content_model_type member variables of
  4111. * the HTMLPurifier_ElementDef class. There is a similar function
  4112. * in HTMLPurifier_HTMLDefinition.
  4113. * @param $def HTMLPurifier_ElementDef instance
  4114. * @return HTMLPurifier_ChildDef subclass
  4115. */
  4116. public function getChildDef($def) {return false;}
  4117. // -- Convenience -----------------------------------------------------
  4118. /**
  4119. * Convenience function that sets up a new element
  4120. * @param $element Name of element to add
  4121. * @param $type What content set should element be registered to?
  4122. * Set as false to skip this step.
  4123. * @param $contents Allowed children in form of:
  4124. * "$content_model_type: $content_model"
  4125. * @param $attr_includes What attribute collections to register to
  4126. * element?
  4127. * @param $attr What unique attributes does the element define?
  4128. * @note See ElementDef for in-depth descriptions of these parameters.
  4129. * @return Created element definition object, so you
  4130. * can set advanced parameters
  4131. */
  4132. public function addElement($element, $type, $contents, $attr_includes = array(), $attr = array()) {
  4133. $this->elements[] = $element;
  4134. // parse content_model
  4135. list($content_model_type, $content_model) = $this->parseContents($contents);
  4136. // merge in attribute inclusions
  4137. $this->mergeInAttrIncludes($attr, $attr_includes);
  4138. // add element to content sets
  4139. if ($type) $this->addElementToContentSet($element, $type);
  4140. // create element
  4141. $this->info[$element] = HTMLPurifier_ElementDef::create(
  4142. $content_model, $content_model_type, $attr
  4143. );
  4144. // literal object $contents means direct child manipulation
  4145. if (!is_string($contents)) $this->info[$element]->child = $contents;
  4146. return $this->info[$element];
  4147. }
  4148. /**
  4149. * Convenience function that creates a totally blank, non-standalone
  4150. * element.
  4151. * @param $element Name of element to create
  4152. * @return Created element
  4153. */
  4154. public function addBlankElement($element) {
  4155. if (!isset($this->info[$element])) {
  4156. $this->elements[] = $element;
  4157. $this->info[$element] = new HTMLPurifier_ElementDef();
  4158. $this->info[$element]->standalone = false;
  4159. } else {
  4160. trigger_error("Definition for $element already exists in module, cannot redefine");
  4161. }
  4162. return $this->info[$element];
  4163. }
  4164. /**
  4165. * Convenience function that registers an element to a content set
  4166. * @param Element to register
  4167. * @param Name content set (warning: case sensitive, usually upper-case
  4168. * first letter)
  4169. */
  4170. public function addElementToContentSet($element, $type) {
  4171. if (!isset($this->content_sets[$type])) $this->content_sets[$type] = '';
  4172. else $this->content_sets[$type] .= ' | ';
  4173. $this->content_sets[$type] .= $element;
  4174. }
  4175. /**
  4176. * Convenience function that transforms single-string contents
  4177. * into separate content model and content model type
  4178. * @param $contents Allowed children in form of:
  4179. * "$content_model_type: $content_model"
  4180. * @note If contents is an object, an array of two nulls will be
  4181. * returned, and the callee needs to take the original $contents
  4182. * and use it directly.
  4183. */
  4184. public function parseContents($contents) {
  4185. if (!is_string($contents)) return array(null, null); // defer
  4186. switch ($contents) {
  4187. // check for shorthand content model forms
  4188. case 'Empty':
  4189. return array('empty', '');
  4190. case 'Inline':
  4191. return array('optional', 'Inline | #PCDATA');
  4192. case 'Flow':
  4193. return array('optional', 'Flow | #PCDATA');
  4194. }
  4195. list($content_model_type, $content_model) = explode(':', $contents);
  4196. $content_model_type = strtolower(trim($content_model_type));
  4197. $content_model = trim($content_model);
  4198. return array($content_model_type, $content_model);
  4199. }
  4200. /**
  4201. * Convenience function that merges a list of attribute includes into
  4202. * an attribute array.
  4203. * @param $attr Reference to attr array to modify
  4204. * @param $attr_includes Array of includes / string include to merge in
  4205. */
  4206. public function mergeInAttrIncludes(&$attr, $attr_includes) {
  4207. if (!is_array($attr_includes)) {
  4208. if (empty($attr_includes)) $attr_includes = array();
  4209. else $attr_includes = array($attr_includes);
  4210. }
  4211. $attr[0] = $attr_includes;
  4212. }
  4213. /**
  4214. * Convenience function that generates a lookup table with boolean
  4215. * true as value.
  4216. * @param $list List of values to turn into a lookup
  4217. * @note You can also pass an arbitrary number of arguments in
  4218. * place of the regular argument
  4219. * @return Lookup array equivalent of list
  4220. */
  4221. public function makeLookup($list) {
  4222. if (is_string($list)) $list = func_get_args();
  4223. $ret = array();
  4224. foreach ($list as $value) {
  4225. if (is_null($value)) continue;
  4226. $ret[$value] = true;
  4227. }
  4228. return $ret;
  4229. }
  4230. /**
  4231. * Lazy load construction of the module after determining whether
  4232. * or not it's needed, and also when a finalized configuration object
  4233. * is available.
  4234. * @param $config Instance of HTMLPurifier_Config
  4235. */
  4236. public function setup($config) {}
  4237. }
  4238. class HTMLPurifier_HTMLModuleManager
  4239. {
  4240. /**
  4241. * Instance of HTMLPurifier_DoctypeRegistry
  4242. */
  4243. public $doctypes;
  4244. /**
  4245. * Instance of current doctype
  4246. */
  4247. public $doctype;
  4248. /**
  4249. * Instance of HTMLPurifier_AttrTypes
  4250. */
  4251. public $attrTypes;
  4252. /**
  4253. * Active instances of modules for the specified doctype are
  4254. * indexed, by name, in this array.
  4255. */
  4256. public $modules = array();
  4257. /**
  4258. * Array of recognized HTMLPurifier_Module instances, indexed by
  4259. * module's class name. This array is usually lazy loaded, but a
  4260. * user can overload a module by pre-emptively registering it.
  4261. */
  4262. public $registeredModules = array();
  4263. /**
  4264. * List of extra modules that were added by the user using addModule().
  4265. * These get unconditionally merged into the current doctype, whatever
  4266. * it may be.
  4267. */
  4268. public $userModules = array();
  4269. /**
  4270. * Associative array of element name to list of modules that have
  4271. * definitions for the element; this array is dynamically filled.
  4272. */
  4273. public $elementLookup = array();
  4274. /** List of prefixes we should use for registering small names */
  4275. public $prefixes = array('HTMLPurifier_HTMLModule_');
  4276. public $contentSets; /**< Instance of HTMLPurifier_ContentSets */
  4277. public $attrCollections; /**< Instance of HTMLPurifier_AttrCollections */
  4278. /** If set to true, unsafe elements and attributes will be allowed */
  4279. public $trusted = false;
  4280. public function __construct() {
  4281. // editable internal objects
  4282. $this->attrTypes = new HTMLPurifier_AttrTypes();
  4283. $this->doctypes = new HTMLPurifier_DoctypeRegistry();
  4284. // setup basic modules
  4285. $common = array(
  4286. 'CommonAttributes', 'Text', 'Hypertext', 'List',
  4287. 'Presentation', 'Edit', 'Bdo', 'Tables', 'Image',
  4288. 'StyleAttribute',
  4289. // Unsafe:
  4290. 'Scripting', 'Object', 'Forms',
  4291. // Sorta legacy, but present in strict:
  4292. 'Name',
  4293. );
  4294. $transitional = array('Legacy', 'Target');
  4295. $xml = array('XMLCommonAttributes');
  4296. $non_xml = array('NonXMLCommonAttributes');
  4297. // setup basic doctypes
  4298. $this->doctypes->register(
  4299. 'HTML 4.01 Transitional', false,
  4300. array_merge($common, $transitional, $non_xml),
  4301. array('Tidy_Transitional', 'Tidy_Proprietary'),
  4302. array(),
  4303. '-//W3C//DTD HTML 4.01 Transitional//EN',
  4304. 'http://www.w3.org/TR/html4/loose.dtd'
  4305. );
  4306. $this->doctypes->register(
  4307. 'HTML 4.01 Strict', false,
  4308. array_merge($common, $non_xml),
  4309. array('Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'),
  4310. array(),
  4311. '-//W3C//DTD HTML 4.01//EN',
  4312. 'http://www.w3.org/TR/html4/strict.dtd'
  4313. );
  4314. $this->doctypes->register(
  4315. 'XHTML 1.0 Transitional', true,
  4316. array_merge($common, $transitional, $xml, $non_xml),
  4317. array('Tidy_Transitional', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Name'),
  4318. array(),
  4319. '-//W3C//DTD XHTML 1.0 Transitional//EN',
  4320. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
  4321. );
  4322. $this->doctypes->register(
  4323. 'XHTML 1.0 Strict', true,
  4324. array_merge($common, $xml, $non_xml),
  4325. array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'),
  4326. array(),
  4327. '-//W3C//DTD XHTML 1.0 Strict//EN',
  4328. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'
  4329. );
  4330. $this->doctypes->register(
  4331. 'XHTML 1.1', true,
  4332. array_merge($common, $xml, array('Ruby')),
  4333. array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Strict', 'Tidy_Name'), // Tidy_XHTML1_1
  4334. array(),
  4335. '-//W3C//DTD XHTML 1.1//EN',
  4336. 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'
  4337. );
  4338. }
  4339. /**
  4340. * Registers a module to the recognized module list, useful for
  4341. * overloading pre-existing modules.
  4342. * @param $module Mixed: string module name, with or without
  4343. * HTMLPurifier_HTMLModule prefix, or instance of
  4344. * subclass of HTMLPurifier_HTMLModule.
  4345. * @param $overload Boolean whether or not to overload previous modules.
  4346. * If this is not set, and you do overload a module,
  4347. * HTML Purifier will complain with a warning.
  4348. * @note This function will not call autoload, you must instantiate
  4349. * (and thus invoke) autoload outside the method.
  4350. * @note If a string is passed as a module name, different variants
  4351. * will be tested in this order:
  4352. * - Check for HTMLPurifier_HTMLModule_$name
  4353. * - Check all prefixes with $name in order they were added
  4354. * - Check for literal object name
  4355. * - Throw fatal error
  4356. * If your object name collides with an internal class, specify
  4357. * your module manually. All modules must have been included
  4358. * externally: registerModule will not perform inclusions for you!
  4359. */
  4360. public function registerModule($module, $overload = false) {
  4361. if (is_string($module)) {
  4362. // attempt to load the module
  4363. $original_module = $module;
  4364. $ok = false;
  4365. foreach ($this->prefixes as $prefix) {
  4366. $module = $prefix . $original_module;
  4367. if (class_exists($module)) {
  4368. $ok = true;
  4369. break;
  4370. }
  4371. }
  4372. if (!$ok) {
  4373. $module = $original_module;
  4374. if (!class_exists($module)) {
  4375. trigger_error($original_module . ' module does not exist',
  4376. E_USER_ERROR);
  4377. return;
  4378. }
  4379. }
  4380. $module = new $module();
  4381. }
  4382. if (empty($module->name)) {
  4383. trigger_error('Module instance of ' . get_class($module) . ' must have name');
  4384. return;
  4385. }
  4386. if (!$overload && isset($this->registeredModules[$module->name])) {
  4387. trigger_error('Overloading ' . $module->name . ' without explicit overload parameter', E_USER_WARNING);
  4388. }
  4389. $this->registeredModules[$module->name] = $module;
  4390. }
  4391. /**
  4392. * Adds a module to the current doctype by first registering it,
  4393. * and then tacking it on to the active doctype
  4394. */
  4395. public function addModule($module) {
  4396. $this->registerModule($module);
  4397. if (is_object($module)) $module = $module->name;
  4398. $this->userModules[] = $module;
  4399. }
  4400. /**
  4401. * Adds a class prefix that registerModule() will use to resolve a
  4402. * string name to a concrete class
  4403. */
  4404. public function addPrefix($prefix) {
  4405. $this->prefixes[] = $prefix;
  4406. }
  4407. /**
  4408. * Performs processing on modules, after being called you may
  4409. * use getElement() and getElements()
  4410. * @param $config Instance of HTMLPurifier_Config
  4411. */
  4412. public function setup($config) {
  4413. $this->trusted = $config->get('HTML.Trusted');
  4414. // generate
  4415. $this->doctype = $this->doctypes->make($config);
  4416. $modules = $this->doctype->modules;
  4417. // take out the default modules that aren't allowed
  4418. $lookup = $config->get('HTML.AllowedModules');
  4419. $special_cases = $config->get('HTML.CoreModules');
  4420. if (is_array($lookup)) {
  4421. foreach ($modules as $k => $m) {
  4422. if (isset($special_cases[$m])) continue;
  4423. if (!isset($lookup[$m])) unset($modules[$k]);
  4424. }
  4425. }
  4426. // custom modules
  4427. if ($config->get('HTML.Proprietary')) {
  4428. $modules[] = 'Proprietary';
  4429. }
  4430. if ($config->get('HTML.SafeObject')) {
  4431. $modules[] = 'SafeObject';
  4432. }
  4433. if ($config->get('HTML.SafeEmbed')) {
  4434. $modules[] = 'SafeEmbed';
  4435. }
  4436. if ($config->get('HTML.Nofollow')) {
  4437. $modules[] = 'Nofollow';
  4438. }
  4439. // merge in custom modules
  4440. $modules = array_merge($modules, $this->userModules);
  4441. foreach ($modules as $module) {
  4442. $this->processModule($module);
  4443. $this->modules[$module]->setup($config);
  4444. }
  4445. foreach ($this->doctype->tidyModules as $module) {
  4446. $this->processModule($module);
  4447. $this->modules[$module]->setup($config);
  4448. }
  4449. // prepare any injectors
  4450. foreach ($this->modules as $module) {
  4451. $n = array();
  4452. foreach ($module->info_injector as $i => $injector) {
  4453. if (!is_object($injector)) {
  4454. $class = "HTMLPurifier_Injector_$injector";
  4455. $injector = new $class;
  4456. }
  4457. $n[$injector->name] = $injector;
  4458. }
  4459. $module->info_injector = $n;
  4460. }
  4461. // setup lookup table based on all valid modules
  4462. foreach ($this->modules as $module) {
  4463. foreach ($module->info as $name => $def) {
  4464. if (!isset($this->elementLookup[$name])) {
  4465. $this->elementLookup[$name] = array();
  4466. }
  4467. $this->elementLookup[$name][] = $module->name;
  4468. }
  4469. }
  4470. // note the different choice
  4471. $this->contentSets = new HTMLPurifier_ContentSets(
  4472. // content set assembly deals with all possible modules,
  4473. // not just ones deemed to be "safe"
  4474. $this->modules
  4475. );
  4476. $this->attrCollections = new HTMLPurifier_AttrCollections(
  4477. $this->attrTypes,
  4478. // there is no way to directly disable a global attribute,
  4479. // but using AllowedAttributes or simply not including
  4480. // the module in your custom doctype should be sufficient
  4481. $this->modules
  4482. );
  4483. }
  4484. /**
  4485. * Takes a module and adds it to the active module collection,
  4486. * registering it if necessary.
  4487. */
  4488. public function processModule($module) {
  4489. if (!isset($this->registeredModules[$module]) || is_object($module)) {
  4490. $this->registerModule($module);
  4491. }
  4492. $this->modules[$module] = $this->registeredModules[$module];
  4493. }
  4494. /**
  4495. * Retrieves merged element definitions.
  4496. * @return Array of HTMLPurifier_ElementDef
  4497. */
  4498. public function getElements() {
  4499. $elements = array();
  4500. foreach ($this->modules as $module) {
  4501. if (!$this->trusted && !$module->safe) continue;
  4502. foreach ($module->info as $name => $v) {
  4503. if (isset($elements[$name])) continue;
  4504. $elements[$name] = $this->getElement($name);
  4505. }
  4506. }
  4507. // remove dud elements, this happens when an element that
  4508. // appeared to be safe actually wasn't
  4509. foreach ($elements as $n => $v) {
  4510. if ($v === false) unset($elements[$n]);
  4511. }
  4512. return $elements;
  4513. }
  4514. /**
  4515. * Retrieves a single merged element definition
  4516. * @param $name Name of element
  4517. * @param $trusted Boolean trusted overriding parameter: set to true
  4518. * if you want the full version of an element
  4519. * @return Merged HTMLPurifier_ElementDef
  4520. * @note You may notice that modules are getting iterated over twice (once
  4521. * in getElements() and once here). This
  4522. * is because
  4523. */
  4524. public function getElement($name, $trusted = null) {
  4525. if (!isset($this->elementLookup[$name])) {
  4526. return false;
  4527. }
  4528. // setup global state variables
  4529. $def = false;
  4530. if ($trusted === null) $trusted = $this->trusted;
  4531. // iterate through each module that has registered itself to this
  4532. // element
  4533. foreach($this->elementLookup[$name] as $module_name) {
  4534. $module = $this->modules[$module_name];
  4535. // refuse to create/merge from a module that is deemed unsafe--
  4536. // pretend the module doesn't exist--when trusted mode is not on.
  4537. if (!$trusted && !$module->safe) {
  4538. continue;
  4539. }
  4540. // clone is used because, ideally speaking, the original
  4541. // definition should not be modified. Usually, this will
  4542. // make no difference, but for consistency's sake
  4543. $new_def = clone $module->info[$name];
  4544. if (!$def && $new_def->standalone) {
  4545. $def = $new_def;
  4546. } elseif ($def) {
  4547. // This will occur even if $new_def is standalone. In practice,
  4548. // this will usually result in a full replacement.
  4549. $def->mergeIn($new_def);
  4550. } else {
  4551. // :TODO:
  4552. // non-standalone definitions that don't have a standalone
  4553. // to merge into could be deferred to the end
  4554. continue;
  4555. }
  4556. // attribute value expansions
  4557. $this->attrCollections->performInclusions($def->attr);
  4558. $this->attrCollections->expandIdentifiers($def->attr, $this->attrTypes);
  4559. // descendants_are_inline, for ChildDef_Chameleon
  4560. if (is_string($def->content_model) &&
  4561. strpos($def->content_model, 'Inline') !== false) {
  4562. if ($name != 'del' && $name != 'ins') {
  4563. // this is for you, ins/del
  4564. $def->descendants_are_inline = true;
  4565. }
  4566. }
  4567. $this->contentSets->generateChildDef($def, $module);
  4568. }
  4569. // This can occur if there is a blank definition, but no base to
  4570. // mix it in with
  4571. if (!$def) return false;
  4572. // add information on required attributes
  4573. foreach ($def->attr as $attr_name => $attr_def) {
  4574. if ($attr_def->required) {
  4575. $def->required_attr[] = $attr_name;
  4576. }
  4577. }
  4578. return $def;
  4579. }
  4580. }
  4581. /**
  4582. * Component of HTMLPurifier_AttrContext that accumulates IDs to prevent dupes
  4583. * @note In Slashdot-speak, dupe means duplicate.
  4584. * @note The default constructor does not accept $config or $context objects:
  4585. * use must use the static build() factory method to perform initialization.
  4586. */
  4587. class HTMLPurifier_IDAccumulator
  4588. {
  4589. /**
  4590. * Lookup table of IDs we've accumulated.
  4591. * @public
  4592. */
  4593. public $ids = array();
  4594. /**
  4595. * Builds an IDAccumulator, also initializing the default blacklist
  4596. * @param $config Instance of HTMLPurifier_Config
  4597. * @param $context Instance of HTMLPurifier_Context
  4598. * @return Fully initialized HTMLPurifier_IDAccumulator
  4599. */
  4600. public static function build($config, $context) {
  4601. $id_accumulator = new HTMLPurifier_IDAccumulator();
  4602. $id_accumulator->load($config->get('Attr.IDBlacklist'));
  4603. return $id_accumulator;
  4604. }
  4605. /**
  4606. * Add an ID to the lookup table.
  4607. * @param $id ID to be added.
  4608. * @return Bool status, true if success, false if there's a dupe
  4609. */
  4610. public function add($id) {
  4611. if (isset($this->ids[$id])) return false;
  4612. return $this->ids[$id] = true;
  4613. }
  4614. /**
  4615. * Load a list of IDs into the lookup table
  4616. * @param $array_of_ids Array of IDs to load
  4617. * @note This function doesn't care about duplicates
  4618. */
  4619. public function load($array_of_ids) {
  4620. foreach ($array_of_ids as $id) {
  4621. $this->ids[$id] = true;
  4622. }
  4623. }
  4624. }
  4625. /**
  4626. * Injects tokens into the document while parsing for well-formedness.
  4627. * This enables "formatter-like" functionality such as auto-paragraphing,
  4628. * smiley-ification and linkification to take place.
  4629. *
  4630. * A note on how handlers create changes; this is done by assigning a new
  4631. * value to the $token reference. These values can take a variety of forms and
  4632. * are best described HTMLPurifier_Strategy_MakeWellFormed->processToken()
  4633. * documentation.
  4634. *
  4635. * @todo Allow injectors to request a re-run on their output. This
  4636. * would help if an operation is recursive.
  4637. */
  4638. abstract class HTMLPurifier_Injector
  4639. {
  4640. /**
  4641. * Advisory name of injector, this is for friendly error messages
  4642. */
  4643. public $name;
  4644. /**
  4645. * Instance of HTMLPurifier_HTMLDefinition
  4646. */
  4647. protected $htmlDefinition;
  4648. /**
  4649. * Reference to CurrentNesting variable in Context. This is an array
  4650. * list of tokens that we are currently "inside"
  4651. */
  4652. protected $currentNesting;
  4653. /**
  4654. * Reference to InputTokens variable in Context. This is an array
  4655. * list of the input tokens that are being processed.
  4656. */
  4657. protected $inputTokens;
  4658. /**
  4659. * Reference to InputIndex variable in Context. This is an integer
  4660. * array index for $this->inputTokens that indicates what token
  4661. * is currently being processed.
  4662. */
  4663. protected $inputIndex;
  4664. /**
  4665. * Array of elements and attributes this injector creates and therefore
  4666. * need to be allowed by the definition. Takes form of
  4667. * array('element' => array('attr', 'attr2'), 'element2')
  4668. */
  4669. public $needed = array();
  4670. /**
  4671. * Index of inputTokens to rewind to.
  4672. */
  4673. protected $rewind = false;
  4674. /**
  4675. * Rewind to a spot to re-perform processing. This is useful if you
  4676. * deleted a node, and now need to see if this change affected any
  4677. * earlier nodes. Rewinding does not affect other injectors, and can
  4678. * result in infinite loops if not used carefully.
  4679. * @warning HTML Purifier will prevent you from fast-forwarding with this
  4680. * function.
  4681. */
  4682. public function rewind($index) {
  4683. $this->rewind = $index;
  4684. }
  4685. /**
  4686. * Retrieves rewind, and then unsets it.
  4687. */
  4688. public function getRewind() {
  4689. $r = $this->rewind;
  4690. $this->rewind = false;
  4691. return $r;
  4692. }
  4693. /**
  4694. * Prepares the injector by giving it the config and context objects:
  4695. * this allows references to important variables to be made within
  4696. * the injector. This function also checks if the HTML environment
  4697. * will work with the Injector (see checkNeeded()).
  4698. * @param $config Instance of HTMLPurifier_Config
  4699. * @param $context Instance of HTMLPurifier_Context
  4700. * @return Boolean false if success, string of missing needed element/attribute if failure
  4701. */
  4702. public function prepare($config, $context) {
  4703. $this->htmlDefinition = $config->getHTMLDefinition();
  4704. // Even though this might fail, some unit tests ignore this and
  4705. // still test checkNeeded, so be careful. Maybe get rid of that
  4706. // dependency.
  4707. $result = $this->checkNeeded($config);
  4708. if ($result !== false) return $result;
  4709. $this->currentNesting =& $context->get('CurrentNesting');
  4710. $this->inputTokens =& $context->get('InputTokens');
  4711. $this->inputIndex =& $context->get('InputIndex');
  4712. return false;
  4713. }
  4714. /**
  4715. * This function checks if the HTML environment
  4716. * will work with the Injector: if p tags are not allowed, the
  4717. * Auto-Paragraphing injector should not be enabled.
  4718. * @param $config Instance of HTMLPurifier_Config
  4719. * @param $context Instance of HTMLPurifier_Context
  4720. * @return Boolean false if success, string of missing needed element/attribute if failure
  4721. */
  4722. public function checkNeeded($config) {
  4723. $def = $config->getHTMLDefinition();
  4724. foreach ($this->needed as $element => $attributes) {
  4725. if (is_int($element)) $element = $attributes;
  4726. if (!isset($def->info[$element])) return $element;
  4727. if (!is_array($attributes)) continue;
  4728. foreach ($attributes as $name) {
  4729. if (!isset($def->info[$element]->attr[$name])) return "$element.$name";
  4730. }
  4731. }
  4732. return false;
  4733. }
  4734. /**
  4735. * Tests if the context node allows a certain element
  4736. * @param $name Name of element to test for
  4737. * @return True if element is allowed, false if it is not
  4738. */
  4739. public function allowsElement($name) {
  4740. if (!empty($this->currentNesting)) {
  4741. $parent_token = array_pop($this->currentNesting);
  4742. $this->currentNesting[] = $parent_token;
  4743. $parent = $this->htmlDefinition->info[$parent_token->name];
  4744. } else {
  4745. $parent = $this->htmlDefinition->info_parent_def;
  4746. }
  4747. if (!isset($parent->child->elements[$name]) || isset($parent->excludes[$name])) {
  4748. return false;
  4749. }
  4750. // check for exclusion
  4751. for ($i = count($this->currentNesting) - 2; $i >= 0; $i--) {
  4752. $node = $this->currentNesting[$i];
  4753. $def = $this->htmlDefinition->info[$node->name];
  4754. if (isset($def->excludes[$name])) return false;
  4755. }
  4756. return true;
  4757. }
  4758. /**
  4759. * Iterator function, which starts with the next token and continues until
  4760. * you reach the end of the input tokens.
  4761. * @warning Please prevent previous references from interfering with this
  4762. * functions by setting $i = null beforehand!
  4763. * @param &$i Current integer index variable for inputTokens
  4764. * @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
  4765. */
  4766. protected function forward(&$i, &$current) {
  4767. if ($i === null) $i = $this->inputIndex + 1;
  4768. else $i++;
  4769. if (!isset($this->inputTokens[$i])) return false;
  4770. $current = $this->inputTokens[$i];
  4771. return true;
  4772. }
  4773. /**
  4774. * Similar to _forward, but accepts a third parameter $nesting (which
  4775. * should be initialized at 0) and stops when we hit the end tag
  4776. * for the node $this->inputIndex starts in.
  4777. */
  4778. protected function forwardUntilEndToken(&$i, &$current, &$nesting) {
  4779. $result = $this->forward($i, $current);
  4780. if (!$result) return false;
  4781. if ($nesting === null) $nesting = 0;
  4782. if ($current instanceof HTMLPurifier_Token_Start) $nesting++;
  4783. elseif ($current instanceof HTMLPurifier_Token_End) {
  4784. if ($nesting <= 0) return false;
  4785. $nesting--;
  4786. }
  4787. return true;
  4788. }
  4789. /**
  4790. * Iterator function, starts with the previous token and continues until
  4791. * you reach the beginning of input tokens.
  4792. * @warning Please prevent previous references from interfering with this
  4793. * functions by setting $i = null beforehand!
  4794. * @param &$i Current integer index variable for inputTokens
  4795. * @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
  4796. */
  4797. protected function backward(&$i, &$current) {
  4798. if ($i === null) $i = $this->inputIndex - 1;
  4799. else $i--;
  4800. if ($i < 0) return false;
  4801. $current = $this->inputTokens[$i];
  4802. return true;
  4803. }
  4804. /**
  4805. * Initializes the iterator at the current position. Use in a do {} while;
  4806. * loop to force the _forward and _backward functions to start at the
  4807. * current location.
  4808. * @warning Please prevent previous references from interfering with this
  4809. * functions by setting $i = null beforehand!
  4810. * @param &$i Current integer index variable for inputTokens
  4811. * @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
  4812. */
  4813. protected function current(&$i, &$current) {
  4814. if ($i === null) $i = $this->inputIndex;
  4815. $current = $this->inputTokens[$i];
  4816. }
  4817. /**
  4818. * Handler that is called when a text token is processed
  4819. */
  4820. public function handleText(&$token) {}
  4821. /**
  4822. * Handler that is called when a start or empty token is processed
  4823. */
  4824. public function handleElement(&$token) {}
  4825. /**
  4826. * Handler that is called when an end token is processed
  4827. */
  4828. public function handleEnd(&$token) {
  4829. $this->notifyEnd($token);
  4830. }
  4831. /**
  4832. * Notifier that is called when an end token is processed
  4833. * @note This differs from handlers in that the token is read-only
  4834. * @deprecated
  4835. */
  4836. public function notifyEnd($token) {}
  4837. }
  4838. /**
  4839. * Represents a language and defines localizable string formatting and
  4840. * other functions, as well as the localized messages for HTML Purifier.
  4841. */
  4842. class HTMLPurifier_Language
  4843. {
  4844. /**
  4845. * ISO 639 language code of language. Prefers shortest possible version
  4846. */
  4847. public $code = 'en';
  4848. /**
  4849. * Fallback language code
  4850. */
  4851. public $fallback = false;
  4852. /**
  4853. * Array of localizable messages
  4854. */
  4855. public $messages = array();
  4856. /**
  4857. * Array of localizable error codes
  4858. */
  4859. public $errorNames = array();
  4860. /**
  4861. * True if no message file was found for this language, so English
  4862. * is being used instead. Check this if you'd like to notify the
  4863. * user that they've used a non-supported language.
  4864. */
  4865. public $error = false;
  4866. /**
  4867. * Has the language object been loaded yet?
  4868. * @todo Make it private, fix usage in HTMLPurifier_LanguageTest
  4869. */
  4870. public $_loaded = false;
  4871. /**
  4872. * Instances of HTMLPurifier_Config and HTMLPurifier_Context
  4873. */
  4874. protected $config, $context;
  4875. public function __construct($config, $context) {
  4876. $this->config = $config;
  4877. $this->context = $context;
  4878. }
  4879. /**
  4880. * Loads language object with necessary info from factory cache
  4881. * @note This is a lazy loader
  4882. */
  4883. public function load() {
  4884. if ($this->_loaded) return;
  4885. $factory = HTMLPurifier_LanguageFactory::instance();
  4886. $factory->loadLanguage($this->code);
  4887. foreach ($factory->keys as $key) {
  4888. $this->$key = $factory->cache[$this->code][$key];
  4889. }
  4890. $this->_loaded = true;
  4891. }
  4892. /**
  4893. * Retrieves a localised message.
  4894. * @param $key string identifier of message
  4895. * @return string localised message
  4896. */
  4897. public function getMessage($key) {
  4898. if (!$this->_loaded) $this->load();
  4899. if (!isset($this->messages[$key])) return "[$key]";
  4900. return $this->messages[$key];
  4901. }
  4902. /**
  4903. * Retrieves a localised error name.
  4904. * @param $int integer error number, corresponding to PHP's error
  4905. * reporting
  4906. * @return string localised message
  4907. */
  4908. public function getErrorName($int) {
  4909. if (!$this->_loaded) $this->load();
  4910. if (!isset($this->errorNames[$int])) return "[Error: $int]";
  4911. return $this->errorNames[$int];
  4912. }
  4913. /**
  4914. * Converts an array list into a string readable representation
  4915. */
  4916. public function listify($array) {
  4917. $sep = $this->getMessage('Item separator');
  4918. $sep_last = $this->getMessage('Item separator last');
  4919. $ret = '';
  4920. for ($i = 0, $c = count($array); $i < $c; $i++) {
  4921. if ($i == 0) {
  4922. } elseif ($i + 1 < $c) {
  4923. $ret .= $sep;
  4924. } else {
  4925. $ret .= $sep_last;
  4926. }
  4927. $ret .= $array[$i];
  4928. }
  4929. return $ret;
  4930. }
  4931. /**
  4932. * Formats a localised message with passed parameters
  4933. * @param $key string identifier of message
  4934. * @param $args Parameters to substitute in
  4935. * @return string localised message
  4936. * @todo Implement conditionals? Right now, some messages make
  4937. * reference to line numbers, but those aren't always available
  4938. */
  4939. public function formatMessage($key, $args = array()) {
  4940. if (!$this->_loaded) $this->load();
  4941. if (!isset($this->messages[$key])) return "[$key]";
  4942. $raw = $this->messages[$key];
  4943. $subst = array();
  4944. $generator = false;
  4945. foreach ($args as $i => $value) {
  4946. if (is_object($value)) {
  4947. if ($value instanceof HTMLPurifier_Token) {
  4948. // factor this out some time
  4949. if (!$generator) $generator = $this->context->get('Generator');
  4950. if (isset($value->name)) $subst['$'.$i.'.Name'] = $value->name;
  4951. if (isset($value->data)) $subst['$'.$i.'.Data'] = $value->data;
  4952. $subst['$'.$i.'.Compact'] =
  4953. $subst['$'.$i.'.Serialized'] = $generator->generateFromToken($value);
  4954. // a more complex algorithm for compact representation
  4955. // could be introduced for all types of tokens. This
  4956. // may need to be factored out into a dedicated class
  4957. if (!empty($value->attr)) {
  4958. $stripped_token = clone $value;
  4959. $stripped_token->attr = array();
  4960. $subst['$'.$i.'.Compact'] = $generator->generateFromToken($stripped_token);
  4961. }
  4962. $subst['$'.$i.'.Line'] = $value->line ? $value->line : 'unknown';
  4963. }
  4964. continue;
  4965. } elseif (is_array($value)) {
  4966. $keys = array_keys($value);
  4967. if (array_keys($keys) === $keys) {
  4968. // list
  4969. $subst['$'.$i] = $this->listify($value);
  4970. } else {
  4971. // associative array
  4972. // no $i implementation yet, sorry
  4973. $subst['$'.$i.'.Keys'] = $this->listify($keys);
  4974. $subst['$'.$i.'.Values'] = $this->listify(array_values($value));
  4975. }
  4976. continue;
  4977. }
  4978. $subst['$' . $i] = $value;
  4979. }
  4980. return strtr($raw, $subst);
  4981. }
  4982. }
  4983. /**
  4984. * Class responsible for generating HTMLPurifier_Language objects, managing
  4985. * caching and fallbacks.
  4986. * @note Thanks to MediaWiki for the general logic, although this version
  4987. * has been entirely rewritten
  4988. * @todo Serialized cache for languages
  4989. */
  4990. class HTMLPurifier_LanguageFactory
  4991. {
  4992. /**
  4993. * Cache of language code information used to load HTMLPurifier_Language objects
  4994. * Structure is: $factory->cache[$language_code][$key] = $value
  4995. * @value array map
  4996. */
  4997. public $cache;
  4998. /**
  4999. * Valid keys in the HTMLPurifier_Language object. Designates which
  5000. * variables to slurp out of a message file.
  5001. * @value array list
  5002. */
  5003. public $keys = array('fallback', 'messages', 'errorNames');
  5004. /**
  5005. * Instance of HTMLPurifier_AttrDef_Lang to validate language codes
  5006. * @value object HTMLPurifier_AttrDef_Lang
  5007. */
  5008. protected $validator;
  5009. /**
  5010. * Cached copy of dirname(__FILE__), directory of current file without
  5011. * trailing slash
  5012. * @value string filename
  5013. */
  5014. protected $dir;
  5015. /**
  5016. * Keys whose contents are a hash map and can be merged
  5017. * @value array lookup
  5018. */
  5019. protected $mergeable_keys_map = array('messages' => true, 'errorNames' => true);
  5020. /**
  5021. * Keys whose contents are a list and can be merged
  5022. * @value array lookup
  5023. */
  5024. protected $mergeable_keys_list = array();
  5025. /**
  5026. * Retrieve sole instance of the factory.
  5027. * @param $prototype Optional prototype to overload sole instance with,
  5028. * or bool true to reset to default factory.
  5029. */
  5030. public static function instance($prototype = null) {
  5031. static $instance = null;
  5032. if ($prototype !== null) {
  5033. $instance = $prototype;
  5034. } elseif ($instance === null || $prototype == true) {
  5035. $instance = new HTMLPurifier_LanguageFactory();
  5036. $instance->setup();
  5037. }
  5038. return $instance;
  5039. }
  5040. /**
  5041. * Sets up the singleton, much like a constructor
  5042. * @note Prevents people from getting this outside of the singleton
  5043. */
  5044. public function setup() {
  5045. $this->validator = new HTMLPurifier_AttrDef_Lang();
  5046. $this->dir = HTMLPURIFIER_PREFIX . '/HTMLPurifier';
  5047. }
  5048. /**
  5049. * Creates a language object, handles class fallbacks
  5050. * @param $config Instance of HTMLPurifier_Config
  5051. * @param $context Instance of HTMLPurifier_Context
  5052. * @param $code Code to override configuration with. Private parameter.
  5053. */
  5054. public function create($config, $context, $code = false) {
  5055. // validate language code
  5056. if ($code === false) {
  5057. $code = $this->validator->validate(
  5058. $config->get('Core.Language'), $config, $context
  5059. );
  5060. } else {
  5061. $code = $this->validator->validate($code, $config, $context);
  5062. }
  5063. if ($code === false) $code = 'en'; // malformed code becomes English
  5064. $pcode = str_replace('-', '_', $code); // make valid PHP classname
  5065. static $depth = 0; // recursion protection
  5066. if ($code == 'en') {
  5067. $lang = new HTMLPurifier_Language($config, $context);
  5068. } else {
  5069. $class = 'HTMLPurifier_Language_' . $pcode;
  5070. $file = $this->dir . '/Language/classes/' . $code . '.php';
  5071. if (file_exists($file) || class_exists($class, false)) {
  5072. $lang = new $class($config, $context);
  5073. } else {
  5074. // Go fallback
  5075. $raw_fallback = $this->getFallbackFor($code);
  5076. $fallback = $raw_fallback ? $raw_fallback : 'en';
  5077. $depth++;
  5078. $lang = $this->create($config, $context, $fallback);
  5079. if (!$raw_fallback) {
  5080. $lang->error = true;
  5081. }
  5082. $depth--;
  5083. }
  5084. }
  5085. $lang->code = $code;
  5086. return $lang;
  5087. }
  5088. /**
  5089. * Returns the fallback language for language
  5090. * @note Loads the original language into cache
  5091. * @param $code string language code
  5092. */
  5093. public function getFallbackFor($code) {
  5094. $this->loadLanguage($code);
  5095. return $this->cache[$code]['fallback'];
  5096. }
  5097. /**
  5098. * Loads language into the cache, handles message file and fallbacks
  5099. * @param $code string language code
  5100. */
  5101. public function loadLanguage($code) {
  5102. static $languages_seen = array(); // recursion guard
  5103. // abort if we've already loaded it
  5104. if (isset($this->cache[$code])) return;
  5105. // generate filename
  5106. $filename = $this->dir . '/Language/messages/' . $code . '.php';
  5107. // default fallback : may be overwritten by the ensuing include
  5108. $fallback = ($code != 'en') ? 'en' : false;
  5109. // load primary localisation
  5110. if (!file_exists($filename)) {
  5111. // skip the include: will rely solely on fallback
  5112. $filename = $this->dir . '/Language/messages/en.php';
  5113. $cache = array();
  5114. } else {
  5115. include $filename;
  5116. $cache = compact($this->keys);
  5117. }
  5118. // load fallback localisation
  5119. if (!empty($fallback)) {
  5120. // infinite recursion guard
  5121. if (isset($languages_seen[$code])) {
  5122. trigger_error('Circular fallback reference in language ' .
  5123. $code, E_USER_ERROR);
  5124. $fallback = 'en';
  5125. }
  5126. $language_seen[$code] = true;
  5127. // load the fallback recursively
  5128. $this->loadLanguage($fallback);
  5129. $fallback_cache = $this->cache[$fallback];
  5130. // merge fallback with current language
  5131. foreach ( $this->keys as $key ) {
  5132. if (isset($cache[$key]) && isset($fallback_cache[$key])) {
  5133. if (isset($this->mergeable_keys_map[$key])) {
  5134. $cache[$key] = $cache[$key] + $fallback_cache[$key];
  5135. } elseif (isset($this->mergeable_keys_list[$key])) {
  5136. $cache[$key] = array_merge( $fallback_cache[$key], $cache[$key] );
  5137. }
  5138. } else {
  5139. $cache[$key] = $fallback_cache[$key];
  5140. }
  5141. }
  5142. }
  5143. // save to cache for later retrieval
  5144. $this->cache[$code] = $cache;
  5145. return;
  5146. }
  5147. }
  5148. /**
  5149. * Represents a measurable length, with a string numeric magnitude
  5150. * and a unit. This object is immutable.
  5151. */
  5152. class HTMLPurifier_Length
  5153. {
  5154. /**
  5155. * String numeric magnitude.
  5156. */
  5157. protected $n;
  5158. /**
  5159. * String unit. False is permitted if $n = 0.
  5160. */
  5161. protected $unit;
  5162. /**
  5163. * Whether or not this length is valid. Null if not calculated yet.
  5164. */
  5165. protected $isValid;
  5166. /**
  5167. * Lookup array of units recognized by CSS 2.1
  5168. */
  5169. protected static $allowedUnits = array(
  5170. 'em' => true, 'ex' => true, 'px' => true, 'in' => true,
  5171. 'cm' => true, 'mm' => true, 'pt' => true, 'pc' => true
  5172. );
  5173. /**
  5174. * @param number $n Magnitude
  5175. * @param string $u Unit
  5176. */
  5177. public function __construct($n = '0', $u = false) {
  5178. $this->n = (string) $n;
  5179. $this->unit = $u !== false ? (string) $u : false;
  5180. }
  5181. /**
  5182. * @param string $s Unit string, like '2em' or '3.4in'
  5183. * @warning Does not perform validation.
  5184. */
  5185. static public function make($s) {
  5186. if ($s instanceof HTMLPurifier_Length) return $s;
  5187. $n_length = strspn($s, '1234567890.+-');
  5188. $n = substr($s, 0, $n_length);
  5189. $unit = substr($s, $n_length);
  5190. if ($unit === '') $unit = false;
  5191. return new HTMLPurifier_Length($n, $unit);
  5192. }
  5193. /**
  5194. * Validates the number and unit.
  5195. */
  5196. protected function validate() {
  5197. // Special case:
  5198. if ($this->n === '+0' || $this->n === '-0') $this->n = '0';
  5199. if ($this->n === '0' && $this->unit === false) return true;
  5200. if (!ctype_lower($this->unit)) $this->unit = strtolower($this->unit);
  5201. if (!isset(HTMLPurifier_Length::$allowedUnits[$this->unit])) return false;
  5202. // Hack:
  5203. $def = new HTMLPurifier_AttrDef_CSS_Number();
  5204. $result = $def->validate($this->n, false, false);
  5205. if ($result === false) return false;
  5206. $this->n = $result;
  5207. return true;
  5208. }
  5209. /**
  5210. * Returns string representation of number.
  5211. */
  5212. public function toString() {
  5213. if (!$this->isValid()) return false;
  5214. return $this->n . $this->unit;
  5215. }
  5216. /**
  5217. * Retrieves string numeric magnitude.
  5218. */
  5219. public function getN() {return $this->n;}
  5220. /**
  5221. * Retrieves string unit.
  5222. */
  5223. public function getUnit() {return $this->unit;}
  5224. /**
  5225. * Returns true if this length unit is valid.
  5226. */
  5227. public function isValid() {
  5228. if ($this->isValid === null) $this->isValid = $this->validate();
  5229. return $this->isValid;
  5230. }
  5231. /**
  5232. * Compares two lengths, and returns 1 if greater, -1 if less and 0 if equal.
  5233. * @warning If both values are too large or small, this calculation will
  5234. * not work properly
  5235. */
  5236. public function compareTo($l) {
  5237. if ($l === false) return false;
  5238. if ($l->unit !== $this->unit) {
  5239. $converter = new HTMLPurifier_UnitConverter();
  5240. $l = $converter->convert($l, $this->unit);
  5241. if ($l === false) return false;
  5242. }
  5243. return $this->n - $l->n;
  5244. }
  5245. }
  5246. /**
  5247. * Forgivingly lexes HTML (SGML-style) markup into tokens.
  5248. *
  5249. * A lexer parses a string of SGML-style markup and converts them into
  5250. * corresponding tokens. It doesn't check for well-formedness, although its
  5251. * internal mechanism may make this automatic (such as the case of
  5252. * HTMLPurifier_Lexer_DOMLex). There are several implementations to choose
  5253. * from.
  5254. *
  5255. * A lexer is HTML-oriented: it might work with XML, but it's not
  5256. * recommended, as we adhere to a subset of the specification for optimization
  5257. * reasons. This might change in the future. Also, most tokenizers are not
  5258. * expected to handle DTDs or PIs.
  5259. *
  5260. * This class should not be directly instantiated, but you may use create() to
  5261. * retrieve a default copy of the lexer. Being a supertype, this class
  5262. * does not actually define any implementation, but offers commonly used
  5263. * convenience functions for subclasses.
  5264. *
  5265. * @note The unit tests will instantiate this class for testing purposes, as
  5266. * many of the utility functions require a class to be instantiated.
  5267. * This means that, even though this class is not runnable, it will
  5268. * not be declared abstract.
  5269. *
  5270. * @par
  5271. *
  5272. * @note
  5273. * We use tokens rather than create a DOM representation because DOM would:
  5274. *
  5275. * @par
  5276. * -# Require more processing and memory to create,
  5277. * -# Is not streamable, and
  5278. * -# Has the entire document structure (html and body not needed).
  5279. *
  5280. * @par
  5281. * However, DOM is helpful in that it makes it easy to move around nodes
  5282. * without a lot of lookaheads to see when a tag is closed. This is a
  5283. * limitation of the token system and some workarounds would be nice.
  5284. */
  5285. class HTMLPurifier_Lexer
  5286. {
  5287. /**
  5288. * Whether or not this lexer implements line-number/column-number tracking.
  5289. * If it does, set to true.
  5290. */
  5291. public $tracksLineNumbers = false;
  5292. // -- STATIC ----------------------------------------------------------
  5293. /**
  5294. * Retrieves or sets the default Lexer as a Prototype Factory.
  5295. *
  5296. * By default HTMLPurifier_Lexer_DOMLex will be returned. There are
  5297. * a few exceptions involving special features that only DirectLex
  5298. * implements.
  5299. *
  5300. * @note The behavior of this class has changed, rather than accepting
  5301. * a prototype object, it now accepts a configuration object.
  5302. * To specify your own prototype, set %Core.LexerImpl to it.
  5303. * This change in behavior de-singletonizes the lexer object.
  5304. *
  5305. * @param $config Instance of HTMLPurifier_Config
  5306. * @return Concrete lexer.
  5307. */
  5308. public static function create($config) {
  5309. if (!($config instanceof HTMLPurifier_Config)) {
  5310. $lexer = $config;
  5311. trigger_error("Passing a prototype to
  5312. HTMLPurifier_Lexer::create() is deprecated, please instead
  5313. use %Core.LexerImpl", E_USER_WARNING);
  5314. } else {
  5315. $lexer = $config->get('Core.LexerImpl');
  5316. }
  5317. $needs_tracking =
  5318. $config->get('Core.MaintainLineNumbers') ||
  5319. $config->get('Core.CollectErrors');
  5320. $inst = null;
  5321. if (is_object($lexer)) {
  5322. $inst = $lexer;
  5323. } else {
  5324. if (is_null($lexer)) { do {
  5325. // auto-detection algorithm
  5326. if ($needs_tracking) {
  5327. $lexer = 'DirectLex';
  5328. break;
  5329. }
  5330. if (
  5331. class_exists('DOMDocument') &&
  5332. method_exists('DOMDocument', 'loadHTML') &&
  5333. !extension_loaded('domxml')
  5334. ) {
  5335. // check for DOM support, because while it's part of the
  5336. // core, it can be disabled compile time. Also, the PECL
  5337. // domxml extension overrides the default DOM, and is evil
  5338. // and nasty and we shan't bother to support it
  5339. $lexer = 'DOMLex';
  5340. } else {
  5341. $lexer = 'DirectLex';
  5342. }
  5343. } while(0); } // do..while so we can break
  5344. // instantiate recognized string names
  5345. switch ($lexer) {
  5346. case 'DOMLex':
  5347. $inst = new HTMLPurifier_Lexer_DOMLex();
  5348. break;
  5349. case 'DirectLex':
  5350. $inst = new HTMLPurifier_Lexer_DirectLex();
  5351. break;
  5352. case 'PH5P':
  5353. $inst = new HTMLPurifier_Lexer_PH5P();
  5354. break;
  5355. default:
  5356. throw new HTMLPurifier_Exception("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer));
  5357. }
  5358. }
  5359. if (!$inst) throw new HTMLPurifier_Exception('No lexer was instantiated');
  5360. // once PHP DOM implements native line numbers, or we
  5361. // hack out something using XSLT, remove this stipulation
  5362. if ($needs_tracking && !$inst->tracksLineNumbers) {
  5363. throw new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)');
  5364. }
  5365. return $inst;
  5366. }
  5367. // -- CONVENIENCE MEMBERS ---------------------------------------------
  5368. public function __construct() {
  5369. $this->_entity_parser = new HTMLPurifier_EntityParser();
  5370. }
  5371. /**
  5372. * Most common entity to raw value conversion table for special entities.
  5373. */
  5374. protected $_special_entity2str =
  5375. array(
  5376. '&quot;' => '"',
  5377. '&amp;' => '&',
  5378. '&lt;' => '<',
  5379. '&gt;' => '>',
  5380. '&#39;' => "'",
  5381. '&#039;' => "'",
  5382. '&#x27;' => "'"
  5383. );
  5384. /**
  5385. * Parses special entities into the proper characters.
  5386. *
  5387. * This string will translate escaped versions of the special characters
  5388. * into the correct ones.
  5389. *
  5390. * @warning
  5391. * You should be able to treat the output of this function as
  5392. * completely parsed, but that's only because all other entities should
  5393. * have been handled previously in substituteNonSpecialEntities()
  5394. *
  5395. * @param $string String character data to be parsed.
  5396. * @returns Parsed character data.
  5397. */
  5398. public function parseData($string) {
  5399. // following functions require at least one character
  5400. if ($string === '') return '';
  5401. // subtracts amps that cannot possibly be escaped
  5402. $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
  5403. ($string[strlen($string)-1] === '&' ? 1 : 0);
  5404. if (!$num_amp) return $string; // abort if no entities
  5405. $num_esc_amp = substr_count($string, '&amp;');
  5406. $string = strtr($string, $this->_special_entity2str);
  5407. // code duplication for sake of optimization, see above
  5408. $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
  5409. ($string[strlen($string)-1] === '&' ? 1 : 0);
  5410. if ($num_amp_2 <= $num_esc_amp) return $string;
  5411. // hmm... now we have some uncommon entities. Use the callback.
  5412. $string = $this->_entity_parser->substituteSpecialEntities($string);
  5413. return $string;
  5414. }
  5415. /**
  5416. * Lexes an HTML string into tokens.
  5417. *
  5418. * @param $string String HTML.
  5419. * @return HTMLPurifier_Token array representation of HTML.
  5420. */
  5421. public function tokenizeHTML($string, $config, $context) {
  5422. trigger_error('Call to abstract class', E_USER_ERROR);
  5423. }
  5424. /**
  5425. * Translates CDATA sections into regular sections (through escaping).
  5426. *
  5427. * @param $string HTML string to process.
  5428. * @returns HTML with CDATA sections escaped.
  5429. */
  5430. protected static function escapeCDATA($string) {
  5431. return preg_replace_callback(
  5432. '/<!\[CDATA\[(.+?)\]\]>/s',
  5433. array('HTMLPurifier_Lexer', 'CDATACallback'),
  5434. $string
  5435. );
  5436. }
  5437. /**
  5438. * Special CDATA case that is especially convoluted for <script>
  5439. */
  5440. protected static function escapeCommentedCDATA($string) {
  5441. return preg_replace_callback(
  5442. '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
  5443. array('HTMLPurifier_Lexer', 'CDATACallback'),
  5444. $string
  5445. );
  5446. }
  5447. /**
  5448. * Special Internet Explorer conditional comments should be removed.
  5449. */
  5450. protected static function removeIEConditional($string) {
  5451. return preg_replace(
  5452. '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings
  5453. '',
  5454. $string
  5455. );
  5456. }
  5457. /**
  5458. * Callback function for escapeCDATA() that does the work.
  5459. *
  5460. * @warning Though this is public in order to let the callback happen,
  5461. * calling it directly is not recommended.
  5462. * @params $matches PCRE matches array, with index 0 the entire match
  5463. * and 1 the inside of the CDATA section.
  5464. * @returns Escaped internals of the CDATA section.
  5465. */
  5466. protected static function CDATACallback($matches) {
  5467. // not exactly sure why the character set is needed, but whatever
  5468. return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
  5469. }
  5470. /**
  5471. * Takes a piece of HTML and normalizes it by converting entities, fixing
  5472. * encoding, extracting bits, and other good stuff.
  5473. * @todo Consider making protected
  5474. */
  5475. public function normalize($html, $config, $context) {
  5476. // normalize newlines to \n
  5477. if ($config->get('Core.NormalizeNewlines')) {
  5478. $html = str_replace("\r\n", "\n", $html);
  5479. $html = str_replace("\r", "\n", $html);
  5480. }
  5481. if ($config->get('HTML.Trusted')) {
  5482. // escape convoluted CDATA
  5483. $html = $this->escapeCommentedCDATA($html);
  5484. }
  5485. // escape CDATA
  5486. $html = $this->escapeCDATA($html);
  5487. $html = $this->removeIEConditional($html);
  5488. // extract body from document if applicable
  5489. if ($config->get('Core.ConvertDocumentToFragment')) {
  5490. $e = false;
  5491. if ($config->get('Core.CollectErrors')) {
  5492. $e =& $context->get('ErrorCollector');
  5493. }
  5494. $new_html = $this->extractBody($html);
  5495. if ($e && $new_html != $html) {
  5496. $e->send(E_WARNING, 'Lexer: Extracted body');
  5497. }
  5498. $html = $new_html;
  5499. }
  5500. // expand entities that aren't the big five
  5501. $html = $this->_entity_parser->substituteNonSpecialEntities($html);
  5502. // clean into wellformed UTF-8 string for an SGML context: this has
  5503. // to be done after entity expansion because the entities sometimes
  5504. // represent non-SGML characters (horror, horror!)
  5505. $html = HTMLPurifier_Encoder::cleanUTF8($html);
  5506. // if processing instructions are to removed, remove them now
  5507. if ($config->get('Core.RemoveProcessingInstructions')) {
  5508. $html = preg_replace('#<\?.+?\?>#s', '', $html);
  5509. }
  5510. return $html;
  5511. }
  5512. /**
  5513. * Takes a string of HTML (fragment or document) and returns the content
  5514. * @todo Consider making protected
  5515. */
  5516. public function extractBody($html) {
  5517. $matches = array();
  5518. $result = preg_match('!<body[^>]*>(.*)</body>!is', $html, $matches);
  5519. if ($result) {
  5520. return $matches[1];
  5521. } else {
  5522. return $html;
  5523. }
  5524. }
  5525. }
  5526. /**
  5527. * Class that handles operations involving percent-encoding in URIs.
  5528. *
  5529. * @warning
  5530. * Be careful when reusing instances of PercentEncoder. The object
  5531. * you use for normalize() SHOULD NOT be used for encode(), or
  5532. * vice-versa.
  5533. */
  5534. class HTMLPurifier_PercentEncoder
  5535. {
  5536. /**
  5537. * Reserved characters to preserve when using encode().
  5538. */
  5539. protected $preserve = array();
  5540. /**
  5541. * String of characters that should be preserved while using encode().
  5542. */
  5543. public function __construct($preserve = false) {
  5544. // unreserved letters, ought to const-ify
  5545. for ($i = 48; $i <= 57; $i++) $this->preserve[$i] = true; // digits
  5546. for ($i = 65; $i <= 90; $i++) $this->preserve[$i] = true; // upper-case
  5547. for ($i = 97; $i <= 122; $i++) $this->preserve[$i] = true; // lower-case
  5548. $this->preserve[45] = true; // Dash -
  5549. $this->preserve[46] = true; // Period .
  5550. $this->preserve[95] = true; // Underscore _
  5551. $this->preserve[126]= true; // Tilde ~
  5552. // extra letters not to escape
  5553. if ($preserve !== false) {
  5554. for ($i = 0, $c = strlen($preserve); $i < $c; $i++) {
  5555. $this->preserve[ord($preserve[$i])] = true;
  5556. }
  5557. }
  5558. }
  5559. /**
  5560. * Our replacement for urlencode, it encodes all non-reserved characters,
  5561. * as well as any extra characters that were instructed to be preserved.
  5562. * @note
  5563. * Assumes that the string has already been normalized, making any
  5564. * and all percent escape sequences valid. Percents will not be
  5565. * re-escaped, regardless of their status in $preserve
  5566. * @param $string String to be encoded
  5567. * @return Encoded string.
  5568. */
  5569. public function encode($string) {
  5570. $ret = '';
  5571. for ($i = 0, $c = strlen($string); $i < $c; $i++) {
  5572. if ($string[$i] !== '%' && !isset($this->preserve[$int = ord($string[$i])]) ) {
  5573. $ret .= '%' . sprintf('%02X', $int);
  5574. } else {
  5575. $ret .= $string[$i];
  5576. }
  5577. }
  5578. return $ret;
  5579. }
  5580. /**
  5581. * Fix up percent-encoding by decoding unreserved characters and normalizing.
  5582. * @warning This function is affected by $preserve, even though the
  5583. * usual desired behavior is for this not to preserve those
  5584. * characters. Be careful when reusing instances of PercentEncoder!
  5585. * @param $string String to normalize
  5586. */
  5587. public function normalize($string) {
  5588. if ($string == '') return '';
  5589. $parts = explode('%', $string);
  5590. $ret = array_shift($parts);
  5591. foreach ($parts as $part) {
  5592. $length = strlen($part);
  5593. if ($length < 2) {
  5594. $ret .= '%25' . $part;
  5595. continue;
  5596. }
  5597. $encoding = substr($part, 0, 2);
  5598. $text = substr($part, 2);
  5599. if (!ctype_xdigit($encoding)) {
  5600. $ret .= '%25' . $part;
  5601. continue;
  5602. }
  5603. $int = hexdec($encoding);
  5604. if (isset($this->preserve[$int])) {
  5605. $ret .= chr($int) . $text;
  5606. continue;
  5607. }
  5608. $encoding = strtoupper($encoding);
  5609. $ret .= '%' . $encoding . $text;
  5610. }
  5611. return $ret;
  5612. }
  5613. }
  5614. /**
  5615. * Generic property list implementation
  5616. */
  5617. class HTMLPurifier_PropertyList
  5618. {
  5619. /**
  5620. * Internal data-structure for properties
  5621. */
  5622. protected $data = array();
  5623. /**
  5624. * Parent plist
  5625. */
  5626. protected $parent;
  5627. protected $cache;
  5628. public function __construct($parent = null) {
  5629. $this->parent = $parent;
  5630. }
  5631. /**
  5632. * Recursively retrieves the value for a key
  5633. */
  5634. public function get($name) {
  5635. if ($this->has($name)) return $this->data[$name];
  5636. // possible performance bottleneck, convert to iterative if necessary
  5637. if ($this->parent) return $this->parent->get($name);
  5638. throw new HTMLPurifier_Exception("Key '$name' not found");
  5639. }
  5640. /**
  5641. * Sets the value of a key, for this plist
  5642. */
  5643. public function set($name, $value) {
  5644. $this->data[$name] = $value;
  5645. }
  5646. /**
  5647. * Returns true if a given key exists
  5648. */
  5649. public function has($name) {
  5650. return array_key_exists($name, $this->data);
  5651. }
  5652. /**
  5653. * Resets a value to the value of it's parent, usually the default. If
  5654. * no value is specified, the entire plist is reset.
  5655. */
  5656. public function reset($name = null) {
  5657. if ($name == null) $this->data = array();
  5658. else unset($this->data[$name]);
  5659. }
  5660. /**
  5661. * Squashes this property list and all of its property lists into a single
  5662. * array, and returns the array. This value is cached by default.
  5663. * @param $force If true, ignores the cache and regenerates the array.
  5664. */
  5665. public function squash($force = false) {
  5666. if ($this->cache !== null && !$force) return $this->cache;
  5667. if ($this->parent) {
  5668. return $this->cache = array_merge($this->parent->squash($force), $this->data);
  5669. } else {
  5670. return $this->cache = $this->data;
  5671. }
  5672. }
  5673. /**
  5674. * Returns the parent plist.
  5675. */
  5676. public function getParent() {
  5677. return $this->parent;
  5678. }
  5679. /**
  5680. * Sets the parent plist.
  5681. */
  5682. public function setParent($plist) {
  5683. $this->parent = $plist;
  5684. }
  5685. }
  5686. /**
  5687. * Property list iterator. Do not instantiate this class directly.
  5688. */
  5689. class HTMLPurifier_PropertyListIterator extends FilterIterator
  5690. {
  5691. protected $l;
  5692. protected $filter;
  5693. /**
  5694. * @param $data Array of data to iterate over
  5695. * @param $filter Optional prefix to only allow values of
  5696. */
  5697. public function __construct(Iterator $iterator, $filter = null) {
  5698. parent::__construct($iterator);
  5699. $this->l = strlen($filter);
  5700. $this->filter = $filter;
  5701. }
  5702. public function accept() {
  5703. $key = $this->getInnerIterator()->key();
  5704. if( strncmp($key, $this->filter, $this->l) !== 0 ) {
  5705. return false;
  5706. }
  5707. return true;
  5708. }
  5709. }
  5710. /**
  5711. * Supertype for classes that define a strategy for modifying/purifying tokens.
  5712. *
  5713. * While HTMLPurifier's core purpose is fixing HTML into something proper,
  5714. * strategies provide plug points for extra configuration or even extra
  5715. * features, such as custom tags, custom parsing of text, etc.
  5716. */
  5717. abstract class HTMLPurifier_Strategy
  5718. {
  5719. /**
  5720. * Executes the strategy on the tokens.
  5721. *
  5722. * @param $tokens Array of HTMLPurifier_Token objects to be operated on.
  5723. * @param $config Configuration options
  5724. * @returns Processed array of token objects.
  5725. */
  5726. abstract public function execute($tokens, $config, $context);
  5727. }
  5728. /**
  5729. * This is in almost every respect equivalent to an array except
  5730. * that it keeps track of which keys were accessed.
  5731. *
  5732. * @warning For the sake of backwards compatibility with early versions
  5733. * of PHP 5, you must not use the $hash[$key] syntax; if you do
  5734. * our version of offsetGet is never called.
  5735. */
  5736. class HTMLPurifier_StringHash extends ArrayObject
  5737. {
  5738. protected $accessed = array();
  5739. /**
  5740. * Retrieves a value, and logs the access.
  5741. */
  5742. public function offsetGet($index) {
  5743. $this->accessed[$index] = true;
  5744. return parent::offsetGet($index);
  5745. }
  5746. /**
  5747. * Returns a lookup array of all array indexes that have been accessed.
  5748. * @return Array in form array($index => true).
  5749. */
  5750. public function getAccessed() {
  5751. return $this->accessed;
  5752. }
  5753. /**
  5754. * Resets the access array.
  5755. */
  5756. public function resetAccessed() {
  5757. $this->accessed = array();
  5758. }
  5759. }
  5760. /**
  5761. * Parses string hash files. File format is as such:
  5762. *
  5763. * DefaultKeyValue
  5764. * KEY: Value
  5765. * KEY2: Value2
  5766. * --MULTILINE-KEY--
  5767. * Multiline
  5768. * value.
  5769. *
  5770. * Which would output something similar to:
  5771. *
  5772. * array(
  5773. * 'ID' => 'DefaultKeyValue',
  5774. * 'KEY' => 'Value',
  5775. * 'KEY2' => 'Value2',
  5776. * 'MULTILINE-KEY' => "Multiline\nvalue.\n",
  5777. * )
  5778. *
  5779. * We use this as an easy to use file-format for configuration schema
  5780. * files, but the class itself is usage agnostic.
  5781. *
  5782. * You can use ---- to forcibly terminate parsing of a single string-hash;
  5783. * this marker is used in multi string-hashes to delimit boundaries.
  5784. */
  5785. class HTMLPurifier_StringHashParser
  5786. {
  5787. public $default = 'ID';
  5788. /**
  5789. * Parses a file that contains a single string-hash.
  5790. */
  5791. public function parseFile($file) {
  5792. if (!file_exists($file)) return false;
  5793. $fh = fopen($file, 'r');
  5794. if (!$fh) return false;
  5795. $ret = $this->parseHandle($fh);
  5796. fclose($fh);
  5797. return $ret;
  5798. }
  5799. /**
  5800. * Parses a file that contains multiple string-hashes delimited by '----'
  5801. */
  5802. public function parseMultiFile($file) {
  5803. if (!file_exists($file)) return false;
  5804. $ret = array();
  5805. $fh = fopen($file, 'r');
  5806. if (!$fh) return false;
  5807. while (!feof($fh)) {
  5808. $ret[] = $this->parseHandle($fh);
  5809. }
  5810. fclose($fh);
  5811. return $ret;
  5812. }
  5813. /**
  5814. * Internal parser that acepts a file handle.
  5815. * @note While it's possible to simulate in-memory parsing by using
  5816. * custom stream wrappers, if such a use-case arises we should
  5817. * factor out the file handle into its own class.
  5818. * @param $fh File handle with pointer at start of valid string-hash
  5819. * block.
  5820. */
  5821. protected function parseHandle($fh) {
  5822. $state = false;
  5823. $single = false;
  5824. $ret = array();
  5825. do {
  5826. $line = fgets($fh);
  5827. if ($line === false) break;
  5828. $line = rtrim($line, "\n\r");
  5829. if (!$state && $line === '') continue;
  5830. if ($line === '----') break;
  5831. if (strncmp('--#', $line, 3) === 0) {
  5832. // Comment
  5833. continue;
  5834. } elseif (strncmp('--', $line, 2) === 0) {
  5835. // Multiline declaration
  5836. $state = trim($line, '- ');
  5837. if (!isset($ret[$state])) $ret[$state] = '';
  5838. continue;
  5839. } elseif (!$state) {
  5840. $single = true;
  5841. if (strpos($line, ':') !== false) {
  5842. // Single-line declaration
  5843. list($state, $line) = explode(':', $line, 2);
  5844. $line = trim($line);
  5845. } else {
  5846. // Use default declaration
  5847. $state = $this->default;
  5848. }
  5849. }
  5850. if ($single) {
  5851. $ret[$state] = $line;
  5852. $single = false;
  5853. $state = false;
  5854. } else {
  5855. $ret[$state] .= "$line\n";
  5856. }
  5857. } while (!feof($fh));
  5858. return $ret;
  5859. }
  5860. }
  5861. /**
  5862. * Defines a mutation of an obsolete tag into a valid tag.
  5863. */
  5864. abstract class HTMLPurifier_TagTransform
  5865. {
  5866. /**
  5867. * Tag name to transform the tag to.
  5868. */
  5869. public $transform_to;
  5870. /**
  5871. * Transforms the obsolete tag into the valid tag.
  5872. * @param $tag Tag to be transformed.
  5873. * @param $config Mandatory HTMLPurifier_Config object
  5874. * @param $context Mandatory HTMLPurifier_Context object
  5875. */
  5876. abstract public function transform($tag, $config, $context);
  5877. /**
  5878. * Prepends CSS properties to the style attribute, creating the
  5879. * attribute if it doesn't exist.
  5880. * @warning Copied over from AttrTransform, be sure to keep in sync
  5881. * @param $attr Attribute array to process (passed by reference)
  5882. * @param $css CSS to prepend
  5883. */
  5884. protected function prependCSS(&$attr, $css) {
  5885. $attr['style'] = isset($attr['style']) ? $attr['style'] : '';
  5886. $attr['style'] = $css . $attr['style'];
  5887. }
  5888. }
  5889. /**
  5890. * Abstract base token class that all others inherit from.
  5891. */
  5892. class HTMLPurifier_Token {
  5893. public $line; /**< Line number node was on in source document. Null if unknown. */
  5894. public $col; /**< Column of line node was on in source document. Null if unknown. */
  5895. /**
  5896. * Lookup array of processing that this token is exempt from.
  5897. * Currently, valid values are "ValidateAttributes" and
  5898. * "MakeWellFormed_TagClosedError"
  5899. */
  5900. public $armor = array();
  5901. /**
  5902. * Used during MakeWellFormed.
  5903. */
  5904. public $skip;
  5905. public $rewind;
  5906. public $carryover;
  5907. public function __get($n) {
  5908. if ($n === 'type') {
  5909. trigger_error('Deprecated type property called; use instanceof', E_USER_NOTICE);
  5910. switch (get_class($this)) {
  5911. case 'HTMLPurifier_Token_Start': return 'start';
  5912. case 'HTMLPurifier_Token_Empty': return 'empty';
  5913. case 'HTMLPurifier_Token_End': return 'end';
  5914. case 'HTMLPurifier_Token_Text': return 'text';
  5915. case 'HTMLPurifier_Token_Comment': return 'comment';
  5916. default: return null;
  5917. }
  5918. }
  5919. }
  5920. /**
  5921. * Sets the position of the token in the source document.
  5922. */
  5923. public function position($l = null, $c = null) {
  5924. $this->line = $l;
  5925. $this->col = $c;
  5926. }
  5927. /**
  5928. * Convenience function for DirectLex settings line/col position.
  5929. */
  5930. public function rawPosition($l, $c) {
  5931. if ($c === -1) $l++;
  5932. $this->line = $l;
  5933. $this->col = $c;
  5934. }
  5935. }
  5936. /**
  5937. * Factory for token generation.
  5938. *
  5939. * @note Doing some benchmarking indicates that the new operator is much
  5940. * slower than the clone operator (even discounting the cost of the
  5941. * constructor). This class is for that optimization.
  5942. * Other then that, there's not much point as we don't
  5943. * maintain parallel HTMLPurifier_Token hierarchies (the main reason why
  5944. * you'd want to use an abstract factory).
  5945. * @todo Port DirectLex to use this
  5946. */
  5947. class HTMLPurifier_TokenFactory
  5948. {
  5949. /**
  5950. * Prototypes that will be cloned.
  5951. * @private
  5952. */
  5953. // p stands for prototype
  5954. private $p_start, $p_end, $p_empty, $p_text, $p_comment;
  5955. /**
  5956. * Generates blank prototypes for cloning.
  5957. */
  5958. public function __construct() {
  5959. $this->p_start = new HTMLPurifier_Token_Start('', array());
  5960. $this->p_end = new HTMLPurifier_Token_End('');
  5961. $this->p_empty = new HTMLPurifier_Token_Empty('', array());
  5962. $this->p_text = new HTMLPurifier_Token_Text('');
  5963. $this->p_comment= new HTMLPurifier_Token_Comment('');
  5964. }
  5965. /**
  5966. * Creates a HTMLPurifier_Token_Start.
  5967. * @param $name Tag name
  5968. * @param $attr Associative array of attributes
  5969. * @return Generated HTMLPurifier_Token_Start
  5970. */
  5971. public function createStart($name, $attr = array()) {
  5972. $p = clone $this->p_start;
  5973. $p->__construct($name, $attr);
  5974. return $p;
  5975. }
  5976. /**
  5977. * Creates a HTMLPurifier_Token_End.
  5978. * @param $name Tag name
  5979. * @return Generated HTMLPurifier_Token_End
  5980. */
  5981. public function createEnd($name) {
  5982. $p = clone $this->p_end;
  5983. $p->__construct($name);
  5984. return $p;
  5985. }
  5986. /**
  5987. * Creates a HTMLPurifier_Token_Empty.
  5988. * @param $name Tag name
  5989. * @param $attr Associative array of attributes
  5990. * @return Generated HTMLPurifier_Token_Empty
  5991. */
  5992. public function createEmpty($name, $attr = array()) {
  5993. $p = clone $this->p_empty;
  5994. $p->__construct($name, $attr);
  5995. return $p;
  5996. }
  5997. /**
  5998. * Creates a HTMLPurifier_Token_Text.
  5999. * @param $data Data of text token
  6000. * @return Generated HTMLPurifier_Token_Text
  6001. */
  6002. public function createText($data) {
  6003. $p = clone $this->p_text;
  6004. $p->__construct($data);
  6005. return $p;
  6006. }
  6007. /**
  6008. * Creates a HTMLPurifier_Token_Comment.
  6009. * @param $data Data of comment token
  6010. * @return Generated HTMLPurifier_Token_Comment
  6011. */
  6012. public function createComment($data) {
  6013. $p = clone $this->p_comment;
  6014. $p->__construct($data);
  6015. return $p;
  6016. }
  6017. }
  6018. /**
  6019. * HTML Purifier's internal representation of a URI.
  6020. * @note
  6021. * Internal data-structures are completely escaped. If the data needs
  6022. * to be used in a non-URI context (which is very unlikely), be sure
  6023. * to decode it first. The URI may not necessarily be well-formed until
  6024. * validate() is called.
  6025. */
  6026. class HTMLPurifier_URI
  6027. {
  6028. public $scheme, $userinfo, $host, $port, $path, $query, $fragment;
  6029. /**
  6030. * @note Automatically normalizes scheme and port
  6031. */
  6032. public function __construct($scheme, $userinfo, $host, $port, $path, $query, $fragment) {
  6033. $this->scheme = is_null($scheme) || ctype_lower($scheme) ? $scheme : strtolower($scheme);
  6034. $this->userinfo = $userinfo;
  6035. $this->host = $host;
  6036. $this->port = is_null($port) ? $port : (int) $port;
  6037. $this->path = $path;
  6038. $this->query = $query;
  6039. $this->fragment = $fragment;
  6040. }
  6041. /**
  6042. * Retrieves a scheme object corresponding to the URI's scheme/default
  6043. * @param $config Instance of HTMLPurifier_Config
  6044. * @param $context Instance of HTMLPurifier_Context
  6045. * @return Scheme object appropriate for validating this URI
  6046. */
  6047. public function getSchemeObj($config, $context) {
  6048. $registry = HTMLPurifier_URISchemeRegistry::instance();
  6049. if ($this->scheme !== null) {
  6050. $scheme_obj = $registry->getScheme($this->scheme, $config, $context);
  6051. if (!$scheme_obj) return false; // invalid scheme, clean it out
  6052. } else {
  6053. // no scheme: retrieve the default one
  6054. $def = $config->getDefinition('URI');
  6055. $scheme_obj = $registry->getScheme($def->defaultScheme, $config, $context);
  6056. if (!$scheme_obj) {
  6057. // something funky happened to the default scheme object
  6058. trigger_error(
  6059. 'Default scheme object "' . $def->defaultScheme . '" was not readable',
  6060. E_USER_WARNING
  6061. );
  6062. return false;
  6063. }
  6064. }
  6065. return $scheme_obj;
  6066. }
  6067. /**
  6068. * Generic validation method applicable for all schemes. May modify
  6069. * this URI in order to get it into a compliant form.
  6070. * @param $config Instance of HTMLPurifier_Config
  6071. * @param $context Instance of HTMLPurifier_Context
  6072. * @return True if validation/filtering succeeds, false if failure
  6073. */
  6074. public function validate($config, $context) {
  6075. // ABNF definitions from RFC 3986
  6076. $chars_sub_delims = '!$&\'()*+,;=';
  6077. $chars_gen_delims = ':/?#[]@';
  6078. $chars_pchar = $chars_sub_delims . ':@';
  6079. // validate host
  6080. if (!is_null($this->host)) {
  6081. $host_def = new HTMLPurifier_AttrDef_URI_Host();
  6082. $this->host = $host_def->validate($this->host, $config, $context);
  6083. if ($this->host === false) $this->host = null;
  6084. }
  6085. // validate scheme
  6086. // NOTE: It's not appropriate to check whether or not this
  6087. // scheme is in our registry, since a URIFilter may convert a
  6088. // URI that we don't allow into one we do. So instead, we just
  6089. // check if the scheme can be dropped because there is no host
  6090. // and it is our default scheme.
  6091. if (!is_null($this->scheme) && is_null($this->host) || $this->host === '') {
  6092. // support for relative paths is pretty abysmal when the
  6093. // scheme is present, so axe it when possible
  6094. $def = $config->getDefinition('URI');
  6095. if ($def->defaultScheme === $this->scheme) {
  6096. $this->scheme = null;
  6097. }
  6098. }
  6099. // validate username
  6100. if (!is_null($this->userinfo)) {
  6101. $encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
  6102. $this->userinfo = $encoder->encode($this->userinfo);
  6103. }
  6104. // validate port
  6105. if (!is_null($this->port)) {
  6106. if ($this->port < 1 || $this->port > 65535) $this->port = null;
  6107. }
  6108. // validate path
  6109. $path_parts = array();
  6110. $segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
  6111. if (!is_null($this->host)) { // this catches $this->host === ''
  6112. // path-abempty (hier and relative)
  6113. // http://www.example.com/my/path
  6114. // //www.example.com/my/path (looks odd, but works, and
  6115. // recognized by most browsers)
  6116. // (this set is valid or invalid on a scheme by scheme
  6117. // basis, so we'll deal with it later)
  6118. // file:///my/path
  6119. // ///my/path
  6120. $this->path = $segments_encoder->encode($this->path);
  6121. } elseif ($this->path !== '') {
  6122. if ($this->path[0] === '/') {
  6123. // path-absolute (hier and relative)
  6124. // http:/my/path
  6125. // /my/path
  6126. if (strlen($this->path) >= 2 && $this->path[1] === '/') {
  6127. // This could happen if both the host gets stripped
  6128. // out
  6129. // http://my/path
  6130. // //my/path
  6131. $this->path = '';
  6132. } else {
  6133. $this->path = $segments_encoder->encode($this->path);
  6134. }
  6135. } elseif (!is_null($this->scheme)) {
  6136. // path-rootless (hier)
  6137. // http:my/path
  6138. // Short circuit evaluation means we don't need to check nz
  6139. $this->path = $segments_encoder->encode($this->path);
  6140. } else {
  6141. // path-noscheme (relative)
  6142. // my/path
  6143. // (once again, not checking nz)
  6144. $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
  6145. $c = strpos($this->path, '/');
  6146. if ($c !== false) {
  6147. $this->path =
  6148. $segment_nc_encoder->encode(substr($this->path, 0, $c)) .
  6149. $segments_encoder->encode(substr($this->path, $c));
  6150. } else {
  6151. $this->path = $segment_nc_encoder->encode($this->path);
  6152. }
  6153. }
  6154. } else {
  6155. // path-empty (hier and relative)
  6156. $this->path = ''; // just to be safe
  6157. }
  6158. // qf = query and fragment
  6159. $qf_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/?');
  6160. if (!is_null($this->query)) {
  6161. $this->query = $qf_encoder->encode($this->query);
  6162. }
  6163. if (!is_null($this->fragment)) {
  6164. $this->fragment = $qf_encoder->encode($this->fragment);
  6165. }
  6166. return true;
  6167. }
  6168. /**
  6169. * Convert URI back to string
  6170. * @return String URI appropriate for output
  6171. */
  6172. public function toString() {
  6173. // reconstruct authority
  6174. $authority = null;
  6175. // there is a rendering difference between a null authority
  6176. // (http:foo-bar) and an empty string authority
  6177. // (http:///foo-bar).
  6178. if (!is_null($this->host)) {
  6179. $authority = '';
  6180. if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@';
  6181. $authority .= $this->host;
  6182. if(!is_null($this->port)) $authority .= ':' . $this->port;
  6183. }
  6184. // Reconstruct the result
  6185. // One might wonder about parsing quirks from browsers after
  6186. // this reconstruction. Unfortunately, parsing behavior depends
  6187. // on what *scheme* was employed (file:///foo is handled *very*
  6188. // differently than http:///foo), so unfortunately we have to
  6189. // defer to the schemes to do the right thing.
  6190. $result = '';
  6191. if (!is_null($this->scheme)) $result .= $this->scheme . ':';
  6192. if (!is_null($authority)) $result .= '//' . $authority;
  6193. $result .= $this->path;
  6194. if (!is_null($this->query)) $result .= '?' . $this->query;
  6195. if (!is_null($this->fragment)) $result .= '#' . $this->fragment;
  6196. return $result;
  6197. }
  6198. }
  6199. class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition
  6200. {
  6201. public $type = 'URI';
  6202. protected $filters = array();
  6203. protected $postFilters = array();
  6204. protected $registeredFilters = array();
  6205. /**
  6206. * HTMLPurifier_URI object of the base specified at %URI.Base
  6207. */
  6208. public $base;
  6209. /**
  6210. * String host to consider "home" base, derived off of $base
  6211. */
  6212. public $host;
  6213. /**
  6214. * Name of default scheme based on %URI.DefaultScheme and %URI.Base
  6215. */
  6216. public $defaultScheme;
  6217. public function __construct() {
  6218. $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternal());
  6219. $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternalResources());
  6220. $this->registerFilter(new HTMLPurifier_URIFilter_HostBlacklist());
  6221. $this->registerFilter(new HTMLPurifier_URIFilter_MakeAbsolute());
  6222. $this->registerFilter(new HTMLPurifier_URIFilter_Munge());
  6223. }
  6224. public function registerFilter($filter) {
  6225. $this->registeredFilters[$filter->name] = $filter;
  6226. }
  6227. public function addFilter($filter, $config) {
  6228. $r = $filter->prepare($config);
  6229. if ($r === false) return; // null is ok, for backwards compat
  6230. if ($filter->post) {
  6231. $this->postFilters[$filter->name] = $filter;
  6232. } else {
  6233. $this->filters[$filter->name] = $filter;
  6234. }
  6235. }
  6236. protected function doSetup($config) {
  6237. $this->setupMemberVariables($config);
  6238. $this->setupFilters($config);
  6239. }
  6240. protected function setupFilters($config) {
  6241. foreach ($this->registeredFilters as $name => $filter) {
  6242. $conf = $config->get('URI.' . $name);
  6243. if ($conf !== false && $conf !== null) {
  6244. $this->addFilter($filter, $config);
  6245. }
  6246. }
  6247. unset($this->registeredFilters);
  6248. }
  6249. protected function setupMemberVariables($config) {
  6250. $this->host = $config->get('URI.Host');
  6251. $base_uri = $config->get('URI.Base');
  6252. if (!is_null($base_uri)) {
  6253. $parser = new HTMLPurifier_URIParser();
  6254. $this->base = $parser->parse($base_uri);
  6255. $this->defaultScheme = $this->base->scheme;
  6256. if (is_null($this->host)) $this->host = $this->base->host;
  6257. }
  6258. if (is_null($this->defaultScheme)) $this->defaultScheme = $config->get('URI.DefaultScheme');
  6259. }
  6260. public function filter(&$uri, $config, $context) {
  6261. foreach ($this->filters as $name => $f) {
  6262. $result = $f->filter($uri, $config, $context);
  6263. if (!$result) return false;
  6264. }
  6265. return true;
  6266. }
  6267. public function postFilter(&$uri, $config, $context) {
  6268. foreach ($this->postFilters as $name => $f) {
  6269. $result = $f->filter($uri, $config, $context);
  6270. if (!$result) return false;
  6271. }
  6272. return true;
  6273. }
  6274. }
  6275. /**
  6276. * Chainable filters for custom URI processing.
  6277. *
  6278. * These filters can perform custom actions on a URI filter object,
  6279. * including transformation or blacklisting.
  6280. *
  6281. * @warning This filter is called before scheme object validation occurs.
  6282. * Make sure, if you require a specific scheme object, you
  6283. * you check that it exists. This allows filters to convert
  6284. * proprietary URI schemes into regular ones.
  6285. */
  6286. abstract class HTMLPurifier_URIFilter
  6287. {
  6288. /**
  6289. * Unique identifier of filter
  6290. */
  6291. public $name;
  6292. /**
  6293. * True if this filter should be run after scheme validation.
  6294. */
  6295. public $post = false;
  6296. /**
  6297. * Performs initialization for the filter
  6298. */
  6299. public function prepare($config) {return true;}
  6300. /**
  6301. * Filter a URI object
  6302. * @param $uri Reference to URI object variable
  6303. * @param $config Instance of HTMLPurifier_Config
  6304. * @param $context Instance of HTMLPurifier_Context
  6305. * @return bool Whether or not to continue processing: false indicates
  6306. * URL is no good, true indicates continue processing. Note that
  6307. * all changes are committed directly on the URI object
  6308. */
  6309. abstract public function filter(&$uri, $config, $context);
  6310. }
  6311. /**
  6312. * Parses a URI into the components and fragment identifier as specified
  6313. * by RFC 3986.
  6314. */
  6315. class HTMLPurifier_URIParser
  6316. {
  6317. /**
  6318. * Instance of HTMLPurifier_PercentEncoder to do normalization with.
  6319. */
  6320. protected $percentEncoder;
  6321. public function __construct() {
  6322. $this->percentEncoder = new HTMLPurifier_PercentEncoder();
  6323. }
  6324. /**
  6325. * Parses a URI.
  6326. * @param $uri string URI to parse
  6327. * @return HTMLPurifier_URI representation of URI. This representation has
  6328. * not been validated yet and may not conform to RFC.
  6329. */
  6330. public function parse($uri) {
  6331. $uri = $this->percentEncoder->normalize($uri);
  6332. // Regexp is as per Appendix B.
  6333. // Note that ["<>] are an addition to the RFC's recommended
  6334. // characters, because they represent external delimeters.
  6335. $r_URI = '!'.
  6336. '(([^:/?#"<>]+):)?'. // 2. Scheme
  6337. '(//([^/?#"<>]*))?'. // 4. Authority
  6338. '([^?#"<>]*)'. // 5. Path
  6339. '(\?([^#"<>]*))?'. // 7. Query
  6340. '(#([^"<>]*))?'. // 8. Fragment
  6341. '!';
  6342. $matches = array();
  6343. $result = preg_match($r_URI, $uri, $matches);
  6344. if (!$result) return false; // *really* invalid URI
  6345. // seperate out parts
  6346. $scheme = !empty($matches[1]) ? $matches[2] : null;
  6347. $authority = !empty($matches[3]) ? $matches[4] : null;
  6348. $path = $matches[5]; // always present, can be empty
  6349. $query = !empty($matches[6]) ? $matches[7] : null;
  6350. $fragment = !empty($matches[8]) ? $matches[9] : null;
  6351. // further parse authority
  6352. if ($authority !== null) {
  6353. $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
  6354. $matches = array();
  6355. preg_match($r_authority, $authority, $matches);
  6356. $userinfo = !empty($matches[1]) ? $matches[2] : null;
  6357. $host = !empty($matches[3]) ? $matches[3] : '';
  6358. $port = !empty($matches[4]) ? (int) $matches[5] : null;
  6359. } else {
  6360. $port = $host = $userinfo = null;
  6361. }
  6362. return new HTMLPurifier_URI(
  6363. $scheme, $userinfo, $host, $port, $path, $query, $fragment);
  6364. }
  6365. }
  6366. /**
  6367. * Validator for the components of a URI for a specific scheme
  6368. */
  6369. abstract class HTMLPurifier_URIScheme
  6370. {
  6371. /**
  6372. * Scheme's default port (integer). If an explicit port number is
  6373. * specified that coincides with the default port, it will be
  6374. * elided.
  6375. */
  6376. public $default_port = null;
  6377. /**
  6378. * Whether or not URIs of this schem are locatable by a browser
  6379. * http and ftp are accessible, while mailto and news are not.
  6380. */
  6381. public $browsable = false;
  6382. /**
  6383. * Whether or not the URI always uses <hier_part>, resolves edge cases
  6384. * with making relative URIs absolute
  6385. */
  6386. public $hierarchical = false;
  6387. /**
  6388. * Whether or not the URI may omit a hostname when the scheme is
  6389. * explicitly specified, ala file:///path/to/file. As of writing,
  6390. * 'file' is the only scheme that browsers support his properly.
  6391. */
  6392. public $may_omit_host = false;
  6393. /**
  6394. * Validates the components of a URI for a specific scheme.
  6395. * @param $uri Reference to a HTMLPurifier_URI object
  6396. * @param $config HTMLPurifier_Config object
  6397. * @param $context HTMLPurifier_Context object
  6398. * @return Bool success or failure
  6399. */
  6400. public abstract function doValidate(&$uri, $config, $context);
  6401. /**
  6402. * Public interface for validating components of a URI. Performs a
  6403. * bunch of default actions. Don't overload this method.
  6404. * @param $uri Reference to a HTMLPurifier_URI object
  6405. * @param $config HTMLPurifier_Config object
  6406. * @param $context HTMLPurifier_Context object
  6407. * @return Bool success or failure
  6408. */
  6409. public function validate(&$uri, $config, $context) {
  6410. if ($this->default_port == $uri->port) $uri->port = null;
  6411. // kludge: browsers do funny things when the scheme but not the
  6412. // authority is set
  6413. if (!$this->may_omit_host &&
  6414. // if the scheme is present, a missing host is always in error
  6415. (!is_null($uri->scheme) && ($uri->host === '' || is_null($uri->host))) ||
  6416. // if the scheme is not present, a *blank* host is in error,
  6417. // since this translates into '///path' which most browsers
  6418. // interpret as being 'http://path'.
  6419. (is_null($uri->scheme) && $uri->host === '')
  6420. ) {
  6421. do {
  6422. if (is_null($uri->scheme)) {
  6423. if (substr($uri->path, 0, 2) != '//') {
  6424. $uri->host = null;
  6425. break;
  6426. }
  6427. // URI is '////path', so we cannot nullify the
  6428. // host to preserve semantics. Try expanding the
  6429. // hostname instead (fall through)
  6430. }
  6431. // first see if we can manually insert a hostname
  6432. $host = $config->get('URI.Host');
  6433. if (!is_null($host)) {
  6434. $uri->host = $host;
  6435. } else {
  6436. // we can't do anything sensible, reject the URL.
  6437. return false;
  6438. }
  6439. } while (false);
  6440. }
  6441. return $this->doValidate($uri, $config, $context);
  6442. }
  6443. }
  6444. /**
  6445. * Registry for retrieving specific URI scheme validator objects.
  6446. */
  6447. class HTMLPurifier_URISchemeRegistry
  6448. {
  6449. /**
  6450. * Retrieve sole instance of the registry.
  6451. * @param $prototype Optional prototype to overload sole instance with,
  6452. * or bool true to reset to default registry.
  6453. * @note Pass a registry object $prototype with a compatible interface and
  6454. * the function will copy it and return it all further times.
  6455. */
  6456. public static function instance($prototype = null) {
  6457. static $instance = null;
  6458. if ($prototype !== null) {
  6459. $instance = $prototype;
  6460. } elseif ($instance === null || $prototype == true) {
  6461. $instance = new HTMLPurifier_URISchemeRegistry();
  6462. }
  6463. return $instance;
  6464. }
  6465. /**
  6466. * Cache of retrieved schemes.
  6467. */
  6468. protected $schemes = array();
  6469. /**
  6470. * Retrieves a scheme validator object
  6471. * @param $scheme String scheme name like http or mailto
  6472. * @param $config HTMLPurifier_Config object
  6473. * @param $config HTMLPurifier_Context object
  6474. */
  6475. public function getScheme($scheme, $config, $context) {
  6476. if (!$config) $config = HTMLPurifier_Config::createDefault();
  6477. // important, otherwise attacker could include arbitrary file
  6478. $allowed_schemes = $config->get('URI.AllowedSchemes');
  6479. if (!$config->get('URI.OverrideAllowedSchemes') &&
  6480. !isset($allowed_schemes[$scheme])
  6481. ) {
  6482. return;
  6483. }
  6484. if (isset($this->schemes[$scheme])) return $this->schemes[$scheme];
  6485. if (!isset($allowed_schemes[$scheme])) return;
  6486. $class = 'HTMLPurifier_URIScheme_' . $scheme;
  6487. if (!class_exists($class)) return;
  6488. $this->schemes[$scheme] = new $class();
  6489. return $this->schemes[$scheme];
  6490. }
  6491. /**
  6492. * Registers a custom scheme to the cache, bypassing reflection.
  6493. * @param $scheme Scheme name
  6494. * @param $scheme_obj HTMLPurifier_URIScheme object
  6495. */
  6496. public function register($scheme, $scheme_obj) {
  6497. $this->schemes[$scheme] = $scheme_obj;
  6498. }
  6499. }
  6500. /**
  6501. * Class for converting between different unit-lengths as specified by
  6502. * CSS.
  6503. */
  6504. class HTMLPurifier_UnitConverter
  6505. {
  6506. const ENGLISH = 1;
  6507. const METRIC = 2;
  6508. const DIGITAL = 3;
  6509. /**
  6510. * Units information array. Units are grouped into measuring systems
  6511. * (English, Metric), and are assigned an integer representing
  6512. * the conversion factor between that unit and the smallest unit in
  6513. * the system. Numeric indexes are actually magical constants that
  6514. * encode conversion data from one system to the next, with a O(n^2)
  6515. * constraint on memory (this is generally not a problem, since
  6516. * the number of measuring systems is small.)
  6517. */
  6518. protected static $units = array(
  6519. self::ENGLISH => array(
  6520. 'px' => 3, // This is as per CSS 2.1 and Firefox. Your mileage may vary
  6521. 'pt' => 4,
  6522. 'pc' => 48,
  6523. 'in' => 288,
  6524. self::METRIC => array('pt', '0.352777778', 'mm'),
  6525. ),
  6526. self::METRIC => array(
  6527. 'mm' => 1,
  6528. 'cm' => 10,
  6529. self::ENGLISH => array('mm', '2.83464567', 'pt'),
  6530. ),
  6531. );
  6532. /**
  6533. * Minimum bcmath precision for output.
  6534. */
  6535. protected $outputPrecision;
  6536. /**
  6537. * Bcmath precision for internal calculations.
  6538. */
  6539. protected $internalPrecision;
  6540. /**
  6541. * Whether or not BCMath is available
  6542. */
  6543. private $bcmath;
  6544. public function __construct($output_precision = 4, $internal_precision = 10, $force_no_bcmath = false) {
  6545. $this->outputPrecision = $output_precision;
  6546. $this->internalPrecision = $internal_precision;
  6547. $this->bcmath = !$force_no_bcmath && function_exists('bcmul');
  6548. }
  6549. /**
  6550. * Converts a length object of one unit into another unit.
  6551. * @param HTMLPurifier_Length $length
  6552. * Instance of HTMLPurifier_Length to convert. You must validate()
  6553. * it before passing it here!
  6554. * @param string $to_unit
  6555. * Unit to convert to.
  6556. * @note
  6557. * About precision: This conversion function pays very special
  6558. * attention to the incoming precision of values and attempts
  6559. * to maintain a number of significant figure. Results are
  6560. * fairly accurate up to nine digits. Some caveats:
  6561. * - If a number is zero-padded as a result of this significant
  6562. * figure tracking, the zeroes will be eliminated.
  6563. * - If a number contains less than four sigfigs ($outputPrecision)
  6564. * and this causes some decimals to be excluded, those
  6565. * decimals will be added on.
  6566. */
  6567. public function convert($length, $to_unit) {
  6568. if (!$length->isValid()) return false;
  6569. $n = $length->getN();
  6570. $unit = $length->getUnit();
  6571. if ($n === '0' || $unit === false) {
  6572. return new HTMLPurifier_Length('0', false);
  6573. }
  6574. $state = $dest_state = false;
  6575. foreach (self::$units as $k => $x) {
  6576. if (isset($x[$unit])) $state = $k;
  6577. if (isset($x[$to_unit])) $dest_state = $k;
  6578. }
  6579. if (!$state || !$dest_state) return false;
  6580. // Some calculations about the initial precision of the number;
  6581. // this will be useful when we need to do final rounding.
  6582. $sigfigs = $this->getSigFigs($n);
  6583. if ($sigfigs < $this->outputPrecision) $sigfigs = $this->outputPrecision;
  6584. // BCMath's internal precision deals only with decimals. Use
  6585. // our default if the initial number has no decimals, or increase
  6586. // it by how ever many decimals, thus, the number of guard digits
  6587. // will always be greater than or equal to internalPrecision.
  6588. $log = (int) floor(log(abs($n), 10));
  6589. $cp = ($log < 0) ? $this->internalPrecision - $log : $this->internalPrecision; // internal precision
  6590. for ($i = 0; $i < 2; $i++) {
  6591. // Determine what unit IN THIS SYSTEM we need to convert to
  6592. if ($dest_state === $state) {
  6593. // Simple conversion
  6594. $dest_unit = $to_unit;
  6595. } else {
  6596. // Convert to the smallest unit, pending a system shift
  6597. $dest_unit = self::$units[$state][$dest_state][0];
  6598. }
  6599. // Do the conversion if necessary
  6600. if ($dest_unit !== $unit) {
  6601. $factor = $this->div(self::$units[$state][$unit], self::$units[$state][$dest_unit], $cp);
  6602. $n = $this->mul($n, $factor, $cp);
  6603. $unit = $dest_unit;
  6604. }
  6605. // Output was zero, so bail out early. Shouldn't ever happen.
  6606. if ($n === '') {
  6607. $n = '0';
  6608. $unit = $to_unit;
  6609. break;
  6610. }
  6611. // It was a simple conversion, so bail out
  6612. if ($dest_state === $state) {
  6613. break;
  6614. }
  6615. if ($i !== 0) {
  6616. // Conversion failed! Apparently, the system we forwarded
  6617. // to didn't have this unit. This should never happen!
  6618. return false;
  6619. }
  6620. // Pre-condition: $i == 0
  6621. // Perform conversion to next system of units
  6622. $n = $this->mul($n, self::$units[$state][$dest_state][1], $cp);
  6623. $unit = self::$units[$state][$dest_state][2];
  6624. $state = $dest_state;
  6625. // One more loop around to convert the unit in the new system.
  6626. }
  6627. // Post-condition: $unit == $to_unit
  6628. if ($unit !== $to_unit) return false;
  6629. // Useful for debugging:
  6630. //echo "<pre>n";
  6631. //echo "$n\nsigfigs = $sigfigs\nnew_log = $new_log\nlog = $log\nrp = $rp\n</pre>\n";
  6632. $n = $this->round($n, $sigfigs);
  6633. if (strpos($n, '.') !== false) $n = rtrim($n, '0');
  6634. $n = rtrim($n, '.');
  6635. return new HTMLPurifier_Length($n, $unit);
  6636. }
  6637. /**
  6638. * Returns the number of significant figures in a string number.
  6639. * @param string $n Decimal number
  6640. * @return int number of sigfigs
  6641. */
  6642. public function getSigFigs($n) {
  6643. $n = ltrim($n, '0+-');
  6644. $dp = strpos($n, '.'); // decimal position
  6645. if ($dp === false) {
  6646. $sigfigs = strlen(rtrim($n, '0'));
  6647. } else {
  6648. $sigfigs = strlen(ltrim($n, '0.')); // eliminate extra decimal character
  6649. if ($dp !== 0) $sigfigs--;
  6650. }
  6651. return $sigfigs;
  6652. }
  6653. /**
  6654. * Adds two numbers, using arbitrary precision when available.
  6655. */
  6656. private function add($s1, $s2, $scale) {
  6657. if ($this->bcmath) return bcadd($s1, $s2, $scale);
  6658. else return $this->scale($s1 + $s2, $scale);
  6659. }
  6660. /**
  6661. * Multiples two numbers, using arbitrary precision when available.
  6662. */
  6663. private function mul($s1, $s2, $scale) {
  6664. if ($this->bcmath) return bcmul($s1, $s2, $scale);
  6665. else return $this->scale($s1 * $s2, $scale);
  6666. }
  6667. /**
  6668. * Divides two numbers, using arbitrary precision when available.
  6669. */
  6670. private function div($s1, $s2, $scale) {
  6671. if ($this->bcmath) return bcdiv($s1, $s2, $scale);
  6672. else return $this->scale($s1 / $s2, $scale);
  6673. }
  6674. /**
  6675. * Rounds a number according to the number of sigfigs it should have,
  6676. * using arbitrary precision when available.
  6677. */
  6678. private function round($n, $sigfigs) {
  6679. $new_log = (int) floor(log(abs($n), 10)); // Number of digits left of decimal - 1
  6680. $rp = $sigfigs - $new_log - 1; // Number of decimal places needed
  6681. $neg = $n < 0 ? '-' : ''; // Negative sign
  6682. if ($this->bcmath) {
  6683. if ($rp >= 0) {
  6684. $n = bcadd($n, $neg . '0.' . str_repeat('0', $rp) . '5', $rp + 1);
  6685. $n = bcdiv($n, '1', $rp);
  6686. } else {
  6687. // This algorithm partially depends on the standardized
  6688. // form of numbers that comes out of bcmath.
  6689. $n = bcadd($n, $neg . '5' . str_repeat('0', $new_log - $sigfigs), 0);
  6690. $n = substr($n, 0, $sigfigs + strlen($neg)) . str_repeat('0', $new_log - $sigfigs + 1);
  6691. }
  6692. return $n;
  6693. } else {
  6694. return $this->scale(round($n, $sigfigs - $new_log - 1), $rp + 1);
  6695. }
  6696. }
  6697. /**
  6698. * Scales a float to $scale digits right of decimal point, like BCMath.
  6699. */
  6700. private function scale($r, $scale) {
  6701. if ($scale < 0) {
  6702. // The f sprintf type doesn't support negative numbers, so we
  6703. // need to cludge things manually. First get the string.
  6704. $r = sprintf('%.0f', (float) $r);
  6705. // Due to floating point precision loss, $r will more than likely
  6706. // look something like 4652999999999.9234. We grab one more digit
  6707. // than we need to precise from $r and then use that to round
  6708. // appropriately.
  6709. $precise = (string) round(substr($r, 0, strlen($r) + $scale), -1);
  6710. // Now we return it, truncating the zero that was rounded off.
  6711. return substr($precise, 0, -1) . str_repeat('0', -$scale + 1);
  6712. }
  6713. return sprintf('%.' . $scale . 'f', (float) $r);
  6714. }
  6715. }
  6716. /**
  6717. * Parses string representations into their corresponding native PHP
  6718. * variable type. The base implementation does a simple type-check.
  6719. */
  6720. class HTMLPurifier_VarParser
  6721. {
  6722. const STRING = 1;
  6723. const ISTRING = 2;
  6724. const TEXT = 3;
  6725. const ITEXT = 4;
  6726. const INT = 5;
  6727. const FLOAT = 6;
  6728. const BOOL = 7;
  6729. const LOOKUP = 8;
  6730. const ALIST = 9;
  6731. const HASH = 10;
  6732. const MIXED = 11;
  6733. /**
  6734. * Lookup table of allowed types. Mainly for backwards compatibility, but
  6735. * also convenient for transforming string type names to the integer constants.
  6736. */
  6737. static public $types = array(
  6738. 'string' => self::STRING,
  6739. 'istring' => self::ISTRING,
  6740. 'text' => self::TEXT,
  6741. 'itext' => self::ITEXT,
  6742. 'int' => self::INT,
  6743. 'float' => self::FLOAT,
  6744. 'bool' => self::BOOL,
  6745. 'lookup' => self::LOOKUP,
  6746. 'list' => self::ALIST,
  6747. 'hash' => self::HASH,
  6748. 'mixed' => self::MIXED
  6749. );
  6750. /**
  6751. * Lookup table of types that are string, and can have aliases or
  6752. * allowed value lists.
  6753. */
  6754. static public $stringTypes = array(
  6755. self::STRING => true,
  6756. self::ISTRING => true,
  6757. self::TEXT => true,
  6758. self::ITEXT => true,
  6759. );
  6760. /**
  6761. * Validate a variable according to type. Throws
  6762. * HTMLPurifier_VarParserException if invalid.
  6763. * It may return NULL as a valid type if $allow_null is true.
  6764. *
  6765. * @param $var Variable to validate
  6766. * @param $type Type of variable, see HTMLPurifier_VarParser->types
  6767. * @param $allow_null Whether or not to permit null as a value
  6768. * @return Validated and type-coerced variable
  6769. */
  6770. final public function parse($var, $type, $allow_null = false) {
  6771. if (is_string($type)) {
  6772. if (!isset(HTMLPurifier_VarParser::$types[$type])) {
  6773. throw new HTMLPurifier_VarParserException("Invalid type '$type'");
  6774. } else {
  6775. $type = HTMLPurifier_VarParser::$types[$type];
  6776. }
  6777. }
  6778. $var = $this->parseImplementation($var, $type, $allow_null);
  6779. if ($allow_null && $var === null) return null;
  6780. // These are basic checks, to make sure nothing horribly wrong
  6781. // happened in our implementations.
  6782. switch ($type) {
  6783. case (self::STRING):
  6784. case (self::ISTRING):
  6785. case (self::TEXT):
  6786. case (self::ITEXT):
  6787. if (!is_string($var)) break;
  6788. if ($type == self::ISTRING || $type == self::ITEXT) $var = strtolower($var);
  6789. return $var;
  6790. case (self::INT):
  6791. if (!is_int($var)) break;
  6792. return $var;
  6793. case (self::FLOAT):
  6794. if (!is_float($var)) break;
  6795. return $var;
  6796. case (self::BOOL):
  6797. if (!is_bool($var)) break;
  6798. return $var;
  6799. case (self::LOOKUP):
  6800. case (self::ALIST):
  6801. case (self::HASH):
  6802. if (!is_array($var)) break;
  6803. if ($type === self::LOOKUP) {
  6804. foreach ($var as $k) if ($k !== true) $this->error('Lookup table contains value other than true');
  6805. } elseif ($type === self::ALIST) {
  6806. $keys = array_keys($var);
  6807. if (array_keys($keys) !== $keys) $this->error('Indices for list are not uniform');
  6808. }
  6809. return $var;
  6810. case (self::MIXED):
  6811. return $var;
  6812. default:
  6813. $this->errorInconsistent(get_class($this), $type);
  6814. }
  6815. $this->errorGeneric($var, $type);
  6816. }
  6817. /**
  6818. * Actually implements the parsing. Base implementation is to not
  6819. * do anything to $var. Subclasses should overload this!
  6820. */
  6821. protected function parseImplementation($var, $type, $allow_null) {
  6822. return $var;
  6823. }
  6824. /**
  6825. * Throws an exception.
  6826. */
  6827. protected function error($msg) {
  6828. throw new HTMLPurifier_VarParserException($msg);
  6829. }
  6830. /**
  6831. * Throws an inconsistency exception.
  6832. * @note This should not ever be called. It would be called if we
  6833. * extend the allowed values of HTMLPurifier_VarParser without
  6834. * updating subclasses.
  6835. */
  6836. protected function errorInconsistent($class, $type) {
  6837. throw new HTMLPurifier_Exception("Inconsistency in $class: ".HTMLPurifier_VarParser::getTypeName($type)." not implemented");
  6838. }
  6839. /**
  6840. * Generic error for if a type didn't work.
  6841. */
  6842. protected function errorGeneric($var, $type) {
  6843. $vtype = gettype($var);
  6844. $this->error("Expected type ".HTMLPurifier_VarParser::getTypeName($type).", got $vtype");
  6845. }
  6846. static public function getTypeName($type) {
  6847. static $lookup;
  6848. if (!$lookup) {
  6849. // Lazy load the alternative lookup table
  6850. $lookup = array_flip(HTMLPurifier_VarParser::$types);
  6851. }
  6852. if (!isset($lookup[$type])) return 'unknown';
  6853. return $lookup[$type];
  6854. }
  6855. }
  6856. /**
  6857. * Exception type for HTMLPurifier_VarParser
  6858. */
  6859. class HTMLPurifier_VarParserException extends HTMLPurifier_Exception
  6860. {
  6861. }
  6862. /**
  6863. * Validates the HTML attribute style, otherwise known as CSS.
  6864. * @note We don't implement the whole CSS specification, so it might be
  6865. * difficult to reuse this component in the context of validating
  6866. * actual stylesheet declarations.
  6867. * @note If we were really serious about validating the CSS, we would
  6868. * tokenize the styles and then parse the tokens. Obviously, we
  6869. * are not doing that. Doing that could seriously harm performance,
  6870. * but would make these components a lot more viable for a CSS
  6871. * filtering solution.
  6872. */
  6873. class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
  6874. {
  6875. public function validate($css, $config, $context) {
  6876. $css = $this->parseCDATA($css);
  6877. $definition = $config->getCSSDefinition();
  6878. // we're going to break the spec and explode by semicolons.
  6879. // This is because semicolon rarely appears in escaped form
  6880. // Doing this is generally flaky but fast
  6881. // IT MIGHT APPEAR IN URIs, see HTMLPurifier_AttrDef_CSSURI
  6882. // for details
  6883. $declarations = explode(';', $css);
  6884. $propvalues = array();
  6885. /**
  6886. * Name of the current CSS property being validated.
  6887. */
  6888. $property = false;
  6889. $context->register('CurrentCSSProperty', $property);
  6890. foreach ($declarations as $declaration) {
  6891. if (!$declaration) continue;
  6892. if (!strpos($declaration, ':')) continue;
  6893. list($property, $value) = explode(':', $declaration, 2);
  6894. $property = trim($property);
  6895. $value = trim($value);
  6896. $ok = false;
  6897. do {
  6898. if (isset($definition->info[$property])) {
  6899. $ok = true;
  6900. break;
  6901. }
  6902. if (ctype_lower($property)) break;
  6903. $property = strtolower($property);
  6904. if (isset($definition->info[$property])) {
  6905. $ok = true;
  6906. break;
  6907. }
  6908. } while(0);
  6909. if (!$ok) continue;
  6910. // inefficient call, since the validator will do this again
  6911. if (strtolower(trim($value)) !== 'inherit') {
  6912. // inherit works for everything (but only on the base property)
  6913. $result = $definition->info[$property]->validate(
  6914. $value, $config, $context );
  6915. } else {
  6916. $result = 'inherit';
  6917. }
  6918. if ($result === false) continue;
  6919. $propvalues[$property] = $result;
  6920. }
  6921. $context->destroy('CurrentCSSProperty');
  6922. // procedure does not write the new CSS simultaneously, so it's
  6923. // slightly inefficient, but it's the only way of getting rid of
  6924. // duplicates. Perhaps config to optimize it, but not now.
  6925. $new_declarations = '';
  6926. foreach ($propvalues as $prop => $value) {
  6927. $new_declarations .= "$prop:$value;";
  6928. }
  6929. return $new_declarations ? $new_declarations : false;
  6930. }
  6931. }
  6932. // Enum = Enumerated
  6933. /**
  6934. * Validates a keyword against a list of valid values.
  6935. * @warning The case-insensitive compare of this function uses PHP's
  6936. * built-in strtolower and ctype_lower functions, which may
  6937. * cause problems with international comparisons
  6938. */
  6939. class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef
  6940. {
  6941. /**
  6942. * Lookup table of valid values.
  6943. * @todo Make protected
  6944. */
  6945. public $valid_values = array();
  6946. /**
  6947. * Bool indicating whether or not enumeration is case sensitive.
  6948. * @note In general this is always case insensitive.
  6949. */
  6950. protected $case_sensitive = false; // values according to W3C spec
  6951. /**
  6952. * @param $valid_values List of valid values
  6953. * @param $case_sensitive Bool indicating whether or not case sensitive
  6954. */
  6955. public function __construct(
  6956. $valid_values = array(), $case_sensitive = false
  6957. ) {
  6958. $this->valid_values = array_flip($valid_values);
  6959. $this->case_sensitive = $case_sensitive;
  6960. }
  6961. public function validate($string, $config, $context) {
  6962. $string = trim($string);
  6963. if (!$this->case_sensitive) {
  6964. // we may want to do full case-insensitive libraries
  6965. $string = ctype_lower($string) ? $string : strtolower($string);
  6966. }
  6967. $result = isset($this->valid_values[$string]);
  6968. return $result ? $string : false;
  6969. }
  6970. /**
  6971. * @param $string In form of comma-delimited list of case-insensitive
  6972. * valid values. Example: "foo,bar,baz". Prepend "s:" to make
  6973. * case sensitive
  6974. */
  6975. public function make($string) {
  6976. if (strlen($string) > 2 && $string[0] == 's' && $string[1] == ':') {
  6977. $string = substr($string, 2);
  6978. $sensitive = true;
  6979. } else {
  6980. $sensitive = false;
  6981. }
  6982. $values = explode(',', $string);
  6983. return new HTMLPurifier_AttrDef_Enum($values, $sensitive);
  6984. }
  6985. }
  6986. /**
  6987. * Validates an integer.
  6988. * @note While this class was modeled off the CSS definition, no currently
  6989. * allowed CSS uses this type. The properties that do are: widows,
  6990. * orphans, z-index, counter-increment, counter-reset. Some of the
  6991. * HTML attributes, however, find use for a non-negative version of this.
  6992. */
  6993. class HTMLPurifier_AttrDef_Integer extends HTMLPurifier_AttrDef
  6994. {
  6995. /**
  6996. * Bool indicating whether or not negative values are allowed
  6997. */
  6998. protected $negative = true;
  6999. /**
  7000. * Bool indicating whether or not zero is allowed
  7001. */
  7002. protected $zero = true;
  7003. /**
  7004. * Bool indicating whether or not positive values are allowed
  7005. */
  7006. protected $positive = true;
  7007. /**
  7008. * @param $negative Bool indicating whether or not negative values are allowed
  7009. * @param $zero Bool indicating whether or not zero is allowed
  7010. * @param $positive Bool indicating whether or not positive values are allowed
  7011. */
  7012. public function __construct(
  7013. $negative = true, $zero = true, $positive = true
  7014. ) {
  7015. $this->negative = $negative;
  7016. $this->zero = $zero;
  7017. $this->positive = $positive;
  7018. }
  7019. public function validate($integer, $config, $context) {
  7020. $integer = $this->parseCDATA($integer);
  7021. if ($integer === '') return false;
  7022. // we could possibly simply typecast it to integer, but there are
  7023. // certain fringe cases that must not return an integer.
  7024. // clip leading sign
  7025. if ( $this->negative && $integer[0] === '-' ) {
  7026. $digits = substr($integer, 1);
  7027. if ($digits === '0') $integer = '0'; // rm minus sign for zero
  7028. } elseif( $this->positive && $integer[0] === '+' ) {
  7029. $digits = $integer = substr($integer, 1); // rm unnecessary plus
  7030. } else {
  7031. $digits = $integer;
  7032. }
  7033. // test if it's numeric
  7034. if (!ctype_digit($digits)) return false;
  7035. // perform scope tests
  7036. if (!$this->zero && $integer == 0) return false;
  7037. if (!$this->positive && $integer > 0) return false;
  7038. if (!$this->negative && $integer < 0) return false;
  7039. return $integer;
  7040. }
  7041. }
  7042. /**
  7043. * Validates the HTML attribute lang, effectively a language code.
  7044. * @note Built according to RFC 3066, which obsoleted RFC 1766
  7045. */
  7046. class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef
  7047. {
  7048. public function validate($string, $config, $context) {
  7049. $string = trim($string);
  7050. if (!$string) return false;
  7051. $subtags = explode('-', $string);
  7052. $num_subtags = count($subtags);
  7053. if ($num_subtags == 0) return false; // sanity check
  7054. // process primary subtag : $subtags[0]
  7055. $length = strlen($subtags[0]);
  7056. switch ($length) {
  7057. case 0:
  7058. return false;
  7059. case 1:
  7060. if (! ($subtags[0] == 'x' || $subtags[0] == 'i') ) {
  7061. return false;
  7062. }
  7063. break;
  7064. case 2:
  7065. case 3:
  7066. if (! ctype_alpha($subtags[0]) ) {
  7067. return false;
  7068. } elseif (! ctype_lower($subtags[0]) ) {
  7069. $subtags[0] = strtolower($subtags[0]);
  7070. }
  7071. break;
  7072. default:
  7073. return false;
  7074. }
  7075. $new_string = $subtags[0];
  7076. if ($num_subtags == 1) return $new_string;
  7077. // process second subtag : $subtags[1]
  7078. $length = strlen($subtags[1]);
  7079. if ($length == 0 || ($length == 1 && $subtags[1] != 'x') || $length > 8 || !ctype_alnum($subtags[1])) {
  7080. return $new_string;
  7081. }
  7082. if (!ctype_lower($subtags[1])) $subtags[1] = strtolower($subtags[1]);
  7083. $new_string .= '-' . $subtags[1];
  7084. if ($num_subtags == 2) return $new_string;
  7085. // process all other subtags, index 2 and up
  7086. for ($i = 2; $i < $num_subtags; $i++) {
  7087. $length = strlen($subtags[$i]);
  7088. if ($length == 0 || $length > 8 || !ctype_alnum($subtags[$i])) {
  7089. return $new_string;
  7090. }
  7091. if (!ctype_lower($subtags[$i])) {
  7092. $subtags[$i] = strtolower($subtags[$i]);
  7093. }
  7094. $new_string .= '-' . $subtags[$i];
  7095. }
  7096. return $new_string;
  7097. }
  7098. }
  7099. /**
  7100. * Decorator that, depending on a token, switches between two definitions.
  7101. */
  7102. class HTMLPurifier_AttrDef_Switch
  7103. {
  7104. protected $tag;
  7105. protected $withTag, $withoutTag;
  7106. /**
  7107. * @param string $tag Tag name to switch upon
  7108. * @param HTMLPurifier_AttrDef $with_tag Call if token matches tag
  7109. * @param HTMLPurifier_AttrDef $without_tag Call if token doesn't match, or there is no token
  7110. */
  7111. public function __construct($tag, $with_tag, $without_tag) {
  7112. $this->tag = $tag;
  7113. $this->withTag = $with_tag;
  7114. $this->withoutTag = $without_tag;
  7115. }
  7116. public function validate($string, $config, $context) {
  7117. $token = $context->get('CurrentToken', true);
  7118. if (!$token || $token->name !== $this->tag) {
  7119. return $this->withoutTag->validate($string, $config, $context);
  7120. } else {
  7121. return $this->withTag->validate($string, $config, $context);
  7122. }
  7123. }
  7124. }
  7125. /**
  7126. * Validates arbitrary text according to the HTML spec.
  7127. */
  7128. class HTMLPurifier_AttrDef_Text extends HTMLPurifier_AttrDef
  7129. {
  7130. public function validate($string, $config, $context) {
  7131. return $this->parseCDATA($string);
  7132. }
  7133. }
  7134. /**
  7135. * Validates a URI as defined by RFC 3986.
  7136. * @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme
  7137. */
  7138. class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
  7139. {
  7140. protected $parser;
  7141. protected $embedsResource;
  7142. /**
  7143. * @param $embeds_resource_resource Does the URI here result in an extra HTTP request?
  7144. */
  7145. public function __construct($embeds_resource = false) {
  7146. $this->parser = new HTMLPurifier_URIParser();
  7147. $this->embedsResource = (bool) $embeds_resource;
  7148. }
  7149. public function make($string) {
  7150. $embeds = (bool) $string;
  7151. return new HTMLPurifier_AttrDef_URI($embeds);
  7152. }
  7153. public function validate($uri, $config, $context) {
  7154. if ($config->get('URI.Disable')) return false;
  7155. $uri = $this->parseCDATA($uri);
  7156. // parse the URI
  7157. $uri = $this->parser->parse($uri);
  7158. if ($uri === false) return false;
  7159. // add embedded flag to context for validators
  7160. $context->register('EmbeddedURI', $this->embedsResource);
  7161. $ok = false;
  7162. do {
  7163. // generic validation
  7164. $result = $uri->validate($config, $context);
  7165. if (!$result) break;
  7166. // chained filtering
  7167. $uri_def = $config->getDefinition('URI');
  7168. $result = $uri_def->filter($uri, $config, $context);
  7169. if (!$result) break;
  7170. // scheme-specific validation
  7171. $scheme_obj = $uri->getSchemeObj($config, $context);
  7172. if (!$scheme_obj) break;
  7173. if ($this->embedsResource && !$scheme_obj->browsable) break;
  7174. $result = $scheme_obj->validate($uri, $config, $context);
  7175. if (!$result) break;
  7176. // Post chained filtering
  7177. $result = $uri_def->postFilter($uri, $config, $context);
  7178. if (!$result) break;
  7179. // survived gauntlet
  7180. $ok = true;
  7181. } while (false);
  7182. $context->destroy('EmbeddedURI');
  7183. if (!$ok) return false;
  7184. // back to string
  7185. return $uri->toString();
  7186. }
  7187. }
  7188. /**
  7189. * Validates a number as defined by the CSS spec.
  7190. */
  7191. class HTMLPurifier_AttrDef_CSS_Number extends HTMLPurifier_AttrDef
  7192. {
  7193. /**
  7194. * Bool indicating whether or not only positive values allowed.
  7195. */
  7196. protected $non_negative = false;
  7197. /**
  7198. * @param $non_negative Bool indicating whether negatives are forbidden
  7199. */
  7200. public function __construct($non_negative = false) {
  7201. $this->non_negative = $non_negative;
  7202. }
  7203. /**
  7204. * @warning Some contexts do not pass $config, $context. These
  7205. * variables should not be used without checking HTMLPurifier_Length
  7206. */
  7207. public function validate($number, $config, $context) {
  7208. $number = $this->parseCDATA($number);
  7209. if ($number === '') return false;
  7210. if ($number === '0') return '0';
  7211. $sign = '';
  7212. switch ($number[0]) {
  7213. case '-':
  7214. if ($this->non_negative) return false;
  7215. $sign = '-';
  7216. case '+':
  7217. $number = substr($number, 1);
  7218. }
  7219. if (ctype_digit($number)) {
  7220. $number = ltrim($number, '0');
  7221. return $number ? $sign . $number : '0';
  7222. }
  7223. // Period is the only non-numeric character allowed
  7224. if (strpos($number, '.') === false) return false;
  7225. list($left, $right) = explode('.', $number, 2);
  7226. if ($left === '' && $right === '') return false;
  7227. if ($left !== '' && !ctype_digit($left)) return false;
  7228. $left = ltrim($left, '0');
  7229. $right = rtrim($right, '0');
  7230. if ($right === '') {
  7231. return $left ? $sign . $left : '0';
  7232. } elseif (!ctype_digit($right)) {
  7233. return false;
  7234. }
  7235. return $sign . $left . '.' . $right;
  7236. }
  7237. }
  7238. class HTMLPurifier_AttrDef_CSS_AlphaValue extends HTMLPurifier_AttrDef_CSS_Number
  7239. {
  7240. public function __construct() {
  7241. parent::__construct(false); // opacity is non-negative, but we will clamp it
  7242. }
  7243. public function validate($number, $config, $context) {
  7244. $result = parent::validate($number, $config, $context);
  7245. if ($result === false) return $result;
  7246. $float = (float) $result;
  7247. if ($float < 0.0) $result = '0';
  7248. if ($float > 1.0) $result = '1';
  7249. return $result;
  7250. }
  7251. }
  7252. /**
  7253. * Validates shorthand CSS property background.
  7254. * @warning Does not support url tokens that have internal spaces.
  7255. */
  7256. class HTMLPurifier_AttrDef_CSS_Background extends HTMLPurifier_AttrDef
  7257. {
  7258. /**
  7259. * Local copy of component validators.
  7260. * @note See HTMLPurifier_AttrDef_Font::$info for a similar impl.
  7261. */
  7262. protected $info;
  7263. public function __construct($config) {
  7264. $def = $config->getCSSDefinition();
  7265. $this->info['background-color'] = $def->info['background-color'];
  7266. $this->info['background-image'] = $def->info['background-image'];
  7267. $this->info['background-repeat'] = $def->info['background-repeat'];
  7268. $this->info['background-attachment'] = $def->info['background-attachment'];
  7269. $this->info['background-position'] = $def->info['background-position'];
  7270. }
  7271. public function validate($string, $config, $context) {
  7272. // regular pre-processing
  7273. $string = $this->parseCDATA($string);
  7274. if ($string === '') return false;
  7275. // munge rgb() decl if necessary
  7276. $string = $this->mungeRgb($string);
  7277. // assumes URI doesn't have spaces in it
  7278. $bits = explode(' ', strtolower($string)); // bits to process
  7279. $caught = array();
  7280. $caught['color'] = false;
  7281. $caught['image'] = false;
  7282. $caught['repeat'] = false;
  7283. $caught['attachment'] = false;
  7284. $caught['position'] = false;
  7285. $i = 0; // number of catches
  7286. $none = false;
  7287. foreach ($bits as $bit) {
  7288. if ($bit === '') continue;
  7289. foreach ($caught as $key => $status) {
  7290. if ($key != 'position') {
  7291. if ($status !== false) continue;
  7292. $r = $this->info['background-' . $key]->validate($bit, $config, $context);
  7293. } else {
  7294. $r = $bit;
  7295. }
  7296. if ($r === false) continue;
  7297. if ($key == 'position') {
  7298. if ($caught[$key] === false) $caught[$key] = '';
  7299. $caught[$key] .= $r . ' ';
  7300. } else {
  7301. $caught[$key] = $r;
  7302. }
  7303. $i++;
  7304. break;
  7305. }
  7306. }
  7307. if (!$i) return false;
  7308. if ($caught['position'] !== false) {
  7309. $caught['position'] = $this->info['background-position']->
  7310. validate($caught['position'], $config, $context);
  7311. }
  7312. $ret = array();
  7313. foreach ($caught as $value) {
  7314. if ($value === false) continue;
  7315. $ret[] = $value;
  7316. }
  7317. if (empty($ret)) return false;
  7318. return implode(' ', $ret);
  7319. }
  7320. }
  7321. /* W3C says:
  7322. [ // adjective and number must be in correct order, even if
  7323. // you could switch them without introducing ambiguity.
  7324. // some browsers support that syntax
  7325. [
  7326. <percentage> | <length> | left | center | right
  7327. ]
  7328. [
  7329. <percentage> | <length> | top | center | bottom
  7330. ]?
  7331. ] |
  7332. [ // this signifies that the vertical and horizontal adjectives
  7333. // can be arbitrarily ordered, however, there can only be two,
  7334. // one of each, or none at all
  7335. [
  7336. left | center | right
  7337. ] ||
  7338. [
  7339. top | center | bottom
  7340. ]
  7341. ]
  7342. top, left = 0%
  7343. center, (none) = 50%
  7344. bottom, right = 100%
  7345. */
  7346. /* QuirksMode says:
  7347. keyword + length/percentage must be ordered correctly, as per W3C
  7348. Internet Explorer and Opera, however, support arbitrary ordering. We
  7349. should fix it up.
  7350. Minor issue though, not strictly necessary.
  7351. */
  7352. // control freaks may appreciate the ability to convert these to
  7353. // percentages or something, but it's not necessary
  7354. /**
  7355. * Validates the value of background-position.
  7356. */
  7357. class HTMLPurifier_AttrDef_CSS_BackgroundPosition extends HTMLPurifier_AttrDef
  7358. {
  7359. protected $length;
  7360. protected $percentage;
  7361. public function __construct() {
  7362. $this->length = new HTMLPurifier_AttrDef_CSS_Length();
  7363. $this->percentage = new HTMLPurifier_AttrDef_CSS_Percentage();
  7364. }
  7365. public function validate($string, $config, $context) {
  7366. $string = $this->parseCDATA($string);
  7367. $bits = explode(' ', $string);
  7368. $keywords = array();
  7369. $keywords['h'] = false; // left, right
  7370. $keywords['v'] = false; // top, bottom
  7371. $keywords['ch'] = false; // center (first word)
  7372. $keywords['cv'] = false; // center (second word)
  7373. $measures = array();
  7374. $i = 0;
  7375. $lookup = array(
  7376. 'top' => 'v',
  7377. 'bottom' => 'v',
  7378. 'left' => 'h',
  7379. 'right' => 'h',
  7380. 'center' => 'c'
  7381. );
  7382. foreach ($bits as $bit) {
  7383. if ($bit === '') continue;
  7384. // test for keyword
  7385. $lbit = ctype_lower($bit) ? $bit : strtolower($bit);
  7386. if (isset($lookup[$lbit])) {
  7387. $status = $lookup[$lbit];
  7388. if ($status == 'c') {
  7389. if ($i == 0) {
  7390. $status = 'ch';
  7391. } else {
  7392. $status = 'cv';
  7393. }
  7394. }
  7395. $keywords[$status] = $lbit;
  7396. $i++;
  7397. }
  7398. // test for length
  7399. $r = $this->length->validate($bit, $config, $context);
  7400. if ($r !== false) {
  7401. $measures[] = $r;
  7402. $i++;
  7403. }
  7404. // test for percentage
  7405. $r = $this->percentage->validate($bit, $config, $context);
  7406. if ($r !== false) {
  7407. $measures[] = $r;
  7408. $i++;
  7409. }
  7410. }
  7411. if (!$i) return false; // no valid values were caught
  7412. $ret = array();
  7413. // first keyword
  7414. if ($keywords['h']) $ret[] = $keywords['h'];
  7415. elseif ($keywords['ch']) {
  7416. $ret[] = $keywords['ch'];
  7417. $keywords['cv'] = false; // prevent re-use: center = center center
  7418. }
  7419. elseif (count($measures)) $ret[] = array_shift($measures);
  7420. if ($keywords['v']) $ret[] = $keywords['v'];
  7421. elseif ($keywords['cv']) $ret[] = $keywords['cv'];
  7422. elseif (count($measures)) $ret[] = array_shift($measures);
  7423. if (empty($ret)) return false;
  7424. return implode(' ', $ret);
  7425. }
  7426. }
  7427. /**
  7428. * Validates the border property as defined by CSS.
  7429. */
  7430. class HTMLPurifier_AttrDef_CSS_Border extends HTMLPurifier_AttrDef
  7431. {
  7432. /**
  7433. * Local copy of properties this property is shorthand for.
  7434. */
  7435. protected $info = array();
  7436. public function __construct($config) {
  7437. $def = $config->getCSSDefinition();
  7438. $this->info['border-width'] = $def->info['border-width'];
  7439. $this->info['border-style'] = $def->info['border-style'];
  7440. $this->info['border-top-color'] = $def->info['border-top-color'];
  7441. }
  7442. public function validate($string, $config, $context) {
  7443. $string = $this->parseCDATA($string);
  7444. $string = $this->mungeRgb($string);
  7445. $bits = explode(' ', $string);
  7446. $done = array(); // segments we've finished
  7447. $ret = ''; // return value
  7448. foreach ($bits as $bit) {
  7449. foreach ($this->info as $propname => $validator) {
  7450. if (isset($done[$propname])) continue;
  7451. $r = $validator->validate($bit, $config, $context);
  7452. if ($r !== false) {
  7453. $ret .= $r . ' ';
  7454. $done[$propname] = true;
  7455. break;
  7456. }
  7457. }
  7458. }
  7459. return rtrim($ret);
  7460. }
  7461. }
  7462. /**
  7463. * Validates Color as defined by CSS.
  7464. */
  7465. class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
  7466. {
  7467. public function validate($color, $config, $context) {
  7468. static $colors = null;
  7469. if ($colors === null) $colors = $config->get('Core.ColorKeywords');
  7470. $color = trim($color);
  7471. if ($color === '') return false;
  7472. $lower = strtolower($color);
  7473. if (isset($colors[$lower])) return $colors[$lower];
  7474. if (strpos($color, 'rgb(') !== false) {
  7475. // rgb literal handling
  7476. $length = strlen($color);
  7477. if (strpos($color, ')') !== $length - 1) return false;
  7478. $triad = substr($color, 4, $length - 4 - 1);
  7479. $parts = explode(',', $triad);
  7480. if (count($parts) !== 3) return false;
  7481. $type = false; // to ensure that they're all the same type
  7482. $new_parts = array();
  7483. foreach ($parts as $part) {
  7484. $part = trim($part);
  7485. if ($part === '') return false;
  7486. $length = strlen($part);
  7487. if ($part[$length - 1] === '%') {
  7488. // handle percents
  7489. if (!$type) {
  7490. $type = 'percentage';
  7491. } elseif ($type !== 'percentage') {
  7492. return false;
  7493. }
  7494. $num = (float) substr($part, 0, $length - 1);
  7495. if ($num < 0) $num = 0;
  7496. if ($num > 100) $num = 100;
  7497. $new_parts[] = "$num%";
  7498. } else {
  7499. // handle integers
  7500. if (!$type) {
  7501. $type = 'integer';
  7502. } elseif ($type !== 'integer') {
  7503. return false;
  7504. }
  7505. $num = (int) $part;
  7506. if ($num < 0) $num = 0;
  7507. if ($num > 255) $num = 255;
  7508. $new_parts[] = (string) $num;
  7509. }
  7510. }
  7511. $new_triad = implode(',', $new_parts);
  7512. $color = "rgb($new_triad)";
  7513. } else {
  7514. // hexadecimal handling
  7515. if ($color[0] === '#') {
  7516. $hex = substr($color, 1);
  7517. } else {
  7518. $hex = $color;
  7519. $color = '#' . $color;
  7520. }
  7521. $length = strlen($hex);
  7522. if ($length !== 3 && $length !== 6) return false;
  7523. if (!ctype_xdigit($hex)) return false;
  7524. }
  7525. return $color;
  7526. }
  7527. }
  7528. /**
  7529. * Allows multiple validators to attempt to validate attribute.
  7530. *
  7531. * Composite is just what it sounds like: a composite of many validators.
  7532. * This means that multiple HTMLPurifier_AttrDef objects will have a whack
  7533. * at the string. If one of them passes, that's what is returned. This is
  7534. * especially useful for CSS values, which often are a choice between
  7535. * an enumerated set of predefined values or a flexible data type.
  7536. */
  7537. class HTMLPurifier_AttrDef_CSS_Composite extends HTMLPurifier_AttrDef
  7538. {
  7539. /**
  7540. * List of HTMLPurifier_AttrDef objects that may process strings
  7541. * @todo Make protected
  7542. */
  7543. public $defs;
  7544. /**
  7545. * @param $defs List of HTMLPurifier_AttrDef objects
  7546. */
  7547. public function __construct($defs) {
  7548. $this->defs = $defs;
  7549. }
  7550. public function validate($string, $config, $context) {
  7551. foreach ($this->defs as $i => $def) {
  7552. $result = $this->defs[$i]->validate($string, $config, $context);
  7553. if ($result !== false) return $result;
  7554. }
  7555. return false;
  7556. }
  7557. }
  7558. /**
  7559. * Decorator which enables CSS properties to be disabled for specific elements.
  7560. */
  7561. class HTMLPurifier_AttrDef_CSS_DenyElementDecorator extends HTMLPurifier_AttrDef
  7562. {
  7563. public $def, $element;
  7564. /**
  7565. * @param $def Definition to wrap
  7566. * @param $element Element to deny
  7567. */
  7568. public function __construct($def, $element) {
  7569. $this->def = $def;
  7570. $this->element = $element;
  7571. }
  7572. /**
  7573. * Checks if CurrentToken is set and equal to $this->element
  7574. */
  7575. public function validate($string, $config, $context) {
  7576. $token = $context->get('CurrentToken', true);
  7577. if ($token && $token->name == $this->element) return false;
  7578. return $this->def->validate($string, $config, $context);
  7579. }
  7580. }
  7581. /**
  7582. * Microsoft's proprietary filter: CSS property
  7583. * @note Currently supports the alpha filter. In the future, this will
  7584. * probably need an extensible framework
  7585. */
  7586. class HTMLPurifier_AttrDef_CSS_Filter extends HTMLPurifier_AttrDef
  7587. {
  7588. protected $intValidator;
  7589. public function __construct() {
  7590. $this->intValidator = new HTMLPurifier_AttrDef_Integer();
  7591. }
  7592. public function validate($value, $config, $context) {
  7593. $value = $this->parseCDATA($value);
  7594. if ($value === 'none') return $value;
  7595. // if we looped this we could support multiple filters
  7596. $function_length = strcspn($value, '(');
  7597. $function = trim(substr($value, 0, $function_length));
  7598. if ($function !== 'alpha' &&
  7599. $function !== 'Alpha' &&
  7600. $function !== 'progid:DXImageTransform.Microsoft.Alpha'
  7601. ) return false;
  7602. $cursor = $function_length + 1;
  7603. $parameters_length = strcspn($value, ')', $cursor);
  7604. $parameters = substr($value, $cursor, $parameters_length);
  7605. $params = explode(',', $parameters);
  7606. $ret_params = array();
  7607. $lookup = array();
  7608. foreach ($params as $param) {
  7609. list($key, $value) = explode('=', $param);
  7610. $key = trim($key);
  7611. $value = trim($value);
  7612. if (isset($lookup[$key])) continue;
  7613. if ($key !== 'opacity') continue;
  7614. $value = $this->intValidator->validate($value, $config, $context);
  7615. if ($value === false) continue;
  7616. $int = (int) $value;
  7617. if ($int > 100) $value = '100';
  7618. if ($int < 0) $value = '0';
  7619. $ret_params[] = "$key=$value";
  7620. $lookup[$key] = true;
  7621. }
  7622. $ret_parameters = implode(',', $ret_params);
  7623. $ret_function = "$function($ret_parameters)";
  7624. return $ret_function;
  7625. }
  7626. }
  7627. /**
  7628. * Validates shorthand CSS property font.
  7629. */
  7630. class HTMLPurifier_AttrDef_CSS_Font extends HTMLPurifier_AttrDef
  7631. {
  7632. /**
  7633. * Local copy of component validators.
  7634. *
  7635. * @note If we moved specific CSS property definitions to their own
  7636. * classes instead of having them be assembled at run time by
  7637. * CSSDefinition, this wouldn't be necessary. We'd instantiate
  7638. * our own copies.
  7639. */
  7640. protected $info = array();
  7641. public function __construct($config) {
  7642. $def = $config->getCSSDefinition();
  7643. $this->info['font-style'] = $def->info['font-style'];
  7644. $this->info['font-variant'] = $def->info['font-variant'];
  7645. $this->info['font-weight'] = $def->info['font-weight'];
  7646. $this->info['font-size'] = $def->info['font-size'];
  7647. $this->info['line-height'] = $def->info['line-height'];
  7648. $this->info['font-family'] = $def->info['font-family'];
  7649. }
  7650. public function validate($string, $config, $context) {
  7651. static $system_fonts = array(
  7652. 'caption' => true,
  7653. 'icon' => true,
  7654. 'menu' => true,
  7655. 'message-box' => true,
  7656. 'small-caption' => true,
  7657. 'status-bar' => true
  7658. );
  7659. // regular pre-processing
  7660. $string = $this->parseCDATA($string);
  7661. if ($string === '') return false;
  7662. // check if it's one of the keywords
  7663. $lowercase_string = strtolower($string);
  7664. if (isset($system_fonts[$lowercase_string])) {
  7665. return $lowercase_string;
  7666. }
  7667. $bits = explode(' ', $string); // bits to process
  7668. $stage = 0; // this indicates what we're looking for
  7669. $caught = array(); // which stage 0 properties have we caught?
  7670. $stage_1 = array('font-style', 'font-variant', 'font-weight');
  7671. $final = ''; // output
  7672. for ($i = 0, $size = count($bits); $i < $size; $i++) {
  7673. if ($bits[$i] === '') continue;
  7674. switch ($stage) {
  7675. // attempting to catch font-style, font-variant or font-weight
  7676. case 0:
  7677. foreach ($stage_1 as $validator_name) {
  7678. if (isset($caught[$validator_name])) continue;
  7679. $r = $this->info[$validator_name]->validate(
  7680. $bits[$i], $config, $context);
  7681. if ($r !== false) {
  7682. $final .= $r . ' ';
  7683. $caught[$validator_name] = true;
  7684. break;
  7685. }
  7686. }
  7687. // all three caught, continue on
  7688. if (count($caught) >= 3) $stage = 1;
  7689. if ($r !== false) break;
  7690. // attempting to catch font-size and perhaps line-height
  7691. case 1:
  7692. $found_slash = false;
  7693. if (strpos($bits[$i], '/') !== false) {
  7694. list($font_size, $line_height) =
  7695. explode('/', $bits[$i]);
  7696. if ($line_height === '') {
  7697. // ooh, there's a space after the slash!
  7698. $line_height = false;
  7699. $found_slash = true;
  7700. }
  7701. } else {
  7702. $font_size = $bits[$i];
  7703. $line_height = false;
  7704. }
  7705. $r = $this->info['font-size']->validate(
  7706. $font_size, $config, $context);
  7707. if ($r !== false) {
  7708. $final .= $r;
  7709. // attempt to catch line-height
  7710. if ($line_height === false) {
  7711. // we need to scroll forward
  7712. for ($j = $i + 1; $j < $size; $j++) {
  7713. if ($bits[$j] === '') continue;
  7714. if ($bits[$j] === '/') {
  7715. if ($found_slash) {
  7716. return false;
  7717. } else {
  7718. $found_slash = true;
  7719. continue;
  7720. }
  7721. }
  7722. $line_height = $bits[$j];
  7723. break;
  7724. }
  7725. } else {
  7726. // slash already found
  7727. $found_slash = true;
  7728. $j = $i;
  7729. }
  7730. if ($found_slash) {
  7731. $i = $j;
  7732. $r = $this->info['line-height']->validate(
  7733. $line_height, $config, $context);
  7734. if ($r !== false) {
  7735. $final .= '/' . $r;
  7736. }
  7737. }
  7738. $final .= ' ';
  7739. $stage = 2;
  7740. break;
  7741. }
  7742. return false;
  7743. // attempting to catch font-family
  7744. case 2:
  7745. $font_family =
  7746. implode(' ', array_slice($bits, $i, $size - $i));
  7747. $r = $this->info['font-family']->validate(
  7748. $font_family, $config, $context);
  7749. if ($r !== false) {
  7750. $final .= $r . ' ';
  7751. // processing completed successfully
  7752. return rtrim($final);
  7753. }
  7754. return false;
  7755. }
  7756. }
  7757. return false;
  7758. }
  7759. }
  7760. /**
  7761. * Validates a font family list according to CSS spec
  7762. */
  7763. class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
  7764. {
  7765. protected $mask = null;
  7766. public function __construct() {
  7767. $this->mask = '- ';
  7768. for ($c = 'a'; $c <= 'z'; $c++) $this->mask .= $c;
  7769. for ($c = 'A'; $c <= 'Z'; $c++) $this->mask .= $c;
  7770. for ($c = '0'; $c <= '9'; $c++) $this->mask .= $c; // cast-y, but should be fine
  7771. // special bytes used by UTF-8
  7772. for ($i = 0x80; $i <= 0xFF; $i++) {
  7773. // We don't bother excluding invalid bytes in this range,
  7774. // because the our restriction of well-formed UTF-8 will
  7775. // prevent these from ever occurring.
  7776. $this->mask .= chr($i);
  7777. }
  7778. /*
  7779. PHP's internal strcspn implementation is
  7780. O(length of string * length of mask), making it inefficient
  7781. for large masks. However, it's still faster than
  7782. preg_match 8)
  7783. for (p = s1;;) {
  7784. spanp = s2;
  7785. do {
  7786. if (*spanp == c || p == s1_end) {
  7787. return p - s1;
  7788. }
  7789. } while (spanp++ < (s2_end - 1));
  7790. c = *++p;
  7791. }
  7792. */
  7793. // possible optimization: invert the mask.
  7794. }
  7795. public function validate($string, $config, $context) {
  7796. static $generic_names = array(
  7797. 'serif' => true,
  7798. 'sans-serif' => true,
  7799. 'monospace' => true,
  7800. 'fantasy' => true,
  7801. 'cursive' => true
  7802. );
  7803. $allowed_fonts = $config->get('CSS.AllowedFonts');
  7804. // assume that no font names contain commas in them
  7805. $fonts = explode(',', $string);
  7806. $final = '';
  7807. foreach($fonts as $font) {
  7808. $font = trim($font);
  7809. if ($font === '') continue;
  7810. // match a generic name
  7811. if (isset($generic_names[$font])) {
  7812. if ($allowed_fonts === null || isset($allowed_fonts[$font])) {
  7813. $final .= $font . ', ';
  7814. }
  7815. continue;
  7816. }
  7817. // match a quoted name
  7818. if ($font[0] === '"' || $font[0] === "'") {
  7819. $length = strlen($font);
  7820. if ($length <= 2) continue;
  7821. $quote = $font[0];
  7822. if ($font[$length - 1] !== $quote) continue;
  7823. $font = substr($font, 1, $length - 2);
  7824. }
  7825. $font = $this->expandCSSEscape($font);
  7826. // $font is a pure representation of the font name
  7827. if ($allowed_fonts !== null && !isset($allowed_fonts[$font])) {
  7828. continue;
  7829. }
  7830. if (ctype_alnum($font) && $font !== '') {
  7831. // very simple font, allow it in unharmed
  7832. $final .= $font . ', ';
  7833. continue;
  7834. }
  7835. // bugger out on whitespace. form feed (0C) really
  7836. // shouldn't show up regardless
  7837. $font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font);
  7838. // Here, there are various classes of characters which need
  7839. // to be treated differently:
  7840. // - Alphanumeric characters are essentially safe. We
  7841. // handled these above.
  7842. // - Spaces require quoting, though most parsers will do
  7843. // the right thing if there aren't any characters that
  7844. // can be misinterpreted
  7845. // - Dashes rarely occur, but they fairly unproblematic
  7846. // for parsing/rendering purposes.
  7847. // The above characters cover the majority of Western font
  7848. // names.
  7849. // - Arbitrary Unicode characters not in ASCII. Because
  7850. // most parsers give little thought to Unicode, treatment
  7851. // of these codepoints is basically uniform, even for
  7852. // punctuation-like codepoints. These characters can
  7853. // show up in non-Western pages and are supported by most
  7854. // major browsers, for example: "MS 明朝" is a
  7855. // legitimate font-name
  7856. // <http://ja.wikipedia.org/wiki/MS_明朝>. See
  7857. // the CSS3 spec for more examples:
  7858. // <http://www.w3.org/TR/2011/WD-css3-fonts-20110324/localizedfamilynames.png>
  7859. // You can see live samples of these on the Internet:
  7860. // <http://www.google.co.jp/search?q=font-family+MS+明朝|ゴシック>
  7861. // However, most of these fonts have ASCII equivalents:
  7862. // for example, 'MS Mincho', and it's considered
  7863. // professional to use ASCII font names instead of
  7864. // Unicode font names. Thanks Takeshi Terada for
  7865. // providing this information.
  7866. // The following characters, to my knowledge, have not been
  7867. // used to name font names.
  7868. // - Single quote. While theoretically you might find a
  7869. // font name that has a single quote in its name (serving
  7870. // as an apostrophe, e.g. Dave's Scribble), I haven't
  7871. // been able to find any actual examples of this.
  7872. // Internet Explorer's cssText translation (which I
  7873. // believe is invoked by innerHTML) normalizes any
  7874. // quoting to single quotes, and fails to escape single
  7875. // quotes. (Note that this is not IE's behavior for all
  7876. // CSS properties, just some sort of special casing for
  7877. // font-family). So a single quote *cannot* be used
  7878. // safely in the font-family context if there will be an
  7879. // innerHTML/cssText translation. Note that Firefox 3.x
  7880. // does this too.
  7881. // - Double quote. In IE, these get normalized to
  7882. // single-quotes, no matter what the encoding. (Fun
  7883. // fact, in IE8, the 'content' CSS property gained
  7884. // support, where they special cased to preserve encoded
  7885. // double quotes, but still translate unadorned double
  7886. // quotes into single quotes.) So, because their
  7887. // fixpoint behavior is identical to single quotes, they
  7888. // cannot be allowed either. Firefox 3.x displays
  7889. // single-quote style behavior.
  7890. // - Backslashes are reduced by one (so \\ -> \) every
  7891. // iteration, so they cannot be used safely. This shows
  7892. // up in IE7, IE8 and FF3
  7893. // - Semicolons, commas and backticks are handled properly.
  7894. // - The rest of the ASCII punctuation is handled properly.
  7895. // We haven't checked what browsers do to unadorned
  7896. // versions, but this is not important as long as the
  7897. // browser doesn't /remove/ surrounding quotes (as IE does
  7898. // for HTML).
  7899. //
  7900. // With these results in hand, we conclude that there are
  7901. // various levels of safety:
  7902. // - Paranoid: alphanumeric, spaces and dashes(?)
  7903. // - International: Paranoid + non-ASCII Unicode
  7904. // - Edgy: Everything except quotes, backslashes
  7905. // - NoJS: Standards compliance, e.g. sod IE. Note that
  7906. // with some judicious character escaping (since certain
  7907. // types of escaping doesn't work) this is theoretically
  7908. // OK as long as innerHTML/cssText is not called.
  7909. // We believe that international is a reasonable default
  7910. // (that we will implement now), and once we do more
  7911. // extensive research, we may feel comfortable with dropping
  7912. // it down to edgy.
  7913. // Edgy: alphanumeric, spaces, dashes and Unicode. Use of
  7914. // str(c)spn assumes that the string was already well formed
  7915. // Unicode (which of course it is).
  7916. if (strspn($font, $this->mask) !== strlen($font)) {
  7917. continue;
  7918. }
  7919. // Historical:
  7920. // In the absence of innerHTML/cssText, these ugly
  7921. // transforms don't pose a security risk (as \\ and \"
  7922. // might--these escapes are not supported by most browsers).
  7923. // We could try to be clever and use single-quote wrapping
  7924. // when there is a double quote present, but I have choosen
  7925. // not to implement that. (NOTE: you can reduce the amount
  7926. // of escapes by one depending on what quoting style you use)
  7927. // $font = str_replace('\\', '\\5C ', $font);
  7928. // $font = str_replace('"', '\\22 ', $font);
  7929. // $font = str_replace("'", '\\27 ', $font);
  7930. // font possibly with spaces, requires quoting
  7931. $final .= "'$font', ";
  7932. }
  7933. $final = rtrim($final, ', ');
  7934. if ($final === '') return false;
  7935. return $final;
  7936. }
  7937. }
  7938. /**
  7939. * Decorator which enables !important to be used in CSS values.
  7940. */
  7941. class HTMLPurifier_AttrDef_CSS_ImportantDecorator extends HTMLPurifier_AttrDef
  7942. {
  7943. public $def, $allow;
  7944. /**
  7945. * @param $def Definition to wrap
  7946. * @param $allow Whether or not to allow !important
  7947. */
  7948. public function __construct($def, $allow = false) {
  7949. $this->def = $def;
  7950. $this->allow = $allow;
  7951. }
  7952. /**
  7953. * Intercepts and removes !important if necessary
  7954. */
  7955. public function validate($string, $config, $context) {
  7956. // test for ! and important tokens
  7957. $string = trim($string);
  7958. $is_important = false;
  7959. // :TODO: optimization: test directly for !important and ! important
  7960. if (strlen($string) >= 9 && substr($string, -9) === 'important') {
  7961. $temp = rtrim(substr($string, 0, -9));
  7962. // use a temp, because we might want to restore important
  7963. if (strlen($temp) >= 1 && substr($temp, -1) === '!') {
  7964. $string = rtrim(substr($temp, 0, -1));
  7965. $is_important = true;
  7966. }
  7967. }
  7968. $string = $this->def->validate($string, $config, $context);
  7969. if ($this->allow && $is_important) $string .= ' !important';
  7970. return $string;
  7971. }
  7972. }
  7973. /**
  7974. * Represents a Length as defined by CSS.
  7975. */
  7976. class HTMLPurifier_AttrDef_CSS_Length extends HTMLPurifier_AttrDef
  7977. {
  7978. protected $min, $max;
  7979. /**
  7980. * @param HTMLPurifier_Length $max Minimum length, or null for no bound. String is also acceptable.
  7981. * @param HTMLPurifier_Length $max Maximum length, or null for no bound. String is also acceptable.
  7982. */
  7983. public function __construct($min = null, $max = null) {
  7984. $this->min = $min !== null ? HTMLPurifier_Length::make($min) : null;
  7985. $this->max = $max !== null ? HTMLPurifier_Length::make($max) : null;
  7986. }
  7987. public function validate($string, $config, $context) {
  7988. $string = $this->parseCDATA($string);
  7989. // Optimizations
  7990. if ($string === '') return false;
  7991. if ($string === '0') return '0';
  7992. if (strlen($string) === 1) return false;
  7993. $length = HTMLPurifier_Length::make($string);
  7994. if (!$length->isValid()) return false;
  7995. if ($this->min) {
  7996. $c = $length->compareTo($this->min);
  7997. if ($c === false) return false;
  7998. if ($c < 0) return false;
  7999. }
  8000. if ($this->max) {
  8001. $c = $length->compareTo($this->max);
  8002. if ($c === false) return false;
  8003. if ($c > 0) return false;
  8004. }
  8005. return $length->toString();
  8006. }
  8007. }
  8008. /**
  8009. * Validates shorthand CSS property list-style.
  8010. * @warning Does not support url tokens that have internal spaces.
  8011. */
  8012. class HTMLPurifier_AttrDef_CSS_ListStyle extends HTMLPurifier_AttrDef
  8013. {
  8014. /**
  8015. * Local copy of component validators.
  8016. * @note See HTMLPurifier_AttrDef_CSS_Font::$info for a similar impl.
  8017. */
  8018. protected $info;
  8019. public function __construct($config) {
  8020. $def = $config->getCSSDefinition();
  8021. $this->info['list-style-type'] = $def->info['list-style-type'];
  8022. $this->info['list-style-position'] = $def->info['list-style-position'];
  8023. $this->info['list-style-image'] = $def->info['list-style-image'];
  8024. }
  8025. public function validate($string, $config, $context) {
  8026. // regular pre-processing
  8027. $string = $this->parseCDATA($string);
  8028. if ($string === '') return false;
  8029. // assumes URI doesn't have spaces in it
  8030. $bits = explode(' ', strtolower($string)); // bits to process
  8031. $caught = array();
  8032. $caught['type'] = false;
  8033. $caught['position'] = false;
  8034. $caught['image'] = false;
  8035. $i = 0; // number of catches
  8036. $none = false;
  8037. foreach ($bits as $bit) {
  8038. if ($i >= 3) return; // optimization bit
  8039. if ($bit === '') continue;
  8040. foreach ($caught as $key => $status) {
  8041. if ($status !== false) continue;
  8042. $r = $this->info['list-style-' . $key]->validate($bit, $config, $context);
  8043. if ($r === false) continue;
  8044. if ($r === 'none') {
  8045. if ($none) continue;
  8046. else $none = true;
  8047. if ($key == 'image') continue;
  8048. }
  8049. $caught[$key] = $r;
  8050. $i++;
  8051. break;
  8052. }
  8053. }
  8054. if (!$i) return false;
  8055. $ret = array();
  8056. // construct type
  8057. if ($caught['type']) $ret[] = $caught['type'];
  8058. // construct image
  8059. if ($caught['image']) $ret[] = $caught['image'];
  8060. // construct position
  8061. if ($caught['position']) $ret[] = $caught['position'];
  8062. if (empty($ret)) return false;
  8063. return implode(' ', $ret);
  8064. }
  8065. }
  8066. /**
  8067. * Framework class for strings that involve multiple values.
  8068. *
  8069. * Certain CSS properties such as border-width and margin allow multiple
  8070. * lengths to be specified. This class can take a vanilla border-width
  8071. * definition and multiply it, usually into a max of four.
  8072. *
  8073. * @note Even though the CSS specification isn't clear about it, inherit
  8074. * can only be used alone: it will never manifest as part of a multi
  8075. * shorthand declaration. Thus, this class does not allow inherit.
  8076. */
  8077. class HTMLPurifier_AttrDef_CSS_Multiple extends HTMLPurifier_AttrDef
  8078. {
  8079. /**
  8080. * Instance of component definition to defer validation to.
  8081. * @todo Make protected
  8082. */
  8083. public $single;
  8084. /**
  8085. * Max number of values allowed.
  8086. * @todo Make protected
  8087. */
  8088. public $max;
  8089. /**
  8090. * @param $single HTMLPurifier_AttrDef to multiply
  8091. * @param $max Max number of values allowed (usually four)
  8092. */
  8093. public function __construct($single, $max = 4) {
  8094. $this->single = $single;
  8095. $this->max = $max;
  8096. }
  8097. public function validate($string, $config, $context) {
  8098. $string = $this->parseCDATA($string);
  8099. if ($string === '') return false;
  8100. $parts = explode(' ', $string); // parseCDATA replaced \r, \t and \n
  8101. $length = count($parts);
  8102. $final = '';
  8103. for ($i = 0, $num = 0; $i < $length && $num < $this->max; $i++) {
  8104. if (ctype_space($parts[$i])) continue;
  8105. $result = $this->single->validate($parts[$i], $config, $context);
  8106. if ($result !== false) {
  8107. $final .= $result . ' ';
  8108. $num++;
  8109. }
  8110. }
  8111. if ($final === '') return false;
  8112. return rtrim($final);
  8113. }
  8114. }
  8115. /**
  8116. * Validates a Percentage as defined by the CSS spec.
  8117. */
  8118. class HTMLPurifier_AttrDef_CSS_Percentage extends HTMLPurifier_AttrDef
  8119. {
  8120. /**
  8121. * Instance of HTMLPurifier_AttrDef_CSS_Number to defer number validation
  8122. */
  8123. protected $number_def;
  8124. /**
  8125. * @param Bool indicating whether to forbid negative values
  8126. */
  8127. public function __construct($non_negative = false) {
  8128. $this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative);
  8129. }
  8130. public function validate($string, $config, $context) {
  8131. $string = $this->parseCDATA($string);
  8132. if ($string === '') return false;
  8133. $length = strlen($string);
  8134. if ($length === 1) return false;
  8135. if ($string[$length - 1] !== '%') return false;
  8136. $number = substr($string, 0, $length - 1);
  8137. $number = $this->number_def->validate($number, $config, $context);
  8138. if ($number === false) return false;
  8139. return "$number%";
  8140. }
  8141. }
  8142. /**
  8143. * Validates the value for the CSS property text-decoration
  8144. * @note This class could be generalized into a version that acts sort of
  8145. * like Enum except you can compound the allowed values.
  8146. */
  8147. class HTMLPurifier_AttrDef_CSS_TextDecoration extends HTMLPurifier_AttrDef
  8148. {
  8149. public function validate($string, $config, $context) {
  8150. static $allowed_values = array(
  8151. 'line-through' => true,
  8152. 'overline' => true,
  8153. 'underline' => true,
  8154. );
  8155. $string = strtolower($this->parseCDATA($string));
  8156. if ($string === 'none') return $string;
  8157. $parts = explode(' ', $string);
  8158. $final = '';
  8159. foreach ($parts as $part) {
  8160. if (isset($allowed_values[$part])) {
  8161. $final .= $part . ' ';
  8162. }
  8163. }
  8164. $final = rtrim($final);
  8165. if ($final === '') return false;
  8166. return $final;
  8167. }
  8168. }
  8169. /**
  8170. * Validates a URI in CSS syntax, which uses url('http://example.com')
  8171. * @note While theoretically speaking a URI in a CSS document could
  8172. * be non-embedded, as of CSS2 there is no such usage so we're
  8173. * generalizing it. This may need to be changed in the future.
  8174. * @warning Since HTMLPurifier_AttrDef_CSS blindly uses semicolons as
  8175. * the separator, you cannot put a literal semicolon in
  8176. * in the URI. Try percent encoding it, in that case.
  8177. */
  8178. class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI
  8179. {
  8180. public function __construct() {
  8181. parent::__construct(true); // always embedded
  8182. }
  8183. public function validate($uri_string, $config, $context) {
  8184. // parse the URI out of the string and then pass it onto
  8185. // the parent object
  8186. $uri_string = $this->parseCDATA($uri_string);
  8187. if (strpos($uri_string, 'url(') !== 0) return false;
  8188. $uri_string = substr($uri_string, 4);
  8189. $new_length = strlen($uri_string) - 1;
  8190. if ($uri_string[$new_length] != ')') return false;
  8191. $uri = trim(substr($uri_string, 0, $new_length));
  8192. if (!empty($uri) && ($uri[0] == "'" || $uri[0] == '"')) {
  8193. $quote = $uri[0];
  8194. $new_length = strlen($uri) - 1;
  8195. if ($uri[$new_length] !== $quote) return false;
  8196. $uri = substr($uri, 1, $new_length - 1);
  8197. }
  8198. $uri = $this->expandCSSEscape($uri);
  8199. $result = parent::validate($uri, $config, $context);
  8200. if ($result === false) return false;
  8201. // extra sanity check; should have been done by URI
  8202. $result = str_replace(array('"', "\\", "\n", "\x0c", "\r"), "", $result);
  8203. // suspicious characters are ()'; we're going to percent encode
  8204. // them for safety.
  8205. $result = str_replace(array('(', ')', "'"), array('%28', '%29', '%27'), $result);
  8206. // there's an extra bug where ampersands lose their escaping on
  8207. // an innerHTML cycle, so a very unlucky query parameter could
  8208. // then change the meaning of the URL. Unfortunately, there's
  8209. // not much we can do about that...
  8210. return "url(\"$result\")";
  8211. }
  8212. }
  8213. /**
  8214. * Validates a boolean attribute
  8215. */
  8216. class HTMLPurifier_AttrDef_HTML_Bool extends HTMLPurifier_AttrDef
  8217. {
  8218. protected $name;
  8219. public $minimized = true;
  8220. public function __construct($name = false) {$this->name = $name;}
  8221. public function validate($string, $config, $context) {
  8222. if (empty($string)) return false;
  8223. return $this->name;
  8224. }
  8225. /**
  8226. * @param $string Name of attribute
  8227. */
  8228. public function make($string) {
  8229. return new HTMLPurifier_AttrDef_HTML_Bool($string);
  8230. }
  8231. }
  8232. /**
  8233. * Validates contents based on NMTOKENS attribute type.
  8234. */
  8235. class HTMLPurifier_AttrDef_HTML_Nmtokens extends HTMLPurifier_AttrDef
  8236. {
  8237. public function validate($string, $config, $context) {
  8238. $string = trim($string);
  8239. // early abort: '' and '0' (strings that convert to false) are invalid
  8240. if (!$string) return false;
  8241. $tokens = $this->split($string, $config, $context);
  8242. $tokens = $this->filter($tokens, $config, $context);
  8243. if (empty($tokens)) return false;
  8244. return implode(' ', $tokens);
  8245. }
  8246. /**
  8247. * Splits a space separated list of tokens into its constituent parts.
  8248. */
  8249. protected function split($string, $config, $context) {
  8250. // OPTIMIZABLE!
  8251. // do the preg_match, capture all subpatterns for reformulation
  8252. // we don't support U+00A1 and up codepoints or
  8253. // escaping because I don't know how to do that with regexps
  8254. // and plus it would complicate optimization efforts (you never
  8255. // see that anyway).
  8256. $pattern = '/(?:(?<=\s)|\A)'. // look behind for space or string start
  8257. '((?:--|-?[A-Za-z_])[A-Za-z_\-0-9]*)'.
  8258. '(?:(?=\s)|\z)/'; // look ahead for space or string end
  8259. preg_match_all($pattern, $string, $matches);
  8260. return $matches[1];
  8261. }
  8262. /**
  8263. * Template method for removing certain tokens based on arbitrary criteria.
  8264. * @note If we wanted to be really functional, we'd do an array_filter
  8265. * with a callback. But... we're not.
  8266. */
  8267. protected function filter($tokens, $config, $context) {
  8268. return $tokens;
  8269. }
  8270. }
  8271. /**
  8272. * Implements special behavior for class attribute (normally NMTOKENS)
  8273. */
  8274. class HTMLPurifier_AttrDef_HTML_Class extends HTMLPurifier_AttrDef_HTML_Nmtokens
  8275. {
  8276. protected function split($string, $config, $context) {
  8277. // really, this twiddle should be lazy loaded
  8278. $name = $config->getDefinition('HTML')->doctype->name;
  8279. if ($name == "XHTML 1.1" || $name == "XHTML 2.0") {
  8280. return parent::split($string, $config, $context);
  8281. } else {
  8282. return preg_split('/\s+/', $string);
  8283. }
  8284. }
  8285. protected function filter($tokens, $config, $context) {
  8286. $allowed = $config->get('Attr.AllowedClasses');
  8287. $forbidden = $config->get('Attr.ForbiddenClasses');
  8288. $ret = array();
  8289. foreach ($tokens as $token) {
  8290. if (
  8291. ($allowed === null || isset($allowed[$token])) &&
  8292. !isset($forbidden[$token]) &&
  8293. // We need this O(n) check because of PHP's array
  8294. // implementation that casts -0 to 0.
  8295. !in_array($token, $ret, true)
  8296. ) {
  8297. $ret[] = $token;
  8298. }
  8299. }
  8300. return $ret;
  8301. }
  8302. }
  8303. /**
  8304. * Validates a color according to the HTML spec.
  8305. */
  8306. class HTMLPurifier_AttrDef_HTML_Color extends HTMLPurifier_AttrDef
  8307. {
  8308. public function validate($string, $config, $context) {
  8309. static $colors = null;
  8310. if ($colors === null) $colors = $config->get('Core.ColorKeywords');
  8311. $string = trim($string);
  8312. if (empty($string)) return false;
  8313. if (isset($colors[$string])) return $colors[$string];
  8314. if ($string[0] === '#') $hex = substr($string, 1);
  8315. else $hex = $string;
  8316. $length = strlen($hex);
  8317. if ($length !== 3 && $length !== 6) return false;
  8318. if (!ctype_xdigit($hex)) return false;
  8319. if ($length === 3) $hex = $hex[0].$hex[0].$hex[1].$hex[1].$hex[2].$hex[2];
  8320. return "#$hex";
  8321. }
  8322. }
  8323. /**
  8324. * Special-case enum attribute definition that lazy loads allowed frame targets
  8325. */
  8326. class HTMLPurifier_AttrDef_HTML_FrameTarget extends HTMLPurifier_AttrDef_Enum
  8327. {
  8328. public $valid_values = false; // uninitialized value
  8329. protected $case_sensitive = false;
  8330. public function __construct() {}
  8331. public function validate($string, $config, $context) {
  8332. if ($this->valid_values === false) $this->valid_values = $config->get('Attr.AllowedFrameTargets');
  8333. return parent::validate($string, $config, $context);
  8334. }
  8335. }
  8336. /**
  8337. * Validates the HTML attribute ID.
  8338. * @warning Even though this is the id processor, it
  8339. * will ignore the directive Attr:IDBlacklist, since it will only
  8340. * go according to the ID accumulator. Since the accumulator is
  8341. * automatically generated, it will have already absorbed the
  8342. * blacklist. If you're hacking around, make sure you use load()!
  8343. */
  8344. class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef
  8345. {
  8346. // ref functionality disabled, since we also have to verify
  8347. // whether or not the ID it refers to exists
  8348. public function validate($id, $config, $context) {
  8349. if (!$config->get('Attr.EnableID')) return false;
  8350. $id = trim($id); // trim it first
  8351. if ($id === '') return false;
  8352. $prefix = $config->get('Attr.IDPrefix');
  8353. if ($prefix !== '') {
  8354. $prefix .= $config->get('Attr.IDPrefixLocal');
  8355. // prevent re-appending the prefix
  8356. if (strpos($id, $prefix) !== 0) $id = $prefix . $id;
  8357. } elseif ($config->get('Attr.IDPrefixLocal') !== '') {
  8358. trigger_error('%Attr.IDPrefixLocal cannot be used unless '.
  8359. '%Attr.IDPrefix is set', E_USER_WARNING);
  8360. }
  8361. //if (!$this->ref) {
  8362. $id_accumulator =& $context->get('IDAccumulator');
  8363. if (isset($id_accumulator->ids[$id])) return false;
  8364. //}
  8365. // we purposely avoid using regex, hopefully this is faster
  8366. if (ctype_alpha($id)) {
  8367. $result = true;
  8368. } else {
  8369. if (!ctype_alpha(@$id[0])) return false;
  8370. $trim = trim( // primitive style of regexps, I suppose
  8371. $id,
  8372. 'A..Za..z0..9:-._'
  8373. );
  8374. $result = ($trim === '');
  8375. }
  8376. $regexp = $config->get('Attr.IDBlacklistRegexp');
  8377. if ($regexp && preg_match($regexp, $id)) {
  8378. return false;
  8379. }
  8380. if (/*!$this->ref && */$result) $id_accumulator->add($id);
  8381. // if no change was made to the ID, return the result
  8382. // else, return the new id if stripping whitespace made it
  8383. // valid, or return false.
  8384. return $result ? $id : false;
  8385. }
  8386. }
  8387. /**
  8388. * Validates an integer representation of pixels according to the HTML spec.
  8389. */
  8390. class HTMLPurifier_AttrDef_HTML_Pixels extends HTMLPurifier_AttrDef
  8391. {
  8392. protected $max;
  8393. public function __construct($max = null) {
  8394. $this->max = $max;
  8395. }
  8396. public function validate($string, $config, $context) {
  8397. $string = trim($string);
  8398. if ($string === '0') return $string;
  8399. if ($string === '') return false;
  8400. $length = strlen($string);
  8401. if (substr($string, $length - 2) == 'px') {
  8402. $string = substr($string, 0, $length - 2);
  8403. }
  8404. if (!is_numeric($string)) return false;
  8405. $int = (int) $string;
  8406. if ($int < 0) return '0';
  8407. // upper-bound value, extremely high values can
  8408. // crash operating systems, see <http://ha.ckers.org/imagecrash.html>
  8409. // WARNING, above link WILL crash you if you're using Windows
  8410. if ($this->max !== null && $int > $this->max) return (string) $this->max;
  8411. return (string) $int;
  8412. }
  8413. public function make($string) {
  8414. if ($string === '') $max = null;
  8415. else $max = (int) $string;
  8416. $class = get_class($this);
  8417. return new $class($max);
  8418. }
  8419. }
  8420. /**
  8421. * Validates the HTML type length (not to be confused with CSS's length).
  8422. *
  8423. * This accepts integer pixels or percentages as lengths for certain
  8424. * HTML attributes.
  8425. */
  8426. class HTMLPurifier_AttrDef_HTML_Length extends HTMLPurifier_AttrDef_HTML_Pixels
  8427. {
  8428. public function validate($string, $config, $context) {
  8429. $string = trim($string);
  8430. if ($string === '') return false;
  8431. $parent_result = parent::validate($string, $config, $context);
  8432. if ($parent_result !== false) return $parent_result;
  8433. $length = strlen($string);
  8434. $last_char = $string[$length - 1];
  8435. if ($last_char !== '%') return false;
  8436. $points = substr($string, 0, $length - 1);
  8437. if (!is_numeric($points)) return false;
  8438. $points = (int) $points;
  8439. if ($points < 0) return '0%';
  8440. if ($points > 100) return '100%';
  8441. return ((string) $points) . '%';
  8442. }
  8443. }
  8444. /**
  8445. * Validates a rel/rev link attribute against a directive of allowed values
  8446. * @note We cannot use Enum because link types allow multiple
  8447. * values.
  8448. * @note Assumes link types are ASCII text
  8449. */
  8450. class HTMLPurifier_AttrDef_HTML_LinkTypes extends HTMLPurifier_AttrDef
  8451. {
  8452. /** Name config attribute to pull. */
  8453. protected $name;
  8454. public function __construct($name) {
  8455. $configLookup = array(
  8456. 'rel' => 'AllowedRel',
  8457. 'rev' => 'AllowedRev'
  8458. );
  8459. if (!isset($configLookup[$name])) {
  8460. trigger_error('Unrecognized attribute name for link '.
  8461. 'relationship.', E_USER_ERROR);
  8462. return;
  8463. }
  8464. $this->name = $configLookup[$name];
  8465. }
  8466. public function validate($string, $config, $context) {
  8467. $allowed = $config->get('Attr.' . $this->name);
  8468. if (empty($allowed)) return false;
  8469. $string = $this->parseCDATA($string);
  8470. $parts = explode(' ', $string);
  8471. // lookup to prevent duplicates
  8472. $ret_lookup = array();
  8473. foreach ($parts as $part) {
  8474. $part = strtolower(trim($part));
  8475. if (!isset($allowed[$part])) continue;
  8476. $ret_lookup[$part] = true;
  8477. }
  8478. if (empty($ret_lookup)) return false;
  8479. $string = implode(' ', array_keys($ret_lookup));
  8480. return $string;
  8481. }
  8482. }
  8483. /**
  8484. * Validates a MultiLength as defined by the HTML spec.
  8485. *
  8486. * A multilength is either a integer (pixel count), a percentage, or
  8487. * a relative number.
  8488. */
  8489. class HTMLPurifier_AttrDef_HTML_MultiLength extends HTMLPurifier_AttrDef_HTML_Length
  8490. {
  8491. public function validate($string, $config, $context) {
  8492. $string = trim($string);
  8493. if ($string === '') return false;
  8494. $parent_result = parent::validate($string, $config, $context);
  8495. if ($parent_result !== false) return $parent_result;
  8496. $length = strlen($string);
  8497. $last_char = $string[$length - 1];
  8498. if ($last_char !== '*') return false;
  8499. $int = substr($string, 0, $length - 1);
  8500. if ($int == '') return '*';
  8501. if (!is_numeric($int)) return false;
  8502. $int = (int) $int;
  8503. if ($int < 0) return false;
  8504. if ($int == 0) return '0';
  8505. if ($int == 1) return '*';
  8506. return ((string) $int) . '*';
  8507. }
  8508. }
  8509. abstract class HTMLPurifier_AttrDef_URI_Email extends HTMLPurifier_AttrDef
  8510. {
  8511. /**
  8512. * Unpacks a mailbox into its display-name and address
  8513. */
  8514. function unpack($string) {
  8515. // needs to be implemented
  8516. }
  8517. }
  8518. // sub-implementations
  8519. /**
  8520. * Validates a host according to the IPv4, IPv6 and DNS (future) specifications.
  8521. */
  8522. class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
  8523. {
  8524. /**
  8525. * Instance of HTMLPurifier_AttrDef_URI_IPv4 sub-validator
  8526. */
  8527. protected $ipv4;
  8528. /**
  8529. * Instance of HTMLPurifier_AttrDef_URI_IPv6 sub-validator
  8530. */
  8531. protected $ipv6;
  8532. public function __construct() {
  8533. $this->ipv4 = new HTMLPurifier_AttrDef_URI_IPv4();
  8534. $this->ipv6 = new HTMLPurifier_AttrDef_URI_IPv6();
  8535. }
  8536. public function validate($string, $config, $context) {
  8537. $length = strlen($string);
  8538. // empty hostname is OK; it's usually semantically equivalent:
  8539. // the default host as defined by a URI scheme is used:
  8540. //
  8541. // If the URI scheme defines a default for host, then that
  8542. // default applies when the host subcomponent is undefined
  8543. // or when the registered name is empty (zero length).
  8544. if ($string === '') return '';
  8545. if ($length > 1 && $string[0] === '[' && $string[$length-1] === ']') {
  8546. //IPv6
  8547. $ip = substr($string, 1, $length - 2);
  8548. $valid = $this->ipv6->validate($ip, $config, $context);
  8549. if ($valid === false) return false;
  8550. return '['. $valid . ']';
  8551. }
  8552. // need to do checks on unusual encodings too
  8553. $ipv4 = $this->ipv4->validate($string, $config, $context);
  8554. if ($ipv4 !== false) return $ipv4;
  8555. // A regular domain name.
  8556. // This breaks I18N domain names, but we don't have proper IRI support,
  8557. // so force users to insert Punycode. If there's complaining we'll
  8558. // try to fix things into an international friendly form.
  8559. // The productions describing this are:
  8560. $a = '[a-z]'; // alpha
  8561. $an = '[a-z0-9]'; // alphanum
  8562. $and = '[a-z0-9-]'; // alphanum | "-"
  8563. // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
  8564. $domainlabel = "$an($and*$an)?";
  8565. // toplabel = alpha | alpha *( alphanum | "-" ) alphanum
  8566. $toplabel = "$a($and*$an)?";
  8567. // hostname = *( domainlabel "." ) toplabel [ "." ]
  8568. $match = preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string);
  8569. if (!$match) return false;
  8570. return $string;
  8571. }
  8572. }
  8573. /**
  8574. * Validates an IPv4 address
  8575. * @author Feyd @ forums.devnetwork.net (public domain)
  8576. */
  8577. class HTMLPurifier_AttrDef_URI_IPv4 extends HTMLPurifier_AttrDef
  8578. {
  8579. /**
  8580. * IPv4 regex, protected so that IPv6 can reuse it
  8581. */
  8582. protected $ip4;
  8583. public function validate($aIP, $config, $context) {
  8584. if (!$this->ip4) $this->_loadRegex();
  8585. if (preg_match('#^' . $this->ip4 . '$#s', $aIP))
  8586. {
  8587. return $aIP;
  8588. }
  8589. return false;
  8590. }
  8591. /**
  8592. * Lazy load function to prevent regex from being stuffed in
  8593. * cache.
  8594. */
  8595. protected function _loadRegex() {
  8596. $oct = '(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'; // 0-255
  8597. $this->ip4 = "(?:{$oct}\\.{$oct}\\.{$oct}\\.{$oct})";
  8598. }
  8599. }
  8600. /**
  8601. * Validates an IPv6 address.
  8602. * @author Feyd @ forums.devnetwork.net (public domain)
  8603. * @note This function requires brackets to have been removed from address
  8604. * in URI.
  8605. */
  8606. class HTMLPurifier_AttrDef_URI_IPv6 extends HTMLPurifier_AttrDef_URI_IPv4
  8607. {
  8608. public function validate($aIP, $config, $context) {
  8609. if (!$this->ip4) $this->_loadRegex();
  8610. $original = $aIP;
  8611. $hex = '[0-9a-fA-F]';
  8612. $blk = '(?:' . $hex . '{1,4})';
  8613. $pre = '(?:/(?:12[0-8]|1[0-1][0-9]|[1-9][0-9]|[0-9]))'; // /0 - /128
  8614. // prefix check
  8615. if (strpos($aIP, '/') !== false)
  8616. {
  8617. if (preg_match('#' . $pre . '$#s', $aIP, $find))
  8618. {
  8619. $aIP = substr($aIP, 0, 0-strlen($find[0]));
  8620. unset($find);
  8621. }
  8622. else
  8623. {
  8624. return false;
  8625. }
  8626. }
  8627. // IPv4-compatiblity check
  8628. if (preg_match('#(?<=:'.')' . $this->ip4 . '$#s', $aIP, $find))
  8629. {
  8630. $aIP = substr($aIP, 0, 0-strlen($find[0]));
  8631. $ip = explode('.', $find[0]);
  8632. $ip = array_map('dechex', $ip);
  8633. $aIP .= $ip[0] . $ip[1] . ':' . $ip[2] . $ip[3];
  8634. unset($find, $ip);
  8635. }
  8636. // compression check
  8637. $aIP = explode('::', $aIP);
  8638. $c = count($aIP);
  8639. if ($c > 2)
  8640. {
  8641. return false;
  8642. }
  8643. elseif ($c == 2)
  8644. {
  8645. list($first, $second) = $aIP;
  8646. $first = explode(':', $first);
  8647. $second = explode(':', $second);
  8648. if (count($first) + count($second) > 8)
  8649. {
  8650. return false;
  8651. }
  8652. while(count($first) < 8)
  8653. {
  8654. array_push($first, '0');
  8655. }
  8656. array_splice($first, 8 - count($second), 8, $second);
  8657. $aIP = $first;
  8658. unset($first,$second);
  8659. }
  8660. else
  8661. {
  8662. $aIP = explode(':', $aIP[0]);
  8663. }
  8664. $c = count($aIP);
  8665. if ($c != 8)
  8666. {
  8667. return false;
  8668. }
  8669. // All the pieces should be 16-bit hex strings. Are they?
  8670. foreach ($aIP as $piece)
  8671. {
  8672. if (!preg_match('#^[0-9a-fA-F]{4}$#s', sprintf('%04s', $piece)))
  8673. {
  8674. return false;
  8675. }
  8676. }
  8677. return $original;
  8678. }
  8679. }
  8680. /**
  8681. * Primitive email validation class based on the regexp found at
  8682. * http://www.regular-expressions.info/email.html
  8683. */
  8684. class HTMLPurifier_AttrDef_URI_Email_SimpleCheck extends HTMLPurifier_AttrDef_URI_Email
  8685. {
  8686. public function validate($string, $config, $context) {
  8687. // no support for named mailboxes i.e. "Bob <bob@example.com>"
  8688. // that needs more percent encoding to be done
  8689. if ($string == '') return false;
  8690. $string = trim($string);
  8691. $result = preg_match('/^[A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,4}$/i', $string);
  8692. return $result ? $string : false;
  8693. }
  8694. }
  8695. /**
  8696. * Pre-transform that changes proprietary background attribute to CSS.
  8697. */
  8698. class HTMLPurifier_AttrTransform_Background extends HTMLPurifier_AttrTransform {
  8699. public function transform($attr, $config, $context) {
  8700. if (!isset($attr['background'])) return $attr;
  8701. $background = $this->confiscateAttr($attr, 'background');
  8702. // some validation should happen here
  8703. $this->prependCSS($attr, "background-image:url($background);");
  8704. return $attr;
  8705. }
  8706. }
  8707. // this MUST be placed in post, as it assumes that any value in dir is valid
  8708. /**
  8709. * Post-trasnform that ensures that bdo tags have the dir attribute set.
  8710. */
  8711. class HTMLPurifier_AttrTransform_BdoDir extends HTMLPurifier_AttrTransform
  8712. {
  8713. public function transform($attr, $config, $context) {
  8714. if (isset($attr['dir'])) return $attr;
  8715. $attr['dir'] = $config->get('Attr.DefaultTextDir');
  8716. return $attr;
  8717. }
  8718. }
  8719. /**
  8720. * Pre-transform that changes deprecated bgcolor attribute to CSS.
  8721. */
  8722. class HTMLPurifier_AttrTransform_BgColor extends HTMLPurifier_AttrTransform {
  8723. public function transform($attr, $config, $context) {
  8724. if (!isset($attr['bgcolor'])) return $attr;
  8725. $bgcolor = $this->confiscateAttr($attr, 'bgcolor');
  8726. // some validation should happen here
  8727. $this->prependCSS($attr, "background-color:$bgcolor;");
  8728. return $attr;
  8729. }
  8730. }
  8731. /**
  8732. * Pre-transform that changes converts a boolean attribute to fixed CSS
  8733. */
  8734. class HTMLPurifier_AttrTransform_BoolToCSS extends HTMLPurifier_AttrTransform {
  8735. /**
  8736. * Name of boolean attribute that is trigger
  8737. */
  8738. protected $attr;
  8739. /**
  8740. * CSS declarations to add to style, needs trailing semicolon
  8741. */
  8742. protected $css;
  8743. /**
  8744. * @param $attr string attribute name to convert from
  8745. * @param $css string CSS declarations to add to style (needs semicolon)
  8746. */
  8747. public function __construct($attr, $css) {
  8748. $this->attr = $attr;
  8749. $this->css = $css;
  8750. }
  8751. public function transform($attr, $config, $context) {
  8752. if (!isset($attr[$this->attr])) return $attr;
  8753. unset($attr[$this->attr]);
  8754. $this->prependCSS($attr, $this->css);
  8755. return $attr;
  8756. }
  8757. }
  8758. /**
  8759. * Pre-transform that changes deprecated border attribute to CSS.
  8760. */
  8761. class HTMLPurifier_AttrTransform_Border extends HTMLPurifier_AttrTransform {
  8762. public function transform($attr, $config, $context) {
  8763. if (!isset($attr['border'])) return $attr;
  8764. $border_width = $this->confiscateAttr($attr, 'border');
  8765. // some validation should happen here
  8766. $this->prependCSS($attr, "border:{$border_width}px solid;");
  8767. return $attr;
  8768. }
  8769. }
  8770. /**
  8771. * Generic pre-transform that converts an attribute with a fixed number of
  8772. * values (enumerated) to CSS.
  8773. */
  8774. class HTMLPurifier_AttrTransform_EnumToCSS extends HTMLPurifier_AttrTransform {
  8775. /**
  8776. * Name of attribute to transform from
  8777. */
  8778. protected $attr;
  8779. /**
  8780. * Lookup array of attribute values to CSS
  8781. */
  8782. protected $enumToCSS = array();
  8783. /**
  8784. * Case sensitivity of the matching
  8785. * @warning Currently can only be guaranteed to work with ASCII
  8786. * values.
  8787. */
  8788. protected $caseSensitive = false;
  8789. /**
  8790. * @param $attr String attribute name to transform from
  8791. * @param $enumToCSS Lookup array of attribute values to CSS
  8792. * @param $case_sensitive Boolean case sensitivity indicator, default false
  8793. */
  8794. public function __construct($attr, $enum_to_css, $case_sensitive = false) {
  8795. $this->attr = $attr;
  8796. $this->enumToCSS = $enum_to_css;
  8797. $this->caseSensitive = (bool) $case_sensitive;
  8798. }
  8799. public function transform($attr, $config, $context) {
  8800. if (!isset($attr[$this->attr])) return $attr;
  8801. $value = trim($attr[$this->attr]);
  8802. unset($attr[$this->attr]);
  8803. if (!$this->caseSensitive) $value = strtolower($value);
  8804. if (!isset($this->enumToCSS[$value])) {
  8805. return $attr;
  8806. }
  8807. $this->prependCSS($attr, $this->enumToCSS[$value]);
  8808. return $attr;
  8809. }
  8810. }
  8811. // must be called POST validation
  8812. /**
  8813. * Transform that supplies default values for the src and alt attributes
  8814. * in img tags, as well as prevents the img tag from being removed
  8815. * because of a missing alt tag. This needs to be registered as both
  8816. * a pre and post attribute transform.
  8817. */
  8818. class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform
  8819. {
  8820. public function transform($attr, $config, $context) {
  8821. $src = true;
  8822. if (!isset($attr['src'])) {
  8823. if ($config->get('Core.RemoveInvalidImg')) return $attr;
  8824. $attr['src'] = $config->get('Attr.DefaultInvalidImage');
  8825. $src = false;
  8826. }
  8827. if (!isset($attr['alt'])) {
  8828. if ($src) {
  8829. $alt = $config->get('Attr.DefaultImageAlt');
  8830. if ($alt === null) {
  8831. // truncate if the alt is too long
  8832. $attr['alt'] = substr(basename($attr['src']),0,40);
  8833. } else {
  8834. $attr['alt'] = $alt;
  8835. }
  8836. } else {
  8837. $attr['alt'] = $config->get('Attr.DefaultInvalidImageAlt');
  8838. }
  8839. }
  8840. return $attr;
  8841. }
  8842. }
  8843. /**
  8844. * Pre-transform that changes deprecated hspace and vspace attributes to CSS
  8845. */
  8846. class HTMLPurifier_AttrTransform_ImgSpace extends HTMLPurifier_AttrTransform {
  8847. protected $attr;
  8848. protected $css = array(
  8849. 'hspace' => array('left', 'right'),
  8850. 'vspace' => array('top', 'bottom')
  8851. );
  8852. public function __construct($attr) {
  8853. $this->attr = $attr;
  8854. if (!isset($this->css[$attr])) {
  8855. trigger_error(htmlspecialchars($attr) . ' is not valid space attribute');
  8856. }
  8857. }
  8858. public function transform($attr, $config, $context) {
  8859. if (!isset($attr[$this->attr])) return $attr;
  8860. $width = $this->confiscateAttr($attr, $this->attr);
  8861. // some validation could happen here
  8862. if (!isset($this->css[$this->attr])) return $attr;
  8863. $style = '';
  8864. foreach ($this->css[$this->attr] as $suffix) {
  8865. $property = "margin-$suffix";
  8866. $style .= "$property:{$width}px;";
  8867. }
  8868. $this->prependCSS($attr, $style);
  8869. return $attr;
  8870. }
  8871. }
  8872. /**
  8873. * Performs miscellaneous cross attribute validation and filtering for
  8874. * input elements. This is meant to be a post-transform.
  8875. */
  8876. class HTMLPurifier_AttrTransform_Input extends HTMLPurifier_AttrTransform {
  8877. protected $pixels;
  8878. public function __construct() {
  8879. $this->pixels = new HTMLPurifier_AttrDef_HTML_Pixels();
  8880. }
  8881. public function transform($attr, $config, $context) {
  8882. if (!isset($attr['type'])) $t = 'text';
  8883. else $t = strtolower($attr['type']);
  8884. if (isset($attr['checked']) && $t !== 'radio' && $t !== 'checkbox') {
  8885. unset($attr['checked']);
  8886. }
  8887. if (isset($attr['maxlength']) && $t !== 'text' && $t !== 'password') {
  8888. unset($attr['maxlength']);
  8889. }
  8890. if (isset($attr['size']) && $t !== 'text' && $t !== 'password') {
  8891. $result = $this->pixels->validate($attr['size'], $config, $context);
  8892. if ($result === false) unset($attr['size']);
  8893. else $attr['size'] = $result;
  8894. }
  8895. if (isset($attr['src']) && $t !== 'image') {
  8896. unset($attr['src']);
  8897. }
  8898. if (!isset($attr['value']) && ($t === 'radio' || $t === 'checkbox')) {
  8899. $attr['value'] = '';
  8900. }
  8901. return $attr;
  8902. }
  8903. }
  8904. /**
  8905. * Post-transform that copies lang's value to xml:lang (and vice-versa)
  8906. * @note Theoretically speaking, this could be a pre-transform, but putting
  8907. * post is more efficient.
  8908. */
  8909. class HTMLPurifier_AttrTransform_Lang extends HTMLPurifier_AttrTransform
  8910. {
  8911. public function transform($attr, $config, $context) {
  8912. $lang = isset($attr['lang']) ? $attr['lang'] : false;
  8913. $xml_lang = isset($attr['xml:lang']) ? $attr['xml:lang'] : false;
  8914. if ($lang !== false && $xml_lang === false) {
  8915. $attr['xml:lang'] = $lang;
  8916. } elseif ($xml_lang !== false) {
  8917. $attr['lang'] = $xml_lang;
  8918. }
  8919. return $attr;
  8920. }
  8921. }
  8922. /**
  8923. * Class for handling width/height length attribute transformations to CSS
  8924. */
  8925. class HTMLPurifier_AttrTransform_Length extends HTMLPurifier_AttrTransform
  8926. {
  8927. protected $name;
  8928. protected $cssName;
  8929. public function __construct($name, $css_name = null) {
  8930. $this->name = $name;
  8931. $this->cssName = $css_name ? $css_name : $name;
  8932. }
  8933. public function transform($attr, $config, $context) {
  8934. if (!isset($attr[$this->name])) return $attr;
  8935. $length = $this->confiscateAttr($attr, $this->name);
  8936. if(ctype_digit($length)) $length .= 'px';
  8937. $this->prependCSS($attr, $this->cssName . ":$length;");
  8938. return $attr;
  8939. }
  8940. }
  8941. /**
  8942. * Pre-transform that changes deprecated name attribute to ID if necessary
  8943. */
  8944. class HTMLPurifier_AttrTransform_Name extends HTMLPurifier_AttrTransform
  8945. {
  8946. public function transform($attr, $config, $context) {
  8947. // Abort early if we're using relaxed definition of name
  8948. if ($config->get('HTML.Attr.Name.UseCDATA')) return $attr;
  8949. if (!isset($attr['name'])) return $attr;
  8950. $id = $this->confiscateAttr($attr, 'name');
  8951. if ( isset($attr['id'])) return $attr;
  8952. $attr['id'] = $id;
  8953. return $attr;
  8954. }
  8955. }
  8956. /**
  8957. * Post-transform that performs validation to the name attribute; if
  8958. * it is present with an equivalent id attribute, it is passed through;
  8959. * otherwise validation is performed.
  8960. */
  8961. class HTMLPurifier_AttrTransform_NameSync extends HTMLPurifier_AttrTransform
  8962. {
  8963. public function __construct() {
  8964. $this->idDef = new HTMLPurifier_AttrDef_HTML_ID();
  8965. }
  8966. public function transform($attr, $config, $context) {
  8967. if (!isset($attr['name'])) return $attr;
  8968. $name = $attr['name'];
  8969. if (isset($attr['id']) && $attr['id'] === $name) return $attr;
  8970. $result = $this->idDef->validate($name, $config, $context);
  8971. if ($result === false) unset($attr['name']);
  8972. else $attr['name'] = $result;
  8973. return $attr;
  8974. }
  8975. }
  8976. // must be called POST validation
  8977. /**
  8978. * Adds rel="nofollow" to all outbound links. This transform is
  8979. * only attached if Attr.Nofollow is TRUE.
  8980. */
  8981. class HTMLPurifier_AttrTransform_Nofollow extends HTMLPurifier_AttrTransform
  8982. {
  8983. private $parser;
  8984. public function __construct() {
  8985. $this->parser = new HTMLPurifier_URIParser();
  8986. }
  8987. public function transform($attr, $config, $context) {
  8988. if (!isset($attr['href'])) {
  8989. return $attr;
  8990. }
  8991. // XXX Kind of inefficient
  8992. $url = $this->parser->parse($attr['href']);
  8993. $scheme = $url->getSchemeObj($config, $context);
  8994. if (!is_null($url->host) && $scheme !== false && $scheme->browsable) {
  8995. if (isset($attr['rel'])) {
  8996. $attr['rel'] .= ' nofollow';
  8997. } else {
  8998. $attr['rel'] = 'nofollow';
  8999. }
  9000. }
  9001. return $attr;
  9002. }
  9003. }
  9004. class HTMLPurifier_AttrTransform_SafeEmbed extends HTMLPurifier_AttrTransform
  9005. {
  9006. public $name = "SafeEmbed";
  9007. public function transform($attr, $config, $context) {
  9008. $attr['allowscriptaccess'] = 'never';
  9009. $attr['allownetworking'] = 'internal';
  9010. $attr['type'] = 'application/x-shockwave-flash';
  9011. return $attr;
  9012. }
  9013. }
  9014. /**
  9015. * Writes default type for all objects. Currently only supports flash.
  9016. */
  9017. class HTMLPurifier_AttrTransform_SafeObject extends HTMLPurifier_AttrTransform
  9018. {
  9019. public $name = "SafeObject";
  9020. function transform($attr, $config, $context) {
  9021. if (!isset($attr['type'])) $attr['type'] = 'application/x-shockwave-flash';
  9022. return $attr;
  9023. }
  9024. }
  9025. /**
  9026. * Validates name/value pairs in param tags to be used in safe objects. This
  9027. * will only allow name values it recognizes, and pre-fill certain attributes
  9028. * with required values.
  9029. *
  9030. * @note
  9031. * This class only supports Flash. In the future, Quicktime support
  9032. * may be added.
  9033. *
  9034. * @warning
  9035. * This class expects an injector to add the necessary parameters tags.
  9036. */
  9037. class HTMLPurifier_AttrTransform_SafeParam extends HTMLPurifier_AttrTransform
  9038. {
  9039. public $name = "SafeParam";
  9040. private $uri;
  9041. public function __construct() {
  9042. $this->uri = new HTMLPurifier_AttrDef_URI(true); // embedded
  9043. $this->wmode = new HTMLPurifier_AttrDef_Enum(array('window', 'opaque', 'transparent'));
  9044. }
  9045. public function transform($attr, $config, $context) {
  9046. // If we add support for other objects, we'll need to alter the
  9047. // transforms.
  9048. switch ($attr['name']) {
  9049. // application/x-shockwave-flash
  9050. // Keep this synchronized with Injector/SafeObject.php
  9051. case 'allowScriptAccess':
  9052. $attr['value'] = 'never';
  9053. break;
  9054. case 'allowNetworking':
  9055. $attr['value'] = 'internal';
  9056. break;
  9057. case 'allowFullScreen':
  9058. if ($config->get('HTML.FlashAllowFullScreen')) {
  9059. $attr['value'] = ($attr['value'] == 'true') ? 'true' : 'false';
  9060. } else {
  9061. $attr['value'] = 'false';
  9062. }
  9063. break;
  9064. case 'wmode':
  9065. $attr['value'] = $this->wmode->validate($attr['value'], $config, $context);
  9066. break;
  9067. case 'movie':
  9068. case 'src':
  9069. $attr['name'] = "movie";
  9070. $attr['value'] = $this->uri->validate($attr['value'], $config, $context);
  9071. break;
  9072. case 'flashvars':
  9073. // we're going to allow arbitrary inputs to the SWF, on
  9074. // the reasoning that it could only hack the SWF, not us.
  9075. break;
  9076. // add other cases to support other param name/value pairs
  9077. default:
  9078. $attr['name'] = $attr['value'] = null;
  9079. }
  9080. return $attr;
  9081. }
  9082. }
  9083. /**
  9084. * Implements required attribute stipulation for <script>
  9085. */
  9086. class HTMLPurifier_AttrTransform_ScriptRequired extends HTMLPurifier_AttrTransform
  9087. {
  9088. public function transform($attr, $config, $context) {
  9089. if (!isset($attr['type'])) {
  9090. $attr['type'] = 'text/javascript';
  9091. }
  9092. return $attr;
  9093. }
  9094. }
  9095. /**
  9096. * Sets height/width defaults for <textarea>
  9097. */
  9098. class HTMLPurifier_AttrTransform_Textarea extends HTMLPurifier_AttrTransform
  9099. {
  9100. public function transform($attr, $config, $context) {
  9101. // Calculated from Firefox
  9102. if (!isset($attr['cols'])) $attr['cols'] = '22';
  9103. if (!isset($attr['rows'])) $attr['rows'] = '3';
  9104. return $attr;
  9105. }
  9106. }
  9107. /**
  9108. * Definition that uses different definitions depending on context.
  9109. *
  9110. * The del and ins tags are notable because they allow different types of
  9111. * elements depending on whether or not they're in a block or inline context.
  9112. * Chameleon allows this behavior to happen by using two different
  9113. * definitions depending on context. While this somewhat generalized,
  9114. * it is specifically intended for those two tags.
  9115. */
  9116. class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
  9117. {
  9118. /**
  9119. * Instance of the definition object to use when inline. Usually stricter.
  9120. */
  9121. public $inline;
  9122. /**
  9123. * Instance of the definition object to use when block.
  9124. */
  9125. public $block;
  9126. public $type = 'chameleon';
  9127. /**
  9128. * @param $inline List of elements to allow when inline.
  9129. * @param $block List of elements to allow when block.
  9130. */
  9131. public function __construct($inline, $block) {
  9132. $this->inline = new HTMLPurifier_ChildDef_Optional($inline);
  9133. $this->block = new HTMLPurifier_ChildDef_Optional($block);
  9134. $this->elements = $this->block->elements;
  9135. }
  9136. public function validateChildren($tokens_of_children, $config, $context) {
  9137. if ($context->get('IsInline') === false) {
  9138. return $this->block->validateChildren(
  9139. $tokens_of_children, $config, $context);
  9140. } else {
  9141. return $this->inline->validateChildren(
  9142. $tokens_of_children, $config, $context);
  9143. }
  9144. }
  9145. }
  9146. /**
  9147. * Custom validation class, accepts DTD child definitions
  9148. *
  9149. * @warning Currently this class is an all or nothing proposition, that is,
  9150. * it will only give a bool return value.
  9151. */
  9152. class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
  9153. {
  9154. public $type = 'custom';
  9155. public $allow_empty = false;
  9156. /**
  9157. * Allowed child pattern as defined by the DTD
  9158. */
  9159. public $dtd_regex;
  9160. /**
  9161. * PCRE regex derived from $dtd_regex
  9162. * @private
  9163. */
  9164. private $_pcre_regex;
  9165. /**
  9166. * @param $dtd_regex Allowed child pattern from the DTD
  9167. */
  9168. public function __construct($dtd_regex) {
  9169. $this->dtd_regex = $dtd_regex;
  9170. $this->_compileRegex();
  9171. }
  9172. /**
  9173. * Compiles the PCRE regex from a DTD regex ($dtd_regex to $_pcre_regex)
  9174. */
  9175. protected function _compileRegex() {
  9176. $raw = str_replace(' ', '', $this->dtd_regex);
  9177. if ($raw{0} != '(') {
  9178. $raw = "($raw)";
  9179. }
  9180. $el = '[#a-zA-Z0-9_.-]+';
  9181. $reg = $raw;
  9182. // COMPLICATED! AND MIGHT BE BUGGY! I HAVE NO CLUE WHAT I'M
  9183. // DOING! Seriously: if there's problems, please report them.
  9184. // collect all elements into the $elements array
  9185. preg_match_all("/$el/", $reg, $matches);
  9186. foreach ($matches[0] as $match) {
  9187. $this->elements[$match] = true;
  9188. }
  9189. // setup all elements as parentheticals with leading commas
  9190. $reg = preg_replace("/$el/", '(,\\0)', $reg);
  9191. // remove commas when they were not solicited
  9192. $reg = preg_replace("/([^,(|]\(+),/", '\\1', $reg);
  9193. // remove all non-paranthetical commas: they are handled by first regex
  9194. $reg = preg_replace("/,\(/", '(', $reg);
  9195. $this->_pcre_regex = $reg;
  9196. }
  9197. public function validateChildren($tokens_of_children, $config, $context) {
  9198. $list_of_children = '';
  9199. $nesting = 0; // depth into the nest
  9200. foreach ($tokens_of_children as $token) {
  9201. if (!empty($token->is_whitespace)) continue;
  9202. $is_child = ($nesting == 0); // direct
  9203. if ($token instanceof HTMLPurifier_Token_Start) {
  9204. $nesting++;
  9205. } elseif ($token instanceof HTMLPurifier_Token_End) {
  9206. $nesting--;
  9207. }
  9208. if ($is_child) {
  9209. $list_of_children .= $token->name . ',';
  9210. }
  9211. }
  9212. // add leading comma to deal with stray comma declarations
  9213. $list_of_children = ',' . rtrim($list_of_children, ',');
  9214. $okay =
  9215. preg_match(
  9216. '/^,?'.$this->_pcre_regex.'$/',
  9217. $list_of_children
  9218. );
  9219. return (bool) $okay;
  9220. }
  9221. }
  9222. /**
  9223. * Definition that disallows all elements.
  9224. * @warning validateChildren() in this class is actually never called, because
  9225. * empty elements are corrected in HTMLPurifier_Strategy_MakeWellFormed
  9226. * before child definitions are parsed in earnest by
  9227. * HTMLPurifier_Strategy_FixNesting.
  9228. */
  9229. class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef
  9230. {
  9231. public $allow_empty = true;
  9232. public $type = 'empty';
  9233. public function __construct() {}
  9234. public function validateChildren($tokens_of_children, $config, $context) {
  9235. return array();
  9236. }
  9237. }
  9238. /**
  9239. * Definition that allows a set of elements, but disallows empty children.
  9240. */
  9241. class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
  9242. {
  9243. /**
  9244. * Lookup table of allowed elements.
  9245. * @public
  9246. */
  9247. public $elements = array();
  9248. /**
  9249. * Whether or not the last passed node was all whitespace.
  9250. */
  9251. protected $whitespace = false;
  9252. /**
  9253. * @param $elements List of allowed element names (lowercase).
  9254. */
  9255. public function __construct($elements) {
  9256. if (is_string($elements)) {
  9257. $elements = str_replace(' ', '', $elements);
  9258. $elements = explode('|', $elements);
  9259. }
  9260. $keys = array_keys($elements);
  9261. if ($keys == array_keys($keys)) {
  9262. $elements = array_flip($elements);
  9263. foreach ($elements as $i => $x) {
  9264. $elements[$i] = true;
  9265. if (empty($i)) unset($elements[$i]); // remove blank
  9266. }
  9267. }
  9268. $this->elements = $elements;
  9269. }
  9270. public $allow_empty = false;
  9271. public $type = 'required';
  9272. public function validateChildren($tokens_of_children, $config, $context) {
  9273. // Flag for subclasses
  9274. $this->whitespace = false;
  9275. // if there are no tokens, delete parent node
  9276. if (empty($tokens_of_children)) return false;
  9277. // the new set of children
  9278. $result = array();
  9279. // current depth into the nest
  9280. $nesting = 0;
  9281. // whether or not we're deleting a node
  9282. $is_deleting = false;
  9283. // whether or not parsed character data is allowed
  9284. // this controls whether or not we silently drop a tag
  9285. // or generate escaped HTML from it
  9286. $pcdata_allowed = isset($this->elements['#PCDATA']);
  9287. // a little sanity check to make sure it's not ALL whitespace
  9288. $all_whitespace = true;
  9289. // some configuration
  9290. $escape_invalid_children = $config->get('Core.EscapeInvalidChildren');
  9291. // generator
  9292. $gen = new HTMLPurifier_Generator($config, $context);
  9293. foreach ($tokens_of_children as $token) {
  9294. if (!empty($token->is_whitespace)) {
  9295. $result[] = $token;
  9296. continue;
  9297. }
  9298. $all_whitespace = false; // phew, we're not talking about whitespace
  9299. $is_child = ($nesting == 0);
  9300. if ($token instanceof HTMLPurifier_Token_Start) {
  9301. $nesting++;
  9302. } elseif ($token instanceof HTMLPurifier_Token_End) {
  9303. $nesting--;
  9304. }
  9305. if ($is_child) {
  9306. $is_deleting = false;
  9307. if (!isset($this->elements[$token->name])) {
  9308. $is_deleting = true;
  9309. if ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text) {
  9310. $result[] = $token;
  9311. } elseif ($pcdata_allowed && $escape_invalid_children) {
  9312. $result[] = new HTMLPurifier_Token_Text(
  9313. $gen->generateFromToken($token)
  9314. );
  9315. }
  9316. continue;
  9317. }
  9318. }
  9319. if (!$is_deleting || ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text)) {
  9320. $result[] = $token;
  9321. } elseif ($pcdata_allowed && $escape_invalid_children) {
  9322. $result[] =
  9323. new HTMLPurifier_Token_Text(
  9324. $gen->generateFromToken($token)
  9325. );
  9326. } else {
  9327. // drop silently
  9328. }
  9329. }
  9330. if (empty($result)) return false;
  9331. if ($all_whitespace) {
  9332. $this->whitespace = true;
  9333. return false;
  9334. }
  9335. if ($tokens_of_children == $result) return true;
  9336. return $result;
  9337. }
  9338. }
  9339. /**
  9340. * Definition that allows a set of elements, and allows no children.
  9341. * @note This is a hack to reuse code from HTMLPurifier_ChildDef_Required,
  9342. * really, one shouldn't inherit from the other. Only altered behavior
  9343. * is to overload a returned false with an array. Thus, it will never
  9344. * return false.
  9345. */
  9346. class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
  9347. {
  9348. public $allow_empty = true;
  9349. public $type = 'optional';
  9350. public function validateChildren($tokens_of_children, $config, $context) {
  9351. $result = parent::validateChildren($tokens_of_children, $config, $context);
  9352. // we assume that $tokens_of_children is not modified
  9353. if ($result === false) {
  9354. if (empty($tokens_of_children)) return true;
  9355. elseif ($this->whitespace) return $tokens_of_children;
  9356. else return array();
  9357. }
  9358. return $result;
  9359. }
  9360. }
  9361. /**
  9362. * Takes the contents of blockquote when in strict and reformats for validation.
  9363. */
  9364. class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Required
  9365. {
  9366. protected $real_elements;
  9367. protected $fake_elements;
  9368. public $allow_empty = true;
  9369. public $type = 'strictblockquote';
  9370. protected $init = false;
  9371. /**
  9372. * @note We don't want MakeWellFormed to auto-close inline elements since
  9373. * they might be allowed.
  9374. */
  9375. public function getAllowedElements($config) {
  9376. $this->init($config);
  9377. return $this->fake_elements;
  9378. }
  9379. public function validateChildren($tokens_of_children, $config, $context) {
  9380. $this->init($config);
  9381. // trick the parent class into thinking it allows more
  9382. $this->elements = $this->fake_elements;
  9383. $result = parent::validateChildren($tokens_of_children, $config, $context);
  9384. $this->elements = $this->real_elements;
  9385. if ($result === false) return array();
  9386. if ($result === true) $result = $tokens_of_children;
  9387. $def = $config->getHTMLDefinition();
  9388. $block_wrap_start = new HTMLPurifier_Token_Start($def->info_block_wrapper);
  9389. $block_wrap_end = new HTMLPurifier_Token_End( $def->info_block_wrapper);
  9390. $is_inline = false;
  9391. $depth = 0;
  9392. $ret = array();
  9393. // assuming that there are no comment tokens
  9394. foreach ($result as $i => $token) {
  9395. $token = $result[$i];
  9396. // ifs are nested for readability
  9397. if (!$is_inline) {
  9398. if (!$depth) {
  9399. if (
  9400. ($token instanceof HTMLPurifier_Token_Text && !$token->is_whitespace) ||
  9401. (!$token instanceof HTMLPurifier_Token_Text && !isset($this->elements[$token->name]))
  9402. ) {
  9403. $is_inline = true;
  9404. $ret[] = $block_wrap_start;
  9405. }
  9406. }
  9407. } else {
  9408. if (!$depth) {
  9409. // starting tokens have been inline text / empty
  9410. if ($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) {
  9411. if (isset($this->elements[$token->name])) {
  9412. // ended
  9413. $ret[] = $block_wrap_end;
  9414. $is_inline = false;
  9415. }
  9416. }
  9417. }
  9418. }
  9419. $ret[] = $token;
  9420. if ($token instanceof HTMLPurifier_Token_Start) $depth++;
  9421. if ($token instanceof HTMLPurifier_Token_End) $depth--;
  9422. }
  9423. if ($is_inline) $ret[] = $block_wrap_end;
  9424. return $ret;
  9425. }
  9426. private function init($config) {
  9427. if (!$this->init) {
  9428. $def = $config->getHTMLDefinition();
  9429. // allow all inline elements
  9430. $this->real_elements = $this->elements;
  9431. $this->fake_elements = $def->info_content_sets['Flow'];
  9432. $this->fake_elements['#PCDATA'] = true;
  9433. $this->init = true;
  9434. }
  9435. }
  9436. }
  9437. /**
  9438. * Definition for tables
  9439. */
  9440. class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
  9441. {
  9442. public $allow_empty = false;
  9443. public $type = 'table';
  9444. public $elements = array('tr' => true, 'tbody' => true, 'thead' => true,
  9445. 'tfoot' => true, 'caption' => true, 'colgroup' => true, 'col' => true);
  9446. public function __construct() {}
  9447. public function validateChildren($tokens_of_children, $config, $context) {
  9448. if (empty($tokens_of_children)) return false;
  9449. // this ensures that the loop gets run one last time before closing
  9450. // up. It's a little bit of a hack, but it works! Just make sure you
  9451. // get rid of the token later.
  9452. $tokens_of_children[] = false;
  9453. // only one of these elements is allowed in a table
  9454. $caption = false;
  9455. $thead = false;
  9456. $tfoot = false;
  9457. // as many of these as you want
  9458. $cols = array();
  9459. $content = array();
  9460. $nesting = 0; // current depth so we can determine nodes
  9461. $is_collecting = false; // are we globbing together tokens to package
  9462. // into one of the collectors?
  9463. $collection = array(); // collected nodes
  9464. $tag_index = 0; // the first node might be whitespace,
  9465. // so this tells us where the start tag is
  9466. foreach ($tokens_of_children as $token) {
  9467. $is_child = ($nesting == 0);
  9468. if ($token === false) {
  9469. // terminating sequence started
  9470. } elseif ($token instanceof HTMLPurifier_Token_Start) {
  9471. $nesting++;
  9472. } elseif ($token instanceof HTMLPurifier_Token_End) {
  9473. $nesting--;
  9474. }
  9475. // handle node collection
  9476. if ($is_collecting) {
  9477. if ($is_child) {
  9478. // okay, let's stash the tokens away
  9479. // first token tells us the type of the collection
  9480. switch ($collection[$tag_index]->name) {
  9481. case 'tr':
  9482. case 'tbody':
  9483. $content[] = $collection;
  9484. break;
  9485. case 'caption':
  9486. if ($caption !== false) break;
  9487. $caption = $collection;
  9488. break;
  9489. case 'thead':
  9490. case 'tfoot':
  9491. // access the appropriate variable, $thead or $tfoot
  9492. $var = $collection[$tag_index]->name;
  9493. if ($$var === false) {
  9494. $$var = $collection;
  9495. } else {
  9496. // transmutate the first and less entries into
  9497. // tbody tags, and then put into content
  9498. $collection[$tag_index]->name = 'tbody';
  9499. $collection[count($collection)-1]->name = 'tbody';
  9500. $content[] = $collection;
  9501. }
  9502. break;
  9503. case 'colgroup':
  9504. $cols[] = $collection;
  9505. break;
  9506. }
  9507. $collection = array();
  9508. $is_collecting = false;
  9509. $tag_index = 0;
  9510. } else {
  9511. // add the node to the collection
  9512. $collection[] = $token;
  9513. }
  9514. }
  9515. // terminate
  9516. if ($token === false) break;
  9517. if ($is_child) {
  9518. // determine what we're dealing with
  9519. if ($token->name == 'col') {
  9520. // the only empty tag in the possie, we can handle it
  9521. // immediately
  9522. $cols[] = array_merge($collection, array($token));
  9523. $collection = array();
  9524. $tag_index = 0;
  9525. continue;
  9526. }
  9527. switch($token->name) {
  9528. case 'caption':
  9529. case 'colgroup':
  9530. case 'thead':
  9531. case 'tfoot':
  9532. case 'tbody':
  9533. case 'tr':
  9534. $is_collecting = true;
  9535. $collection[] = $token;
  9536. continue;
  9537. default:
  9538. if (!empty($token->is_whitespace)) {
  9539. $collection[] = $token;
  9540. $tag_index++;
  9541. }
  9542. continue;
  9543. }
  9544. }
  9545. }
  9546. if (empty($content)) return false;
  9547. $ret = array();
  9548. if ($caption !== false) $ret = array_merge($ret, $caption);
  9549. if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array);
  9550. if ($thead !== false) $ret = array_merge($ret, $thead);
  9551. if ($tfoot !== false) $ret = array_merge($ret, $tfoot);
  9552. foreach ($content as $token_array) $ret = array_merge($ret, $token_array);
  9553. if (!empty($collection) && $is_collecting == false){
  9554. // grab the trailing space
  9555. $ret = array_merge($ret, $collection);
  9556. }
  9557. array_pop($tokens_of_children); // remove phantom token
  9558. return ($ret === $tokens_of_children) ? true : $ret;
  9559. }
  9560. }
  9561. class HTMLPurifier_DefinitionCache_Decorator extends HTMLPurifier_DefinitionCache
  9562. {
  9563. /**
  9564. * Cache object we are decorating
  9565. */
  9566. public $cache;
  9567. public function __construct() {}
  9568. /**
  9569. * Lazy decorator function
  9570. * @param $cache Reference to cache object to decorate
  9571. */
  9572. public function decorate(&$cache) {
  9573. $decorator = $this->copy();
  9574. // reference is necessary for mocks in PHP 4
  9575. $decorator->cache =& $cache;
  9576. $decorator->type = $cache->type;
  9577. return $decorator;
  9578. }
  9579. /**
  9580. * Cross-compatible clone substitute
  9581. */
  9582. public function copy() {
  9583. return new HTMLPurifier_DefinitionCache_Decorator();
  9584. }
  9585. public function add($def, $config) {
  9586. return $this->cache->add($def, $config);
  9587. }
  9588. public function set($def, $config) {
  9589. return $this->cache->set($def, $config);
  9590. }
  9591. public function replace($def, $config) {
  9592. return $this->cache->replace($def, $config);
  9593. }
  9594. public function get($config) {
  9595. return $this->cache->get($config);
  9596. }
  9597. public function remove($config) {
  9598. return $this->cache->remove($config);
  9599. }
  9600. public function flush($config) {
  9601. return $this->cache->flush($config);
  9602. }
  9603. public function cleanup($config) {
  9604. return $this->cache->cleanup($config);
  9605. }
  9606. }
  9607. /**
  9608. * Null cache object to use when no caching is on.
  9609. */
  9610. class HTMLPurifier_DefinitionCache_Null extends HTMLPurifier_DefinitionCache
  9611. {
  9612. public function add($def, $config) {
  9613. return false;
  9614. }
  9615. public function set($def, $config) {
  9616. return false;
  9617. }
  9618. public function replace($def, $config) {
  9619. return false;
  9620. }
  9621. public function remove($config) {
  9622. return false;
  9623. }
  9624. public function get($config) {
  9625. return false;
  9626. }
  9627. public function flush($config) {
  9628. return false;
  9629. }
  9630. public function cleanup($config) {
  9631. return false;
  9632. }
  9633. }
  9634. class HTMLPurifier_DefinitionCache_Serializer extends
  9635. HTMLPurifier_DefinitionCache
  9636. {
  9637. public function add($def, $config) {
  9638. if (!$this->checkDefType($def)) return;
  9639. $file = $this->generateFilePath($config);
  9640. if (file_exists($file)) return false;
  9641. if (!$this->_prepareDir($config)) return false;
  9642. return $this->_write($file, serialize($def), $config);
  9643. }
  9644. public function set($def, $config) {
  9645. if (!$this->checkDefType($def)) return;
  9646. $file = $this->generateFilePath($config);
  9647. if (!$this->_prepareDir($config)) return false;
  9648. return $this->_write($file, serialize($def), $config);
  9649. }
  9650. public function replace($def, $config) {
  9651. if (!$this->checkDefType($def)) return;
  9652. $file = $this->generateFilePath($config);
  9653. if (!file_exists($file)) return false;
  9654. if (!$this->_prepareDir($config)) return false;
  9655. return $this->_write($file, serialize($def), $config);
  9656. }
  9657. public function get($config) {
  9658. $file = $this->generateFilePath($config);
  9659. if (!file_exists($file)) return false;
  9660. return unserialize(file_get_contents($file));
  9661. }
  9662. public function remove($config) {
  9663. $file = $this->generateFilePath($config);
  9664. if (!file_exists($file)) return false;
  9665. return unlink($file);
  9666. }
  9667. public function flush($config) {
  9668. if (!$this->_prepareDir($config)) return false;
  9669. $dir = $this->generateDirectoryPath($config);
  9670. $dh = opendir($dir);
  9671. while (false !== ($filename = readdir($dh))) {
  9672. if (empty($filename)) continue;
  9673. if ($filename[0] === '.') continue;
  9674. unlink($dir . '/' . $filename);
  9675. }
  9676. }
  9677. public function cleanup($config) {
  9678. if (!$this->_prepareDir($config)) return false;
  9679. $dir = $this->generateDirectoryPath($config);
  9680. $dh = opendir($dir);
  9681. while (false !== ($filename = readdir($dh))) {
  9682. if (empty($filename)) continue;
  9683. if ($filename[0] === '.') continue;
  9684. $key = substr($filename, 0, strlen($filename) - 4);
  9685. if ($this->isOld($key, $config)) unlink($dir . '/' . $filename);
  9686. }
  9687. }
  9688. /**
  9689. * Generates the file path to the serial file corresponding to
  9690. * the configuration and definition name
  9691. * @todo Make protected
  9692. */
  9693. public function generateFilePath($config) {
  9694. $key = $this->generateKey($config);
  9695. return $this->generateDirectoryPath($config) . '/' . $key . '.ser';
  9696. }
  9697. /**
  9698. * Generates the path to the directory contain this cache's serial files
  9699. * @note No trailing slash
  9700. * @todo Make protected
  9701. */
  9702. public function generateDirectoryPath($config) {
  9703. $base = $this->generateBaseDirectoryPath($config);
  9704. return $base . '/' . $this->type;
  9705. }
  9706. /**
  9707. * Generates path to base directory that contains all definition type
  9708. * serials
  9709. * @todo Make protected
  9710. */
  9711. public function generateBaseDirectoryPath($config) {
  9712. $base = $config->get('Cache.SerializerPath');
  9713. $base = is_null($base) ? HTMLPURIFIER_PREFIX . '/HTMLPurifier/DefinitionCache/Serializer' : $base;
  9714. return $base;
  9715. }
  9716. /**
  9717. * Convenience wrapper function for file_put_contents
  9718. * @param $file File name to write to
  9719. * @param $data Data to write into file
  9720. * @param $config Config object
  9721. * @return Number of bytes written if success, or false if failure.
  9722. */
  9723. private function _write($file, $data, $config) {
  9724. $result = file_put_contents($file, $data);
  9725. if ($result !== false) {
  9726. // set permissions of the new file (no execute)
  9727. $chmod = $config->get('Cache.SerializerPermissions');
  9728. if (!$chmod) {
  9729. $chmod = 0644; // invalid config or simpletest
  9730. }
  9731. $chmod = $chmod & 0666;
  9732. chmod($file, $chmod);
  9733. }
  9734. return $result;
  9735. }
  9736. /**
  9737. * Prepares the directory that this type stores the serials in
  9738. * @param $config Config object
  9739. * @return True if successful
  9740. */
  9741. private function _prepareDir($config) {
  9742. $directory = $this->generateDirectoryPath($config);
  9743. $chmod = $config->get('Cache.SerializerPermissions');
  9744. if (!$chmod) {
  9745. $chmod = 0755; // invalid config or simpletest
  9746. }
  9747. if (!is_dir($directory)) {
  9748. $base = $this->generateBaseDirectoryPath($config);
  9749. if (!is_dir($base)) {
  9750. trigger_error('Base directory '.$base.' does not exist,
  9751. please create or change using %Cache.SerializerPath',
  9752. E_USER_WARNING);
  9753. return false;
  9754. } elseif (!$this->_testPermissions($base, $chmod)) {
  9755. return false;
  9756. }
  9757. $old = umask(0000);
  9758. mkdir($directory, $chmod);
  9759. umask($old);
  9760. } elseif (!$this->_testPermissions($directory, $chmod)) {
  9761. return false;
  9762. }
  9763. return true;
  9764. }
  9765. /**
  9766. * Tests permissions on a directory and throws out friendly
  9767. * error messages and attempts to chmod it itself if possible
  9768. * @param $dir Directory path
  9769. * @param $chmod Permissions
  9770. * @return True if directory writable
  9771. */
  9772. private function _testPermissions($dir, $chmod) {
  9773. // early abort, if it is writable, everything is hunky-dory
  9774. if (is_writable($dir)) return true;
  9775. if (!is_dir($dir)) {
  9776. // generally, you'll want to handle this beforehand
  9777. // so a more specific error message can be given
  9778. trigger_error('Directory '.$dir.' does not exist',
  9779. E_USER_WARNING);
  9780. return false;
  9781. }
  9782. if (function_exists('posix_getuid')) {
  9783. // POSIX system, we can give more specific advice
  9784. if (fileowner($dir) === posix_getuid()) {
  9785. // we can chmod it ourselves
  9786. $chmod = $chmod | 0700;
  9787. if (chmod($dir, $chmod)) return true;
  9788. } elseif (filegroup($dir) === posix_getgid()) {
  9789. $chmod = $chmod | 0070;
  9790. } else {
  9791. // PHP's probably running as nobody, so we'll
  9792. // need to give global permissions
  9793. $chmod = $chmod | 0777;
  9794. }
  9795. trigger_error('Directory '.$dir.' not writable, '.
  9796. 'please chmod to ' . decoct($chmod),
  9797. E_USER_WARNING);
  9798. } else {
  9799. // generic error message
  9800. trigger_error('Directory '.$dir.' not writable, '.
  9801. 'please alter file permissions',
  9802. E_USER_WARNING);
  9803. }
  9804. return false;
  9805. }
  9806. }
  9807. /**
  9808. * Definition cache decorator class that cleans up the cache
  9809. * whenever there is a cache miss.
  9810. */
  9811. class HTMLPurifier_DefinitionCache_Decorator_Cleanup extends
  9812. HTMLPurifier_DefinitionCache_Decorator
  9813. {
  9814. public $name = 'Cleanup';
  9815. public function copy() {
  9816. return new HTMLPurifier_DefinitionCache_Decorator_Cleanup();
  9817. }
  9818. public function add($def, $config) {
  9819. $status = parent::add($def, $config);
  9820. if (!$status) parent::cleanup($config);
  9821. return $status;
  9822. }
  9823. public function set($def, $config) {
  9824. $status = parent::set($def, $config);
  9825. if (!$status) parent::cleanup($config);
  9826. return $status;
  9827. }
  9828. public function replace($def, $config) {
  9829. $status = parent::replace($def, $config);
  9830. if (!$status) parent::cleanup($config);
  9831. return $status;
  9832. }
  9833. public function get($config) {
  9834. $ret = parent::get($config);
  9835. if (!$ret) parent::cleanup($config);
  9836. return $ret;
  9837. }
  9838. }
  9839. /**
  9840. * Definition cache decorator class that saves all cache retrievals
  9841. * to PHP's memory; good for unit tests or circumstances where
  9842. * there are lots of configuration objects floating around.
  9843. */
  9844. class HTMLPurifier_DefinitionCache_Decorator_Memory extends
  9845. HTMLPurifier_DefinitionCache_Decorator
  9846. {
  9847. protected $definitions;
  9848. public $name = 'Memory';
  9849. public function copy() {
  9850. return new HTMLPurifier_DefinitionCache_Decorator_Memory();
  9851. }
  9852. public function add($def, $config) {
  9853. $status = parent::add($def, $config);
  9854. if ($status) $this->definitions[$this->generateKey($config)] = $def;
  9855. return $status;
  9856. }
  9857. public function set($def, $config) {
  9858. $status = parent::set($def, $config);
  9859. if ($status) $this->definitions[$this->generateKey($config)] = $def;
  9860. return $status;
  9861. }
  9862. public function replace($def, $config) {
  9863. $status = parent::replace($def, $config);
  9864. if ($status) $this->definitions[$this->generateKey($config)] = $def;
  9865. return $status;
  9866. }
  9867. public function get($config) {
  9868. $key = $this->generateKey($config);
  9869. if (isset($this->definitions[$key])) return $this->definitions[$key];
  9870. $this->definitions[$key] = parent::get($config);
  9871. return $this->definitions[$key];
  9872. }
  9873. }
  9874. /**
  9875. * XHTML 1.1 Bi-directional Text Module, defines elements that
  9876. * declare directionality of content. Text Extension Module.
  9877. */
  9878. class HTMLPurifier_HTMLModule_Bdo extends HTMLPurifier_HTMLModule
  9879. {
  9880. public $name = 'Bdo';
  9881. public $attr_collections = array(
  9882. 'I18N' => array('dir' => false)
  9883. );
  9884. public function setup($config) {
  9885. $bdo = $this->addElement(
  9886. 'bdo', 'Inline', 'Inline', array('Core', 'Lang'),
  9887. array(
  9888. 'dir' => 'Enum#ltr,rtl', // required
  9889. // The Abstract Module specification has the attribute
  9890. // inclusions wrong for bdo: bdo allows Lang
  9891. )
  9892. );
  9893. $bdo->attr_transform_post['required-dir'] = new HTMLPurifier_AttrTransform_BdoDir();
  9894. $this->attr_collections['I18N']['dir'] = 'Enum#ltr,rtl';
  9895. }
  9896. }
  9897. class HTMLPurifier_HTMLModule_CommonAttributes extends HTMLPurifier_HTMLModule
  9898. {
  9899. public $name = 'CommonAttributes';
  9900. public $attr_collections = array(
  9901. 'Core' => array(
  9902. 0 => array('Style'),
  9903. // 'xml:space' => false,
  9904. 'class' => 'Class',
  9905. 'id' => 'ID',
  9906. 'title' => 'CDATA',
  9907. ),
  9908. 'Lang' => array(),
  9909. 'I18N' => array(
  9910. 0 => array('Lang'), // proprietary, for xml:lang/lang
  9911. ),
  9912. 'Common' => array(
  9913. 0 => array('Core', 'I18N')
  9914. )
  9915. );
  9916. }
  9917. /**
  9918. * XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
  9919. * Module.
  9920. */
  9921. class HTMLPurifier_HTMLModule_Edit extends HTMLPurifier_HTMLModule
  9922. {
  9923. public $name = 'Edit';
  9924. public function setup($config) {
  9925. $contents = 'Chameleon: #PCDATA | Inline ! #PCDATA | Flow';
  9926. $attr = array(
  9927. 'cite' => 'URI',
  9928. // 'datetime' => 'Datetime', // not implemented
  9929. );
  9930. $this->addElement('del', 'Inline', $contents, 'Common', $attr);
  9931. $this->addElement('ins', 'Inline', $contents, 'Common', $attr);
  9932. }
  9933. // HTML 4.01 specifies that ins/del must not contain block
  9934. // elements when used in an inline context, chameleon is
  9935. // a complicated workaround to acheive this effect
  9936. // Inline context ! Block context (exclamation mark is
  9937. // separator, see getChildDef for parsing)
  9938. public $defines_child_def = true;
  9939. public function getChildDef($def) {
  9940. if ($def->content_model_type != 'chameleon') return false;
  9941. $value = explode('!', $def->content_model);
  9942. return new HTMLPurifier_ChildDef_Chameleon($value[0], $value[1]);
  9943. }
  9944. }
  9945. /**
  9946. * XHTML 1.1 Forms module, defines all form-related elements found in HTML 4.
  9947. */
  9948. class HTMLPurifier_HTMLModule_Forms extends HTMLPurifier_HTMLModule
  9949. {
  9950. public $name = 'Forms';
  9951. public $safe = false;
  9952. public $content_sets = array(
  9953. 'Block' => 'Form',
  9954. 'Inline' => 'Formctrl',
  9955. );
  9956. public function setup($config) {
  9957. $form = $this->addElement('form', 'Form',
  9958. 'Required: Heading | List | Block | fieldset', 'Common', array(
  9959. 'accept' => 'ContentTypes',
  9960. 'accept-charset' => 'Charsets',
  9961. 'action*' => 'URI',
  9962. 'method' => 'Enum#get,post',
  9963. // really ContentType, but these two are the only ones used today
  9964. 'enctype' => 'Enum#application/x-www-form-urlencoded,multipart/form-data',
  9965. ));
  9966. $form->excludes = array('form' => true);
  9967. $input = $this->addElement('input', 'Formctrl', 'Empty', 'Common', array(
  9968. 'accept' => 'ContentTypes',
  9969. 'accesskey' => 'Character',
  9970. 'alt' => 'Text',
  9971. 'checked' => 'Bool#checked',
  9972. 'disabled' => 'Bool#disabled',
  9973. 'maxlength' => 'Number',
  9974. 'name' => 'CDATA',
  9975. 'readonly' => 'Bool#readonly',
  9976. 'size' => 'Number',
  9977. 'src' => 'URI#embeds',
  9978. 'tabindex' => 'Number',
  9979. 'type' => 'Enum#text,password,checkbox,button,radio,submit,reset,file,hidden,image',
  9980. 'value' => 'CDATA',
  9981. ));
  9982. $input->attr_transform_post[] = new HTMLPurifier_AttrTransform_Input();
  9983. $this->addElement('select', 'Formctrl', 'Required: optgroup | option', 'Common', array(
  9984. 'disabled' => 'Bool#disabled',
  9985. 'multiple' => 'Bool#multiple',
  9986. 'name' => 'CDATA',
  9987. 'size' => 'Number',
  9988. 'tabindex' => 'Number',
  9989. ));
  9990. $this->addElement('option', false, 'Optional: #PCDATA', 'Common', array(
  9991. 'disabled' => 'Bool#disabled',
  9992. 'label' => 'Text',
  9993. 'selected' => 'Bool#selected',
  9994. 'value' => 'CDATA',
  9995. ));
  9996. // It's illegal for there to be more than one selected, but not
  9997. // be multiple. Also, no selected means undefined behavior. This might
  9998. // be difficult to implement; perhaps an injector, or a context variable.
  9999. $textarea = $this->addElement('textarea', 'Formctrl', 'Optional: #PCDATA', 'Common', array(
  10000. 'accesskey' => 'Character',
  10001. 'cols*' => 'Number',
  10002. 'disabled' => 'Bool#disabled',
  10003. 'name' => 'CDATA',
  10004. 'readonly' => 'Bool#readonly',
  10005. 'rows*' => 'Number',
  10006. 'tabindex' => 'Number',
  10007. ));
  10008. $textarea->attr_transform_pre[] = new HTMLPurifier_AttrTransform_Textarea();
  10009. $button = $this->addElement('button', 'Formctrl', 'Optional: #PCDATA | Heading | List | Block | Inline', 'Common', array(
  10010. 'accesskey' => 'Character',
  10011. 'disabled' => 'Bool#disabled',
  10012. 'name' => 'CDATA',
  10013. 'tabindex' => 'Number',
  10014. 'type' => 'Enum#button,submit,reset',
  10015. 'value' => 'CDATA',
  10016. ));
  10017. // For exclusions, ideally we'd specify content sets, not literal elements
  10018. $button->excludes = $this->makeLookup(
  10019. 'form', 'fieldset', // Form
  10020. 'input', 'select', 'textarea', 'label', 'button', // Formctrl
  10021. 'a' // as per HTML 4.01 spec, this is omitted by modularization
  10022. );
  10023. // Extra exclusion: img usemap="" is not permitted within this element.
  10024. // We'll omit this for now, since we don't have any good way of
  10025. // indicating it yet.
  10026. // This is HIGHLY user-unfriendly; we need a custom child-def for this
  10027. $this->addElement('fieldset', 'Form', 'Custom: (#WS?,legend,(Flow|#PCDATA)*)', 'Common');
  10028. $label = $this->addElement('label', 'Formctrl', 'Optional: #PCDATA | Inline', 'Common', array(
  10029. 'accesskey' => 'Character',
  10030. // 'for' => 'IDREF', // IDREF not implemented, cannot allow
  10031. ));
  10032. $label->excludes = array('label' => true);
  10033. $this->addElement('legend', false, 'Optional: #PCDATA | Inline', 'Common', array(
  10034. 'accesskey' => 'Character',
  10035. ));
  10036. $this->addElement('optgroup', false, 'Required: option', 'Common', array(
  10037. 'disabled' => 'Bool#disabled',
  10038. 'label*' => 'Text',
  10039. ));
  10040. // Don't forget an injector for <isindex>. This one's a little complex
  10041. // because it maps to multiple elements.
  10042. }
  10043. }
  10044. /**
  10045. * XHTML 1.1 Hypertext Module, defines hypertext links. Core Module.
  10046. */
  10047. class HTMLPurifier_HTMLModule_Hypertext extends HTMLPurifier_HTMLModule
  10048. {
  10049. public $name = 'Hypertext';
  10050. public function setup($config) {
  10051. $a = $this->addElement(
  10052. 'a', 'Inline', 'Inline', 'Common',
  10053. array(
  10054. // 'accesskey' => 'Character',
  10055. // 'charset' => 'Charset',
  10056. 'href' => 'URI',
  10057. // 'hreflang' => 'LanguageCode',
  10058. 'rel' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rel'),
  10059. 'rev' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rev'),
  10060. // 'tabindex' => 'Number',
  10061. // 'type' => 'ContentType',
  10062. )
  10063. );
  10064. $a->formatting = true;
  10065. $a->excludes = array('a' => true);
  10066. }
  10067. }
  10068. /**
  10069. * XHTML 1.1 Image Module provides basic image embedding.
  10070. * @note There is specialized code for removing empty images in
  10071. * HTMLPurifier_Strategy_RemoveForeignElements
  10072. */
  10073. class HTMLPurifier_HTMLModule_Image extends HTMLPurifier_HTMLModule
  10074. {
  10075. public $name = 'Image';
  10076. public function setup($config) {
  10077. $max = $config->get('HTML.MaxImgLength');
  10078. $img = $this->addElement(
  10079. 'img', 'Inline', 'Empty', 'Common',
  10080. array(
  10081. 'alt*' => 'Text',
  10082. // According to the spec, it's Length, but percents can
  10083. // be abused, so we allow only Pixels.
  10084. 'height' => 'Pixels#' . $max,
  10085. 'width' => 'Pixels#' . $max,
  10086. 'longdesc' => 'URI',
  10087. 'src*' => new HTMLPurifier_AttrDef_URI(true), // embedded
  10088. )
  10089. );
  10090. if ($max === null || $config->get('HTML.Trusted')) {
  10091. $img->attr['height'] =
  10092. $img->attr['width'] = 'Length';
  10093. }
  10094. // kind of strange, but splitting things up would be inefficient
  10095. $img->attr_transform_pre[] =
  10096. $img->attr_transform_post[] =
  10097. new HTMLPurifier_AttrTransform_ImgRequired();
  10098. }
  10099. }
  10100. /**
  10101. * XHTML 1.1 Legacy module defines elements that were previously
  10102. * deprecated.
  10103. *
  10104. * @note Not all legacy elements have been implemented yet, which
  10105. * is a bit of a reverse problem as compared to browsers! In
  10106. * addition, this legacy module may implement a bit more than
  10107. * mandated by XHTML 1.1.
  10108. *
  10109. * This module can be used in combination with TransformToStrict in order
  10110. * to transform as many deprecated elements as possible, but retain
  10111. * questionably deprecated elements that do not have good alternatives
  10112. * as well as transform elements that don't have an implementation.
  10113. * See docs/ref-strictness.txt for more details.
  10114. */
  10115. class HTMLPurifier_HTMLModule_Legacy extends HTMLPurifier_HTMLModule
  10116. {
  10117. public $name = 'Legacy';
  10118. public function setup($config) {
  10119. $this->addElement('basefont', 'Inline', 'Empty', false, array(
  10120. 'color' => 'Color',
  10121. 'face' => 'Text', // extremely broad, we should
  10122. 'size' => 'Text', // tighten it
  10123. 'id' => 'ID'
  10124. ));
  10125. $this->addElement('center', 'Block', 'Flow', 'Common');
  10126. $this->addElement('dir', 'Block', 'Required: li', 'Common', array(
  10127. 'compact' => 'Bool#compact'
  10128. ));
  10129. $this->addElement('font', 'Inline', 'Inline', array('Core', 'I18N'), array(
  10130. 'color' => 'Color',
  10131. 'face' => 'Text', // extremely broad, we should
  10132. 'size' => 'Text', // tighten it
  10133. ));
  10134. $this->addElement('menu', 'Block', 'Required: li', 'Common', array(
  10135. 'compact' => 'Bool#compact'
  10136. ));
  10137. $s = $this->addElement('s', 'Inline', 'Inline', 'Common');
  10138. $s->formatting = true;
  10139. $strike = $this->addElement('strike', 'Inline', 'Inline', 'Common');
  10140. $strike->formatting = true;
  10141. $u = $this->addElement('u', 'Inline', 'Inline', 'Common');
  10142. $u->formatting = true;
  10143. // setup modifications to old elements
  10144. $align = 'Enum#left,right,center,justify';
  10145. $address = $this->addBlankElement('address');
  10146. $address->content_model = 'Inline | #PCDATA | p';
  10147. $address->content_model_type = 'optional';
  10148. $address->child = false;
  10149. $blockquote = $this->addBlankElement('blockquote');
  10150. $blockquote->content_model = 'Flow | #PCDATA';
  10151. $blockquote->content_model_type = 'optional';
  10152. $blockquote->child = false;
  10153. $br = $this->addBlankElement('br');
  10154. $br->attr['clear'] = 'Enum#left,all,right,none';
  10155. $caption = $this->addBlankElement('caption');
  10156. $caption->attr['align'] = 'Enum#top,bottom,left,right';
  10157. $div = $this->addBlankElement('div');
  10158. $div->attr['align'] = $align;
  10159. $dl = $this->addBlankElement('dl');
  10160. $dl->attr['compact'] = 'Bool#compact';
  10161. for ($i = 1; $i <= 6; $i++) {
  10162. $h = $this->addBlankElement("h$i");
  10163. $h->attr['align'] = $align;
  10164. }
  10165. $hr = $this->addBlankElement('hr');
  10166. $hr->attr['align'] = $align;
  10167. $hr->attr['noshade'] = 'Bool#noshade';
  10168. $hr->attr['size'] = 'Pixels';
  10169. $hr->attr['width'] = 'Length';
  10170. $img = $this->addBlankElement('img');
  10171. $img->attr['align'] = 'Enum#top,middle,bottom,left,right';
  10172. $img->attr['border'] = 'Pixels';
  10173. $img->attr['hspace'] = 'Pixels';
  10174. $img->attr['vspace'] = 'Pixels';
  10175. // figure out this integer business
  10176. $li = $this->addBlankElement('li');
  10177. $li->attr['value'] = new HTMLPurifier_AttrDef_Integer();
  10178. $li->attr['type'] = 'Enum#s:1,i,I,a,A,disc,square,circle';
  10179. $ol = $this->addBlankElement('ol');
  10180. $ol->attr['compact'] = 'Bool#compact';
  10181. $ol->attr['start'] = new HTMLPurifier_AttrDef_Integer();
  10182. $ol->attr['type'] = 'Enum#s:1,i,I,a,A';
  10183. $p = $this->addBlankElement('p');
  10184. $p->attr['align'] = $align;
  10185. $pre = $this->addBlankElement('pre');
  10186. $pre->attr['width'] = 'Number';
  10187. // script omitted
  10188. $table = $this->addBlankElement('table');
  10189. $table->attr['align'] = 'Enum#left,center,right';
  10190. $table->attr['bgcolor'] = 'Color';
  10191. $tr = $this->addBlankElement('tr');
  10192. $tr->attr['bgcolor'] = 'Color';
  10193. $th = $this->addBlankElement('th');
  10194. $th->attr['bgcolor'] = 'Color';
  10195. $th->attr['height'] = 'Length';
  10196. $th->attr['nowrap'] = 'Bool#nowrap';
  10197. $th->attr['width'] = 'Length';
  10198. $td = $this->addBlankElement('td');
  10199. $td->attr['bgcolor'] = 'Color';
  10200. $td->attr['height'] = 'Length';
  10201. $td->attr['nowrap'] = 'Bool#nowrap';
  10202. $td->attr['width'] = 'Length';
  10203. $ul = $this->addBlankElement('ul');
  10204. $ul->attr['compact'] = 'Bool#compact';
  10205. $ul->attr['type'] = 'Enum#square,disc,circle';
  10206. }
  10207. }
  10208. /**
  10209. * XHTML 1.1 List Module, defines list-oriented elements. Core Module.
  10210. */
  10211. class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule
  10212. {
  10213. public $name = 'List';
  10214. // According to the abstract schema, the List content set is a fully formed
  10215. // one or more expr, but it invariably occurs in an optional declaration
  10216. // so we're not going to do that subtlety. It might cause trouble
  10217. // if a user defines "List" and expects that multiple lists are
  10218. // allowed to be specified, but then again, that's not very intuitive.
  10219. // Furthermore, the actual XML Schema may disagree. Regardless,
  10220. // we don't have support for such nested expressions without using
  10221. // the incredibly inefficient and draconic Custom ChildDef.
  10222. public $content_sets = array('Flow' => 'List');
  10223. public function setup($config) {
  10224. $ol = $this->addElement('ol', 'List', 'Required: li', 'Common');
  10225. $ol->wrap = "li";
  10226. $ul = $this->addElement('ul', 'List', 'Required: li', 'Common');
  10227. $ul->wrap = "li";
  10228. $this->addElement('dl', 'List', 'Required: dt | dd', 'Common');
  10229. $this->addElement('li', false, 'Flow', 'Common');
  10230. $this->addElement('dd', false, 'Flow', 'Common');
  10231. $this->addElement('dt', false, 'Inline', 'Common');
  10232. }
  10233. }
  10234. class HTMLPurifier_HTMLModule_Name extends HTMLPurifier_HTMLModule
  10235. {
  10236. public $name = 'Name';
  10237. public function setup($config) {
  10238. $elements = array('a', 'applet', 'form', 'frame', 'iframe', 'img', 'map');
  10239. foreach ($elements as $name) {
  10240. $element = $this->addBlankElement($name);
  10241. $element->attr['name'] = 'CDATA';
  10242. if (!$config->get('HTML.Attr.Name.UseCDATA')) {
  10243. $element->attr_transform_post['NameSync'] = new HTMLPurifier_AttrTransform_NameSync();
  10244. }
  10245. }
  10246. }
  10247. }
  10248. /**
  10249. * Module adds the nofollow attribute transformation to a tags. It
  10250. * is enabled by HTML.Nofollow
  10251. */
  10252. class HTMLPurifier_HTMLModule_Nofollow extends HTMLPurifier_HTMLModule
  10253. {
  10254. public $name = 'Nofollow';
  10255. public function setup($config) {
  10256. $a = $this->addBlankElement('a');
  10257. $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_Nofollow();
  10258. }
  10259. }
  10260. class HTMLPurifier_HTMLModule_NonXMLCommonAttributes extends HTMLPurifier_HTMLModule
  10261. {
  10262. public $name = 'NonXMLCommonAttributes';
  10263. public $attr_collections = array(
  10264. 'Lang' => array(
  10265. 'lang' => 'LanguageCode',
  10266. )
  10267. );
  10268. }
  10269. /**
  10270. * XHTML 1.1 Object Module, defines elements for generic object inclusion
  10271. * @warning Users will commonly use <embed> to cater to legacy browsers: this
  10272. * module does not allow this sort of behavior
  10273. */
  10274. class HTMLPurifier_HTMLModule_Object extends HTMLPurifier_HTMLModule
  10275. {
  10276. public $name = 'Object';
  10277. public $safe = false;
  10278. public function setup($config) {
  10279. $this->addElement('object', 'Inline', 'Optional: #PCDATA | Flow | param', 'Common',
  10280. array(
  10281. 'archive' => 'URI',
  10282. 'classid' => 'URI',
  10283. 'codebase' => 'URI',
  10284. 'codetype' => 'Text',
  10285. 'data' => 'URI',
  10286. 'declare' => 'Bool#declare',
  10287. 'height' => 'Length',
  10288. 'name' => 'CDATA',
  10289. 'standby' => 'Text',
  10290. 'tabindex' => 'Number',
  10291. 'type' => 'ContentType',
  10292. 'width' => 'Length'
  10293. )
  10294. );
  10295. $this->addElement('param', false, 'Empty', false,
  10296. array(
  10297. 'id' => 'ID',
  10298. 'name*' => 'Text',
  10299. 'type' => 'Text',
  10300. 'value' => 'Text',
  10301. 'valuetype' => 'Enum#data,ref,object'
  10302. )
  10303. );
  10304. }
  10305. }
  10306. /**
  10307. * XHTML 1.1 Presentation Module, defines simple presentation-related
  10308. * markup. Text Extension Module.
  10309. * @note The official XML Schema and DTD specs further divide this into
  10310. * two modules:
  10311. * - Block Presentation (hr)
  10312. * - Inline Presentation (b, big, i, small, sub, sup, tt)
  10313. * We have chosen not to heed this distinction, as content_sets
  10314. * provides satisfactory disambiguation.
  10315. */
  10316. class HTMLPurifier_HTMLModule_Presentation extends HTMLPurifier_HTMLModule
  10317. {
  10318. public $name = 'Presentation';
  10319. public function setup($config) {
  10320. $this->addElement('hr', 'Block', 'Empty', 'Common');
  10321. $this->addElement('sub', 'Inline', 'Inline', 'Common');
  10322. $this->addElement('sup', 'Inline', 'Inline', 'Common');
  10323. $b = $this->addElement('b', 'Inline', 'Inline', 'Common');
  10324. $b->formatting = true;
  10325. $big = $this->addElement('big', 'Inline', 'Inline', 'Common');
  10326. $big->formatting = true;
  10327. $i = $this->addElement('i', 'Inline', 'Inline', 'Common');
  10328. $i->formatting = true;
  10329. $small = $this->addElement('small', 'Inline', 'Inline', 'Common');
  10330. $small->formatting = true;
  10331. $tt = $this->addElement('tt', 'Inline', 'Inline', 'Common');
  10332. $tt->formatting = true;
  10333. }
  10334. }
  10335. /**
  10336. * Module defines proprietary tags and attributes in HTML.
  10337. * @warning If this module is enabled, standards-compliance is off!
  10338. */
  10339. class HTMLPurifier_HTMLModule_Proprietary extends HTMLPurifier_HTMLModule
  10340. {
  10341. public $name = 'Proprietary';
  10342. public function setup($config) {
  10343. $this->addElement('marquee', 'Inline', 'Flow', 'Common',
  10344. array(
  10345. 'direction' => 'Enum#left,right,up,down',
  10346. 'behavior' => 'Enum#alternate',
  10347. 'width' => 'Length',
  10348. 'height' => 'Length',
  10349. 'scrolldelay' => 'Number',
  10350. 'scrollamount' => 'Number',
  10351. 'loop' => 'Number',
  10352. 'bgcolor' => 'Color',
  10353. 'hspace' => 'Pixels',
  10354. 'vspace' => 'Pixels',
  10355. )
  10356. );
  10357. }
  10358. }
  10359. /**
  10360. * XHTML 1.1 Ruby Annotation Module, defines elements that indicate
  10361. * short runs of text alongside base text for annotation or pronounciation.
  10362. */
  10363. class HTMLPurifier_HTMLModule_Ruby extends HTMLPurifier_HTMLModule
  10364. {
  10365. public $name = 'Ruby';
  10366. public function setup($config) {
  10367. $this->addElement('ruby', 'Inline',
  10368. 'Custom: ((rb, (rt | (rp, rt, rp))) | (rbc, rtc, rtc?))',
  10369. 'Common');
  10370. $this->addElement('rbc', false, 'Required: rb', 'Common');
  10371. $this->addElement('rtc', false, 'Required: rt', 'Common');
  10372. $rb = $this->addElement('rb', false, 'Inline', 'Common');
  10373. $rb->excludes = array('ruby' => true);
  10374. $rt = $this->addElement('rt', false, 'Inline', 'Common', array('rbspan' => 'Number'));
  10375. $rt->excludes = array('ruby' => true);
  10376. $this->addElement('rp', false, 'Optional: #PCDATA', 'Common');
  10377. }
  10378. }
  10379. /**
  10380. * A "safe" embed module. See SafeObject. This is a proprietary element.
  10381. */
  10382. class HTMLPurifier_HTMLModule_SafeEmbed extends HTMLPurifier_HTMLModule
  10383. {
  10384. public $name = 'SafeEmbed';
  10385. public function setup($config) {
  10386. $max = $config->get('HTML.MaxImgLength');
  10387. $embed = $this->addElement(
  10388. 'embed', 'Inline', 'Empty', 'Common',
  10389. array(
  10390. 'src*' => 'URI#embedded',
  10391. 'type' => 'Enum#application/x-shockwave-flash',
  10392. 'width' => 'Pixels#' . $max,
  10393. 'height' => 'Pixels#' . $max,
  10394. 'allowscriptaccess' => 'Enum#never',
  10395. 'allownetworking' => 'Enum#internal',
  10396. 'flashvars' => 'Text',
  10397. 'wmode' => 'Enum#window,transparent,opaque',
  10398. 'name' => 'ID',
  10399. )
  10400. );
  10401. $embed->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeEmbed();
  10402. }
  10403. }
  10404. /**
  10405. * A "safe" object module. In theory, objects permitted by this module will
  10406. * be safe, and untrusted users can be allowed to embed arbitrary flash objects
  10407. * (maybe other types too, but only Flash is supported as of right now).
  10408. * Highly experimental.
  10409. */
  10410. class HTMLPurifier_HTMLModule_SafeObject extends HTMLPurifier_HTMLModule
  10411. {
  10412. public $name = 'SafeObject';
  10413. public function setup($config) {
  10414. // These definitions are not intrinsically safe: the attribute transforms
  10415. // are a vital part of ensuring safety.
  10416. $max = $config->get('HTML.MaxImgLength');
  10417. $object = $this->addElement(
  10418. 'object',
  10419. 'Inline',
  10420. 'Optional: param | Flow | #PCDATA',
  10421. 'Common',
  10422. array(
  10423. // While technically not required by the spec, we're forcing
  10424. // it to this value.
  10425. 'type' => 'Enum#application/x-shockwave-flash',
  10426. 'width' => 'Pixels#' . $max,
  10427. 'height' => 'Pixels#' . $max,
  10428. 'data' => 'URI#embedded',
  10429. 'codebase' => new HTMLPurifier_AttrDef_Enum(array(
  10430. 'http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,40,0')),
  10431. )
  10432. );
  10433. $object->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeObject();
  10434. $param = $this->addElement('param', false, 'Empty', false,
  10435. array(
  10436. 'id' => 'ID',
  10437. 'name*' => 'Text',
  10438. 'value' => 'Text'
  10439. )
  10440. );
  10441. $param->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeParam();
  10442. $this->info_injector[] = 'SafeObject';
  10443. }
  10444. }
  10445. /*
  10446. WARNING: THIS MODULE IS EXTREMELY DANGEROUS AS IT ENABLES INLINE SCRIPTING
  10447. INSIDE HTML PURIFIER DOCUMENTS. USE ONLY WITH TRUSTED USER INPUT!!!
  10448. */
  10449. /**
  10450. * XHTML 1.1 Scripting module, defines elements that are used to contain
  10451. * information pertaining to executable scripts or the lack of support
  10452. * for executable scripts.
  10453. * @note This module does not contain inline scripting elements
  10454. */
  10455. class HTMLPurifier_HTMLModule_Scripting extends HTMLPurifier_HTMLModule
  10456. {
  10457. public $name = 'Scripting';
  10458. public $elements = array('script', 'noscript');
  10459. public $content_sets = array('Block' => 'script | noscript', 'Inline' => 'script | noscript');
  10460. public $safe = false;
  10461. public function setup($config) {
  10462. // TODO: create custom child-definition for noscript that
  10463. // auto-wraps stray #PCDATA in a similar manner to
  10464. // blockquote's custom definition (we would use it but
  10465. // blockquote's contents are optional while noscript's contents
  10466. // are required)
  10467. // TODO: convert this to new syntax, main problem is getting
  10468. // both content sets working
  10469. // In theory, this could be safe, but I don't see any reason to
  10470. // allow it.
  10471. $this->info['noscript'] = new HTMLPurifier_ElementDef();
  10472. $this->info['noscript']->attr = array( 0 => array('Common') );
  10473. $this->info['noscript']->content_model = 'Heading | List | Block';
  10474. $this->info['noscript']->content_model_type = 'required';
  10475. $this->info['script'] = new HTMLPurifier_ElementDef();
  10476. $this->info['script']->attr = array(
  10477. 'defer' => new HTMLPurifier_AttrDef_Enum(array('defer')),
  10478. 'src' => new HTMLPurifier_AttrDef_URI(true),
  10479. 'type' => new HTMLPurifier_AttrDef_Enum(array('text/javascript'))
  10480. );
  10481. $this->info['script']->content_model = '#PCDATA';
  10482. $this->info['script']->content_model_type = 'optional';
  10483. $this->info['script']->attr_transform_pre['type'] =
  10484. $this->info['script']->attr_transform_post['type'] =
  10485. new HTMLPurifier_AttrTransform_ScriptRequired();
  10486. }
  10487. }
  10488. /**
  10489. * XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
  10490. * Module.
  10491. */
  10492. class HTMLPurifier_HTMLModule_StyleAttribute extends HTMLPurifier_HTMLModule
  10493. {
  10494. public $name = 'StyleAttribute';
  10495. public $attr_collections = array(
  10496. // The inclusion routine differs from the Abstract Modules but
  10497. // is in line with the DTD and XML Schemas.
  10498. 'Style' => array('style' => false), // see constructor
  10499. 'Core' => array(0 => array('Style'))
  10500. );
  10501. public function setup($config) {
  10502. $this->attr_collections['Style']['style'] = new HTMLPurifier_AttrDef_CSS();
  10503. }
  10504. }
  10505. /**
  10506. * XHTML 1.1 Tables Module, fully defines accessible table elements.
  10507. */
  10508. class HTMLPurifier_HTMLModule_Tables extends HTMLPurifier_HTMLModule
  10509. {
  10510. public $name = 'Tables';
  10511. public function setup($config) {
  10512. $this->addElement('caption', false, 'Inline', 'Common');
  10513. $this->addElement('table', 'Block',
  10514. new HTMLPurifier_ChildDef_Table(), 'Common',
  10515. array(
  10516. 'border' => 'Pixels',
  10517. 'cellpadding' => 'Length',
  10518. 'cellspacing' => 'Length',
  10519. 'frame' => 'Enum#void,above,below,hsides,lhs,rhs,vsides,box,border',
  10520. 'rules' => 'Enum#none,groups,rows,cols,all',
  10521. 'summary' => 'Text',
  10522. 'width' => 'Length'
  10523. )
  10524. );
  10525. // common attributes
  10526. $cell_align = array(
  10527. 'align' => 'Enum#left,center,right,justify,char',
  10528. 'charoff' => 'Length',
  10529. 'valign' => 'Enum#top,middle,bottom,baseline',
  10530. );
  10531. $cell_t = array_merge(
  10532. array(
  10533. 'abbr' => 'Text',
  10534. 'colspan' => 'Number',
  10535. 'rowspan' => 'Number',
  10536. ),
  10537. $cell_align
  10538. );
  10539. $this->addElement('td', false, 'Flow', 'Common', $cell_t);
  10540. $this->addElement('th', false, 'Flow', 'Common', $cell_t);
  10541. $this->addElement('tr', false, 'Required: td | th', 'Common', $cell_align);
  10542. $cell_col = array_merge(
  10543. array(
  10544. 'span' => 'Number',
  10545. 'width' => 'MultiLength',
  10546. ),
  10547. $cell_align
  10548. );
  10549. $this->addElement('col', false, 'Empty', 'Common', $cell_col);
  10550. $this->addElement('colgroup', false, 'Optional: col', 'Common', $cell_col);
  10551. $this->addElement('tbody', false, 'Required: tr', 'Common', $cell_align);
  10552. $this->addElement('thead', false, 'Required: tr', 'Common', $cell_align);
  10553. $this->addElement('tfoot', false, 'Required: tr', 'Common', $cell_align);
  10554. }
  10555. }
  10556. /**
  10557. * XHTML 1.1 Target Module, defines target attribute in link elements.
  10558. */
  10559. class HTMLPurifier_HTMLModule_Target extends HTMLPurifier_HTMLModule
  10560. {
  10561. public $name = 'Target';
  10562. public function setup($config) {
  10563. $elements = array('a');
  10564. foreach ($elements as $name) {
  10565. $e = $this->addBlankElement($name);
  10566. $e->attr = array(
  10567. 'target' => new HTMLPurifier_AttrDef_HTML_FrameTarget()
  10568. );
  10569. }
  10570. }
  10571. }
  10572. /**
  10573. * XHTML 1.1 Text Module, defines basic text containers. Core Module.
  10574. * @note In the normative XML Schema specification, this module
  10575. * is further abstracted into the following modules:
  10576. * - Block Phrasal (address, blockquote, pre, h1, h2, h3, h4, h5, h6)
  10577. * - Block Structural (div, p)
  10578. * - Inline Phrasal (abbr, acronym, cite, code, dfn, em, kbd, q, samp, strong, var)
  10579. * - Inline Structural (br, span)
  10580. * This module, functionally, does not distinguish between these
  10581. * sub-modules, but the code is internally structured to reflect
  10582. * these distinctions.
  10583. */
  10584. class HTMLPurifier_HTMLModule_Text extends HTMLPurifier_HTMLModule
  10585. {
  10586. public $name = 'Text';
  10587. public $content_sets = array(
  10588. 'Flow' => 'Heading | Block | Inline'
  10589. );
  10590. public function setup($config) {
  10591. // Inline Phrasal -------------------------------------------------
  10592. $this->addElement('abbr', 'Inline', 'Inline', 'Common');
  10593. $this->addElement('acronym', 'Inline', 'Inline', 'Common');
  10594. $this->addElement('cite', 'Inline', 'Inline', 'Common');
  10595. $this->addElement('dfn', 'Inline', 'Inline', 'Common');
  10596. $this->addElement('kbd', 'Inline', 'Inline', 'Common');
  10597. $this->addElement('q', 'Inline', 'Inline', 'Common', array('cite' => 'URI'));
  10598. $this->addElement('samp', 'Inline', 'Inline', 'Common');
  10599. $this->addElement('var', 'Inline', 'Inline', 'Common');
  10600. $em = $this->addElement('em', 'Inline', 'Inline', 'Common');
  10601. $em->formatting = true;
  10602. $strong = $this->addElement('strong', 'Inline', 'Inline', 'Common');
  10603. $strong->formatting = true;
  10604. $code = $this->addElement('code', 'Inline', 'Inline', 'Common');
  10605. $code->formatting = true;
  10606. // Inline Structural ----------------------------------------------
  10607. $this->addElement('span', 'Inline', 'Inline', 'Common');
  10608. $this->addElement('br', 'Inline', 'Empty', 'Core');
  10609. // Block Phrasal --------------------------------------------------
  10610. $this->addElement('address', 'Block', 'Inline', 'Common');
  10611. $this->addElement('blockquote', 'Block', 'Optional: Heading | Block | List', 'Common', array('cite' => 'URI') );
  10612. $pre = $this->addElement('pre', 'Block', 'Inline', 'Common');
  10613. $pre->excludes = $this->makeLookup(
  10614. 'img', 'big', 'small', 'object', 'applet', 'font', 'basefont' );
  10615. $this->addElement('h1', 'Heading', 'Inline', 'Common');
  10616. $this->addElement('h2', 'Heading', 'Inline', 'Common');
  10617. $this->addElement('h3', 'Heading', 'Inline', 'Common');
  10618. $this->addElement('h4', 'Heading', 'Inline', 'Common');
  10619. $this->addElement('h5', 'Heading', 'Inline', 'Common');
  10620. $this->addElement('h6', 'Heading', 'Inline', 'Common');
  10621. // Block Structural -----------------------------------------------
  10622. $p = $this->addElement('p', 'Block', 'Inline', 'Common');
  10623. $p->autoclose = array_flip(array("address", "blockquote", "center", "dir", "div", "dl", "fieldset", "ol", "p", "ul"));
  10624. $this->addElement('div', 'Block', 'Flow', 'Common');
  10625. }
  10626. }
  10627. /**
  10628. * Abstract class for a set of proprietary modules that clean up (tidy)
  10629. * poorly written HTML.
  10630. * @todo Figure out how to protect some of these methods/properties
  10631. */
  10632. class HTMLPurifier_HTMLModule_Tidy extends HTMLPurifier_HTMLModule
  10633. {
  10634. /**
  10635. * List of supported levels. Index zero is a special case "no fixes"
  10636. * level.
  10637. */
  10638. public $levels = array(0 => 'none', 'light', 'medium', 'heavy');
  10639. /**
  10640. * Default level to place all fixes in. Disabled by default
  10641. */
  10642. public $defaultLevel = null;
  10643. /**
  10644. * Lists of fixes used by getFixesForLevel(). Format is:
  10645. * HTMLModule_Tidy->fixesForLevel[$level] = array('fix-1', 'fix-2');
  10646. */
  10647. public $fixesForLevel = array(
  10648. 'light' => array(),
  10649. 'medium' => array(),
  10650. 'heavy' => array()
  10651. );
  10652. /**
  10653. * Lazy load constructs the module by determining the necessary
  10654. * fixes to create and then delegating to the populate() function.
  10655. * @todo Wildcard matching and error reporting when an added or
  10656. * subtracted fix has no effect.
  10657. */
  10658. public function setup($config) {
  10659. // create fixes, initialize fixesForLevel
  10660. $fixes = $this->makeFixes();
  10661. $this->makeFixesForLevel($fixes);
  10662. // figure out which fixes to use
  10663. $level = $config->get('HTML.TidyLevel');
  10664. $fixes_lookup = $this->getFixesForLevel($level);
  10665. // get custom fix declarations: these need namespace processing
  10666. $add_fixes = $config->get('HTML.TidyAdd');
  10667. $remove_fixes = $config->get('HTML.TidyRemove');
  10668. foreach ($fixes as $name => $fix) {
  10669. // needs to be refactored a little to implement globbing
  10670. if (
  10671. isset($remove_fixes[$name]) ||
  10672. (!isset($add_fixes[$name]) && !isset($fixes_lookup[$name]))
  10673. ) {
  10674. unset($fixes[$name]);
  10675. }
  10676. }
  10677. // populate this module with necessary fixes
  10678. $this->populate($fixes);
  10679. }
  10680. /**
  10681. * Retrieves all fixes per a level, returning fixes for that specific
  10682. * level as well as all levels below it.
  10683. * @param $level String level identifier, see $levels for valid values
  10684. * @return Lookup up table of fixes
  10685. */
  10686. public function getFixesForLevel($level) {
  10687. if ($level == $this->levels[0]) {
  10688. return array();
  10689. }
  10690. $activated_levels = array();
  10691. for ($i = 1, $c = count($this->levels); $i < $c; $i++) {
  10692. $activated_levels[] = $this->levels[$i];
  10693. if ($this->levels[$i] == $level) break;
  10694. }
  10695. if ($i == $c) {
  10696. trigger_error(
  10697. 'Tidy level ' . htmlspecialchars($level) . ' not recognized',
  10698. E_USER_WARNING
  10699. );
  10700. return array();
  10701. }
  10702. $ret = array();
  10703. foreach ($activated_levels as $level) {
  10704. foreach ($this->fixesForLevel[$level] as $fix) {
  10705. $ret[$fix] = true;
  10706. }
  10707. }
  10708. return $ret;
  10709. }
  10710. /**
  10711. * Dynamically populates the $fixesForLevel member variable using
  10712. * the fixes array. It may be custom overloaded, used in conjunction
  10713. * with $defaultLevel, or not used at all.
  10714. */
  10715. public function makeFixesForLevel($fixes) {
  10716. if (!isset($this->defaultLevel)) return;
  10717. if (!isset($this->fixesForLevel[$this->defaultLevel])) {
  10718. trigger_error(
  10719. 'Default level ' . $this->defaultLevel . ' does not exist',
  10720. E_USER_ERROR
  10721. );
  10722. return;
  10723. }
  10724. $this->fixesForLevel[$this->defaultLevel] = array_keys($fixes);
  10725. }
  10726. /**
  10727. * Populates the module with transforms and other special-case code
  10728. * based on a list of fixes passed to it
  10729. * @param $lookup Lookup table of fixes to activate
  10730. */
  10731. public function populate($fixes) {
  10732. foreach ($fixes as $name => $fix) {
  10733. // determine what the fix is for
  10734. list($type, $params) = $this->getFixType($name);
  10735. switch ($type) {
  10736. case 'attr_transform_pre':
  10737. case 'attr_transform_post':
  10738. $attr = $params['attr'];
  10739. if (isset($params['element'])) {
  10740. $element = $params['element'];
  10741. if (empty($this->info[$element])) {
  10742. $e = $this->addBlankElement($element);
  10743. } else {
  10744. $e = $this->info[$element];
  10745. }
  10746. } else {
  10747. $type = "info_$type";
  10748. $e = $this;
  10749. }
  10750. // PHP does some weird parsing when I do
  10751. // $e->$type[$attr], so I have to assign a ref.
  10752. $f =& $e->$type;
  10753. $f[$attr] = $fix;
  10754. break;
  10755. case 'tag_transform':
  10756. $this->info_tag_transform[$params['element']] = $fix;
  10757. break;
  10758. case 'child':
  10759. case 'content_model_type':
  10760. $element = $params['element'];
  10761. if (empty($this->info[$element])) {
  10762. $e = $this->addBlankElement($element);
  10763. } else {
  10764. $e = $this->info[$element];
  10765. }
  10766. $e->$type = $fix;
  10767. break;
  10768. default:
  10769. trigger_error("Fix type $type not supported", E_USER_ERROR);
  10770. break;
  10771. }
  10772. }
  10773. }
  10774. /**
  10775. * Parses a fix name and determines what kind of fix it is, as well
  10776. * as other information defined by the fix
  10777. * @param $name String name of fix
  10778. * @return array(string $fix_type, array $fix_parameters)
  10779. * @note $fix_parameters is type dependant, see populate() for usage
  10780. * of these parameters
  10781. */
  10782. public function getFixType($name) {
  10783. // parse it
  10784. $property = $attr = null;
  10785. if (strpos($name, '#') !== false) list($name, $property) = explode('#', $name);
  10786. if (strpos($name, '@') !== false) list($name, $attr) = explode('@', $name);
  10787. // figure out the parameters
  10788. $params = array();
  10789. if ($name !== '') $params['element'] = $name;
  10790. if (!is_null($attr)) $params['attr'] = $attr;
  10791. // special case: attribute transform
  10792. if (!is_null($attr)) {
  10793. if (is_null($property)) $property = 'pre';
  10794. $type = 'attr_transform_' . $property;
  10795. return array($type, $params);
  10796. }
  10797. // special case: tag transform
  10798. if (is_null($property)) {
  10799. return array('tag_transform', $params);
  10800. }
  10801. return array($property, $params);
  10802. }
  10803. /**
  10804. * Defines all fixes the module will perform in a compact
  10805. * associative array of fix name to fix implementation.
  10806. */
  10807. public function makeFixes() {}
  10808. }
  10809. class HTMLPurifier_HTMLModule_XMLCommonAttributes extends HTMLPurifier_HTMLModule
  10810. {
  10811. public $name = 'XMLCommonAttributes';
  10812. public $attr_collections = array(
  10813. 'Lang' => array(
  10814. 'xml:lang' => 'LanguageCode',
  10815. )
  10816. );
  10817. }
  10818. /**
  10819. * Name is deprecated, but allowed in strict doctypes, so onl
  10820. */
  10821. class HTMLPurifier_HTMLModule_Tidy_Name extends HTMLPurifier_HTMLModule_Tidy
  10822. {
  10823. public $name = 'Tidy_Name';
  10824. public $defaultLevel = 'heavy';
  10825. public function makeFixes() {
  10826. $r = array();
  10827. // @name for img, a -----------------------------------------------
  10828. // Technically, it's allowed even on strict, so we allow authors to use
  10829. // it. However, it's deprecated in future versions of XHTML.
  10830. $r['img@name'] =
  10831. $r['a@name'] = new HTMLPurifier_AttrTransform_Name();
  10832. return $r;
  10833. }
  10834. }
  10835. class HTMLPurifier_HTMLModule_Tidy_Proprietary extends HTMLPurifier_HTMLModule_Tidy
  10836. {
  10837. public $name = 'Tidy_Proprietary';
  10838. public $defaultLevel = 'light';
  10839. public function makeFixes() {
  10840. $r = array();
  10841. $r['table@background'] = new HTMLPurifier_AttrTransform_Background();
  10842. $r['td@background'] = new HTMLPurifier_AttrTransform_Background();
  10843. $r['th@background'] = new HTMLPurifier_AttrTransform_Background();
  10844. $r['tr@background'] = new HTMLPurifier_AttrTransform_Background();
  10845. $r['thead@background'] = new HTMLPurifier_AttrTransform_Background();
  10846. $r['tfoot@background'] = new HTMLPurifier_AttrTransform_Background();
  10847. $r['tbody@background'] = new HTMLPurifier_AttrTransform_Background();
  10848. $r['table@height'] = new HTMLPurifier_AttrTransform_Length('height');
  10849. return $r;
  10850. }
  10851. }
  10852. class HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4 extends HTMLPurifier_HTMLModule_Tidy
  10853. {
  10854. public function makeFixes() {
  10855. $r = array();
  10856. // == deprecated tag transforms ===================================
  10857. $r['font'] = new HTMLPurifier_TagTransform_Font();
  10858. $r['menu'] = new HTMLPurifier_TagTransform_Simple('ul');
  10859. $r['dir'] = new HTMLPurifier_TagTransform_Simple('ul');
  10860. $r['center'] = new HTMLPurifier_TagTransform_Simple('div', 'text-align:center;');
  10861. $r['u'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:underline;');
  10862. $r['s'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:line-through;');
  10863. $r['strike'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:line-through;');
  10864. // == deprecated attribute transforms =============================
  10865. $r['caption@align'] =
  10866. new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
  10867. // we're following IE's behavior, not Firefox's, due
  10868. // to the fact that no one supports caption-side:right,
  10869. // W3C included (with CSS 2.1). This is a slightly
  10870. // unreasonable attribute!
  10871. 'left' => 'text-align:left;',
  10872. 'right' => 'text-align:right;',
  10873. 'top' => 'caption-side:top;',
  10874. 'bottom' => 'caption-side:bottom;' // not supported by IE
  10875. ));
  10876. // @align for img -------------------------------------------------
  10877. $r['img@align'] =
  10878. new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
  10879. 'left' => 'float:left;',
  10880. 'right' => 'float:right;',
  10881. 'top' => 'vertical-align:top;',
  10882. 'middle' => 'vertical-align:middle;',
  10883. 'bottom' => 'vertical-align:baseline;',
  10884. ));
  10885. // @align for table -----------------------------------------------
  10886. $r['table@align'] =
  10887. new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
  10888. 'left' => 'float:left;',
  10889. 'center' => 'margin-left:auto;margin-right:auto;',
  10890. 'right' => 'float:right;'
  10891. ));
  10892. // @align for hr -----------------------------------------------
  10893. $r['hr@align'] =
  10894. new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
  10895. // we use both text-align and margin because these work
  10896. // for different browsers (IE and Firefox, respectively)
  10897. // and the melange makes for a pretty cross-compatible
  10898. // solution
  10899. 'left' => 'margin-left:0;margin-right:auto;text-align:left;',
  10900. 'center' => 'margin-left:auto;margin-right:auto;text-align:center;',
  10901. 'right' => 'margin-left:auto;margin-right:0;text-align:right;'
  10902. ));
  10903. // @align for h1, h2, h3, h4, h5, h6, p, div ----------------------
  10904. // {{{
  10905. $align_lookup = array();
  10906. $align_values = array('left', 'right', 'center', 'justify');
  10907. foreach ($align_values as $v) $align_lookup[$v] = "text-align:$v;";
  10908. // }}}
  10909. $r['h1@align'] =
  10910. $r['h2@align'] =
  10911. $r['h3@align'] =
  10912. $r['h4@align'] =
  10913. $r['h5@align'] =
  10914. $r['h6@align'] =
  10915. $r['p@align'] =
  10916. $r['div@align'] =
  10917. new HTMLPurifier_AttrTransform_EnumToCSS('align', $align_lookup);
  10918. // @bgcolor for table, tr, td, th ---------------------------------
  10919. $r['table@bgcolor'] =
  10920. $r['td@bgcolor'] =
  10921. $r['th@bgcolor'] =
  10922. new HTMLPurifier_AttrTransform_BgColor();
  10923. // @border for img ------------------------------------------------
  10924. $r['img@border'] = new HTMLPurifier_AttrTransform_Border();
  10925. // @clear for br --------------------------------------------------
  10926. $r['br@clear'] =
  10927. new HTMLPurifier_AttrTransform_EnumToCSS('clear', array(
  10928. 'left' => 'clear:left;',
  10929. 'right' => 'clear:right;',
  10930. 'all' => 'clear:both;',
  10931. 'none' => 'clear:none;',
  10932. ));
  10933. // @height for td, th ---------------------------------------------
  10934. $r['td@height'] =
  10935. $r['th@height'] =
  10936. new HTMLPurifier_AttrTransform_Length('height');
  10937. // @hspace for img ------------------------------------------------
  10938. $r['img@hspace'] = new HTMLPurifier_AttrTransform_ImgSpace('hspace');
  10939. // @noshade for hr ------------------------------------------------
  10940. // this transformation is not precise but often good enough.
  10941. // different browsers use different styles to designate noshade
  10942. $r['hr@noshade'] =
  10943. new HTMLPurifier_AttrTransform_BoolToCSS(
  10944. 'noshade',
  10945. 'color:#808080;background-color:#808080;border:0;'
  10946. );
  10947. // @nowrap for td, th ---------------------------------------------
  10948. $r['td@nowrap'] =
  10949. $r['th@nowrap'] =
  10950. new HTMLPurifier_AttrTransform_BoolToCSS(
  10951. 'nowrap',
  10952. 'white-space:nowrap;'
  10953. );
  10954. // @size for hr --------------------------------------------------
  10955. $r['hr@size'] = new HTMLPurifier_AttrTransform_Length('size', 'height');
  10956. // @type for li, ol, ul -------------------------------------------
  10957. // {{{
  10958. $ul_types = array(
  10959. 'disc' => 'list-style-type:disc;',
  10960. 'square' => 'list-style-type:square;',
  10961. 'circle' => 'list-style-type:circle;'
  10962. );
  10963. $ol_types = array(
  10964. '1' => 'list-style-type:decimal;',
  10965. 'i' => 'list-style-type:lower-roman;',
  10966. 'I' => 'list-style-type:upper-roman;',
  10967. 'a' => 'list-style-type:lower-alpha;',
  10968. 'A' => 'list-style-type:upper-alpha;'
  10969. );
  10970. $li_types = $ul_types + $ol_types;
  10971. // }}}
  10972. $r['ul@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ul_types);
  10973. $r['ol@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ol_types, true);
  10974. $r['li@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $li_types, true);
  10975. // @vspace for img ------------------------------------------------
  10976. $r['img@vspace'] = new HTMLPurifier_AttrTransform_ImgSpace('vspace');
  10977. // @width for hr, td, th ------------------------------------------
  10978. $r['td@width'] =
  10979. $r['th@width'] =
  10980. $r['hr@width'] = new HTMLPurifier_AttrTransform_Length('width');
  10981. return $r;
  10982. }
  10983. }
  10984. class HTMLPurifier_HTMLModule_Tidy_Strict extends HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4
  10985. {
  10986. public $name = 'Tidy_Strict';
  10987. public $defaultLevel = 'light';
  10988. public function makeFixes() {
  10989. $r = parent::makeFixes();
  10990. $r['blockquote#content_model_type'] = 'strictblockquote';
  10991. return $r;
  10992. }
  10993. public $defines_child_def = true;
  10994. public function getChildDef($def) {
  10995. if ($def->content_model_type != 'strictblockquote') return parent::getChildDef($def);
  10996. return new HTMLPurifier_ChildDef_StrictBlockquote($def->content_model);
  10997. }
  10998. }
  10999. class HTMLPurifier_HTMLModule_Tidy_Transitional extends HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4
  11000. {
  11001. public $name = 'Tidy_Transitional';
  11002. public $defaultLevel = 'heavy';
  11003. }
  11004. class HTMLPurifier_HTMLModule_Tidy_XHTML extends HTMLPurifier_HTMLModule_Tidy
  11005. {
  11006. public $name = 'Tidy_XHTML';
  11007. public $defaultLevel = 'medium';
  11008. public function makeFixes() {
  11009. $r = array();
  11010. $r['@lang'] = new HTMLPurifier_AttrTransform_Lang();
  11011. return $r;
  11012. }
  11013. }
  11014. /**
  11015. * Injector that auto paragraphs text in the root node based on
  11016. * double-spacing.
  11017. * @todo Ensure all states are unit tested, including variations as well.
  11018. * @todo Make a graph of the flow control for this Injector.
  11019. */
  11020. class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
  11021. {
  11022. public $name = 'AutoParagraph';
  11023. public $needed = array('p');
  11024. private function _pStart() {
  11025. $par = new HTMLPurifier_Token_Start('p');
  11026. $par->armor['MakeWellFormed_TagClosedError'] = true;
  11027. return $par;
  11028. }
  11029. public function handleText(&$token) {
  11030. $text = $token->data;
  11031. // Does the current parent allow <p> tags?
  11032. if ($this->allowsElement('p')) {
  11033. if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) {
  11034. // Note that we have differing behavior when dealing with text
  11035. // in the anonymous root node, or a node inside the document.
  11036. // If the text as a double-newline, the treatment is the same;
  11037. // if it doesn't, see the next if-block if you're in the document.
  11038. $i = $nesting = null;
  11039. if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) {
  11040. // State 1.1: ... ^ (whitespace, then document end)
  11041. // ----
  11042. // This is a degenerate case
  11043. } else {
  11044. if (!$token->is_whitespace || $this->_isInline($current)) {
  11045. // State 1.2: PAR1
  11046. // ----
  11047. // State 1.3: PAR1\n\nPAR2
  11048. // ------------
  11049. // State 1.4: <div>PAR1\n\nPAR2 (see State 2)
  11050. // ------------
  11051. $token = array($this->_pStart());
  11052. $this->_splitText($text, $token);
  11053. } else {
  11054. // State 1.5: \n<hr />
  11055. // --
  11056. }
  11057. }
  11058. } else {
  11059. // State 2: <div>PAR1... (similar to 1.4)
  11060. // ----
  11061. // We're in an element that allows paragraph tags, but we're not
  11062. // sure if we're going to need them.
  11063. if ($this->_pLookAhead()) {
  11064. // State 2.1: <div>PAR1<b>PAR1\n\nPAR2
  11065. // ----
  11066. // Note: This will always be the first child, since any
  11067. // previous inline element would have triggered this very
  11068. // same routine, and found the double newline. One possible
  11069. // exception would be a comment.
  11070. $token = array($this->_pStart(), $token);
  11071. } else {
  11072. // State 2.2.1: <div>PAR1<div>
  11073. // ----
  11074. // State 2.2.2: <div>PAR1<b>PAR1</b></div>
  11075. // ----
  11076. }
  11077. }
  11078. // Is the current parent a <p> tag?
  11079. } elseif (
  11080. !empty($this->currentNesting) &&
  11081. $this->currentNesting[count($this->currentNesting)-1]->name == 'p'
  11082. ) {
  11083. // State 3.1: ...<p>PAR1
  11084. // ----
  11085. // State 3.2: ...<p>PAR1\n\nPAR2
  11086. // ------------
  11087. $token = array();
  11088. $this->_splitText($text, $token);
  11089. // Abort!
  11090. } else {
  11091. // State 4.1: ...<b>PAR1
  11092. // ----
  11093. // State 4.2: ...<b>PAR1\n\nPAR2
  11094. // ------------
  11095. }
  11096. }
  11097. public function handleElement(&$token) {
  11098. // We don't have to check if we're already in a <p> tag for block
  11099. // tokens, because the tag would have been autoclosed by MakeWellFormed.
  11100. if ($this->allowsElement('p')) {
  11101. if (!empty($this->currentNesting)) {
  11102. if ($this->_isInline($token)) {
  11103. // State 1: <div>...<b>
  11104. // ---
  11105. // Check if this token is adjacent to the parent token
  11106. // (seek backwards until token isn't whitespace)
  11107. $i = null;
  11108. $this->backward($i, $prev);
  11109. if (!$prev instanceof HTMLPurifier_Token_Start) {
  11110. // Token wasn't adjacent
  11111. if (
  11112. $prev instanceof HTMLPurifier_Token_Text &&
  11113. substr($prev->data, -2) === "\n\n"
  11114. ) {
  11115. // State 1.1.4: <div><p>PAR1</p>\n\n<b>
  11116. // ---
  11117. // Quite frankly, this should be handled by splitText
  11118. $token = array($this->_pStart(), $token);
  11119. } else {
  11120. // State 1.1.1: <div><p>PAR1</p><b>
  11121. // ---
  11122. // State 1.1.2: <div><br /><b>
  11123. // ---
  11124. // State 1.1.3: <div>PAR<b>
  11125. // ---
  11126. }
  11127. } else {
  11128. // State 1.2.1: <div><b>
  11129. // ---
  11130. // Lookahead to see if <p> is needed.
  11131. if ($this->_pLookAhead()) {
  11132. // State 1.3.1: <div><b>PAR1\n\nPAR2
  11133. // ---
  11134. $token = array($this->_pStart(), $token);
  11135. } else {
  11136. // State 1.3.2: <div><b>PAR1</b></div>
  11137. // ---
  11138. // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div>
  11139. // ---
  11140. }
  11141. }
  11142. } else {
  11143. // State 2.3: ...<div>
  11144. // -----
  11145. }
  11146. } else {
  11147. if ($this->_isInline($token)) {
  11148. // State 3.1: <b>
  11149. // ---
  11150. // This is where the {p} tag is inserted, not reflected in
  11151. // inputTokens yet, however.
  11152. $token = array($this->_pStart(), $token);
  11153. } else {
  11154. // State 3.2: <div>
  11155. // -----
  11156. }
  11157. $i = null;
  11158. if ($this->backward($i, $prev)) {
  11159. if (
  11160. !$prev instanceof HTMLPurifier_Token_Text
  11161. ) {
  11162. // State 3.1.1: ...</p>{p}<b>
  11163. // ---
  11164. // State 3.2.1: ...</p><div>
  11165. // -----
  11166. if (!is_array($token)) $token = array($token);
  11167. array_unshift($token, new HTMLPurifier_Token_Text("\n\n"));
  11168. } else {
  11169. // State 3.1.2: ...</p>\n\n{p}<b>
  11170. // ---
  11171. // State 3.2.2: ...</p>\n\n<div>
  11172. // -----
  11173. // Note: PAR<ELEM> cannot occur because PAR would have been
  11174. // wrapped in <p> tags.
  11175. }
  11176. }
  11177. }
  11178. } else {
  11179. // State 2.2: <ul><li>
  11180. // ----
  11181. // State 2.4: <p><b>
  11182. // ---
  11183. }
  11184. }
  11185. /**
  11186. * Splits up a text in paragraph tokens and appends them
  11187. * to the result stream that will replace the original
  11188. * @param $data String text data that will be processed
  11189. * into paragraphs
  11190. * @param $result Reference to array of tokens that the
  11191. * tags will be appended onto
  11192. * @param $config Instance of HTMLPurifier_Config
  11193. * @param $context Instance of HTMLPurifier_Context
  11194. */
  11195. private function _splitText($data, &$result) {
  11196. $raw_paragraphs = explode("\n\n", $data);
  11197. $paragraphs = array(); // without empty paragraphs
  11198. $needs_start = false;
  11199. $needs_end = false;
  11200. $c = count($raw_paragraphs);
  11201. if ($c == 1) {
  11202. // There were no double-newlines, abort quickly. In theory this
  11203. // should never happen.
  11204. $result[] = new HTMLPurifier_Token_Text($data);
  11205. return;
  11206. }
  11207. for ($i = 0; $i < $c; $i++) {
  11208. $par = $raw_paragraphs[$i];
  11209. if (trim($par) !== '') {
  11210. $paragraphs[] = $par;
  11211. } else {
  11212. if ($i == 0) {
  11213. // Double newline at the front
  11214. if (empty($result)) {
  11215. // The empty result indicates that the AutoParagraph
  11216. // injector did not add any start paragraph tokens.
  11217. // This means that we have been in a paragraph for
  11218. // a while, and the newline means we should start a new one.
  11219. $result[] = new HTMLPurifier_Token_End('p');
  11220. $result[] = new HTMLPurifier_Token_Text("\n\n");
  11221. // However, the start token should only be added if
  11222. // there is more processing to be done (i.e. there are
  11223. // real paragraphs in here). If there are none, the
  11224. // next start paragraph tag will be handled by the
  11225. // next call to the injector
  11226. $needs_start = true;
  11227. } else {
  11228. // We just started a new paragraph!
  11229. // Reinstate a double-newline for presentation's sake, since
  11230. // it was in the source code.
  11231. array_unshift($result, new HTMLPurifier_Token_Text("\n\n"));
  11232. }
  11233. } elseif ($i + 1 == $c) {
  11234. // Double newline at the end
  11235. // There should be a trailing </p> when we're finally done.
  11236. $needs_end = true;
  11237. }
  11238. }
  11239. }
  11240. // Check if this was just a giant blob of whitespace. Move this earlier,
  11241. // perhaps?
  11242. if (empty($paragraphs)) {
  11243. return;
  11244. }
  11245. // Add the start tag indicated by \n\n at the beginning of $data
  11246. if ($needs_start) {
  11247. $result[] = $this->_pStart();
  11248. }
  11249. // Append the paragraphs onto the result
  11250. foreach ($paragraphs as $par) {
  11251. $result[] = new HTMLPurifier_Token_Text($par);
  11252. $result[] = new HTMLPurifier_Token_End('p');
  11253. $result[] = new HTMLPurifier_Token_Text("\n\n");
  11254. $result[] = $this->_pStart();
  11255. }
  11256. // Remove trailing start token; Injector will handle this later if
  11257. // it was indeed needed. This prevents from needing to do a lookahead,
  11258. // at the cost of a lookbehind later.
  11259. array_pop($result);
  11260. // If there is no need for an end tag, remove all of it and let
  11261. // MakeWellFormed close it later.
  11262. if (!$needs_end) {
  11263. array_pop($result); // removes \n\n
  11264. array_pop($result); // removes </p>
  11265. }
  11266. }
  11267. /**
  11268. * Returns true if passed token is inline (and, ergo, allowed in
  11269. * paragraph tags)
  11270. */
  11271. private function _isInline($token) {
  11272. return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);
  11273. }
  11274. /**
  11275. * Looks ahead in the token list and determines whether or not we need
  11276. * to insert a <p> tag.
  11277. */
  11278. private function _pLookAhead() {
  11279. $this->current($i, $current);
  11280. if ($current instanceof HTMLPurifier_Token_Start) $nesting = 1;
  11281. else $nesting = 0;
  11282. $ok = false;
  11283. while ($this->forwardUntilEndToken($i, $current, $nesting)) {
  11284. $result = $this->_checkNeedsP($current);
  11285. if ($result !== null) {
  11286. $ok = $result;
  11287. break;
  11288. }
  11289. }
  11290. return $ok;
  11291. }
  11292. /**
  11293. * Determines if a particular token requires an earlier inline token
  11294. * to get a paragraph. This should be used with _forwardUntilEndToken
  11295. */
  11296. private function _checkNeedsP($current) {
  11297. if ($current instanceof HTMLPurifier_Token_Start){
  11298. if (!$this->_isInline($current)) {
  11299. // <div>PAR1<div>
  11300. // ----
  11301. // Terminate early, since we hit a block element
  11302. return false;
  11303. }
  11304. } elseif ($current instanceof HTMLPurifier_Token_Text) {
  11305. if (strpos($current->data, "\n\n") !== false) {
  11306. // <div>PAR1<b>PAR1\n\nPAR2
  11307. // ----
  11308. return true;
  11309. } else {
  11310. // <div>PAR1<b>PAR1...
  11311. // ----
  11312. }
  11313. }
  11314. return null;
  11315. }
  11316. }
  11317. /**
  11318. * Injector that displays the URL of an anchor instead of linking to it, in addition to showing the text of the link.
  11319. */
  11320. class HTMLPurifier_Injector_DisplayLinkURI extends HTMLPurifier_Injector
  11321. {
  11322. public $name = 'DisplayLinkURI';
  11323. public $needed = array('a');
  11324. public function handleElement(&$token) {
  11325. }
  11326. public function handleEnd(&$token) {
  11327. if (isset($token->start->attr['href'])){
  11328. $url = $token->start->attr['href'];
  11329. unset($token->start->attr['href']);
  11330. $token = array($token, new HTMLPurifier_Token_Text(" ($url)"));
  11331. } else {
  11332. // nothing to display
  11333. }
  11334. }
  11335. }
  11336. /**
  11337. * Injector that converts http, https and ftp text URLs to actual links.
  11338. */
  11339. class HTMLPurifier_Injector_Linkify extends HTMLPurifier_Injector
  11340. {
  11341. public $name = 'Linkify';
  11342. public $needed = array('a' => array('href'));
  11343. public function handleText(&$token) {
  11344. if (!$this->allowsElement('a')) return;
  11345. if (strpos($token->data, '://') === false) {
  11346. // our really quick heuristic failed, abort
  11347. // this may not work so well if we want to match things like
  11348. // "google.com", but then again, most people don't
  11349. return;
  11350. }
  11351. // there is/are URL(s). Let's split the string:
  11352. // Note: this regex is extremely permissive
  11353. $bits = preg_split('#((?:https?|ftp)://[^\s\'"<>()]+)#S', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
  11354. $token = array();
  11355. // $i = index
  11356. // $c = count
  11357. // $l = is link
  11358. for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) {
  11359. if (!$l) {
  11360. if ($bits[$i] === '') continue;
  11361. $token[] = new HTMLPurifier_Token_Text($bits[$i]);
  11362. } else {
  11363. $token[] = new HTMLPurifier_Token_Start('a', array('href' => $bits[$i]));
  11364. $token[] = new HTMLPurifier_Token_Text($bits[$i]);
  11365. $token[] = new HTMLPurifier_Token_End('a');
  11366. }
  11367. }
  11368. }
  11369. }
  11370. /**
  11371. * Injector that converts configuration directive syntax %Namespace.Directive
  11372. * to links
  11373. */
  11374. class HTMLPurifier_Injector_PurifierLinkify extends HTMLPurifier_Injector
  11375. {
  11376. public $name = 'PurifierLinkify';
  11377. public $docURL;
  11378. public $needed = array('a' => array('href'));
  11379. public function prepare($config, $context) {
  11380. $this->docURL = $config->get('AutoFormat.PurifierLinkify.DocURL');
  11381. return parent::prepare($config, $context);
  11382. }
  11383. public function handleText(&$token) {
  11384. if (!$this->allowsElement('a')) return;
  11385. if (strpos($token->data, '%') === false) return;
  11386. $bits = preg_split('#%([a-z0-9]+\.[a-z0-9]+)#Si', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
  11387. $token = array();
  11388. // $i = index
  11389. // $c = count
  11390. // $l = is link
  11391. for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) {
  11392. if (!$l) {
  11393. if ($bits[$i] === '') continue;
  11394. $token[] = new HTMLPurifier_Token_Text($bits[$i]);
  11395. } else {
  11396. $token[] = new HTMLPurifier_Token_Start('a',
  11397. array('href' => str_replace('%s', $bits[$i], $this->docURL)));
  11398. $token[] = new HTMLPurifier_Token_Text('%' . $bits[$i]);
  11399. $token[] = new HTMLPurifier_Token_End('a');
  11400. }
  11401. }
  11402. }
  11403. }
  11404. class HTMLPurifier_Injector_RemoveEmpty extends HTMLPurifier_Injector
  11405. {
  11406. private $context, $config, $attrValidator, $removeNbsp, $removeNbspExceptions;
  11407. public function prepare($config, $context) {
  11408. parent::prepare($config, $context);
  11409. $this->config = $config;
  11410. $this->context = $context;
  11411. $this->removeNbsp = $config->get('AutoFormat.RemoveEmpty.RemoveNbsp');
  11412. $this->removeNbspExceptions = $config->get('AutoFormat.RemoveEmpty.RemoveNbsp.Exceptions');
  11413. $this->attrValidator = new HTMLPurifier_AttrValidator();
  11414. }
  11415. public function handleElement(&$token) {
  11416. if (!$token instanceof HTMLPurifier_Token_Start) return;
  11417. $next = false;
  11418. for ($i = $this->inputIndex + 1, $c = count($this->inputTokens); $i < $c; $i++) {
  11419. $next = $this->inputTokens[$i];
  11420. if ($next instanceof HTMLPurifier_Token_Text) {
  11421. if ($next->is_whitespace) continue;
  11422. if ($this->removeNbsp && !isset($this->removeNbspExceptions[$token->name])) {
  11423. $plain = str_replace("\xC2\xA0", "", $next->data);
  11424. $isWsOrNbsp = $plain === '' || ctype_space($plain);
  11425. if ($isWsOrNbsp) continue;
  11426. }
  11427. }
  11428. break;
  11429. }
  11430. if (!$next || ($next instanceof HTMLPurifier_Token_End && $next->name == $token->name)) {
  11431. if ($token->name == 'colgroup') return;
  11432. $this->attrValidator->validateToken($token, $this->config, $this->context);
  11433. $token->armor['ValidateAttributes'] = true;
  11434. if (isset($token->attr['id']) || isset($token->attr['name'])) return;
  11435. $token = $i - $this->inputIndex + 1;
  11436. for ($b = $this->inputIndex - 1; $b > 0; $b--) {
  11437. $prev = $this->inputTokens[$b];
  11438. if ($prev instanceof HTMLPurifier_Token_Text && $prev->is_whitespace) continue;
  11439. break;
  11440. }
  11441. // This is safe because we removed the token that triggered this.
  11442. $this->rewind($b - 1);
  11443. return;
  11444. }
  11445. }
  11446. }
  11447. /**
  11448. * Injector that removes spans with no attributes
  11449. */
  11450. class HTMLPurifier_Injector_RemoveSpansWithoutAttributes extends HTMLPurifier_Injector
  11451. {
  11452. public $name = 'RemoveSpansWithoutAttributes';
  11453. public $needed = array('span');
  11454. private $attrValidator;
  11455. /**
  11456. * Used by AttrValidator
  11457. */
  11458. private $config;
  11459. private $context;
  11460. public function prepare($config, $context) {
  11461. $this->attrValidator = new HTMLPurifier_AttrValidator();
  11462. $this->config = $config;
  11463. $this->context = $context;
  11464. return parent::prepare($config, $context);
  11465. }
  11466. public function handleElement(&$token) {
  11467. if ($token->name !== 'span' || !$token instanceof HTMLPurifier_Token_Start) {
  11468. return;
  11469. }
  11470. // We need to validate the attributes now since this doesn't normally
  11471. // happen until after MakeWellFormed. If all the attributes are removed
  11472. // the span needs to be removed too.
  11473. $this->attrValidator->validateToken($token, $this->config, $this->context);
  11474. $token->armor['ValidateAttributes'] = true;
  11475. if (!empty($token->attr)) {
  11476. return;
  11477. }
  11478. $nesting = 0;
  11479. $spanContentTokens = array();
  11480. while ($this->forwardUntilEndToken($i, $current, $nesting)) {}
  11481. if ($current instanceof HTMLPurifier_Token_End && $current->name === 'span') {
  11482. // Mark closing span tag for deletion
  11483. $current->markForDeletion = true;
  11484. // Delete open span tag
  11485. $token = false;
  11486. }
  11487. }
  11488. public function handleEnd(&$token) {
  11489. if ($token->markForDeletion) {
  11490. $token = false;
  11491. }
  11492. }
  11493. }
  11494. /**
  11495. * Adds important param elements to inside of object in order to make
  11496. * things safe.
  11497. */
  11498. class HTMLPurifier_Injector_SafeObject extends HTMLPurifier_Injector
  11499. {
  11500. public $name = 'SafeObject';
  11501. public $needed = array('object', 'param');
  11502. protected $objectStack = array();
  11503. protected $paramStack = array();
  11504. // Keep this synchronized with AttrTransform/SafeParam.php
  11505. protected $addParam = array(
  11506. 'allowScriptAccess' => 'never',
  11507. 'allowNetworking' => 'internal',
  11508. );
  11509. protected $allowedParam = array(
  11510. 'wmode' => true,
  11511. 'movie' => true,
  11512. 'flashvars' => true,
  11513. 'src' => true,
  11514. 'allowFullScreen' => true, // if omitted, assume to be 'false'
  11515. );
  11516. public function prepare($config, $context) {
  11517. parent::prepare($config, $context);
  11518. }
  11519. public function handleElement(&$token) {
  11520. if ($token->name == 'object') {
  11521. $this->objectStack[] = $token;
  11522. $this->paramStack[] = array();
  11523. $new = array($token);
  11524. foreach ($this->addParam as $name => $value) {
  11525. $new[] = new HTMLPurifier_Token_Empty('param', array('name' => $name, 'value' => $value));
  11526. }
  11527. $token = $new;
  11528. } elseif ($token->name == 'param') {
  11529. $nest = count($this->currentNesting) - 1;
  11530. if ($nest >= 0 && $this->currentNesting[$nest]->name === 'object') {
  11531. $i = count($this->objectStack) - 1;
  11532. if (!isset($token->attr['name'])) {
  11533. $token = false;
  11534. return;
  11535. }
  11536. $n = $token->attr['name'];
  11537. // We need this fix because YouTube doesn't supply a data
  11538. // attribute, which we need if a type is specified. This is
  11539. // *very* Flash specific.
  11540. if (!isset($this->objectStack[$i]->attr['data']) &&
  11541. ($token->attr['name'] == 'movie' || $token->attr['name'] == 'src')) {
  11542. $this->objectStack[$i]->attr['data'] = $token->attr['value'];
  11543. }
  11544. // Check if the parameter is the correct value but has not
  11545. // already been added
  11546. if (
  11547. !isset($this->paramStack[$i][$n]) &&
  11548. isset($this->addParam[$n]) &&
  11549. $token->attr['name'] === $this->addParam[$n]
  11550. ) {
  11551. // keep token, and add to param stack
  11552. $this->paramStack[$i][$n] = true;
  11553. } elseif (isset($this->allowedParam[$n])) {
  11554. // keep token, don't do anything to it
  11555. // (could possibly check for duplicates here)
  11556. } else {
  11557. $token = false;
  11558. }
  11559. } else {
  11560. // not directly inside an object, DENY!
  11561. $token = false;
  11562. }
  11563. }
  11564. }
  11565. public function handleEnd(&$token) {
  11566. // This is the WRONG way of handling the object and param stacks;
  11567. // we should be inserting them directly on the relevant object tokens
  11568. // so that the global stack handling handles it.
  11569. if ($token->name == 'object') {
  11570. array_pop($this->objectStack);
  11571. array_pop($this->paramStack);
  11572. }
  11573. }
  11574. }
  11575. /**
  11576. * Parser that uses PHP 5's DOM extension (part of the core).
  11577. *
  11578. * In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
  11579. * It gives us a forgiving HTML parser, which we use to transform the HTML
  11580. * into a DOM, and then into the tokens. It is blazingly fast (for large
  11581. * documents, it performs twenty times faster than
  11582. * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.
  11583. *
  11584. * @note Any empty elements will have empty tokens associated with them, even if
  11585. * this is prohibited by the spec. This is cannot be fixed until the spec
  11586. * comes into play.
  11587. *
  11588. * @note PHP's DOM extension does not actually parse any entities, we use
  11589. * our own function to do that.
  11590. *
  11591. * @warning DOM tends to drop whitespace, which may wreak havoc on indenting.
  11592. * If this is a huge problem, due to the fact that HTML is hand
  11593. * edited and you are unable to get a parser cache that caches the
  11594. * the output of HTML Purifier while keeping the original HTML lying
  11595. * around, you may want to run Tidy on the resulting output or use
  11596. * HTMLPurifier_DirectLex
  11597. */
  11598. class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
  11599. {
  11600. private $factory;
  11601. public function __construct() {
  11602. // setup the factory
  11603. parent::__construct();
  11604. $this->factory = new HTMLPurifier_TokenFactory();
  11605. }
  11606. public function tokenizeHTML($html, $config, $context) {
  11607. $html = $this->normalize($html, $config, $context);
  11608. // attempt to armor stray angled brackets that cannot possibly
  11609. // form tags and thus are probably being used as emoticons
  11610. if ($config->get('Core.AggressivelyFixLt')) {
  11611. $char = '[^a-z!\/]';
  11612. $comment = "/<!--(.*?)(-->|\z)/is";
  11613. $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
  11614. do {
  11615. $old = $html;
  11616. $html = preg_replace("/<($char)/i", '&lt;\\1', $html);
  11617. } while ($html !== $old);
  11618. $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
  11619. }
  11620. // preprocess html, essential for UTF-8
  11621. $html = $this->wrapHTML($html, $config, $context);
  11622. $doc = new DOMDocument();
  11623. $doc->encoding = 'UTF-8'; // theoretically, the above has this covered
  11624. set_error_handler(array($this, 'muteErrorHandler'));
  11625. $doc->loadHTML($html);
  11626. restore_error_handler();
  11627. $tokens = array();
  11628. $this->tokenizeDOM(
  11629. $doc->getElementsByTagName('html')->item(0)-> // <html>
  11630. getElementsByTagName('body')->item(0)-> // <body>
  11631. getElementsByTagName('div')->item(0) // <div>
  11632. , $tokens);
  11633. return $tokens;
  11634. }
  11635. /**
  11636. * Iterative function that tokenizes a node, putting it into an accumulator.
  11637. * To iterate is human, to recurse divine - L. Peter Deutsch
  11638. * @param $node DOMNode to be tokenized.
  11639. * @param $tokens Array-list of already tokenized tokens.
  11640. * @returns Tokens of node appended to previously passed tokens.
  11641. */
  11642. protected function tokenizeDOM($node, &$tokens) {
  11643. $level = 0;
  11644. $nodes = array($level => array($node));
  11645. $closingNodes = array();
  11646. do {
  11647. while (!empty($nodes[$level])) {
  11648. $node = array_shift($nodes[$level]); // FIFO
  11649. $collect = $level > 0 ? true : false;
  11650. $needEndingTag = $this->createStartNode($node, $tokens, $collect);
  11651. if ($needEndingTag) {
  11652. $closingNodes[$level][] = $node;
  11653. }
  11654. if ($node->childNodes && $node->childNodes->length) {
  11655. $level++;
  11656. $nodes[$level] = array();
  11657. foreach ($node->childNodes as $childNode) {
  11658. array_push($nodes[$level], $childNode);
  11659. }
  11660. }
  11661. }
  11662. $level--;
  11663. if ($level && isset($closingNodes[$level])) {
  11664. while($node = array_pop($closingNodes[$level])) {
  11665. $this->createEndNode($node, $tokens);
  11666. }
  11667. }
  11668. } while ($level > 0);
  11669. }
  11670. /**
  11671. * @param $node DOMNode to be tokenized.
  11672. * @param $tokens Array-list of already tokenized tokens.
  11673. * @param $collect Says whether or start and close are collected, set to
  11674. * false at first recursion because it's the implicit DIV
  11675. * tag you're dealing with.
  11676. * @returns bool if the token needs an endtoken
  11677. */
  11678. protected function createStartNode($node, &$tokens, $collect) {
  11679. // intercept non element nodes. WE MUST catch all of them,
  11680. // but we're not getting the character reference nodes because
  11681. // those should have been preprocessed
  11682. if ($node->nodeType === XML_TEXT_NODE) {
  11683. $tokens[] = $this->factory->createText($node->data);
  11684. return false;
  11685. } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
  11686. // undo libxml's special treatment of <script> and <style> tags
  11687. $last = end($tokens);
  11688. $data = $node->data;
  11689. // (note $node->tagname is already normalized)
  11690. if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
  11691. $new_data = trim($data);
  11692. if (substr($new_data, 0, 4) === '<!--') {
  11693. $data = substr($new_data, 4);
  11694. if (substr($data, -3) === '-->') {
  11695. $data = substr($data, 0, -3);
  11696. } else {
  11697. // Highly suspicious! Not sure what to do...
  11698. }
  11699. }
  11700. }
  11701. $tokens[] = $this->factory->createText($this->parseData($data));
  11702. return false;
  11703. } elseif ($node->nodeType === XML_COMMENT_NODE) {
  11704. // this is code is only invoked for comments in script/style in versions
  11705. // of libxml pre-2.6.28 (regular comments, of course, are still
  11706. // handled regularly)
  11707. $tokens[] = $this->factory->createComment($node->data);
  11708. return false;
  11709. } elseif (
  11710. // not-well tested: there may be other nodes we have to grab
  11711. $node->nodeType !== XML_ELEMENT_NODE
  11712. ) {
  11713. return false;
  11714. }
  11715. $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
  11716. // We still have to make sure that the element actually IS empty
  11717. if (!$node->childNodes->length) {
  11718. if ($collect) {
  11719. $tokens[] = $this->factory->createEmpty($node->tagName, $attr);
  11720. }
  11721. return false;
  11722. } else {
  11723. if ($collect) {
  11724. $tokens[] = $this->factory->createStart(
  11725. $tag_name = $node->tagName, // somehow, it get's dropped
  11726. $attr
  11727. );
  11728. }
  11729. return true;
  11730. }
  11731. }
  11732. protected function createEndNode($node, &$tokens) {
  11733. $tokens[] = $this->factory->createEnd($node->tagName);
  11734. }
  11735. /**
  11736. * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
  11737. *
  11738. * @param $attribute_list DOMNamedNodeMap of DOMAttr objects.
  11739. * @returns Associative array of attributes.
  11740. */
  11741. protected function transformAttrToAssoc($node_map) {
  11742. // NamedNodeMap is documented very well, so we're using undocumented
  11743. // features, namely, the fact that it implements Iterator and
  11744. // has a ->length attribute
  11745. if ($node_map->length === 0) return array();
  11746. $array = array();
  11747. foreach ($node_map as $attr) {
  11748. $array[$attr->name] = $attr->value;
  11749. }
  11750. return $array;
  11751. }
  11752. /**
  11753. * An error handler that mutes all errors
  11754. */
  11755. public function muteErrorHandler($errno, $errstr) {}
  11756. /**
  11757. * Callback function for undoing escaping of stray angled brackets
  11758. * in comments
  11759. */
  11760. public function callbackUndoCommentSubst($matches) {
  11761. return '<!--' . strtr($matches[1], array('&amp;'=>'&','&lt;'=>'<')) . $matches[2];
  11762. }
  11763. /**
  11764. * Callback function that entity-izes ampersands in comments so that
  11765. * callbackUndoCommentSubst doesn't clobber them
  11766. */
  11767. public function callbackArmorCommentEntities($matches) {
  11768. return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
  11769. }
  11770. /**
  11771. * Wraps an HTML fragment in the necessary HTML
  11772. */
  11773. protected function wrapHTML($html, $config, $context) {
  11774. $def = $config->getDefinition('HTML');
  11775. $ret = '';
  11776. if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
  11777. $ret .= '<!DOCTYPE html ';
  11778. if (!empty($def->doctype->dtdPublic)) $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
  11779. if (!empty($def->doctype->dtdSystem)) $ret .= '"' . $def->doctype->dtdSystem . '" ';
  11780. $ret .= '>';
  11781. }
  11782. $ret .= '<html><head>';
  11783. $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
  11784. // No protection if $html contains a stray </div>!
  11785. $ret .= '</head><body><div>'.$html.'</div></body></html>';
  11786. return $ret;
  11787. }
  11788. }
  11789. /**
  11790. * Our in-house implementation of a parser.
  11791. *
  11792. * A pure PHP parser, DirectLex has absolutely no dependencies, making
  11793. * it a reasonably good default for PHP4. Written with efficiency in mind,
  11794. * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
  11795. * pales in comparison to HTMLPurifier_Lexer_DOMLex.
  11796. *
  11797. * @todo Reread XML spec and document differences.
  11798. */
  11799. class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
  11800. {
  11801. public $tracksLineNumbers = true;
  11802. /**
  11803. * Whitespace characters for str(c)spn.
  11804. */
  11805. protected $_whitespace = "\x20\x09\x0D\x0A";
  11806. /**
  11807. * Callback function for script CDATA fudge
  11808. * @param $matches, in form of array(opening tag, contents, closing tag)
  11809. */
  11810. protected function scriptCallback($matches) {
  11811. return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
  11812. }
  11813. public function tokenizeHTML($html, $config, $context) {
  11814. // special normalization for script tags without any armor
  11815. // our "armor" heurstic is a < sign any number of whitespaces after
  11816. // the first script tag
  11817. if ($config->get('HTML.Trusted')) {
  11818. $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
  11819. array($this, 'scriptCallback'), $html);
  11820. }
  11821. $html = $this->normalize($html, $config, $context);
  11822. $cursor = 0; // our location in the text
  11823. $inside_tag = false; // whether or not we're parsing the inside of a tag
  11824. $array = array(); // result array
  11825. // This is also treated to mean maintain *column* numbers too
  11826. $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
  11827. if ($maintain_line_numbers === null) {
  11828. // automatically determine line numbering by checking
  11829. // if error collection is on
  11830. $maintain_line_numbers = $config->get('Core.CollectErrors');
  11831. }
  11832. if ($maintain_line_numbers) {
  11833. $current_line = 1;
  11834. $current_col = 0;
  11835. $length = strlen($html);
  11836. } else {
  11837. $current_line = false;
  11838. $current_col = false;
  11839. $length = false;
  11840. }
  11841. $context->register('CurrentLine', $current_line);
  11842. $context->register('CurrentCol', $current_col);
  11843. $nl = "\n";
  11844. // how often to manually recalculate. This will ALWAYS be right,
  11845. // but it's pretty wasteful. Set to 0 to turn off
  11846. $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
  11847. $e = false;
  11848. if ($config->get('Core.CollectErrors')) {
  11849. $e =& $context->get('ErrorCollector');
  11850. }
  11851. // for testing synchronization
  11852. $loops = 0;
  11853. while(++$loops) {
  11854. // $cursor is either at the start of a token, or inside of
  11855. // a tag (i.e. there was a < immediately before it), as indicated
  11856. // by $inside_tag
  11857. if ($maintain_line_numbers) {
  11858. // $rcursor, however, is always at the start of a token.
  11859. $rcursor = $cursor - (int) $inside_tag;
  11860. // Column number is cheap, so we calculate it every round.
  11861. // We're interested at the *end* of the newline string, so
  11862. // we need to add strlen($nl) == 1 to $nl_pos before subtracting it
  11863. // from our "rcursor" position.
  11864. $nl_pos = strrpos($html, $nl, $rcursor - $length);
  11865. $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
  11866. // recalculate lines
  11867. if (
  11868. $synchronize_interval && // synchronization is on
  11869. $cursor > 0 && // cursor is further than zero
  11870. $loops % $synchronize_interval === 0 // time to synchronize!
  11871. ) {
  11872. $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
  11873. }
  11874. }
  11875. $position_next_lt = strpos($html, '<', $cursor);
  11876. $position_next_gt = strpos($html, '>', $cursor);
  11877. // triggers on "<b>asdf</b>" but not "asdf <b></b>"
  11878. // special case to set up context
  11879. if ($position_next_lt === $cursor) {
  11880. $inside_tag = true;
  11881. $cursor++;
  11882. }
  11883. if (!$inside_tag && $position_next_lt !== false) {
  11884. // We are not inside tag and there still is another tag to parse
  11885. $token = new
  11886. HTMLPurifier_Token_Text(
  11887. $this->parseData(
  11888. substr(
  11889. $html, $cursor, $position_next_lt - $cursor
  11890. )
  11891. )
  11892. );
  11893. if ($maintain_line_numbers) {
  11894. $token->rawPosition($current_line, $current_col);
  11895. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
  11896. }
  11897. $array[] = $token;
  11898. $cursor = $position_next_lt + 1;
  11899. $inside_tag = true;
  11900. continue;
  11901. } elseif (!$inside_tag) {
  11902. // We are not inside tag but there are no more tags
  11903. // If we're already at the end, break
  11904. if ($cursor === strlen($html)) break;
  11905. // Create Text of rest of string
  11906. $token = new
  11907. HTMLPurifier_Token_Text(
  11908. $this->parseData(
  11909. substr(
  11910. $html, $cursor
  11911. )
  11912. )
  11913. );
  11914. if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
  11915. $array[] = $token;
  11916. break;
  11917. } elseif ($inside_tag && $position_next_gt !== false) {
  11918. // We are in tag and it is well formed
  11919. // Grab the internals of the tag
  11920. $strlen_segment = $position_next_gt - $cursor;
  11921. if ($strlen_segment < 1) {
  11922. // there's nothing to process!
  11923. $token = new HTMLPurifier_Token_Text('<');
  11924. $cursor++;
  11925. continue;
  11926. }
  11927. $segment = substr($html, $cursor, $strlen_segment);
  11928. if ($segment === false) {
  11929. // somehow, we attempted to access beyond the end of
  11930. // the string, defense-in-depth, reported by Nate Abele
  11931. break;
  11932. }
  11933. // Check if it's a comment
  11934. if (
  11935. substr($segment, 0, 3) === '!--'
  11936. ) {
  11937. // re-determine segment length, looking for -->
  11938. $position_comment_end = strpos($html, '-->', $cursor);
  11939. if ($position_comment_end === false) {
  11940. // uh oh, we have a comment that extends to
  11941. // infinity. Can't be helped: set comment
  11942. // end position to end of string
  11943. if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment');
  11944. $position_comment_end = strlen($html);
  11945. $end = true;
  11946. } else {
  11947. $end = false;
  11948. }
  11949. $strlen_segment = $position_comment_end - $cursor;
  11950. $segment = substr($html, $cursor, $strlen_segment);
  11951. $token = new
  11952. HTMLPurifier_Token_Comment(
  11953. substr(
  11954. $segment, 3, $strlen_segment - 3
  11955. )
  11956. );
  11957. if ($maintain_line_numbers) {
  11958. $token->rawPosition($current_line, $current_col);
  11959. $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
  11960. }
  11961. $array[] = $token;
  11962. $cursor = $end ? $position_comment_end : $position_comment_end + 3;
  11963. $inside_tag = false;
  11964. continue;
  11965. }
  11966. // Check if it's an end tag
  11967. $is_end_tag = (strpos($segment,'/') === 0);
  11968. if ($is_end_tag) {
  11969. $type = substr($segment, 1);
  11970. $token = new HTMLPurifier_Token_End($type);
  11971. if ($maintain_line_numbers) {
  11972. $token->rawPosition($current_line, $current_col);
  11973. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  11974. }
  11975. $array[] = $token;
  11976. $inside_tag = false;
  11977. $cursor = $position_next_gt + 1;
  11978. continue;
  11979. }
  11980. // Check leading character is alnum, if not, we may
  11981. // have accidently grabbed an emoticon. Translate into
  11982. // text and go our merry way
  11983. if (!ctype_alpha($segment[0])) {
  11984. // XML: $segment[0] !== '_' && $segment[0] !== ':'
  11985. if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
  11986. $token = new HTMLPurifier_Token_Text('<');
  11987. if ($maintain_line_numbers) {
  11988. $token->rawPosition($current_line, $current_col);
  11989. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  11990. }
  11991. $array[] = $token;
  11992. $inside_tag = false;
  11993. continue;
  11994. }
  11995. // Check if it is explicitly self closing, if so, remove
  11996. // trailing slash. Remember, we could have a tag like <br>, so
  11997. // any later token processing scripts must convert improperly
  11998. // classified EmptyTags from StartTags.
  11999. $is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
  12000. if ($is_self_closing) {
  12001. $strlen_segment--;
  12002. $segment = substr($segment, 0, $strlen_segment);
  12003. }
  12004. // Check if there are any attributes
  12005. $position_first_space = strcspn($segment, $this->_whitespace);
  12006. if ($position_first_space >= $strlen_segment) {
  12007. if ($is_self_closing) {
  12008. $token = new HTMLPurifier_Token_Empty($segment);
  12009. } else {
  12010. $token = new HTMLPurifier_Token_Start($segment);
  12011. }
  12012. if ($maintain_line_numbers) {
  12013. $token->rawPosition($current_line, $current_col);
  12014. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  12015. }
  12016. $array[] = $token;
  12017. $inside_tag = false;
  12018. $cursor = $position_next_gt + 1;
  12019. continue;
  12020. }
  12021. // Grab out all the data
  12022. $type = substr($segment, 0, $position_first_space);
  12023. $attribute_string =
  12024. trim(
  12025. substr(
  12026. $segment, $position_first_space
  12027. )
  12028. );
  12029. if ($attribute_string) {
  12030. $attr = $this->parseAttributeString(
  12031. $attribute_string
  12032. , $config, $context
  12033. );
  12034. } else {
  12035. $attr = array();
  12036. }
  12037. if ($is_self_closing) {
  12038. $token = new HTMLPurifier_Token_Empty($type, $attr);
  12039. } else {
  12040. $token = new HTMLPurifier_Token_Start($type, $attr);
  12041. }
  12042. if ($maintain_line_numbers) {
  12043. $token->rawPosition($current_line, $current_col);
  12044. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  12045. }
  12046. $array[] = $token;
  12047. $cursor = $position_next_gt + 1;
  12048. $inside_tag = false;
  12049. continue;
  12050. } else {
  12051. // inside tag, but there's no ending > sign
  12052. if ($e) $e->send(E_WARNING, 'Lexer: Missing gt');
  12053. $token = new
  12054. HTMLPurifier_Token_Text(
  12055. '<' .
  12056. $this->parseData(
  12057. substr($html, $cursor)
  12058. )
  12059. );
  12060. if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
  12061. // no cursor scroll? Hmm...
  12062. $array[] = $token;
  12063. break;
  12064. }
  12065. break;
  12066. }
  12067. $context->destroy('CurrentLine');
  12068. $context->destroy('CurrentCol');
  12069. return $array;
  12070. }
  12071. /**
  12072. * PHP 5.0.x compatible substr_count that implements offset and length
  12073. */
  12074. protected function substrCount($haystack, $needle, $offset, $length) {
  12075. static $oldVersion;
  12076. if ($oldVersion === null) {
  12077. $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
  12078. }
  12079. if ($oldVersion) {
  12080. $haystack = substr($haystack, $offset, $length);
  12081. return substr_count($haystack, $needle);
  12082. } else {
  12083. return substr_count($haystack, $needle, $offset, $length);
  12084. }
  12085. }
  12086. /**
  12087. * Takes the inside of an HTML tag and makes an assoc array of attributes.
  12088. *
  12089. * @param $string Inside of tag excluding name.
  12090. * @returns Assoc array of attributes.
  12091. */
  12092. public function parseAttributeString($string, $config, $context) {
  12093. $string = (string) $string; // quick typecast
  12094. if ($string == '') return array(); // no attributes
  12095. $e = false;
  12096. if ($config->get('Core.CollectErrors')) {
  12097. $e =& $context->get('ErrorCollector');
  12098. }
  12099. // let's see if we can abort as quickly as possible
  12100. // one equal sign, no spaces => one attribute
  12101. $num_equal = substr_count($string, '=');
  12102. $has_space = strpos($string, ' ');
  12103. if ($num_equal === 0 && !$has_space) {
  12104. // bool attribute
  12105. return array($string => $string);
  12106. } elseif ($num_equal === 1 && !$has_space) {
  12107. // only one attribute
  12108. list($key, $quoted_value) = explode('=', $string);
  12109. $quoted_value = trim($quoted_value);
  12110. if (!$key) {
  12111. if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
  12112. return array();
  12113. }
  12114. if (!$quoted_value) return array($key => '');
  12115. $first_char = @$quoted_value[0];
  12116. $last_char = @$quoted_value[strlen($quoted_value)-1];
  12117. $same_quote = ($first_char == $last_char);
  12118. $open_quote = ($first_char == '"' || $first_char == "'");
  12119. if ( $same_quote && $open_quote) {
  12120. // well behaved
  12121. $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
  12122. } else {
  12123. // not well behaved
  12124. if ($open_quote) {
  12125. if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote');
  12126. $value = substr($quoted_value, 1);
  12127. } else {
  12128. $value = $quoted_value;
  12129. }
  12130. }
  12131. if ($value === false) $value = '';
  12132. return array($key => $this->parseData($value));
  12133. }
  12134. // setup loop environment
  12135. $array = array(); // return assoc array of attributes
  12136. $cursor = 0; // current position in string (moves forward)
  12137. $size = strlen($string); // size of the string (stays the same)
  12138. // if we have unquoted attributes, the parser expects a terminating
  12139. // space, so let's guarantee that there's always a terminating space.
  12140. $string .= ' ';
  12141. while(true) {
  12142. if ($cursor >= $size) {
  12143. break;
  12144. }
  12145. $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
  12146. // grab the key
  12147. $key_begin = $cursor; //we're currently at the start of the key
  12148. // scroll past all characters that are the key (not whitespace or =)
  12149. $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
  12150. $key_end = $cursor; // now at the end of the key
  12151. $key = substr($string, $key_begin, $key_end - $key_begin);
  12152. if (!$key) {
  12153. if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
  12154. $cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
  12155. continue; // empty key
  12156. }
  12157. // scroll past all whitespace
  12158. $cursor += strspn($string, $this->_whitespace, $cursor);
  12159. if ($cursor >= $size) {
  12160. $array[$key] = $key;
  12161. break;
  12162. }
  12163. // if the next character is an equal sign, we've got a regular
  12164. // pair, otherwise, it's a bool attribute
  12165. $first_char = @$string[$cursor];
  12166. if ($first_char == '=') {
  12167. // key="value"
  12168. $cursor++;
  12169. $cursor += strspn($string, $this->_whitespace, $cursor);
  12170. if ($cursor === false) {
  12171. $array[$key] = '';
  12172. break;
  12173. }
  12174. // we might be in front of a quote right now
  12175. $char = @$string[$cursor];
  12176. if ($char == '"' || $char == "'") {
  12177. // it's quoted, end bound is $char
  12178. $cursor++;
  12179. $value_begin = $cursor;
  12180. $cursor = strpos($string, $char, $cursor);
  12181. $value_end = $cursor;
  12182. } else {
  12183. // it's not quoted, end bound is whitespace
  12184. $value_begin = $cursor;
  12185. $cursor += strcspn($string, $this->_whitespace, $cursor);
  12186. $value_end = $cursor;
  12187. }
  12188. // we reached a premature end
  12189. if ($cursor === false) {
  12190. $cursor = $size;
  12191. $value_end = $cursor;
  12192. }
  12193. $value = substr($string, $value_begin, $value_end - $value_begin);
  12194. if ($value === false) $value = '';
  12195. $array[$key] = $this->parseData($value);
  12196. $cursor++;
  12197. } else {
  12198. // boolattr
  12199. if ($key !== '') {
  12200. $array[$key] = $key;
  12201. } else {
  12202. // purely theoretical
  12203. if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
  12204. }
  12205. }
  12206. }
  12207. return $array;
  12208. }
  12209. }
  12210. /**
  12211. * Composite strategy that runs multiple strategies on tokens.
  12212. */
  12213. abstract class HTMLPurifier_Strategy_Composite extends HTMLPurifier_Strategy
  12214. {
  12215. /**
  12216. * List of strategies to run tokens through.
  12217. */
  12218. protected $strategies = array();
  12219. abstract public function __construct();
  12220. public function execute($tokens, $config, $context) {
  12221. foreach ($this->strategies as $strategy) {
  12222. $tokens = $strategy->execute($tokens, $config, $context);
  12223. }
  12224. return $tokens;
  12225. }
  12226. }
  12227. /**
  12228. * Core strategy composed of the big four strategies.
  12229. */
  12230. class HTMLPurifier_Strategy_Core extends HTMLPurifier_Strategy_Composite
  12231. {
  12232. public function __construct() {
  12233. $this->strategies[] = new HTMLPurifier_Strategy_RemoveForeignElements();
  12234. $this->strategies[] = new HTMLPurifier_Strategy_MakeWellFormed();
  12235. $this->strategies[] = new HTMLPurifier_Strategy_FixNesting();
  12236. $this->strategies[] = new HTMLPurifier_Strategy_ValidateAttributes();
  12237. }
  12238. }
  12239. /**
  12240. * Takes a well formed list of tokens and fixes their nesting.
  12241. *
  12242. * HTML elements dictate which elements are allowed to be their children,
  12243. * for example, you can't have a p tag in a span tag. Other elements have
  12244. * much more rigorous definitions: tables, for instance, require a specific
  12245. * order for their elements. There are also constraints not expressible by
  12246. * document type definitions, such as the chameleon nature of ins/del
  12247. * tags and global child exclusions.
  12248. *
  12249. * The first major objective of this strategy is to iterate through all the
  12250. * nodes (not tokens) of the list of tokens and determine whether or not
  12251. * their children conform to the element's definition. If they do not, the
  12252. * child definition may optionally supply an amended list of elements that
  12253. * is valid or require that the entire node be deleted (and the previous
  12254. * node rescanned).
  12255. *
  12256. * The second objective is to ensure that explicitly excluded elements of
  12257. * an element do not appear in its children. Code that accomplishes this
  12258. * task is pervasive through the strategy, though the two are distinct tasks
  12259. * and could, theoretically, be seperated (although it's not recommended).
  12260. *
  12261. * @note Whether or not unrecognized children are silently dropped or
  12262. * translated into text depends on the child definitions.
  12263. *
  12264. * @todo Enable nodes to be bubbled out of the structure.
  12265. */
  12266. class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
  12267. {
  12268. public function execute($tokens, $config, $context) {
  12269. //####################################################################//
  12270. // Pre-processing
  12271. // get a copy of the HTML definition
  12272. $definition = $config->getHTMLDefinition();
  12273. // insert implicit "parent" node, will be removed at end.
  12274. // DEFINITION CALL
  12275. $parent_name = $definition->info_parent;
  12276. array_unshift($tokens, new HTMLPurifier_Token_Start($parent_name));
  12277. $tokens[] = new HTMLPurifier_Token_End($parent_name);
  12278. // setup the context variable 'IsInline', for chameleon processing
  12279. // is 'false' when we are not inline, 'true' when it must always
  12280. // be inline, and an integer when it is inline for a certain
  12281. // branch of the document tree