PageRenderTime 64ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 2ms

/school/605.441.Databases/project/htmlpurifier/library/HTMLPurifier.standalone.php

https://github.com/hank/life
PHP | 14931 lines | 8007 code | 2561 blank | 4363 comment | 1531 complexity | cb3f1eb546210ed31d14f6cc6598ce5f MD5 | raw file
Possible License(s): BSD-3-Clause, LGPL-2.1, Apache-2.0
  1. <?php
  2. /**
  3. * @file
  4. * This file was auto-generated by generate-includes.php and includes all of
  5. * the core files required by HTML Purifier. Use this if performance is a
  6. * primary concern and you are using an opcode cache. PLEASE DO NOT EDIT THIS
  7. * FILE, changes will be overwritten the next time the script is run.
  8. *
  9. * @version 4.2.0
  10. *
  11. * @warning
  12. * You must *not* include any other HTML Purifier files before this file,
  13. * because 'require' not 'require_once' is used.
  14. *
  15. * @warning
  16. * This file requires that the include path contains the HTML Purifier
  17. * library directory; this is not auto-set.
  18. */
  19. /*! @mainpage
  20. *
  21. * HTML Purifier is an HTML filter that will take an arbitrary snippet of
  22. * HTML and rigorously test, validate and filter it into a version that
  23. * is safe for output onto webpages. It achieves this by:
  24. *
  25. * -# Lexing (parsing into tokens) the document,
  26. * -# Executing various strategies on the tokens:
  27. * -# Removing all elements not in the whitelist,
  28. * -# Making the tokens well-formed,
  29. * -# Fixing the nesting of the nodes, and
  30. * -# Validating attributes of the nodes; and
  31. * -# Generating HTML from the purified tokens.
  32. *
  33. * However, most users will only need to interface with the HTMLPurifier
  34. * and HTMLPurifier_Config.
  35. */
  36. /*
  37. HTML Purifier 4.2.0 - Standards Compliant HTML Filtering
  38. Copyright (C) 2006-2008 Edward Z. Yang
  39. This library is free software; you can redistribute it and/or
  40. modify it under the terms of the GNU Lesser General Public
  41. License as published by the Free Software Foundation; either
  42. version 2.1 of the License, or (at your option) any later version.
  43. This library is distributed in the hope that it will be useful,
  44. but WITHOUT ANY WARRANTY; without even the implied warranty of
  45. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  46. Lesser General Public License for more details.
  47. You should have received a copy of the GNU Lesser General Public
  48. License along with this library; if not, write to the Free Software
  49. Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  50. */
  51. /**
  52. * Facade that coordinates HTML Purifier's subsystems in order to purify HTML.
  53. *
  54. * @note There are several points in which configuration can be specified
  55. * for HTML Purifier. The precedence of these (from lowest to
  56. * highest) is as follows:
  57. * -# Instance: new HTMLPurifier($config)
  58. * -# Invocation: purify($html, $config)
  59. * These configurations are entirely independent of each other and
  60. * are *not* merged (this behavior may change in the future).
  61. *
  62. * @todo We need an easier way to inject strategies using the configuration
  63. * object.
  64. */
  65. class HTMLPurifier
  66. {
  67. /** Version of HTML Purifier */
  68. public $version = '4.2.0';
  69. /** Constant with version of HTML Purifier */
  70. const VERSION = '4.2.0';
  71. /** Global configuration object */
  72. public $config;
  73. /** Array of extra HTMLPurifier_Filter objects to run on HTML, for backwards compatibility */
  74. private $filters = array();
  75. /** Single instance of HTML Purifier */
  76. private static $instance;
  77. protected $strategy, $generator;
  78. /**
  79. * Resultant HTMLPurifier_Context of last run purification. Is an array
  80. * of contexts if the last called method was purifyArray().
  81. */
  82. public $context;
  83. /**
  84. * Initializes the purifier.
  85. * @param $config Optional HTMLPurifier_Config object for all instances of
  86. * the purifier, if omitted, a default configuration is
  87. * supplied (which can be overridden on a per-use basis).
  88. * The parameter can also be any type that
  89. * HTMLPurifier_Config::create() supports.
  90. */
  91. public function __construct($config = null) {
  92. $this->config = HTMLPurifier_Config::create($config);
  93. $this->strategy = new HTMLPurifier_Strategy_Core();
  94. }
  95. /**
  96. * Adds a filter to process the output. First come first serve
  97. * @param $filter HTMLPurifier_Filter object
  98. */
  99. public function addFilter($filter) {
  100. trigger_error('HTMLPurifier->addFilter() is deprecated, use configuration directives in the Filter namespace or Filter.Custom', E_USER_WARNING);
  101. $this->filters[] = $filter;
  102. }
  103. /**
  104. * Filters an HTML snippet/document to be XSS-free and standards-compliant.
  105. *
  106. * @param $html String of HTML to purify
  107. * @param $config HTMLPurifier_Config object for this operation, if omitted,
  108. * defaults to the config object specified during this
  109. * object's construction. The parameter can also be any type
  110. * that HTMLPurifier_Config::create() supports.
  111. * @return Purified HTML
  112. */
  113. public function purify($html, $config = null) {
  114. // :TODO: make the config merge in, instead of replace
  115. $config = $config ? HTMLPurifier_Config::create($config) : $this->config;
  116. // implementation is partially environment dependant, partially
  117. // configuration dependant
  118. $lexer = HTMLPurifier_Lexer::create($config);
  119. $context = new HTMLPurifier_Context();
  120. // setup HTML generator
  121. $this->generator = new HTMLPurifier_Generator($config, $context);
  122. $context->register('Generator', $this->generator);
  123. // set up global context variables
  124. if ($config->get('Core.CollectErrors')) {
  125. // may get moved out if other facilities use it
  126. $language_factory = HTMLPurifier_LanguageFactory::instance();
  127. $language = $language_factory->create($config, $context);
  128. $context->register('Locale', $language);
  129. $error_collector = new HTMLPurifier_ErrorCollector($context);
  130. $context->register('ErrorCollector', $error_collector);
  131. }
  132. // setup id_accumulator context, necessary due to the fact that
  133. // AttrValidator can be called from many places
  134. $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
  135. $context->register('IDAccumulator', $id_accumulator);
  136. $html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context);
  137. // setup filters
  138. $filter_flags = $config->getBatch('Filter');
  139. $custom_filters = $filter_flags['Custom'];
  140. unset($filter_flags['Custom']);
  141. $filters = array();
  142. foreach ($filter_flags as $filter => $flag) {
  143. if (!$flag) continue;
  144. if (strpos($filter, '.') !== false) continue;
  145. $class = "HTMLPurifier_Filter_$filter";
  146. $filters[] = new $class;
  147. }
  148. foreach ($custom_filters as $filter) {
  149. // maybe "HTMLPurifier_Filter_$filter", but be consistent with AutoFormat
  150. $filters[] = $filter;
  151. }
  152. $filters = array_merge($filters, $this->filters);
  153. // maybe prepare(), but later
  154. for ($i = 0, $filter_size = count($filters); $i < $filter_size; $i++) {
  155. $html = $filters[$i]->preFilter($html, $config, $context);
  156. }
  157. // purified HTML
  158. $html =
  159. $this->generator->generateFromTokens(
  160. // list of tokens
  161. $this->strategy->execute(
  162. // list of un-purified tokens
  163. $lexer->tokenizeHTML(
  164. // un-purified HTML
  165. $html, $config, $context
  166. ),
  167. $config, $context
  168. )
  169. );
  170. for ($i = $filter_size - 1; $i >= 0; $i--) {
  171. $html = $filters[$i]->postFilter($html, $config, $context);
  172. }
  173. $html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context);
  174. $this->context =& $context;
  175. return $html;
  176. }
  177. /**
  178. * Filters an array of HTML snippets
  179. * @param $config Optional HTMLPurifier_Config object for this operation.
  180. * See HTMLPurifier::purify() for more details.
  181. * @return Array of purified HTML
  182. */
  183. public function purifyArray($array_of_html, $config = null) {
  184. $context_array = array();
  185. foreach ($array_of_html as $key => $html) {
  186. $array_of_html[$key] = $this->purify($html, $config);
  187. $context_array[$key] = $this->context;
  188. }
  189. $this->context = $context_array;
  190. return $array_of_html;
  191. }
  192. /**
  193. * Singleton for enforcing just one HTML Purifier in your system
  194. * @param $prototype Optional prototype HTMLPurifier instance to
  195. * overload singleton with, or HTMLPurifier_Config
  196. * instance to configure the generated version with.
  197. */
  198. public static function instance($prototype = null) {
  199. if (!self::$instance || $prototype) {
  200. if ($prototype instanceof HTMLPurifier) {
  201. self::$instance = $prototype;
  202. } elseif ($prototype) {
  203. self::$instance = new HTMLPurifier($prototype);
  204. } else {
  205. self::$instance = new HTMLPurifier();
  206. }
  207. }
  208. return self::$instance;
  209. }
  210. /**
  211. * @note Backwards compatibility, see instance()
  212. */
  213. public static function getInstance($prototype = null) {
  214. return HTMLPurifier::instance($prototype);
  215. }
  216. }
  217. /**
  218. * Defines common attribute collections that modules reference
  219. */
  220. class HTMLPurifier_AttrCollections
  221. {
  222. /**
  223. * Associative array of attribute collections, indexed by name
  224. */
  225. public $info = array();
  226. /**
  227. * Performs all expansions on internal data for use by other inclusions
  228. * It also collects all attribute collection extensions from
  229. * modules
  230. * @param $attr_types HTMLPurifier_AttrTypes instance
  231. * @param $modules Hash array of HTMLPurifier_HTMLModule members
  232. */
  233. public function __construct($attr_types, $modules) {
  234. // load extensions from the modules
  235. foreach ($modules as $module) {
  236. foreach ($module->attr_collections as $coll_i => $coll) {
  237. if (!isset($this->info[$coll_i])) {
  238. $this->info[$coll_i] = array();
  239. }
  240. foreach ($coll as $attr_i => $attr) {
  241. if ($attr_i === 0 && isset($this->info[$coll_i][$attr_i])) {
  242. // merge in includes
  243. $this->info[$coll_i][$attr_i] = array_merge(
  244. $this->info[$coll_i][$attr_i], $attr);
  245. continue;
  246. }
  247. $this->info[$coll_i][$attr_i] = $attr;
  248. }
  249. }
  250. }
  251. // perform internal expansions and inclusions
  252. foreach ($this->info as $name => $attr) {
  253. // merge attribute collections that include others
  254. $this->performInclusions($this->info[$name]);
  255. // replace string identifiers with actual attribute objects
  256. $this->expandIdentifiers($this->info[$name], $attr_types);
  257. }
  258. }
  259. /**
  260. * Takes a reference to an attribute associative array and performs
  261. * all inclusions specified by the zero index.
  262. * @param &$attr Reference to attribute array
  263. */
  264. public function performInclusions(&$attr) {
  265. if (!isset($attr[0])) return;
  266. $merge = $attr[0];
  267. $seen = array(); // recursion guard
  268. // loop through all the inclusions
  269. for ($i = 0; isset($merge[$i]); $i++) {
  270. if (isset($seen[$merge[$i]])) continue;
  271. $seen[$merge[$i]] = true;
  272. // foreach attribute of the inclusion, copy it over
  273. if (!isset($this->info[$merge[$i]])) continue;
  274. foreach ($this->info[$merge[$i]] as $key => $value) {
  275. if (isset($attr[$key])) continue; // also catches more inclusions
  276. $attr[$key] = $value;
  277. }
  278. if (isset($this->info[$merge[$i]][0])) {
  279. // recursion
  280. $merge = array_merge($merge, $this->info[$merge[$i]][0]);
  281. }
  282. }
  283. unset($attr[0]);
  284. }
  285. /**
  286. * Expands all string identifiers in an attribute array by replacing
  287. * them with the appropriate values inside HTMLPurifier_AttrTypes
  288. * @param &$attr Reference to attribute array
  289. * @param $attr_types HTMLPurifier_AttrTypes instance
  290. */
  291. public function expandIdentifiers(&$attr, $attr_types) {
  292. // because foreach will process new elements we add, make sure we
  293. // skip duplicates
  294. $processed = array();
  295. foreach ($attr as $def_i => $def) {
  296. // skip inclusions
  297. if ($def_i === 0) continue;
  298. if (isset($processed[$def_i])) continue;
  299. // determine whether or not attribute is required
  300. if ($required = (strpos($def_i, '*') !== false)) {
  301. // rename the definition
  302. unset($attr[$def_i]);
  303. $def_i = trim($def_i, '*');
  304. $attr[$def_i] = $def;
  305. }
  306. $processed[$def_i] = true;
  307. // if we've already got a literal object, move on
  308. if (is_object($def)) {
  309. // preserve previous required
  310. $attr[$def_i]->required = ($required || $attr[$def_i]->required);
  311. continue;
  312. }
  313. if ($def === false) {
  314. unset($attr[$def_i]);
  315. continue;
  316. }
  317. if ($t = $attr_types->get($def)) {
  318. $attr[$def_i] = $t;
  319. $attr[$def_i]->required = $required;
  320. } else {
  321. unset($attr[$def_i]);
  322. }
  323. }
  324. }
  325. }
  326. /**
  327. * Base class for all validating attribute definitions.
  328. *
  329. * This family of classes forms the core for not only HTML attribute validation,
  330. * but also any sort of string that needs to be validated or cleaned (which
  331. * means CSS properties and composite definitions are defined here too).
  332. * Besides defining (through code) what precisely makes the string valid,
  333. * subclasses are also responsible for cleaning the code if possible.
  334. */
  335. abstract class HTMLPurifier_AttrDef
  336. {
  337. /**
  338. * Tells us whether or not an HTML attribute is minimized. Has no
  339. * meaning in other contexts.
  340. */
  341. public $minimized = false;
  342. /**
  343. * Tells us whether or not an HTML attribute is required. Has no
  344. * meaning in other contexts
  345. */
  346. public $required = false;
  347. /**
  348. * Validates and cleans passed string according to a definition.
  349. *
  350. * @param $string String to be validated and cleaned.
  351. * @param $config Mandatory HTMLPurifier_Config object.
  352. * @param $context Mandatory HTMLPurifier_AttrContext object.
  353. */
  354. abstract public function validate($string, $config, $context);
  355. /**
  356. * Convenience method that parses a string as if it were CDATA.
  357. *
  358. * This method process a string in the manner specified at
  359. * <http://www.w3.org/TR/html4/types.html#h-6.2> by removing
  360. * leading and trailing whitespace, ignoring line feeds, and replacing
  361. * carriage returns and tabs with spaces. While most useful for HTML
  362. * attributes specified as CDATA, it can also be applied to most CSS
  363. * values.
  364. *
  365. * @note This method is not entirely standards compliant, as trim() removes
  366. * more types of whitespace than specified in the spec. In practice,
  367. * this is rarely a problem, as those extra characters usually have
  368. * already been removed by HTMLPurifier_Encoder.
  369. *
  370. * @warning This processing is inconsistent with XML's whitespace handling
  371. * as specified by section 3.3.3 and referenced XHTML 1.0 section
  372. * 4.7. However, note that we are NOT necessarily
  373. * parsing XML, thus, this behavior may still be correct. We
  374. * assume that newlines have been normalized.
  375. */
  376. public function parseCDATA($string) {
  377. $string = trim($string);
  378. $string = str_replace(array("\n", "\t", "\r"), ' ', $string);
  379. return $string;
  380. }
  381. /**
  382. * Factory method for creating this class from a string.
  383. * @param $string String construction info
  384. * @return Created AttrDef object corresponding to $string
  385. */
  386. public function make($string) {
  387. // default implementation, return a flyweight of this object.
  388. // If $string has an effect on the returned object (i.e. you
  389. // need to overload this method), it is best
  390. // to clone or instantiate new copies. (Instantiation is safer.)
  391. return $this;
  392. }
  393. /**
  394. * Removes spaces from rgb(0, 0, 0) so that shorthand CSS properties work
  395. * properly. THIS IS A HACK!
  396. */
  397. protected function mungeRgb($string) {
  398. return preg_replace('/rgb\((\d+)\s*,\s*(\d+)\s*,\s*(\d+)\)/', 'rgb(\1,\2,\3)', $string);
  399. }
  400. /**
  401. * Parses a possibly escaped CSS string and returns the "pure"
  402. * version of it.
  403. */
  404. protected function expandCSSEscape($string) {
  405. // flexibly parse it
  406. $ret = '';
  407. for ($i = 0, $c = strlen($string); $i < $c; $i++) {
  408. if ($string[$i] === '\\') {
  409. $i++;
  410. if ($i >= $c) {
  411. $ret .= '\\';
  412. break;
  413. }
  414. if (ctype_xdigit($string[$i])) {
  415. $code = $string[$i];
  416. for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
  417. if (!ctype_xdigit($string[$i])) break;
  418. $code .= $string[$i];
  419. }
  420. // We have to be extremely careful when adding
  421. // new characters, to make sure we're not breaking
  422. // the encoding.
  423. $char = HTMLPurifier_Encoder::unichr(hexdec($code));
  424. if (HTMLPurifier_Encoder::cleanUTF8($char) === '') continue;
  425. $ret .= $char;
  426. if ($i < $c && trim($string[$i]) !== '') $i--;
  427. continue;
  428. }
  429. if ($string[$i] === "\n") continue;
  430. }
  431. $ret .= $string[$i];
  432. }
  433. return $ret;
  434. }
  435. }
  436. /**
  437. * Processes an entire attribute array for corrections needing multiple values.
  438. *
  439. * Occasionally, a certain attribute will need to be removed and popped onto
  440. * another value. Instead of creating a complex return syntax for
  441. * HTMLPurifier_AttrDef, we just pass the whole attribute array to a
  442. * specialized object and have that do the special work. That is the
  443. * family of HTMLPurifier_AttrTransform.
  444. *
  445. * An attribute transformation can be assigned to run before or after
  446. * HTMLPurifier_AttrDef validation. See HTMLPurifier_HTMLDefinition for
  447. * more details.
  448. */
  449. abstract class HTMLPurifier_AttrTransform
  450. {
  451. /**
  452. * Abstract: makes changes to the attributes dependent on multiple values.
  453. *
  454. * @param $attr Assoc array of attributes, usually from
  455. * HTMLPurifier_Token_Tag::$attr
  456. * @param $config Mandatory HTMLPurifier_Config object.
  457. * @param $context Mandatory HTMLPurifier_Context object
  458. * @returns Processed attribute array.
  459. */
  460. abstract public function transform($attr, $config, $context);
  461. /**
  462. * Prepends CSS properties to the style attribute, creating the
  463. * attribute if it doesn't exist.
  464. * @param $attr Attribute array to process (passed by reference)
  465. * @param $css CSS to prepend
  466. */
  467. public function prependCSS(&$attr, $css) {
  468. $attr['style'] = isset($attr['style']) ? $attr['style'] : '';
  469. $attr['style'] = $css . $attr['style'];
  470. }
  471. /**
  472. * Retrieves and removes an attribute
  473. * @param $attr Attribute array to process (passed by reference)
  474. * @param $key Key of attribute to confiscate
  475. */
  476. public function confiscateAttr(&$attr, $key) {
  477. if (!isset($attr[$key])) return null;
  478. $value = $attr[$key];
  479. unset($attr[$key]);
  480. return $value;
  481. }
  482. }
  483. /**
  484. * Provides lookup array of attribute types to HTMLPurifier_AttrDef objects
  485. */
  486. class HTMLPurifier_AttrTypes
  487. {
  488. /**
  489. * Lookup array of attribute string identifiers to concrete implementations
  490. */
  491. protected $info = array();
  492. /**
  493. * Constructs the info array, supplying default implementations for attribute
  494. * types.
  495. */
  496. public function __construct() {
  497. // pseudo-types, must be instantiated via shorthand
  498. $this->info['Enum'] = new HTMLPurifier_AttrDef_Enum();
  499. $this->info['Bool'] = new HTMLPurifier_AttrDef_HTML_Bool();
  500. $this->info['CDATA'] = new HTMLPurifier_AttrDef_Text();
  501. $this->info['ID'] = new HTMLPurifier_AttrDef_HTML_ID();
  502. $this->info['Length'] = new HTMLPurifier_AttrDef_HTML_Length();
  503. $this->info['MultiLength'] = new HTMLPurifier_AttrDef_HTML_MultiLength();
  504. $this->info['NMTOKENS'] = new HTMLPurifier_AttrDef_HTML_Nmtokens();
  505. $this->info['Pixels'] = new HTMLPurifier_AttrDef_HTML_Pixels();
  506. $this->info['Text'] = new HTMLPurifier_AttrDef_Text();
  507. $this->info['URI'] = new HTMLPurifier_AttrDef_URI();
  508. $this->info['LanguageCode'] = new HTMLPurifier_AttrDef_Lang();
  509. $this->info['Color'] = new HTMLPurifier_AttrDef_HTML_Color();
  510. // unimplemented aliases
  511. $this->info['ContentType'] = new HTMLPurifier_AttrDef_Text();
  512. $this->info['ContentTypes'] = new HTMLPurifier_AttrDef_Text();
  513. $this->info['Charsets'] = new HTMLPurifier_AttrDef_Text();
  514. $this->info['Character'] = new HTMLPurifier_AttrDef_Text();
  515. // "proprietary" types
  516. $this->info['Class'] = new HTMLPurifier_AttrDef_HTML_Class();
  517. // number is really a positive integer (one or more digits)
  518. // FIXME: ^^ not always, see start and value of list items
  519. $this->info['Number'] = new HTMLPurifier_AttrDef_Integer(false, false, true);
  520. }
  521. /**
  522. * Retrieves a type
  523. * @param $type String type name
  524. * @return Object AttrDef for type
  525. */
  526. public function get($type) {
  527. // determine if there is any extra info tacked on
  528. if (strpos($type, '#') !== false) list($type, $string) = explode('#', $type, 2);
  529. else $string = '';
  530. if (!isset($this->info[$type])) {
  531. trigger_error('Cannot retrieve undefined attribute type ' . $type, E_USER_ERROR);
  532. return;
  533. }
  534. return $this->info[$type]->make($string);
  535. }
  536. /**
  537. * Sets a new implementation for a type
  538. * @param $type String type name
  539. * @param $impl Object AttrDef for type
  540. */
  541. public function set($type, $impl) {
  542. $this->info[$type] = $impl;
  543. }
  544. }
  545. /**
  546. * Validates the attributes of a token. Doesn't manage required attributes
  547. * very well. The only reason we factored this out was because RemoveForeignElements
  548. * also needed it besides ValidateAttributes.
  549. */
  550. class HTMLPurifier_AttrValidator
  551. {
  552. /**
  553. * Validates the attributes of a token, returning a modified token
  554. * that has valid tokens
  555. * @param $token Reference to token to validate. We require a reference
  556. * because the operation this class performs on the token are
  557. * not atomic, so the context CurrentToken to be updated
  558. * throughout
  559. * @param $config Instance of HTMLPurifier_Config
  560. * @param $context Instance of HTMLPurifier_Context
  561. */
  562. public function validateToken(&$token, &$config, $context) {
  563. $definition = $config->getHTMLDefinition();
  564. $e =& $context->get('ErrorCollector', true);
  565. // initialize IDAccumulator if necessary
  566. $ok =& $context->get('IDAccumulator', true);
  567. if (!$ok) {
  568. $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
  569. $context->register('IDAccumulator', $id_accumulator);
  570. }
  571. // initialize CurrentToken if necessary
  572. $current_token =& $context->get('CurrentToken', true);
  573. if (!$current_token) $context->register('CurrentToken', $token);
  574. if (
  575. !$token instanceof HTMLPurifier_Token_Start &&
  576. !$token instanceof HTMLPurifier_Token_Empty
  577. ) return $token;
  578. // create alias to global definition array, see also $defs
  579. // DEFINITION CALL
  580. $d_defs = $definition->info_global_attr;
  581. // don't update token until the very end, to ensure an atomic update
  582. $attr = $token->attr;
  583. // do global transformations (pre)
  584. // nothing currently utilizes this
  585. foreach ($definition->info_attr_transform_pre as $transform) {
  586. $attr = $transform->transform($o = $attr, $config, $context);
  587. if ($e) {
  588. if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
  589. }
  590. }
  591. // do local transformations only applicable to this element (pre)
  592. // ex. <p align="right"> to <p style="text-align:right;">
  593. foreach ($definition->info[$token->name]->attr_transform_pre as $transform) {
  594. $attr = $transform->transform($o = $attr, $config, $context);
  595. if ($e) {
  596. if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
  597. }
  598. }
  599. // create alias to this element's attribute definition array, see
  600. // also $d_defs (global attribute definition array)
  601. // DEFINITION CALL
  602. $defs = $definition->info[$token->name]->attr;
  603. $attr_key = false;
  604. $context->register('CurrentAttr', $attr_key);
  605. // iterate through all the attribute keypairs
  606. // Watch out for name collisions: $key has previously been used
  607. foreach ($attr as $attr_key => $value) {
  608. // call the definition
  609. if ( isset($defs[$attr_key]) ) {
  610. // there is a local definition defined
  611. if ($defs[$attr_key] === false) {
  612. // We've explicitly been told not to allow this element.
  613. // This is usually when there's a global definition
  614. // that must be overridden.
  615. // Theoretically speaking, we could have a
  616. // AttrDef_DenyAll, but this is faster!
  617. $result = false;
  618. } else {
  619. // validate according to the element's definition
  620. $result = $defs[$attr_key]->validate(
  621. $value, $config, $context
  622. );
  623. }
  624. } elseif ( isset($d_defs[$attr_key]) ) {
  625. // there is a global definition defined, validate according
  626. // to the global definition
  627. $result = $d_defs[$attr_key]->validate(
  628. $value, $config, $context
  629. );
  630. } else {
  631. // system never heard of the attribute? DELETE!
  632. $result = false;
  633. }
  634. // put the results into effect
  635. if ($result === false || $result === null) {
  636. // this is a generic error message that should replaced
  637. // with more specific ones when possible
  638. if ($e) $e->send(E_ERROR, 'AttrValidator: Attribute removed');
  639. // remove the attribute
  640. unset($attr[$attr_key]);
  641. } elseif (is_string($result)) {
  642. // generally, if a substitution is happening, there
  643. // was some sort of implicit correction going on. We'll
  644. // delegate it to the attribute classes to say exactly what.
  645. // simple substitution
  646. $attr[$attr_key] = $result;
  647. } else {
  648. // nothing happens
  649. }
  650. // we'd also want slightly more complicated substitution
  651. // involving an array as the return value,
  652. // although we're not sure how colliding attributes would
  653. // resolve (certain ones would be completely overriden,
  654. // others would prepend themselves).
  655. }
  656. $context->destroy('CurrentAttr');
  657. // post transforms
  658. // global (error reporting untested)
  659. foreach ($definition->info_attr_transform_post as $transform) {
  660. $attr = $transform->transform($o = $attr, $config, $context);
  661. if ($e) {
  662. if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
  663. }
  664. }
  665. // local (error reporting untested)
  666. foreach ($definition->info[$token->name]->attr_transform_post as $transform) {
  667. $attr = $transform->transform($o = $attr, $config, $context);
  668. if ($e) {
  669. if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
  670. }
  671. }
  672. $token->attr = $attr;
  673. // destroy CurrentToken if we made it ourselves
  674. if (!$current_token) $context->destroy('CurrentToken');
  675. }
  676. }
  677. // constants are slow, so we use as few as possible
  678. if (!defined('HTMLPURIFIER_PREFIX')) {
  679. define('HTMLPURIFIER_PREFIX', dirname(__FILE__) . '/standalone');
  680. set_include_path(HTMLPURIFIER_PREFIX . PATH_SEPARATOR . get_include_path());
  681. }
  682. // accomodations for versions earlier than 5.0.2
  683. // borrowed from PHP_Compat, LGPL licensed, by Aidan Lister <aidan@php.net>
  684. if (!defined('PHP_EOL')) {
  685. switch (strtoupper(substr(PHP_OS, 0, 3))) {
  686. case 'WIN':
  687. define('PHP_EOL', "\r\n");
  688. break;
  689. case 'DAR':
  690. define('PHP_EOL', "\r");
  691. break;
  692. default:
  693. define('PHP_EOL', "\n");
  694. }
  695. }
  696. /**
  697. * Bootstrap class that contains meta-functionality for HTML Purifier such as
  698. * the autoload function.
  699. *
  700. * @note
  701. * This class may be used without any other files from HTML Purifier.
  702. */
  703. class HTMLPurifier_Bootstrap
  704. {
  705. /**
  706. * Autoload function for HTML Purifier
  707. * @param $class Class to load
  708. */
  709. public static function autoload($class) {
  710. $file = HTMLPurifier_Bootstrap::getPath($class);
  711. if (!$file) return false;
  712. require HTMLPURIFIER_PREFIX . '/' . $file;
  713. return true;
  714. }
  715. /**
  716. * Returns the path for a specific class.
  717. */
  718. public static function getPath($class) {
  719. if (strncmp('HTMLPurifier', $class, 12) !== 0) return false;
  720. // Custom implementations
  721. if (strncmp('HTMLPurifier_Language_', $class, 22) === 0) {
  722. $code = str_replace('_', '-', substr($class, 22));
  723. $file = 'HTMLPurifier/Language/classes/' . $code . '.php';
  724. } else {
  725. $file = str_replace('_', '/', $class) . '.php';
  726. }
  727. if (!file_exists(HTMLPURIFIER_PREFIX . '/' . $file)) return false;
  728. return $file;
  729. }
  730. /**
  731. * "Pre-registers" our autoloader on the SPL stack.
  732. */
  733. public static function registerAutoload() {
  734. $autoload = array('HTMLPurifier_Bootstrap', 'autoload');
  735. if ( ($funcs = spl_autoload_functions()) === false ) {
  736. spl_autoload_register($autoload);
  737. } elseif (function_exists('spl_autoload_unregister')) {
  738. $compat = version_compare(PHP_VERSION, '5.1.2', '<=') &&
  739. version_compare(PHP_VERSION, '5.1.0', '>=');
  740. foreach ($funcs as $func) {
  741. if (is_array($func)) {
  742. // :TRICKY: There are some compatibility issues and some
  743. // places where we need to error out
  744. $reflector = new ReflectionMethod($func[0], $func[1]);
  745. if (!$reflector->isStatic()) {
  746. throw new Exception('
  747. HTML Purifier autoloader registrar is not compatible
  748. with non-static object methods due to PHP Bug #44144;
  749. Please do not use HTMLPurifier.autoload.php (or any
  750. file that includes this file); instead, place the code:
  751. spl_autoload_register(array(\'HTMLPurifier_Bootstrap\', \'autoload\'))
  752. after your own autoloaders.
  753. ');
  754. }
  755. // Suprisingly, spl_autoload_register supports the
  756. // Class::staticMethod callback format, although call_user_func doesn't
  757. if ($compat) $func = implode('::', $func);
  758. }
  759. spl_autoload_unregister($func);
  760. }
  761. spl_autoload_register($autoload);
  762. foreach ($funcs as $func) spl_autoload_register($func);
  763. }
  764. }
  765. }
  766. /**
  767. * Super-class for definition datatype objects, implements serialization
  768. * functions for the class.
  769. */
  770. abstract class HTMLPurifier_Definition
  771. {
  772. /**
  773. * Has setup() been called yet?
  774. */
  775. public $setup = false;
  776. /**
  777. * What type of definition is it?
  778. */
  779. public $type;
  780. /**
  781. * Sets up the definition object into the final form, something
  782. * not done by the constructor
  783. * @param $config HTMLPurifier_Config instance
  784. */
  785. abstract protected function doSetup($config);
  786. /**
  787. * Setup function that aborts if already setup
  788. * @param $config HTMLPurifier_Config instance
  789. */
  790. public function setup($config) {
  791. if ($this->setup) return;
  792. $this->setup = true;
  793. $this->doSetup($config);
  794. }
  795. }
  796. /**
  797. * Defines allowed CSS attributes and what their values are.
  798. * @see HTMLPurifier_HTMLDefinition
  799. */
  800. class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
  801. {
  802. public $type = 'CSS';
  803. /**
  804. * Assoc array of attribute name to definition object.
  805. */
  806. public $info = array();
  807. /**
  808. * Constructs the info array. The meat of this class.
  809. */
  810. protected function doSetup($config) {
  811. $this->info['text-align'] = new HTMLPurifier_AttrDef_Enum(
  812. array('left', 'right', 'center', 'justify'), false);
  813. $border_style =
  814. $this->info['border-bottom-style'] =
  815. $this->info['border-right-style'] =
  816. $this->info['border-left-style'] =
  817. $this->info['border-top-style'] = new HTMLPurifier_AttrDef_Enum(
  818. array('none', 'hidden', 'dotted', 'dashed', 'solid', 'double',
  819. 'groove', 'ridge', 'inset', 'outset'), false);
  820. $this->info['border-style'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_style);
  821. $this->info['clear'] = new HTMLPurifier_AttrDef_Enum(
  822. array('none', 'left', 'right', 'both'), false);
  823. $this->info['float'] = new HTMLPurifier_AttrDef_Enum(
  824. array('none', 'left', 'right'), false);
  825. $this->info['font-style'] = new HTMLPurifier_AttrDef_Enum(
  826. array('normal', 'italic', 'oblique'), false);
  827. $this->info['font-variant'] = new HTMLPurifier_AttrDef_Enum(
  828. array('normal', 'small-caps'), false);
  829. $uri_or_none = new HTMLPurifier_AttrDef_CSS_Composite(
  830. array(
  831. new HTMLPurifier_AttrDef_Enum(array('none')),
  832. new HTMLPurifier_AttrDef_CSS_URI()
  833. )
  834. );
  835. $this->info['list-style-position'] = new HTMLPurifier_AttrDef_Enum(
  836. array('inside', 'outside'), false);
  837. $this->info['list-style-type'] = new HTMLPurifier_AttrDef_Enum(
  838. array('disc', 'circle', 'square', 'decimal', 'lower-roman',
  839. 'upper-roman', 'lower-alpha', 'upper-alpha', 'none'), false);
  840. $this->info['list-style-image'] = $uri_or_none;
  841. $this->info['list-style'] = new HTMLPurifier_AttrDef_CSS_ListStyle($config);
  842. $this->info['text-transform'] = new HTMLPurifier_AttrDef_Enum(
  843. array('capitalize', 'uppercase', 'lowercase', 'none'), false);
  844. $this->info['color'] = new HTMLPurifier_AttrDef_CSS_Color();
  845. $this->info['background-image'] = $uri_or_none;
  846. $this->info['background-repeat'] = new HTMLPurifier_AttrDef_Enum(
  847. array('repeat', 'repeat-x', 'repeat-y', 'no-repeat')
  848. );
  849. $this->info['background-attachment'] = new HTMLPurifier_AttrDef_Enum(
  850. array('scroll', 'fixed')
  851. );
  852. $this->info['background-position'] = new HTMLPurifier_AttrDef_CSS_BackgroundPosition();
  853. $border_color =
  854. $this->info['border-top-color'] =
  855. $this->info['border-bottom-color'] =
  856. $this->info['border-left-color'] =
  857. $this->info['border-right-color'] =
  858. $this->info['background-color'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  859. new HTMLPurifier_AttrDef_Enum(array('transparent')),
  860. new HTMLPurifier_AttrDef_CSS_Color()
  861. ));
  862. $this->info['background'] = new HTMLPurifier_AttrDef_CSS_Background($config);
  863. $this->info['border-color'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_color);
  864. $border_width =
  865. $this->info['border-top-width'] =
  866. $this->info['border-bottom-width'] =
  867. $this->info['border-left-width'] =
  868. $this->info['border-right-width'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  869. new HTMLPurifier_AttrDef_Enum(array('thin', 'medium', 'thick')),
  870. new HTMLPurifier_AttrDef_CSS_Length('0') //disallow negative
  871. ));
  872. $this->info['border-width'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_width);
  873. $this->info['letter-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  874. new HTMLPurifier_AttrDef_Enum(array('normal')),
  875. new HTMLPurifier_AttrDef_CSS_Length()
  876. ));
  877. $this->info['word-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  878. new HTMLPurifier_AttrDef_Enum(array('normal')),
  879. new HTMLPurifier_AttrDef_CSS_Length()
  880. ));
  881. $this->info['font-size'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  882. new HTMLPurifier_AttrDef_Enum(array('xx-small', 'x-small',
  883. 'small', 'medium', 'large', 'x-large', 'xx-large',
  884. 'larger', 'smaller')),
  885. new HTMLPurifier_AttrDef_CSS_Percentage(),
  886. new HTMLPurifier_AttrDef_CSS_Length()
  887. ));
  888. $this->info['line-height'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  889. new HTMLPurifier_AttrDef_Enum(array('normal')),
  890. new HTMLPurifier_AttrDef_CSS_Number(true), // no negatives
  891. new HTMLPurifier_AttrDef_CSS_Length('0'),
  892. new HTMLPurifier_AttrDef_CSS_Percentage(true)
  893. ));
  894. $margin =
  895. $this->info['margin-top'] =
  896. $this->info['margin-bottom'] =
  897. $this->info['margin-left'] =
  898. $this->info['margin-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  899. new HTMLPurifier_AttrDef_CSS_Length(),
  900. new HTMLPurifier_AttrDef_CSS_Percentage(),
  901. new HTMLPurifier_AttrDef_Enum(array('auto'))
  902. ));
  903. $this->info['margin'] = new HTMLPurifier_AttrDef_CSS_Multiple($margin);
  904. // non-negative
  905. $padding =
  906. $this->info['padding-top'] =
  907. $this->info['padding-bottom'] =
  908. $this->info['padding-left'] =
  909. $this->info['padding-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  910. new HTMLPurifier_AttrDef_CSS_Length('0'),
  911. new HTMLPurifier_AttrDef_CSS_Percentage(true)
  912. ));
  913. $this->info['padding'] = new HTMLPurifier_AttrDef_CSS_Multiple($padding);
  914. $this->info['text-indent'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  915. new HTMLPurifier_AttrDef_CSS_Length(),
  916. new HTMLPurifier_AttrDef_CSS_Percentage()
  917. ));
  918. $trusted_wh = new HTMLPurifier_AttrDef_CSS_Composite(array(
  919. new HTMLPurifier_AttrDef_CSS_Length('0'),
  920. new HTMLPurifier_AttrDef_CSS_Percentage(true),
  921. new HTMLPurifier_AttrDef_Enum(array('auto'))
  922. ));
  923. $max = $config->get('CSS.MaxImgLength');
  924. $this->info['width'] =
  925. $this->info['height'] =
  926. $max === null ?
  927. $trusted_wh :
  928. new HTMLPurifier_AttrDef_Switch('img',
  929. // For img tags:
  930. new HTMLPurifier_AttrDef_CSS_Composite(array(
  931. new HTMLPurifier_AttrDef_CSS_Length('0', $max),
  932. new HTMLPurifier_AttrDef_Enum(array('auto'))
  933. )),
  934. // For everyone else:
  935. $trusted_wh
  936. );
  937. $this->info['text-decoration'] = new HTMLPurifier_AttrDef_CSS_TextDecoration();
  938. $this->info['font-family'] = new HTMLPurifier_AttrDef_CSS_FontFamily();
  939. // this could use specialized code
  940. $this->info['font-weight'] = new HTMLPurifier_AttrDef_Enum(
  941. array('normal', 'bold', 'bolder', 'lighter', '100', '200', '300',
  942. '400', '500', '600', '700', '800', '900'), false);
  943. // MUST be called after other font properties, as it references
  944. // a CSSDefinition object
  945. $this->info['font'] = new HTMLPurifier_AttrDef_CSS_Font($config);
  946. // same here
  947. $this->info['border'] =
  948. $this->info['border-bottom'] =
  949. $this->info['border-top'] =
  950. $this->info['border-left'] =
  951. $this->info['border-right'] = new HTMLPurifier_AttrDef_CSS_Border($config);
  952. $this->info['border-collapse'] = new HTMLPurifier_AttrDef_Enum(array(
  953. 'collapse', 'separate'));
  954. $this->info['caption-side'] = new HTMLPurifier_AttrDef_Enum(array(
  955. 'top', 'bottom'));
  956. $this->info['table-layout'] = new HTMLPurifier_AttrDef_Enum(array(
  957. 'auto', 'fixed'));
  958. $this->info['vertical-align'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  959. new HTMLPurifier_AttrDef_Enum(array('baseline', 'sub', 'super',
  960. 'top', 'text-top', 'middle', 'bottom', 'text-bottom')),
  961. new HTMLPurifier_AttrDef_CSS_Length(),
  962. new HTMLPurifier_AttrDef_CSS_Percentage()
  963. ));
  964. $this->info['border-spacing'] = new HTMLPurifier_AttrDef_CSS_Multiple(new HTMLPurifier_AttrDef_CSS_Length(), 2);
  965. // partial support
  966. $this->info['white-space'] = new HTMLPurifier_AttrDef_Enum(array('nowrap'));
  967. if ($config->get('CSS.Proprietary')) {
  968. $this->doSetupProprietary($config);
  969. }
  970. if ($config->get('CSS.AllowTricky')) {
  971. $this->doSetupTricky($config);
  972. }
  973. $allow_important = $config->get('CSS.AllowImportant');
  974. // wrap all attr-defs with decorator that handles !important
  975. foreach ($this->info as $k => $v) {
  976. $this->info[$k] = new HTMLPurifier_AttrDef_CSS_ImportantDecorator($v, $allow_important);
  977. }
  978. $this->setupConfigStuff($config);
  979. }
  980. protected function doSetupProprietary($config) {
  981. // Internet Explorer only scrollbar colors
  982. $this->info['scrollbar-arrow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  983. $this->info['scrollbar-base-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  984. $this->info['scrollbar-darkshadow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  985. $this->info['scrollbar-face-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  986. $this->info['scrollbar-highlight-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  987. $this->info['scrollbar-shadow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  988. // technically not proprietary, but CSS3, and no one supports it
  989. $this->info['opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
  990. $this->info['-moz-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
  991. $this->info['-khtml-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
  992. // only opacity, for now
  993. $this->info['filter'] = new HTMLPurifier_AttrDef_CSS_Filter();
  994. }
  995. protected function doSetupTricky($config) {
  996. $this->info['display'] = new HTMLPurifier_AttrDef_Enum(array(
  997. 'inline', 'block', 'list-item', 'run-in', 'compact',
  998. 'marker', 'table', 'inline-table', 'table-row-group',
  999. 'table-header-group', 'table-footer-group', 'table-row',
  1000. 'table-column-group', 'table-column', 'table-cell', 'table-caption', 'none'
  1001. ));
  1002. $this->info['visibility'] = new HTMLPurifier_AttrDef_Enum(array(
  1003. 'visible', 'hidden', 'collapse'
  1004. ));
  1005. $this->info['overflow'] = new HTMLPurifier_AttrDef_Enum(array('visible', 'hidden', 'auto', 'scroll'));
  1006. }
  1007. /**
  1008. * Performs extra config-based processing. Based off of
  1009. * HTMLPurifier_HTMLDefinition.
  1010. * @todo Refactor duplicate elements into common class (probably using
  1011. * composition, not inheritance).
  1012. */
  1013. protected function setupConfigStuff($config) {
  1014. // setup allowed elements
  1015. $support = "(for information on implementing this, see the ".
  1016. "support forums) ";
  1017. $allowed_properties = $config->get('CSS.AllowedProperties');
  1018. if ($allowed_properties !== null) {
  1019. foreach ($this->info as $name => $d) {
  1020. if(!isset($allowed_properties[$name])) unset($this->info[$name]);
  1021. unset($allowed_properties[$name]);
  1022. }
  1023. // emit errors
  1024. foreach ($allowed_properties as $name => $d) {
  1025. // :TODO: Is this htmlspecialchars() call really necessary?
  1026. $name = htmlspecialchars($name);
  1027. trigger_error("Style attribute '$name' is not supported $support", E_USER_WARNING);
  1028. }
  1029. }
  1030. $forbidden_properties = $config->get('CSS.ForbiddenProperties');
  1031. if ($forbidden_properties !== null) {
  1032. foreach ($this->info as $name => $d) {
  1033. if (isset($forbidden_properties[$name])) {
  1034. unset($this->info[$name]);
  1035. }
  1036. }
  1037. }
  1038. }
  1039. }
  1040. /**
  1041. * Defines allowed child nodes and validates tokens against it.
  1042. */
  1043. abstract class HTMLPurifier_ChildDef
  1044. {
  1045. /**
  1046. * Type of child definition, usually right-most part of class name lowercase.
  1047. * Used occasionally in terms of context.
  1048. */
  1049. public $type;
  1050. /**
  1051. * Bool that indicates whether or not an empty array of children is okay
  1052. *
  1053. * This is necessary for redundant checking when changes affecting
  1054. * a child node may cause a parent node to now be disallowed.
  1055. */
  1056. public $allow_empty;
  1057. /**
  1058. * Lookup array of all elements that this definition could possibly allow
  1059. */
  1060. public $elements = array();
  1061. /**
  1062. * Get lookup of tag names that should not close this element automatically.
  1063. * All other elements will do so.
  1064. */
  1065. public function getAllowedElements($config) {
  1066. return $this->elements;
  1067. }
  1068. /**
  1069. * Validates nodes according to definition and returns modification.
  1070. *
  1071. * @param $tokens_of_children Array of HTMLPurifier_Token
  1072. * @param $config HTMLPurifier_Config object
  1073. * @param $context HTMLPurifier_Context object
  1074. * @return bool true to leave nodes as is
  1075. * @return bool false to remove parent node
  1076. * @return array of replacement child tokens
  1077. */
  1078. abstract public function validateChildren($tokens_of_children, $config, $context);
  1079. }
  1080. /**
  1081. * Configuration object that triggers customizable behavior.
  1082. *
  1083. * @warning This class is strongly defined: that means that the class
  1084. * will fail if an undefined directive is retrieved or set.
  1085. *
  1086. * @note Many classes that could (although many times don't) use the
  1087. * configuration object make it a mandatory parameter. This is
  1088. * because a configuration object should always be forwarded,
  1089. * otherwise, you run the risk of missing a parameter and then
  1090. * being stumped when a configuration directive doesn't work.
  1091. *
  1092. * @todo Reconsider some of the public member variables
  1093. */
  1094. class HTMLPurifier_Config
  1095. {
  1096. /**
  1097. * HTML Purifier's version
  1098. */
  1099. public $version = '4.2.0';
  1100. /**
  1101. * Bool indicator whether or not to automatically finalize
  1102. * the object if a read operation is done
  1103. */
  1104. public $autoFinalize = true;
  1105. // protected member variables
  1106. /**
  1107. * Namespace indexed array of serials for specific namespaces (see
  1108. * getSerial() for more info).
  1109. */
  1110. protected $serials = array();
  1111. /**
  1112. * Serial for entire configuration object
  1113. */
  1114. protected $serial;
  1115. /**
  1116. * Parser for variables
  1117. */
  1118. protected $parser;
  1119. /**
  1120. * Reference HTMLPurifier_ConfigSchema for value checking
  1121. * @note This is public for introspective purposes. Please don't
  1122. * abuse!
  1123. */
  1124. public $def;
  1125. /**
  1126. * Indexed array of definitions
  1127. */
  1128. protected $definitions;
  1129. /**
  1130. * Bool indicator whether or not config is finalized
  1131. */
  1132. protected $finalized = false;
  1133. /**
  1134. * Property list containing configuration directives.
  1135. */
  1136. protected $plist;
  1137. /**
  1138. * Whether or not a set is taking place due to an
  1139. * alias lookup.
  1140. */
  1141. private $aliasMode;
  1142. /**
  1143. * Set to false if you do not want line and file numbers in errors
  1144. * (useful when unit testing)
  1145. */
  1146. public $chatty = true;
  1147. /**
  1148. * Current lock; only gets to this namespace are allowed.
  1149. */
  1150. private $lock;
  1151. /**
  1152. * @param $definition HTMLPurifier_ConfigSchema that defines what directives
  1153. * are allowed.
  1154. */
  1155. public function __construct($definition, $parent = null) {
  1156. $parent = $parent ? $parent : $definition->defaultPlist;
  1157. $this->plist = new HTMLPurifier_PropertyList($parent);
  1158. $this->def = $definition; // keep a copy around for checking
  1159. $this->parser = new HTMLPurifier_VarParser_Flexible();
  1160. }
  1161. /**
  1162. * Convenience constructor that creates a config object based on a mixed var
  1163. * @param mixed $config Variable that defines the state of the config
  1164. * object. Can be: a HTMLPurifier_Config() object,
  1165. * an array of directives based on loadArray(),
  1166. * or a string filename of an ini file.
  1167. * @param HTMLPurifier_ConfigSchema Schema object
  1168. * @return Configured HTMLPurifier_Config object
  1169. */
  1170. public static function create($config, $schema = null) {
  1171. if ($config instanceof HTMLPurifier_Config) {
  1172. // pass-through
  1173. return $config;
  1174. }
  1175. if (!$schema) {
  1176. $ret = HTMLPurifier_Config::createDefault();
  1177. } else {
  1178. $ret = new HTMLPurifier_Config($schema);
  1179. }
  1180. if (is_string($config)) $ret->loadIni($config);
  1181. elseif (is_array($config)) $ret->loadArray($config);
  1182. return $ret;
  1183. }
  1184. /**
  1185. * Creates a new config object that inherits from a previous one.
  1186. * @param HTMLPurifier_Config $config Configuration object to inherit
  1187. * from.
  1188. * @return HTMLPurifier_Config object with $config as its parent.
  1189. */
  1190. public static function inherit(HTMLPurifier_Config $config) {
  1191. return new HTMLPurifier_Config($config->def, $config->plist);
  1192. }
  1193. /**
  1194. * Convenience constructor that creates a default configuration object.
  1195. * @return Default HTMLPurifier_Config object.
  1196. */
  1197. public static function createDefault() {
  1198. $definition = HTMLPurifier_ConfigSchema::instance();
  1199. $config = new HTMLPurifier_Config($definition);
  1200. return $config;
  1201. }
  1202. /**
  1203. * Retreives a value from the configuration.
  1204. * @param $key String key
  1205. */
  1206. public function get($key, $a = null) {
  1207. if ($a !== null) {
  1208. $this->triggerError("Using deprecated API: use \$config->get('$key.$a') instead", E_USER_WARNING);
  1209. $key = "$key.$a";
  1210. }
  1211. if (!$this->finalized) $this->autoFinalize();
  1212. if (!isset($this->def->info[$key])) {
  1213. // can't add % due to SimpleTest bug
  1214. $this->triggerError('Cannot retrieve value of undefined directive ' . htmlspecialchars($key),
  1215. E_USER_WARNING);
  1216. return;
  1217. }
  1218. if (isset($this->def->info[$key]->isAlias)) {
  1219. $d = $this->def->info[$key];
  1220. $this->triggerError('Cannot get value from aliased directive, use real name ' . $d->key,
  1221. E_USER_ERROR);
  1222. return;
  1223. }
  1224. if ($this->lock) {
  1225. list($ns) = explode('.', $key);
  1226. if ($ns !== $this->lock) {
  1227. $this->triggerError('Cannot get value of namespace ' . $ns . ' when lock for ' . $this->lock . ' is active, this probably indicates a Definition setup method is accessing directives that are not within its namespace', E_USER_ERROR);
  1228. return;
  1229. }
  1230. }
  1231. return $this->plist->get($key);
  1232. }
  1233. /**
  1234. * Retreives an array of directives to values from a given namespace
  1235. * @param $namespace String namespace
  1236. */
  1237. public function getBatch($namespace) {
  1238. if (!$this->finalized) $this->autoFinalize();
  1239. $full = $this->getAll();
  1240. if (!isset($full[$namespace])) {
  1241. $this->triggerError('Cannot retrieve undefined namespace ' . htmlspecialchars($namespace),
  1242. E_USER_WARNING);
  1243. return;
  1244. }
  1245. return $full[$namespace];
  1246. }
  1247. /**
  1248. * Returns a md5 signature of a segment of the configuration object
  1249. * that uniquely identifies that particular configuration
  1250. * @note Revision is handled specially and is removed from the batch
  1251. * before processing!
  1252. * @param $namespace Namespace to get serial for
  1253. */
  1254. public function getBatchSerial($namespace) {
  1255. if (empty($this->serials[$namespace])) {
  1256. $batch = $this->getBatch($namespace);
  1257. unset($batch['DefinitionRev']);
  1258. $this->serials[$namespace] = md5(serialize($batch));
  1259. }
  1260. return $this->serials[$namespace];
  1261. }
  1262. /**
  1263. * Returns a md5 signature for the entire configuration object
  1264. * that uniquely identifies that particular configuration
  1265. */
  1266. public function getSerial() {
  1267. if (empty($this->serial)) {
  1268. $this->serial = md5(serialize($this->getAll()));
  1269. }
  1270. return $this->serial;
  1271. }
  1272. /**
  1273. * Retrieves all directives, organized by namespace
  1274. * @warning This is a pretty inefficient function, avoid if you can
  1275. */
  1276. public function getAll() {
  1277. if (!$this->finalized) $this->autoFinalize();
  1278. $ret = array();
  1279. foreach ($this->plist->squash() as $name => $value) {
  1280. list($ns, $key) = explode('.', $name, 2);
  1281. $ret[$ns][$key] = $value;
  1282. }
  1283. return $ret;
  1284. }
  1285. /**
  1286. * Sets a value to configuration.
  1287. * @param $key String key
  1288. * @param $value Mixed value
  1289. */
  1290. public function set($key, $value, $a = null) {
  1291. if (strpos($key, '.') === false) {
  1292. $namespace = $key;
  1293. $directive = $value;
  1294. $value = $a;
  1295. $key = "$key.$directive";
  1296. $this->triggerError("Using deprecated API: use \$config->set('$key', ...) instead", E_USER_NOTICE);
  1297. } else {
  1298. list($namespace) = explode('.', $key);
  1299. }
  1300. if ($this->isFinalized('Cannot set directive after finalization')) return;
  1301. if (!isset($this->def->info[$key])) {
  1302. $this->triggerError('Cannot set undefined directive ' . htmlspecialchars($key) . ' to value',
  1303. E_USER_WARNING);
  1304. return;
  1305. }
  1306. $def = $this->def->info[$key];
  1307. if (isset($def->isAlias)) {
  1308. if ($this->aliasMode) {
  1309. $this->triggerError('Double-aliases not allowed, please fix '.
  1310. 'ConfigSchema bug with' . $key, E_USER_ERROR);
  1311. return;
  1312. }
  1313. $this->aliasMode = true;
  1314. $this->set($def->key, $value);
  1315. $this->aliasMode = false;
  1316. $this->triggerError("$key is an alias, preferred directive name is {$def->key}", E_USER_NOTICE);
  1317. return;
  1318. }
  1319. // Raw type might be negative when using the fully optimized form
  1320. // of stdclass, which indicates allow_null == true
  1321. $rtype = is_int($def) ? $def : $def->type;
  1322. if ($rtype < 0) {
  1323. $type = -$rtype;
  1324. $allow_null = true;
  1325. } else {
  1326. $type = $rtype;
  1327. $allow_null = isset($def->allow_null);
  1328. }
  1329. try {
  1330. $value = $this->parser->parse($value, $type, $allow_null);
  1331. } catch (HTMLPurifier_VarParserException $e) {
  1332. $this->triggerError('Value for ' . $key . ' is of invalid type, should be ' . HTMLPurifier_VarParser::getTypeName($type), E_USER_WARNING);
  1333. return;
  1334. }
  1335. if (is_string($value) && is_object($def)) {
  1336. // resolve value alias if defined
  1337. if (isset($def->aliases[$value])) {
  1338. $value = $def->aliases[$value];
  1339. }
  1340. // check to see if the value is allowed
  1341. if (isset($def->allowed) && !isset($def->allowed[$value])) {
  1342. $this->triggerError('Value not supported, valid values are: ' .
  1343. $this->_listify($def->allowed), E_USER_WARNING);
  1344. return;
  1345. }
  1346. }
  1347. $this->plist->set($key, $value);
  1348. // reset definitions if the directives they depend on changed
  1349. // this is a very costly process, so it's discouraged
  1350. // with finalization
  1351. if ($namespace == 'HTML' || $namespace == 'CSS' || $namespace == 'URI') {
  1352. $this->definitions[$namespace] = null;
  1353. }
  1354. $this->serials[$namespace] = false;
  1355. }
  1356. /**
  1357. * Convenience function for error reporting
  1358. */
  1359. private function _listify($lookup) {
  1360. $list = array();
  1361. foreach ($lookup as $name => $b) $list[] = $name;
  1362. return implode(', ', $list);
  1363. }
  1364. /**
  1365. * Retrieves object reference to the HTML definition.
  1366. * @param $raw Return a copy that has not been setup yet. Must be
  1367. * called before it's been setup, otherwise won't work.
  1368. */
  1369. public function getHTMLDefinition($raw = false) {
  1370. return $this->getDefinition('HTML', $raw);
  1371. }
  1372. /**
  1373. * Retrieves object reference to the CSS definition
  1374. * @param $raw Return a copy that has not been setup yet. Must be
  1375. * called before it's been setup, otherwise won't work.
  1376. */
  1377. public function getCSSDefinition($raw = false) {
  1378. return $this->getDefinition('CSS', $raw);
  1379. }
  1380. /**
  1381. * Retrieves a definition
  1382. * @param $type Type of definition: HTML, CSS, etc
  1383. * @param $raw Whether or not definition should be returned raw
  1384. */
  1385. public function getDefinition($type, $raw = false) {
  1386. if (!$this->finalized) $this->autoFinalize();
  1387. // temporarily suspend locks, so we can handle recursive definition calls
  1388. $lock = $this->lock;
  1389. $this->lock = null;
  1390. $factory = HTMLPurifier_DefinitionCacheFactory::instance();
  1391. $cache = $factory->create($type, $this);
  1392. $this->lock = $lock;
  1393. if (!$raw) {
  1394. // see if we can quickly supply a definition
  1395. if (!empty($this->definitions[$type])) {
  1396. if (!$this->definitions[$type]->setup) {
  1397. $this->definitions[$type]->setup($this);
  1398. $cache->set($this->definitions[$type], $this);
  1399. }
  1400. return $this->definitions[$type];
  1401. }
  1402. // memory check missed, try cache
  1403. $this->definitions[$type] = $cache->get($this);
  1404. if ($this->definitions[$type]) {
  1405. // definition in cache, return it
  1406. return $this->definitions[$type];
  1407. }
  1408. } elseif (
  1409. !empty($this->definitions[$type]) &&
  1410. !$this->definitions[$type]->setup
  1411. ) {
  1412. // raw requested, raw in memory, quick return
  1413. return $this->definitions[$type];
  1414. }
  1415. // quick checks failed, let's create the object
  1416. if ($type == 'HTML') {
  1417. $this->definitions[$type] = new HTMLPurifier_HTMLDefinition();
  1418. } elseif ($type == 'CSS') {
  1419. $this->definitions[$type] = new HTMLPurifier_CSSDefinition();
  1420. } elseif ($type == 'URI') {
  1421. $this->definitions[$type] = new HTMLPurifier_URIDefinition();
  1422. } else {
  1423. throw new HTMLPurifier_Exception("Definition of $type type not supported");
  1424. }
  1425. // quick abort if raw
  1426. if ($raw) {
  1427. if (is_null($this->get($type . '.DefinitionID'))) {
  1428. // fatally error out if definition ID not set
  1429. throw new HTMLPurifier_Exception("Cannot retrieve raw version without specifying %$type.DefinitionID");
  1430. }
  1431. return $this->definitions[$type];
  1432. }
  1433. // set it up
  1434. $this->lock = $type;
  1435. $this->definitions[$type]->setup($this);
  1436. $this->lock = null;
  1437. // save in cache
  1438. $cache->set($this->definitions[$type], $this);
  1439. return $this->definitions[$type];
  1440. }
  1441. /**
  1442. * Loads configuration values from an array with the following structure:
  1443. * Namespace.Directive => Value
  1444. * @param $config_array Configuration associative array
  1445. */
  1446. public function loadArray($config_array) {
  1447. if ($this->isFinalized('Cannot load directives after finalization')) return;
  1448. foreach ($config_array as $key => $value) {
  1449. $key = str_replace('_', '.', $key);
  1450. if (strpos($key, '.') !== false) {
  1451. $this->set($key, $value);
  1452. } else {
  1453. $namespace = $key;
  1454. $namespace_values = $value;
  1455. foreach ($namespace_values as $directive => $value) {
  1456. $this->set($namespace .'.'. $directive, $value);
  1457. }
  1458. }
  1459. }
  1460. }
  1461. /**
  1462. * Returns a list of array(namespace, directive) for all directives
  1463. * that are allowed in a web-form context as per an allowed
  1464. * namespaces/directives list.
  1465. * @param $allowed List of allowed namespaces/directives
  1466. */
  1467. public static function getAllowedDirectivesForForm($allowed, $schema = null) {
  1468. if (!$schema) {
  1469. $schema = HTMLPurifier_ConfigSchema::instance();
  1470. }
  1471. if ($allowed !== true) {
  1472. if (is_string($allowed)) $allowed = array($allowed);
  1473. $allowed_ns = array();
  1474. $allowed_directives = array();
  1475. $blacklisted_directives = array();
  1476. foreach ($allowed as $ns_or_directive) {
  1477. if (strpos($ns_or_directive, '.') !== false) {
  1478. // directive
  1479. if ($ns_or_directive[0] == '-') {
  1480. $blacklisted_directives[substr($ns_or_directive, 1)] = true;
  1481. } else {
  1482. $allowed_directives[$ns_or_directive] = true;
  1483. }
  1484. } else {
  1485. // namespace
  1486. $allowed_ns[$ns_or_directive] = true;
  1487. }
  1488. }
  1489. }
  1490. $ret = array();
  1491. foreach ($schema->info as $key => $def) {
  1492. list($ns, $directive) = explode('.', $key, 2);
  1493. if ($allowed !== true) {
  1494. if (isset($blacklisted_directives["$ns.$directive"])) continue;
  1495. if (!isset($allowed_directives["$ns.$directive"]) && !isset($allowed_ns[$ns])) continue;
  1496. }
  1497. if (isset($def->isAlias)) continue;
  1498. if ($directive == 'DefinitionID' || $directive == 'DefinitionRev') continue;
  1499. $ret[] = array($ns, $directive);
  1500. }
  1501. return $ret;
  1502. }
  1503. /**
  1504. * Loads configuration values from $_GET/$_POST that were posted
  1505. * via ConfigForm
  1506. * @param $array $_GET or $_POST array to import
  1507. * @param $index Index/name that the config variables are in
  1508. * @param $allowed List of allowed namespaces/directives
  1509. * @param $mq_fix Boolean whether or not to enable magic quotes fix
  1510. * @param $schema Instance of HTMLPurifier_ConfigSchema to use, if not global copy
  1511. */
  1512. public static function loadArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) {
  1513. $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $schema);
  1514. $config = HTMLPurifier_Config::create($ret, $schema);
  1515. return $config;
  1516. }
  1517. /**
  1518. * Merges in configuration values from $_GET/$_POST to object. NOT STATIC.
  1519. * @note Same parameters as loadArrayFromForm
  1520. */
  1521. public function mergeArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true) {
  1522. $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $this->def);
  1523. $this->loadArray($ret);
  1524. }
  1525. /**
  1526. * Prepares an array from a form into something usable for the more
  1527. * strict parts of HTMLPurifier_Config
  1528. */
  1529. public static function prepareArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) {
  1530. if ($index !== false) $array = (isset($array[$index]) && is_array($array[$index])) ? $array[$index] : array();
  1531. $mq = $mq_fix && function_exists('get_magic_quotes_gpc') && get_magic_quotes_gpc();
  1532. $allowed = HTMLPurifier_Config::getAllowedDirectivesForForm($allowed, $schema);
  1533. $ret = array();
  1534. foreach ($allowed as $key) {
  1535. list($ns, $directive) = $key;
  1536. $skey = "$ns.$directive";
  1537. if (!empty($array["Null_$skey"])) {
  1538. $ret[$ns][$directive] = null;
  1539. continue;
  1540. }
  1541. if (!isset($array[$skey])) continue;
  1542. $value = $mq ? stripslashes($array[$skey]) : $array[$skey];
  1543. $ret[$ns][$directive] = $value;
  1544. }
  1545. return $ret;
  1546. }
  1547. /**
  1548. * Loads configuration values from an ini file
  1549. * @param $filename Name of ini file
  1550. */
  1551. public function loadIni($filename) {
  1552. if ($this->isFinalized('Cannot load directives after finalization')) return;
  1553. $array = parse_ini_file($filename, true);
  1554. $this->loadArray($array);
  1555. }
  1556. /**
  1557. * Checks whether or not the configuration object is finalized.
  1558. * @param $error String error message, or false for no error
  1559. */
  1560. public function isFinalized($error = false) {
  1561. if ($this->finalized && $error) {
  1562. $this->triggerError($error, E_USER_ERROR);
  1563. }
  1564. return $this->finalized;
  1565. }
  1566. /**
  1567. * Finalizes configuration only if auto finalize is on and not
  1568. * already finalized
  1569. */
  1570. public function autoFinalize() {
  1571. if ($this->autoFinalize) {
  1572. $this->finalize();
  1573. } else {
  1574. $this->plist->squash(true);
  1575. }
  1576. }
  1577. /**
  1578. * Finalizes a configuration object, prohibiting further change
  1579. */
  1580. public function finalize() {
  1581. $this->finalized = true;
  1582. unset($this->parser);
  1583. }
  1584. /**
  1585. * Produces a nicely formatted error message by supplying the
  1586. * stack frame information from two levels up and OUTSIDE of
  1587. * HTMLPurifier_Config.
  1588. */
  1589. protected function triggerError($msg, $no) {
  1590. // determine previous stack frame
  1591. $backtrace = debug_backtrace();
  1592. if ($this->chatty && isset($backtrace[1])) {
  1593. $frame = $backtrace[1];
  1594. $extra = " on line {$frame['line']} in file {$frame['file']}";
  1595. } else {
  1596. $extra = '';
  1597. }
  1598. trigger_error($msg . $extra, $no);
  1599. }
  1600. /**
  1601. * Returns a serialized form of the configuration object that can
  1602. * be reconstituted.
  1603. */
  1604. public function serialize() {
  1605. $this->getDefinition('HTML');
  1606. $this->getDefinition('CSS');
  1607. $this->getDefinition('URI');
  1608. return serialize($this);
  1609. }
  1610. }
  1611. /**
  1612. * Configuration definition, defines directives and their defaults.
  1613. */
  1614. class HTMLPurifier_ConfigSchema {
  1615. /**
  1616. * Defaults of the directives and namespaces.
  1617. * @note This shares the exact same structure as HTMLPurifier_Config::$conf
  1618. */
  1619. public $defaults = array();
  1620. /**
  1621. * The default property list. Do not edit this property list.
  1622. */
  1623. public $defaultPlist;
  1624. /**
  1625. * Definition of the directives. The structure of this is:
  1626. *
  1627. * array(
  1628. * 'Namespace' => array(
  1629. * 'Directive' => new stdclass(),
  1630. * )
  1631. * )
  1632. *
  1633. * The stdclass may have the following properties:
  1634. *
  1635. * - If isAlias isn't set:
  1636. * - type: Integer type of directive, see HTMLPurifier_VarParser for definitions
  1637. * - allow_null: If set, this directive allows null values
  1638. * - aliases: If set, an associative array of value aliases to real values
  1639. * - allowed: If set, a lookup array of allowed (string) values
  1640. * - If isAlias is set:
  1641. * - namespace: Namespace this directive aliases to
  1642. * - name: Directive name this directive aliases to
  1643. *
  1644. * In certain degenerate cases, stdclass will actually be an integer. In
  1645. * that case, the value is equivalent to an stdclass with the type
  1646. * property set to the integer. If the integer is negative, type is
  1647. * equal to the absolute value of integer, and allow_null is true.
  1648. *
  1649. * This class is friendly with HTMLPurifier_Config. If you need introspection
  1650. * about the schema, you're better of using the ConfigSchema_Interchange,
  1651. * which uses more memory but has much richer information.
  1652. */
  1653. public $info = array();
  1654. /**
  1655. * Application-wide singleton
  1656. */
  1657. static protected $singleton;
  1658. public function __construct() {
  1659. $this->defaultPlist = new HTMLPurifier_PropertyList();
  1660. }
  1661. /**
  1662. * Unserializes the default ConfigSchema.
  1663. */
  1664. public static function makeFromSerial() {
  1665. return unserialize(file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/ConfigSchema/schema.ser'));
  1666. }
  1667. /**
  1668. * Retrieves an instance of the application-wide configuration definition.
  1669. */
  1670. public static function instance($prototype = null) {
  1671. if ($prototype !== null) {
  1672. HTMLPurifier_ConfigSchema::$singleton = $prototype;
  1673. } elseif (HTMLPurifier_ConfigSchema::$singleton === null || $prototype === true) {
  1674. HTMLPurifier_ConfigSchema::$singleton = HTMLPurifier_ConfigSchema::makeFromSerial();
  1675. }
  1676. return HTMLPurifier_ConfigSchema::$singleton;
  1677. }
  1678. /**
  1679. * Defines a directive for configuration
  1680. * @warning Will fail of directive's namespace is defined.
  1681. * @warning This method's signature is slightly different from the legacy
  1682. * define() static method! Beware!
  1683. * @param $namespace Namespace the directive is in
  1684. * @param $name Key of directive
  1685. * @param $default Default value of directive
  1686. * @param $type Allowed type of the directive. See
  1687. * HTMLPurifier_DirectiveDef::$type for allowed values
  1688. * @param $allow_null Whether or not to allow null values
  1689. */
  1690. public function add($key, $default, $type, $allow_null) {
  1691. $obj = new stdclass();
  1692. $obj->type = is_int($type) ? $type : HTMLPurifier_VarParser::$types[$type];
  1693. if ($allow_null) $obj->allow_null = true;
  1694. $this->info[$key] = $obj;
  1695. $this->defaults[$key] = $default;
  1696. $this->defaultPlist->set($key, $default);
  1697. }
  1698. /**
  1699. * Defines a directive value alias.
  1700. *
  1701. * Directive value aliases are convenient for developers because it lets
  1702. * them set a directive to several values and get the same result.
  1703. * @param $namespace Directive's namespace
  1704. * @param $name Name of Directive
  1705. * @param $aliases Hash of aliased values to the real alias
  1706. */
  1707. public function addValueAliases($key, $aliases) {
  1708. if (!isset($this->info[$key]->aliases)) {
  1709. $this->info[$key]->aliases = array();
  1710. }
  1711. foreach ($aliases as $alias => $real) {
  1712. $this->info[$key]->aliases[$alias] = $real;
  1713. }
  1714. }
  1715. /**
  1716. * Defines a set of allowed values for a directive.
  1717. * @warning This is slightly different from the corresponding static
  1718. * method definition.
  1719. * @param $namespace Namespace of directive
  1720. * @param $name Name of directive
  1721. * @param $allowed Lookup array of allowed values
  1722. */
  1723. public function addAllowedValues($key, $allowed) {
  1724. $this->info[$key]->allowed = $allowed;
  1725. }
  1726. /**
  1727. * Defines a directive alias for backwards compatibility
  1728. * @param $namespace
  1729. * @param $name Directive that will be aliased
  1730. * @param $new_namespace
  1731. * @param $new_name Directive that the alias will be to
  1732. */
  1733. public function addAlias($key, $new_key) {
  1734. $obj = new stdclass;
  1735. $obj->key = $new_key;
  1736. $obj->isAlias = true;
  1737. $this->info[$key] = $obj;
  1738. }
  1739. /**
  1740. * Replaces any stdclass that only has the type property with type integer.
  1741. */
  1742. public function postProcess() {
  1743. foreach ($this->info as $key => $v) {
  1744. if (count((array) $v) == 1) {
  1745. $this->info[$key] = $v->type;
  1746. } elseif (count((array) $v) == 2 && isset($v->allow_null)) {
  1747. $this->info[$key] = -$v->type;
  1748. }
  1749. }
  1750. }
  1751. }
  1752. /**
  1753. * @todo Unit test
  1754. */
  1755. class HTMLPurifier_ContentSets
  1756. {
  1757. /**
  1758. * List of content set strings (pipe seperators) indexed by name.
  1759. */
  1760. public $info = array();
  1761. /**
  1762. * List of content set lookups (element => true) indexed by name.
  1763. * @note This is in HTMLPurifier_HTMLDefinition->info_content_sets
  1764. */
  1765. public $lookup = array();
  1766. /**
  1767. * Synchronized list of defined content sets (keys of info)
  1768. */
  1769. protected $keys = array();
  1770. /**
  1771. * Synchronized list of defined content values (values of info)
  1772. */
  1773. protected $values = array();
  1774. /**
  1775. * Merges in module's content sets, expands identifiers in the content
  1776. * sets and populates the keys, values and lookup member variables.
  1777. * @param $modules List of HTMLPurifier_HTMLModule
  1778. */
  1779. public function __construct($modules) {
  1780. if (!is_array($modules)) $modules = array($modules);
  1781. // populate content_sets based on module hints
  1782. // sorry, no way of overloading
  1783. foreach ($modules as $module_i => $module) {
  1784. foreach ($module->content_sets as $key => $value) {
  1785. $temp = $this->convertToLookup($value);
  1786. if (isset($this->lookup[$key])) {
  1787. // add it into the existing content set
  1788. $this->lookup[$key] = array_merge($this->lookup[$key], $temp);
  1789. } else {
  1790. $this->lookup[$key] = $temp;
  1791. }
  1792. }
  1793. }
  1794. $old_lookup = false;
  1795. while ($old_lookup !== $this->lookup) {
  1796. $old_lookup = $this->lookup;
  1797. foreach ($this->lookup as $i => $set) {
  1798. $add = array();
  1799. foreach ($set as $element => $x) {
  1800. if (isset($this->lookup[$element])) {
  1801. $add += $this->lookup[$element];
  1802. unset($this->lookup[$i][$element]);
  1803. }
  1804. }
  1805. $this->lookup[$i] += $add;
  1806. }
  1807. }
  1808. foreach ($this->lookup as $key => $lookup) {
  1809. $this->info[$key] = implode(' | ', array_keys($lookup));
  1810. }
  1811. $this->keys = array_keys($this->info);
  1812. $this->values = array_values($this->info);
  1813. }
  1814. /**
  1815. * Accepts a definition; generates and assigns a ChildDef for it
  1816. * @param $def HTMLPurifier_ElementDef reference
  1817. * @param $module Module that defined the ElementDef
  1818. */
  1819. public function generateChildDef(&$def, $module) {
  1820. if (!empty($def->child)) return; // already done!
  1821. $content_model = $def->content_model;
  1822. if (is_string($content_model)) {
  1823. // Assume that $this->keys is alphanumeric
  1824. $def->content_model = preg_replace_callback(
  1825. '/\b(' . implode('|', $this->keys) . ')\b/',
  1826. array($this, 'generateChildDefCallback'),
  1827. $content_model
  1828. );
  1829. //$def->content_model = str_replace(
  1830. // $this->keys, $this->values, $content_model);
  1831. }
  1832. $def->child = $this->getChildDef($def, $module);
  1833. }
  1834. public function generateChildDefCallback($matches) {
  1835. return $this->info[$matches[0]];
  1836. }
  1837. /**
  1838. * Instantiates a ChildDef based on content_model and content_model_type
  1839. * member variables in HTMLPurifier_ElementDef
  1840. * @note This will also defer to modules for custom HTMLPurifier_ChildDef
  1841. * subclasses that need content set expansion
  1842. * @param $def HTMLPurifier_ElementDef to have ChildDef extracted
  1843. * @return HTMLPurifier_ChildDef corresponding to ElementDef
  1844. */
  1845. public function getChildDef($def, $module) {
  1846. $value = $def->content_model;
  1847. if (is_object($value)) {
  1848. trigger_error(
  1849. 'Literal object child definitions should be stored in '.
  1850. 'ElementDef->child not ElementDef->content_model',
  1851. E_USER_NOTICE
  1852. );
  1853. return $value;
  1854. }
  1855. switch ($def->content_model_type) {
  1856. case 'required':
  1857. return new HTMLPurifier_ChildDef_Required($value);
  1858. case 'optional':
  1859. return new HTMLPurifier_ChildDef_Optional($value);
  1860. case 'empty':
  1861. return new HTMLPurifier_ChildDef_Empty();
  1862. case 'custom':
  1863. return new HTMLPurifier_ChildDef_Custom($value);
  1864. }
  1865. // defer to its module
  1866. $return = false;
  1867. if ($module->defines_child_def) { // save a func call
  1868. $return = $module->getChildDef($def);
  1869. }
  1870. if ($return !== false) return $return;
  1871. // error-out
  1872. trigger_error(
  1873. 'Could not determine which ChildDef class to instantiate',
  1874. E_USER_ERROR
  1875. );
  1876. return false;
  1877. }
  1878. /**
  1879. * Converts a string list of elements separated by pipes into
  1880. * a lookup array.
  1881. * @param $string List of elements
  1882. * @return Lookup array of elements
  1883. */
  1884. protected function convertToLookup($string) {
  1885. $array = explode('|', str_replace(' ', '', $string));
  1886. $ret = array();
  1887. foreach ($array as $i => $k) {
  1888. $ret[$k] = true;
  1889. }
  1890. return $ret;
  1891. }
  1892. }
  1893. /**
  1894. * Registry object that contains information about the current context.
  1895. * @warning Is a bit buggy when variables are set to null: it thinks
  1896. * they don't exist! So use false instead, please.
  1897. * @note Since the variables Context deals with may not be objects,
  1898. * references are very important here! Do not remove!
  1899. */
  1900. class HTMLPurifier_Context
  1901. {
  1902. /**
  1903. * Private array that stores the references.
  1904. */
  1905. private $_storage = array();
  1906. /**
  1907. * Registers a variable into the context.
  1908. * @param $name String name
  1909. * @param $ref Reference to variable to be registered
  1910. */
  1911. public function register($name, &$ref) {
  1912. if (isset($this->_storage[$name])) {
  1913. trigger_error("Name $name produces collision, cannot re-register",
  1914. E_USER_ERROR);
  1915. return;
  1916. }
  1917. $this->_storage[$name] =& $ref;
  1918. }
  1919. /**
  1920. * Retrieves a variable reference from the context.
  1921. * @param $name String name
  1922. * @param $ignore_error Boolean whether or not to ignore error
  1923. */
  1924. public function &get($name, $ignore_error = false) {
  1925. if (!isset($this->_storage[$name])) {
  1926. if (!$ignore_error) {
  1927. trigger_error("Attempted to retrieve non-existent variable $name",
  1928. E_USER_ERROR);
  1929. }
  1930. $var = null; // so we can return by reference
  1931. return $var;
  1932. }
  1933. return $this->_storage[$name];
  1934. }
  1935. /**
  1936. * Destorys a variable in the context.
  1937. * @param $name String name
  1938. */
  1939. public function destroy($name) {
  1940. if (!isset($this->_storage[$name])) {
  1941. trigger_error("Attempted to destroy non-existent variable $name",
  1942. E_USER_ERROR);
  1943. return;
  1944. }
  1945. unset($this->_storage[$name]);
  1946. }
  1947. /**
  1948. * Checks whether or not the variable exists.
  1949. * @param $name String name
  1950. */
  1951. public function exists($name) {
  1952. return isset($this->_storage[$name]);
  1953. }
  1954. /**
  1955. * Loads a series of variables from an associative array
  1956. * @param $context_array Assoc array of variables to load
  1957. */
  1958. public function loadArray($context_array) {
  1959. foreach ($context_array as $key => $discard) {
  1960. $this->register($key, $context_array[$key]);
  1961. }
  1962. }
  1963. }
  1964. /**
  1965. * Abstract class representing Definition cache managers that implements
  1966. * useful common methods and is a factory.
  1967. * @todo Create a separate maintenance file advanced users can use to
  1968. * cache their custom HTMLDefinition, which can be loaded
  1969. * via a configuration directive
  1970. * @todo Implement memcached
  1971. */
  1972. abstract class HTMLPurifier_DefinitionCache
  1973. {
  1974. public $type;
  1975. /**
  1976. * @param $name Type of definition objects this instance of the
  1977. * cache will handle.
  1978. */
  1979. public function __construct($type) {
  1980. $this->type = $type;
  1981. }
  1982. /**
  1983. * Generates a unique identifier for a particular configuration
  1984. * @param Instance of HTMLPurifier_Config
  1985. */
  1986. public function generateKey($config) {
  1987. return $config->version . ',' . // possibly replace with function calls
  1988. $config->getBatchSerial($this->type) . ',' .
  1989. $config->get($this->type . '.DefinitionRev');
  1990. }
  1991. /**
  1992. * Tests whether or not a key is old with respect to the configuration's
  1993. * version and revision number.
  1994. * @param $key Key to test
  1995. * @param $config Instance of HTMLPurifier_Config to test against
  1996. */
  1997. public function isOld($key, $config) {
  1998. if (substr_count($key, ',') < 2) return true;
  1999. list($version, $hash, $revision) = explode(',', $key, 3);
  2000. $compare = version_compare($version, $config->version);
  2001. // version mismatch, is always old
  2002. if ($compare != 0) return true;
  2003. // versions match, ids match, check revision number
  2004. if (
  2005. $hash == $config->getBatchSerial($this->type) &&
  2006. $revision < $config->get($this->type . '.DefinitionRev')
  2007. ) return true;
  2008. return false;
  2009. }
  2010. /**
  2011. * Checks if a definition's type jives with the cache's type
  2012. * @note Throws an error on failure
  2013. * @param $def Definition object to check
  2014. * @return Boolean true if good, false if not
  2015. */
  2016. public function checkDefType($def) {
  2017. if ($def->type !== $this->type) {
  2018. trigger_error("Cannot use definition of type {$def->type} in cache for {$this->type}");
  2019. return false;
  2020. }
  2021. return true;
  2022. }
  2023. /**
  2024. * Adds a definition object to the cache
  2025. */
  2026. abstract public function add($def, $config);
  2027. /**
  2028. * Unconditionally saves a definition object to the cache
  2029. */
  2030. abstract public function set($def, $config);
  2031. /**
  2032. * Replace an object in the cache
  2033. */
  2034. abstract public function replace($def, $config);
  2035. /**
  2036. * Retrieves a definition object from the cache
  2037. */
  2038. abstract public function get($config);
  2039. /**
  2040. * Removes a definition object to the cache
  2041. */
  2042. abstract public function remove($config);
  2043. /**
  2044. * Clears all objects from cache
  2045. */
  2046. abstract public function flush($config);
  2047. /**
  2048. * Clears all expired (older version or revision) objects from cache
  2049. * @note Be carefuly implementing this method as flush. Flush must
  2050. * not interfere with other Definition types, and cleanup()
  2051. * should not be repeatedly called by userland code.
  2052. */
  2053. abstract public function cleanup($config);
  2054. }
  2055. /**
  2056. * Responsible for creating definition caches.
  2057. */
  2058. class HTMLPurifier_DefinitionCacheFactory
  2059. {
  2060. protected $caches = array('Serializer' => array());
  2061. protected $implementations = array();
  2062. protected $decorators = array();
  2063. /**
  2064. * Initialize default decorators
  2065. */
  2066. public function setup() {
  2067. $this->addDecorator('Cleanup');
  2068. }
  2069. /**
  2070. * Retrieves an instance of global definition cache factory.
  2071. */
  2072. public static function instance($prototype = null) {
  2073. static $instance;
  2074. if ($prototype !== null) {
  2075. $instance = $prototype;
  2076. } elseif ($instance === null || $prototype === true) {
  2077. $instance = new HTMLPurifier_DefinitionCacheFactory();
  2078. $instance->setup();
  2079. }
  2080. return $instance;
  2081. }
  2082. /**
  2083. * Registers a new definition cache object
  2084. * @param $short Short name of cache object, for reference
  2085. * @param $long Full class name of cache object, for construction
  2086. */
  2087. public function register($short, $long) {
  2088. $this->implementations[$short] = $long;
  2089. }
  2090. /**
  2091. * Factory method that creates a cache object based on configuration
  2092. * @param $name Name of definitions handled by cache
  2093. * @param $config Instance of HTMLPurifier_Config
  2094. */
  2095. public function create($type, $config) {
  2096. $method = $config->get('Cache.DefinitionImpl');
  2097. if ($method === null) {
  2098. return new HTMLPurifier_DefinitionCache_Null($type);
  2099. }
  2100. if (!empty($this->caches[$method][$type])) {
  2101. return $this->caches[$method][$type];
  2102. }
  2103. if (
  2104. isset($this->implementations[$method]) &&
  2105. class_exists($class = $this->implementations[$method], false)
  2106. ) {
  2107. $cache = new $class($type);
  2108. } else {
  2109. if ($method != 'Serializer') {
  2110. trigger_error("Unrecognized DefinitionCache $method, using Serializer instead", E_USER_WARNING);
  2111. }
  2112. $cache = new HTMLPurifier_DefinitionCache_Serializer($type);
  2113. }
  2114. foreach ($this->decorators as $decorator) {
  2115. $new_cache = $decorator->decorate($cache);
  2116. // prevent infinite recursion in PHP 4
  2117. unset($cache);
  2118. $cache = $new_cache;
  2119. }
  2120. $this->caches[$method][$type] = $cache;
  2121. return $this->caches[$method][$type];
  2122. }
  2123. /**
  2124. * Registers a decorator to add to all new cache objects
  2125. * @param
  2126. */
  2127. public function addDecorator($decorator) {
  2128. if (is_string($decorator)) {
  2129. $class = "HTMLPurifier_DefinitionCache_Decorator_$decorator";
  2130. $decorator = new $class;
  2131. }
  2132. $this->decorators[$decorator->name] = $decorator;
  2133. }
  2134. }
  2135. /**
  2136. * Represents a document type, contains information on which modules
  2137. * need to be loaded.
  2138. * @note This class is inspected by Printer_HTMLDefinition->renderDoctype.
  2139. * If structure changes, please update that function.
  2140. */
  2141. class HTMLPurifier_Doctype
  2142. {
  2143. /**
  2144. * Full name of doctype
  2145. */
  2146. public $name;
  2147. /**
  2148. * List of standard modules (string identifiers or literal objects)
  2149. * that this doctype uses
  2150. */
  2151. public $modules = array();
  2152. /**
  2153. * List of modules to use for tidying up code
  2154. */
  2155. public $tidyModules = array();
  2156. /**
  2157. * Is the language derived from XML (i.e. XHTML)?
  2158. */
  2159. public $xml = true;
  2160. /**
  2161. * List of aliases for this doctype
  2162. */
  2163. public $aliases = array();
  2164. /**
  2165. * Public DTD identifier
  2166. */
  2167. public $dtdPublic;
  2168. /**
  2169. * System DTD identifier
  2170. */
  2171. public $dtdSystem;
  2172. public function __construct($name = null, $xml = true, $modules = array(),
  2173. $tidyModules = array(), $aliases = array(), $dtd_public = null, $dtd_system = null
  2174. ) {
  2175. $this->name = $name;
  2176. $this->xml = $xml;
  2177. $this->modules = $modules;
  2178. $this->tidyModules = $tidyModules;
  2179. $this->aliases = $aliases;
  2180. $this->dtdPublic = $dtd_public;
  2181. $this->dtdSystem = $dtd_system;
  2182. }
  2183. }
  2184. class HTMLPurifier_DoctypeRegistry
  2185. {
  2186. /**
  2187. * Hash of doctype names to doctype objects
  2188. */
  2189. protected $doctypes;
  2190. /**
  2191. * Lookup table of aliases to real doctype names
  2192. */
  2193. protected $aliases;
  2194. /**
  2195. * Registers a doctype to the registry
  2196. * @note Accepts a fully-formed doctype object, or the
  2197. * parameters for constructing a doctype object
  2198. * @param $doctype Name of doctype or literal doctype object
  2199. * @param $modules Modules doctype will load
  2200. * @param $modules_for_modes Modules doctype will load for certain modes
  2201. * @param $aliases Alias names for doctype
  2202. * @return Editable registered doctype
  2203. */
  2204. public function register($doctype, $xml = true, $modules = array(),
  2205. $tidy_modules = array(), $aliases = array(), $dtd_public = null, $dtd_system = null
  2206. ) {
  2207. if (!is_array($modules)) $modules = array($modules);
  2208. if (!is_array($tidy_modules)) $tidy_modules = array($tidy_modules);
  2209. if (!is_array($aliases)) $aliases = array($aliases);
  2210. if (!is_object($doctype)) {
  2211. $doctype = new HTMLPurifier_Doctype(
  2212. $doctype, $xml, $modules, $tidy_modules, $aliases, $dtd_public, $dtd_system
  2213. );
  2214. }
  2215. $this->doctypes[$doctype->name] = $doctype;
  2216. $name = $doctype->name;
  2217. // hookup aliases
  2218. foreach ($doctype->aliases as $alias) {
  2219. if (isset($this->doctypes[$alias])) continue;
  2220. $this->aliases[$alias] = $name;
  2221. }
  2222. // remove old aliases
  2223. if (isset($this->aliases[$name])) unset($this->aliases[$name]);
  2224. return $doctype;
  2225. }
  2226. /**
  2227. * Retrieves reference to a doctype of a certain name
  2228. * @note This function resolves aliases
  2229. * @note When possible, use the more fully-featured make()
  2230. * @param $doctype Name of doctype
  2231. * @return Editable doctype object
  2232. */
  2233. public function get($doctype) {
  2234. if (isset($this->aliases[$doctype])) $doctype = $this->aliases[$doctype];
  2235. if (!isset($this->doctypes[$doctype])) {
  2236. trigger_error('Doctype ' . htmlspecialchars($doctype) . ' does not exist', E_USER_ERROR);
  2237. $anon = new HTMLPurifier_Doctype($doctype);
  2238. return $anon;
  2239. }
  2240. return $this->doctypes[$doctype];
  2241. }
  2242. /**
  2243. * Creates a doctype based on a configuration object,
  2244. * will perform initialization on the doctype
  2245. * @note Use this function to get a copy of doctype that config
  2246. * can hold on to (this is necessary in order to tell
  2247. * Generator whether or not the current document is XML
  2248. * based or not).
  2249. */
  2250. public function make($config) {
  2251. return clone $this->get($this->getDoctypeFromConfig($config));
  2252. }
  2253. /**
  2254. * Retrieves the doctype from the configuration object
  2255. */
  2256. public function getDoctypeFromConfig($config) {
  2257. // recommended test
  2258. $doctype = $config->get('HTML.Doctype');
  2259. if (!empty($doctype)) return $doctype;
  2260. $doctype = $config->get('HTML.CustomDoctype');
  2261. if (!empty($doctype)) return $doctype;
  2262. // backwards-compatibility
  2263. if ($config->get('HTML.XHTML')) {
  2264. $doctype = 'XHTML 1.0';
  2265. } else {
  2266. $doctype = 'HTML 4.01';
  2267. }
  2268. if ($config->get('HTML.Strict')) {
  2269. $doctype .= ' Strict';
  2270. } else {
  2271. $doctype .= ' Transitional';
  2272. }
  2273. return $doctype;
  2274. }
  2275. }
  2276. /**
  2277. * Structure that stores an HTML element definition. Used by
  2278. * HTMLPurifier_HTMLDefinition and HTMLPurifier_HTMLModule.
  2279. * @note This class is inspected by HTMLPurifier_Printer_HTMLDefinition.
  2280. * Please update that class too.
  2281. * @warning If you add new properties to this class, you MUST update
  2282. * the mergeIn() method.
  2283. */
  2284. class HTMLPurifier_ElementDef
  2285. {
  2286. /**
  2287. * Does the definition work by itself, or is it created solely
  2288. * for the purpose of merging into another definition?
  2289. */
  2290. public $standalone = true;
  2291. /**
  2292. * Associative array of attribute name to HTMLPurifier_AttrDef
  2293. * @note Before being processed by HTMLPurifier_AttrCollections
  2294. * when modules are finalized during
  2295. * HTMLPurifier_HTMLDefinition->setup(), this array may also
  2296. * contain an array at index 0 that indicates which attribute
  2297. * collections to load into the full array. It may also
  2298. * contain string indentifiers in lieu of HTMLPurifier_AttrDef,
  2299. * see HTMLPurifier_AttrTypes on how they are expanded during
  2300. * HTMLPurifier_HTMLDefinition->setup() processing.
  2301. */
  2302. public $attr = array();
  2303. /**
  2304. * Indexed list of tag's HTMLPurifier_AttrTransform to be done before validation
  2305. */
  2306. public $attr_transform_pre = array();
  2307. /**
  2308. * Indexed list of tag's HTMLPurifier_AttrTransform to be done after validation
  2309. */
  2310. public $attr_transform_post = array();
  2311. /**
  2312. * HTMLPurifier_ChildDef of this tag.
  2313. */
  2314. public $child;
  2315. /**
  2316. * Abstract string representation of internal ChildDef rules. See
  2317. * HTMLPurifier_ContentSets for how this is parsed and then transformed
  2318. * into an HTMLPurifier_ChildDef.
  2319. * @warning This is a temporary variable that is not available after
  2320. * being processed by HTMLDefinition
  2321. */
  2322. public $content_model;
  2323. /**
  2324. * Value of $child->type, used to determine which ChildDef to use,
  2325. * used in combination with $content_model.
  2326. * @warning This must be lowercase
  2327. * @warning This is a temporary variable that is not available after
  2328. * being processed by HTMLDefinition
  2329. */
  2330. public $content_model_type;
  2331. /**
  2332. * Does the element have a content model (#PCDATA | Inline)*? This
  2333. * is important for chameleon ins and del processing in
  2334. * HTMLPurifier_ChildDef_Chameleon. Dynamically set: modules don't
  2335. * have to worry about this one.
  2336. */
  2337. public $descendants_are_inline = false;
  2338. /**
  2339. * List of the names of required attributes this element has. Dynamically
  2340. * populated by HTMLPurifier_HTMLDefinition::getElement
  2341. */
  2342. public $required_attr = array();
  2343. /**
  2344. * Lookup table of tags excluded from all descendants of this tag.
  2345. * @note SGML permits exclusions for all descendants, but this is
  2346. * not possible with DTDs or XML Schemas. W3C has elected to
  2347. * use complicated compositions of content_models to simulate
  2348. * exclusion for children, but we go the simpler, SGML-style
  2349. * route of flat-out exclusions, which correctly apply to
  2350. * all descendants and not just children. Note that the XHTML
  2351. * Modularization Abstract Modules are blithely unaware of such
  2352. * distinctions.
  2353. */
  2354. public $excludes = array();
  2355. /**
  2356. * This tag is explicitly auto-closed by the following tags.
  2357. */
  2358. public $autoclose = array();
  2359. /**
  2360. * If a foreign element is found in this element, test if it is
  2361. * allowed by this sub-element; if it is, instead of closing the
  2362. * current element, place it inside this element.
  2363. */
  2364. public $wrap;
  2365. /**
  2366. * Whether or not this is a formatting element affected by the
  2367. * "Active Formatting Elements" algorithm.
  2368. */
  2369. public $formatting;
  2370. /**
  2371. * Low-level factory constructor for creating new standalone element defs
  2372. */
  2373. public static function create($content_model, $content_model_type, $attr) {
  2374. $def = new HTMLPurifier_ElementDef();
  2375. $def->content_model = $content_model;
  2376. $def->content_model_type = $content_model_type;
  2377. $def->attr = $attr;
  2378. return $def;
  2379. }
  2380. /**
  2381. * Merges the values of another element definition into this one.
  2382. * Values from the new element def take precedence if a value is
  2383. * not mergeable.
  2384. */
  2385. public function mergeIn($def) {
  2386. // later keys takes precedence
  2387. foreach($def->attr as $k => $v) {
  2388. if ($k === 0) {
  2389. // merge in the includes
  2390. // sorry, no way to override an include
  2391. foreach ($v as $v2) {
  2392. $this->attr[0][] = $v2;
  2393. }
  2394. continue;
  2395. }
  2396. if ($v === false) {
  2397. if (isset($this->attr[$k])) unset($this->attr[$k]);
  2398. continue;
  2399. }
  2400. $this->attr[$k] = $v;
  2401. }
  2402. $this->_mergeAssocArray($this->attr_transform_pre, $def->attr_transform_pre);
  2403. $this->_mergeAssocArray($this->attr_transform_post, $def->attr_transform_post);
  2404. $this->_mergeAssocArray($this->excludes, $def->excludes);
  2405. if(!empty($def->content_model)) {
  2406. $this->content_model =
  2407. str_replace("#SUPER", $this->content_model, $def->content_model);
  2408. $this->child = false;
  2409. }
  2410. if(!empty($def->content_model_type)) {
  2411. $this->content_model_type = $def->content_model_type;
  2412. $this->child = false;
  2413. }
  2414. if(!is_null($def->child)) $this->child = $def->child;
  2415. if(!is_null($def->formatting)) $this->formatting = $def->formatting;
  2416. if($def->descendants_are_inline) $this->descendants_are_inline = $def->descendants_are_inline;
  2417. }
  2418. /**
  2419. * Merges one array into another, removes values which equal false
  2420. * @param $a1 Array by reference that is merged into
  2421. * @param $a2 Array that merges into $a1
  2422. */
  2423. private function _mergeAssocArray(&$a1, $a2) {
  2424. foreach ($a2 as $k => $v) {
  2425. if ($v === false) {
  2426. if (isset($a1[$k])) unset($a1[$k]);
  2427. continue;
  2428. }
  2429. $a1[$k] = $v;
  2430. }
  2431. }
  2432. }
  2433. /**
  2434. * A UTF-8 specific character encoder that handles cleaning and transforming.
  2435. * @note All functions in this class should be static.
  2436. */
  2437. class HTMLPurifier_Encoder
  2438. {
  2439. /**
  2440. * Constructor throws fatal error if you attempt to instantiate class
  2441. */
  2442. private function __construct() {
  2443. trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
  2444. }
  2445. /**
  2446. * Error-handler that mutes errors, alternative to shut-up operator.
  2447. */
  2448. public static function muteErrorHandler() {}
  2449. /**
  2450. * Cleans a UTF-8 string for well-formedness and SGML validity
  2451. *
  2452. * It will parse according to UTF-8 and return a valid UTF8 string, with
  2453. * non-SGML codepoints excluded.
  2454. *
  2455. * @note Just for reference, the non-SGML code points are 0 to 31 and
  2456. * 127 to 159, inclusive. However, we allow code points 9, 10
  2457. * and 13, which are the tab, line feed and carriage return
  2458. * respectively. 128 and above the code points map to multibyte
  2459. * UTF-8 representations.
  2460. *
  2461. * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
  2462. * hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
  2463. * LGPL license. Notes on what changed are inside, but in general,
  2464. * the original code transformed UTF-8 text into an array of integer
  2465. * Unicode codepoints. Understandably, transforming that back to
  2466. * a string would be somewhat expensive, so the function was modded to
  2467. * directly operate on the string. However, this discourages code
  2468. * reuse, and the logic enumerated here would be useful for any
  2469. * function that needs to be able to understand UTF-8 characters.
  2470. * As of right now, only smart lossless character encoding converters
  2471. * would need that, and I'm probably not going to implement them.
  2472. * Once again, PHP 6 should solve all our problems.
  2473. */
  2474. public static function cleanUTF8($str, $force_php = false) {
  2475. // UTF-8 validity is checked since PHP 4.3.5
  2476. // This is an optimization: if the string is already valid UTF-8, no
  2477. // need to do PHP stuff. 99% of the time, this will be the case.
  2478. // The regexp matches the XML char production, as well as well as excluding
  2479. // non-SGML codepoints U+007F to U+009F
  2480. if (preg_match('/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) {
  2481. return $str;
  2482. }
  2483. $mState = 0; // cached expected number of octets after the current octet
  2484. // until the beginning of the next UTF8 character sequence
  2485. $mUcs4 = 0; // cached Unicode character
  2486. $mBytes = 1; // cached expected number of octets in the current sequence
  2487. // original code involved an $out that was an array of Unicode
  2488. // codepoints. Instead of having to convert back into UTF-8, we've
  2489. // decided to directly append valid UTF-8 characters onto a string
  2490. // $out once they're done. $char accumulates raw bytes, while $mUcs4
  2491. // turns into the Unicode code point, so there's some redundancy.
  2492. $out = '';
  2493. $char = '';
  2494. $len = strlen($str);
  2495. for($i = 0; $i < $len; $i++) {
  2496. $in = ord($str{$i});
  2497. $char .= $str[$i]; // append byte to char
  2498. if (0 == $mState) {
  2499. // When mState is zero we expect either a US-ASCII character
  2500. // or a multi-octet sequence.
  2501. if (0 == (0x80 & ($in))) {
  2502. // US-ASCII, pass straight through.
  2503. if (($in <= 31 || $in == 127) &&
  2504. !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
  2505. ) {
  2506. // control characters, remove
  2507. } else {
  2508. $out .= $char;
  2509. }
  2510. // reset
  2511. $char = '';
  2512. $mBytes = 1;
  2513. } elseif (0xC0 == (0xE0 & ($in))) {
  2514. // First octet of 2 octet sequence
  2515. $mUcs4 = ($in);
  2516. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  2517. $mState = 1;
  2518. $mBytes = 2;
  2519. } elseif (0xE0 == (0xF0 & ($in))) {
  2520. // First octet of 3 octet sequence
  2521. $mUcs4 = ($in);
  2522. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  2523. $mState = 2;
  2524. $mBytes = 3;
  2525. } elseif (0xF0 == (0xF8 & ($in))) {
  2526. // First octet of 4 octet sequence
  2527. $mUcs4 = ($in);
  2528. $mUcs4 = ($mUcs4 & 0x07) << 18;
  2529. $mState = 3;
  2530. $mBytes = 4;
  2531. } elseif (0xF8 == (0xFC & ($in))) {
  2532. // First octet of 5 octet sequence.
  2533. //
  2534. // This is illegal because the encoded codepoint must be
  2535. // either:
  2536. // (a) not the shortest form or
  2537. // (b) outside the Unicode range of 0-0x10FFFF.
  2538. // Rather than trying to resynchronize, we will carry on
  2539. // until the end of the sequence and let the later error
  2540. // handling code catch it.
  2541. $mUcs4 = ($in);
  2542. $mUcs4 = ($mUcs4 & 0x03) << 24;
  2543. $mState = 4;
  2544. $mBytes = 5;
  2545. } elseif (0xFC == (0xFE & ($in))) {
  2546. // First octet of 6 octet sequence, see comments for 5
  2547. // octet sequence.
  2548. $mUcs4 = ($in);
  2549. $mUcs4 = ($mUcs4 & 1) << 30;
  2550. $mState = 5;
  2551. $mBytes = 6;
  2552. } else {
  2553. // Current octet is neither in the US-ASCII range nor a
  2554. // legal first octet of a multi-octet sequence.
  2555. $mState = 0;
  2556. $mUcs4 = 0;
  2557. $mBytes = 1;
  2558. $char = '';
  2559. }
  2560. } else {
  2561. // When mState is non-zero, we expect a continuation of the
  2562. // multi-octet sequence
  2563. if (0x80 == (0xC0 & ($in))) {
  2564. // Legal continuation.
  2565. $shift = ($mState - 1) * 6;
  2566. $tmp = $in;
  2567. $tmp = ($tmp & 0x0000003F) << $shift;
  2568. $mUcs4 |= $tmp;
  2569. if (0 == --$mState) {
  2570. // End of the multi-octet sequence. mUcs4 now contains
  2571. // the final Unicode codepoint to be output
  2572. // Check for illegal sequences and codepoints.
  2573. // From Unicode 3.1, non-shortest form is illegal
  2574. if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
  2575. ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
  2576. ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
  2577. (4 < $mBytes) ||
  2578. // From Unicode 3.2, surrogate characters = illegal
  2579. (($mUcs4 & 0xFFFFF800) == 0xD800) ||
  2580. // Codepoints outside the Unicode range are illegal
  2581. ($mUcs4 > 0x10FFFF)
  2582. ) {
  2583. } elseif (0xFEFF != $mUcs4 && // omit BOM
  2584. // check for valid Char unicode codepoints
  2585. (
  2586. 0x9 == $mUcs4 ||
  2587. 0xA == $mUcs4 ||
  2588. 0xD == $mUcs4 ||
  2589. (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
  2590. // 7F-9F is not strictly prohibited by XML,
  2591. // but it is non-SGML, and thus we don't allow it
  2592. (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
  2593. (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
  2594. )
  2595. ) {
  2596. $out .= $char;
  2597. }
  2598. // initialize UTF8 cache (reset)
  2599. $mState = 0;
  2600. $mUcs4 = 0;
  2601. $mBytes = 1;
  2602. $char = '';
  2603. }
  2604. } else {
  2605. // ((0xC0 & (*in) != 0x80) && (mState != 0))
  2606. // Incomplete multi-octet sequence.
  2607. // used to result in complete fail, but we'll reset
  2608. $mState = 0;
  2609. $mUcs4 = 0;
  2610. $mBytes = 1;
  2611. $char ='';
  2612. }
  2613. }
  2614. }
  2615. return $out;
  2616. }
  2617. /**
  2618. * Translates a Unicode codepoint into its corresponding UTF-8 character.
  2619. * @note Based on Feyd's function at
  2620. * <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
  2621. * which is in public domain.
  2622. * @note While we're going to do code point parsing anyway, a good
  2623. * optimization would be to refuse to translate code points that
  2624. * are non-SGML characters. However, this could lead to duplication.
  2625. * @note This is very similar to the unichr function in
  2626. * maintenance/generate-entity-file.php (although this is superior,
  2627. * due to its sanity checks).
  2628. */
  2629. // +----------+----------+----------+----------+
  2630. // | 33222222 | 22221111 | 111111 | |
  2631. // | 10987654 | 32109876 | 54321098 | 76543210 | bit
  2632. // +----------+----------+----------+----------+
  2633. // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
  2634. // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
  2635. // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
  2636. // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
  2637. // +----------+----------+----------+----------+
  2638. // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
  2639. // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
  2640. // +----------+----------+----------+----------+
  2641. public static function unichr($code) {
  2642. if($code > 1114111 or $code < 0 or
  2643. ($code >= 55296 and $code <= 57343) ) {
  2644. // bits are set outside the "valid" range as defined
  2645. // by UNICODE 4.1.0
  2646. return '';
  2647. }
  2648. $x = $y = $z = $w = 0;
  2649. if ($code < 128) {
  2650. // regular ASCII character
  2651. $x = $code;
  2652. } else {
  2653. // set up bits for UTF-8
  2654. $x = ($code & 63) | 128;
  2655. if ($code < 2048) {
  2656. $y = (($code & 2047) >> 6) | 192;
  2657. } else {
  2658. $y = (($code & 4032) >> 6) | 128;
  2659. if($code < 65536) {
  2660. $z = (($code >> 12) & 15) | 224;
  2661. } else {
  2662. $z = (($code >> 12) & 63) | 128;
  2663. $w = (($code >> 18) & 7) | 240;
  2664. }
  2665. }
  2666. }
  2667. // set up the actual character
  2668. $ret = '';
  2669. if($w) $ret .= chr($w);
  2670. if($z) $ret .= chr($z);
  2671. if($y) $ret .= chr($y);
  2672. $ret .= chr($x);
  2673. return $ret;
  2674. }
  2675. /**
  2676. * Converts a string to UTF-8 based on configuration.
  2677. */
  2678. public static function convertToUTF8($str, $config, $context) {
  2679. $encoding = $config->get('Core.Encoding');
  2680. if ($encoding === 'utf-8') return $str;
  2681. static $iconv = null;
  2682. if ($iconv === null) $iconv = function_exists('iconv');
  2683. set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
  2684. if ($iconv && !$config->get('Test.ForceNoIconv')) {
  2685. $str = iconv($encoding, 'utf-8//IGNORE', $str);
  2686. if ($str === false) {
  2687. // $encoding is not a valid encoding
  2688. restore_error_handler();
  2689. trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR);
  2690. return '';
  2691. }
  2692. // If the string is bjorked by Shift_JIS or a similar encoding
  2693. // that doesn't support all of ASCII, convert the naughty
  2694. // characters to their true byte-wise ASCII/UTF-8 equivalents.
  2695. $str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding));
  2696. restore_error_handler();
  2697. return $str;
  2698. } elseif ($encoding === 'iso-8859-1') {
  2699. $str = utf8_encode($str);
  2700. restore_error_handler();
  2701. return $str;
  2702. }
  2703. trigger_error('Encoding not supported, please install iconv', E_USER_ERROR);
  2704. }
  2705. /**
  2706. * Converts a string from UTF-8 based on configuration.
  2707. * @note Currently, this is a lossy conversion, with unexpressable
  2708. * characters being omitted.
  2709. */
  2710. public static function convertFromUTF8($str, $config, $context) {
  2711. $encoding = $config->get('Core.Encoding');
  2712. if ($encoding === 'utf-8') return $str;
  2713. static $iconv = null;
  2714. if ($iconv === null) $iconv = function_exists('iconv');
  2715. if ($escape = $config->get('Core.EscapeNonASCIICharacters')) {
  2716. $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
  2717. }
  2718. set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
  2719. if ($iconv && !$config->get('Test.ForceNoIconv')) {
  2720. // Undo our previous fix in convertToUTF8, otherwise iconv will barf
  2721. $ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);
  2722. if (!$escape && !empty($ascii_fix)) {
  2723. $clear_fix = array();
  2724. foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = '';
  2725. $str = strtr($str, $clear_fix);
  2726. }
  2727. $str = strtr($str, array_flip($ascii_fix));
  2728. // Normal stuff
  2729. $str = iconv('utf-8', $encoding . '//IGNORE', $str);
  2730. restore_error_handler();
  2731. return $str;
  2732. } elseif ($encoding === 'iso-8859-1') {
  2733. $str = utf8_decode($str);
  2734. restore_error_handler();
  2735. return $str;
  2736. }
  2737. trigger_error('Encoding not supported', E_USER_ERROR);
  2738. }
  2739. /**
  2740. * Lossless (character-wise) conversion of HTML to ASCII
  2741. * @param $str UTF-8 string to be converted to ASCII
  2742. * @returns ASCII encoded string with non-ASCII character entity-ized
  2743. * @warning Adapted from MediaWiki, claiming fair use: this is a common
  2744. * algorithm. If you disagree with this license fudgery,
  2745. * implement it yourself.
  2746. * @note Uses decimal numeric entities since they are best supported.
  2747. * @note This is a DUMB function: it has no concept of keeping
  2748. * character entities that the projected character encoding
  2749. * can allow. We could possibly implement a smart version
  2750. * but that would require it to also know which Unicode
  2751. * codepoints the charset supported (not an easy task).
  2752. * @note Sort of with cleanUTF8() but it assumes that $str is
  2753. * well-formed UTF-8
  2754. */
  2755. public static function convertToASCIIDumbLossless($str) {
  2756. $bytesleft = 0;
  2757. $result = '';
  2758. $working = 0;
  2759. $len = strlen($str);
  2760. for( $i = 0; $i < $len; $i++ ) {
  2761. $bytevalue = ord( $str[$i] );
  2762. if( $bytevalue <= 0x7F ) { //0xxx xxxx
  2763. $result .= chr( $bytevalue );
  2764. $bytesleft = 0;
  2765. } elseif( $bytevalue <= 0xBF ) { //10xx xxxx
  2766. $working = $working << 6;
  2767. $working += ($bytevalue & 0x3F);
  2768. $bytesleft--;
  2769. if( $bytesleft <= 0 ) {
  2770. $result .= "&#" . $working . ";";
  2771. }
  2772. } elseif( $bytevalue <= 0xDF ) { //110x xxxx
  2773. $working = $bytevalue & 0x1F;
  2774. $bytesleft = 1;
  2775. } elseif( $bytevalue <= 0xEF ) { //1110 xxxx
  2776. $working = $bytevalue & 0x0F;
  2777. $bytesleft = 2;
  2778. } else { //1111 0xxx
  2779. $working = $bytevalue & 0x07;
  2780. $bytesleft = 3;
  2781. }
  2782. }
  2783. return $result;
  2784. }
  2785. /**
  2786. * This expensive function tests whether or not a given character
  2787. * encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
  2788. * fail this test, and require special processing. Variable width
  2789. * encodings shouldn't ever fail.
  2790. *
  2791. * @param string $encoding Encoding name to test, as per iconv format
  2792. * @param bool $bypass Whether or not to bypass the precompiled arrays.
  2793. * @return Array of UTF-8 characters to their corresponding ASCII,
  2794. * which can be used to "undo" any overzealous iconv action.
  2795. */
  2796. public static function testEncodingSupportsASCII($encoding, $bypass = false) {
  2797. static $encodings = array();
  2798. if (!$bypass) {
  2799. if (isset($encodings[$encoding])) return $encodings[$encoding];
  2800. $lenc = strtolower($encoding);
  2801. switch ($lenc) {
  2802. case 'shift_jis':
  2803. return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
  2804. case 'johab':
  2805. return array("\xE2\x82\xA9" => '\\');
  2806. }
  2807. if (strpos($lenc, 'iso-8859-') === 0) return array();
  2808. }
  2809. $ret = array();
  2810. set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
  2811. if (iconv('UTF-8', $encoding, 'a') === false) return false;
  2812. for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
  2813. $c = chr($i); // UTF-8 char
  2814. $r = iconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion
  2815. if (
  2816. $r === '' ||
  2817. // This line is needed for iconv implementations that do not
  2818. // omit characters that do not exist in the target character set
  2819. ($r === $c && iconv($encoding, 'UTF-8//IGNORE', $r) !== $c)
  2820. ) {
  2821. // Reverse engineer: what's the UTF-8 equiv of this byte
  2822. // sequence? This assumes that there's no variable width
  2823. // encoding that doesn't support ASCII.
  2824. $ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
  2825. }
  2826. }
  2827. restore_error_handler();
  2828. $encodings[$encoding] = $ret;
  2829. return $ret;
  2830. }
  2831. }
  2832. /**
  2833. * Object that provides entity lookup table from entity name to character
  2834. */
  2835. class HTMLPurifier_EntityLookup {
  2836. /**
  2837. * Assoc array of entity name to character represented.
  2838. */
  2839. public $table;
  2840. /**
  2841. * Sets up the entity lookup table from the serialized file contents.
  2842. * @note The serialized contents are versioned, but were generated
  2843. * using the maintenance script generate_entity_file.php
  2844. * @warning This is not in constructor to help enforce the Singleton
  2845. */
  2846. public function setup($file = false) {
  2847. if (!$file) {
  2848. $file = HTMLPURIFIER_PREFIX . '/HTMLPurifier/EntityLookup/entities.ser';
  2849. }
  2850. $this->table = unserialize(file_get_contents($file));
  2851. }
  2852. /**
  2853. * Retrieves sole instance of the object.
  2854. * @param Optional prototype of custom lookup table to overload with.
  2855. */
  2856. public static function instance($prototype = false) {
  2857. // no references, since PHP doesn't copy unless modified
  2858. static $instance = null;
  2859. if ($prototype) {
  2860. $instance = $prototype;
  2861. } elseif (!$instance) {
  2862. $instance = new HTMLPurifier_EntityLookup();
  2863. $instance->setup();
  2864. }
  2865. return $instance;
  2866. }
  2867. }
  2868. // if want to implement error collecting here, we'll need to use some sort
  2869. // of global data (probably trigger_error) because it's impossible to pass
  2870. // $config or $context to the callback functions.
  2871. /**
  2872. * Handles referencing and derefencing character entities
  2873. */
  2874. class HTMLPurifier_EntityParser
  2875. {
  2876. /**
  2877. * Reference to entity lookup table.
  2878. */
  2879. protected $_entity_lookup;
  2880. /**
  2881. * Callback regex string for parsing entities.
  2882. */
  2883. protected $_substituteEntitiesRegex =
  2884. '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
  2885. // 1. hex 2. dec 3. string (XML style)
  2886. /**
  2887. * Decimal to parsed string conversion table for special entities.
  2888. */
  2889. protected $_special_dec2str =
  2890. array(
  2891. 34 => '"',
  2892. 38 => '&',
  2893. 39 => "'",
  2894. 60 => '<',
  2895. 62 => '>'
  2896. );
  2897. /**
  2898. * Stripped entity names to decimal conversion table for special entities.
  2899. */
  2900. protected $_special_ent2dec =
  2901. array(
  2902. 'quot' => 34,
  2903. 'amp' => 38,
  2904. 'lt' => 60,
  2905. 'gt' => 62
  2906. );
  2907. /**
  2908. * Substitutes non-special entities with their parsed equivalents. Since
  2909. * running this whenever you have parsed character is t3h 5uck, we run
  2910. * it before everything else.
  2911. *
  2912. * @param $string String to have non-special entities parsed.
  2913. * @returns Parsed string.
  2914. */
  2915. public function substituteNonSpecialEntities($string) {
  2916. // it will try to detect missing semicolons, but don't rely on it
  2917. return preg_replace_callback(
  2918. $this->_substituteEntitiesRegex,
  2919. array($this, 'nonSpecialEntityCallback'),
  2920. $string
  2921. );
  2922. }
  2923. /**
  2924. * Callback function for substituteNonSpecialEntities() that does the work.
  2925. *
  2926. * @param $matches PCRE matches array, with 0 the entire match, and
  2927. * either index 1, 2 or 3 set with a hex value, dec value,
  2928. * or string (respectively).
  2929. * @returns Replacement string.
  2930. */
  2931. protected function nonSpecialEntityCallback($matches) {
  2932. // replaces all but big five
  2933. $entity = $matches[0];
  2934. $is_num = (@$matches[0][1] === '#');
  2935. if ($is_num) {
  2936. $is_hex = (@$entity[2] === 'x');
  2937. $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
  2938. // abort for special characters
  2939. if (isset($this->_special_dec2str[$code])) return $entity;
  2940. return HTMLPurifier_Encoder::unichr($code);
  2941. } else {
  2942. if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
  2943. if (!$this->_entity_lookup) {
  2944. $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
  2945. }
  2946. if (isset($this->_entity_lookup->table[$matches[3]])) {
  2947. return $this->_entity_lookup->table[$matches[3]];
  2948. } else {
  2949. return $entity;
  2950. }
  2951. }
  2952. }
  2953. /**
  2954. * Substitutes only special entities with their parsed equivalents.
  2955. *
  2956. * @notice We try to avoid calling this function because otherwise, it
  2957. * would have to be called a lot (for every parsed section).
  2958. *
  2959. * @param $string String to have non-special entities parsed.
  2960. * @returns Parsed string.
  2961. */
  2962. public function substituteSpecialEntities($string) {
  2963. return preg_replace_callback(
  2964. $this->_substituteEntitiesRegex,
  2965. array($this, 'specialEntityCallback'),
  2966. $string);
  2967. }
  2968. /**
  2969. * Callback function for substituteSpecialEntities() that does the work.
  2970. *
  2971. * This callback has same syntax as nonSpecialEntityCallback().
  2972. *
  2973. * @param $matches PCRE-style matches array, with 0 the entire match, and
  2974. * either index 1, 2 or 3 set with a hex value, dec value,
  2975. * or string (respectively).
  2976. * @returns Replacement string.
  2977. */
  2978. protected function specialEntityCallback($matches) {
  2979. $entity = $matches[0];
  2980. $is_num = (@$matches[0][1] === '#');
  2981. if ($is_num) {
  2982. $is_hex = (@$entity[2] === 'x');
  2983. $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
  2984. return isset($this->_special_dec2str[$int]) ?
  2985. $this->_special_dec2str[$int] :
  2986. $entity;
  2987. } else {
  2988. return isset($this->_special_ent2dec[$matches[3]]) ?
  2989. $this->_special_ent2dec[$matches[3]] :
  2990. $entity;
  2991. }
  2992. }
  2993. }
  2994. /**
  2995. * Error collection class that enables HTML Purifier to report HTML
  2996. * problems back to the user
  2997. */
  2998. class HTMLPurifier_ErrorCollector
  2999. {
  3000. /**
  3001. * Identifiers for the returned error array. These are purposely numeric
  3002. * so list() can be used.
  3003. */
  3004. const LINENO = 0;
  3005. const SEVERITY = 1;
  3006. const MESSAGE = 2;
  3007. const CHILDREN = 3;
  3008. protected $errors;
  3009. protected $_current;
  3010. protected $_stacks = array(array());
  3011. protected $locale;
  3012. protected $generator;
  3013. protected $context;
  3014. protected $lines = array();
  3015. public function __construct($context) {
  3016. $this->locale =& $context->get('Locale');
  3017. $this->context = $context;
  3018. $this->_current =& $this->_stacks[0];
  3019. $this->errors =& $this->_stacks[0];
  3020. }
  3021. /**
  3022. * Sends an error message to the collector for later use
  3023. * @param $severity int Error severity, PHP error style (don't use E_USER_)
  3024. * @param $msg string Error message text
  3025. * @param $subst1 string First substitution for $msg
  3026. * @param $subst2 string ...
  3027. */
  3028. public function send($severity, $msg) {
  3029. $args = array();
  3030. if (func_num_args() > 2) {
  3031. $args = func_get_args();
  3032. array_shift($args);
  3033. unset($args[0]);
  3034. }
  3035. $token = $this->context->get('CurrentToken', true);
  3036. $line = $token ? $token->line : $this->context->get('CurrentLine', true);
  3037. $col = $token ? $token->col : $this->context->get('CurrentCol', true);
  3038. $attr = $this->context->get('CurrentAttr', true);
  3039. // perform special substitutions, also add custom parameters
  3040. $subst = array();
  3041. if (!is_null($token)) {
  3042. $args['CurrentToken'] = $token;
  3043. }
  3044. if (!is_null($attr)) {
  3045. $subst['$CurrentAttr.Name'] = $attr;
  3046. if (isset($token->attr[$attr])) $subst['$CurrentAttr.Value'] = $token->attr[$attr];
  3047. }
  3048. if (empty($args)) {
  3049. $msg = $this->locale->getMessage($msg);
  3050. } else {
  3051. $msg = $this->locale->formatMessage($msg, $args);
  3052. }
  3053. if (!empty($subst)) $msg = strtr($msg, $subst);
  3054. // (numerically indexed)
  3055. $error = array(
  3056. self::LINENO => $line,
  3057. self::SEVERITY => $severity,
  3058. self::MESSAGE => $msg,
  3059. self::CHILDREN => array()
  3060. );
  3061. $this->_current[] = $error;
  3062. // NEW CODE BELOW ...
  3063. $struct = null;
  3064. // Top-level errors are either:
  3065. // TOKEN type, if $value is set appropriately, or
  3066. // "syntax" type, if $value is null
  3067. $new_struct = new HTMLPurifier_ErrorStruct();
  3068. $new_struct->type = HTMLPurifier_ErrorStruct::TOKEN;
  3069. if ($token) $new_struct->value = clone $token;
  3070. if (is_int($line) && is_int($col)) {
  3071. if (isset($this->lines[$line][$col])) {
  3072. $struct = $this->lines[$line][$col];
  3073. } else {
  3074. $struct = $this->lines[$line][$col] = $new_struct;
  3075. }
  3076. // These ksorts may present a performance problem
  3077. ksort($this->lines[$line], SORT_NUMERIC);
  3078. } else {
  3079. if (isset($this->lines[-1])) {
  3080. $struct = $this->lines[-1];
  3081. } else {
  3082. $struct = $this->lines[-1] = $new_struct;
  3083. }
  3084. }
  3085. ksort($this->lines, SORT_NUMERIC);
  3086. // Now, check if we need to operate on a lower structure
  3087. if (!empty($attr)) {
  3088. $struct = $struct->getChild(HTMLPurifier_ErrorStruct::ATTR, $attr);
  3089. if (!$struct->value) {
  3090. $struct->value = array($attr, 'PUT VALUE HERE');
  3091. }
  3092. }
  3093. if (!empty($cssprop)) {
  3094. $struct = $struct->getChild(HTMLPurifier_ErrorStruct::CSSPROP, $cssprop);
  3095. if (!$struct->value) {
  3096. // if we tokenize CSS this might be a little more difficult to do
  3097. $struct->value = array($cssprop, 'PUT VALUE HERE');
  3098. }
  3099. }
  3100. // Ok, structs are all setup, now time to register the error
  3101. $struct->addError($severity, $msg);
  3102. }
  3103. /**
  3104. * Retrieves raw error data for custom formatter to use
  3105. * @param List of arrays in format of array(line of error,
  3106. * error severity, error message,
  3107. * recursive sub-errors array)
  3108. */
  3109. public function getRaw() {
  3110. return $this->errors;
  3111. }
  3112. /**
  3113. * Default HTML formatting implementation for error messages
  3114. * @param $config Configuration array, vital for HTML output nature
  3115. * @param $errors Errors array to display; used for recursion.
  3116. */
  3117. public function getHTMLFormatted($config, $errors = null) {
  3118. $ret = array();
  3119. $this->generator = new HTMLPurifier_Generator($config, $this->context);
  3120. if ($errors === null) $errors = $this->errors;
  3121. // 'At line' message needs to be removed
  3122. // generation code for new structure goes here. It needs to be recursive.
  3123. foreach ($this->lines as $line => $col_array) {
  3124. if ($line == -1) continue;
  3125. foreach ($col_array as $col => $struct) {
  3126. $this->_renderStruct($ret, $struct, $line, $col);
  3127. }
  3128. }
  3129. if (isset($this->lines[-1])) {
  3130. $this->_renderStruct($ret, $this->lines[-1]);
  3131. }
  3132. if (empty($errors)) {
  3133. return '<p>' . $this->locale->getMessage('ErrorCollector: No errors') . '</p>';
  3134. } else {
  3135. return '<ul><li>' . implode('</li><li>', $ret) . '</li></ul>';
  3136. }
  3137. }
  3138. private function _renderStruct(&$ret, $struct, $line = null, $col = null) {
  3139. $stack = array($struct);
  3140. $context_stack = array(array());
  3141. while ($current = array_pop($stack)) {
  3142. $context = array_pop($context_stack);
  3143. foreach ($current->errors as $error) {
  3144. list($severity, $msg) = $error;
  3145. $string = '';
  3146. $string .= '<div>';
  3147. // W3C uses an icon to indicate the severity of the error.
  3148. $error = $this->locale->getErrorName($severity);
  3149. $string .= "<span class=\"error e$severity\"><strong>$error</strong></span> ";
  3150. if (!is_null($line) && !is_null($col)) {
  3151. $string .= "<em class=\"location\">Line $line, Column $col: </em> ";
  3152. } else {
  3153. $string .= '<em class="location">End of Document: </em> ';
  3154. }
  3155. $string .= '<strong class="description">' . $this->generator->escape($msg) . '</strong> ';
  3156. $string .= '</div>';
  3157. // Here, have a marker for the character on the column appropriate.
  3158. // Be sure to clip extremely long lines.
  3159. //$string .= '<pre>';
  3160. //$string .= '';
  3161. //$string .= '</pre>';
  3162. $ret[] = $string;
  3163. }
  3164. foreach ($current->children as $type => $array) {
  3165. $context[] = $current;
  3166. $stack = array_merge($stack, array_reverse($array, true));
  3167. for ($i = count($array); $i > 0; $i--) {
  3168. $context_stack[] = $context;
  3169. }
  3170. }
  3171. }
  3172. }
  3173. }
  3174. /**
  3175. * Records errors for particular segments of an HTML document such as tokens,
  3176. * attributes or CSS properties. They can contain error structs (which apply
  3177. * to components of what they represent), but their main purpose is to hold
  3178. * errors applying to whatever struct is being used.
  3179. */
  3180. class HTMLPurifier_ErrorStruct
  3181. {
  3182. /**
  3183. * Possible values for $children first-key. Note that top-level structures
  3184. * are automatically token-level.
  3185. */
  3186. const TOKEN = 0;
  3187. const ATTR = 1;
  3188. const CSSPROP = 2;
  3189. /**
  3190. * Type of this struct.
  3191. */
  3192. public $type;
  3193. /**
  3194. * Value of the struct we are recording errors for. There are various
  3195. * values for this:
  3196. * - TOKEN: Instance of HTMLPurifier_Token
  3197. * - ATTR: array('attr-name', 'value')
  3198. * - CSSPROP: array('prop-name', 'value')
  3199. */
  3200. public $value;
  3201. /**
  3202. * Errors registered for this structure.
  3203. */
  3204. public $errors = array();
  3205. /**
  3206. * Child ErrorStructs that are from this structure. For example, a TOKEN
  3207. * ErrorStruct would contain ATTR ErrorStructs. This is a multi-dimensional
  3208. * array in structure: [TYPE]['identifier']
  3209. */
  3210. public $children = array();
  3211. public function getChild($type, $id) {
  3212. if (!isset($this->children[$type][$id])) {
  3213. $this->children[$type][$id] = new HTMLPurifier_ErrorStruct();
  3214. $this->children[$type][$id]->type = $type;
  3215. }
  3216. return $this->children[$type][$id];
  3217. }
  3218. public function addError($severity, $message) {
  3219. $this->errors[] = array($severity, $message);
  3220. }
  3221. }
  3222. /**
  3223. * Global exception class for HTML Purifier; any exceptions we throw
  3224. * are from here.
  3225. */
  3226. class HTMLPurifier_Exception extends Exception
  3227. {
  3228. }
  3229. /**
  3230. * Represents a pre or post processing filter on HTML Purifier's output
  3231. *
  3232. * Sometimes, a little ad-hoc fixing of HTML has to be done before
  3233. * it gets sent through HTML Purifier: you can use filters to acheive
  3234. * this effect. For instance, YouTube videos can be preserved using
  3235. * this manner. You could have used a decorator for this task, but
  3236. * PHP's support for them is not terribly robust, so we're going
  3237. * to just loop through the filters.
  3238. *
  3239. * Filters should be exited first in, last out. If there are three filters,
  3240. * named 1, 2 and 3, the order of execution should go 1->preFilter,
  3241. * 2->preFilter, 3->preFilter, purify, 3->postFilter, 2->postFilter,
  3242. * 1->postFilter.
  3243. *
  3244. * @note Methods are not declared abstract as it is perfectly legitimate
  3245. * for an implementation not to want anything to happen on a step
  3246. */
  3247. class HTMLPurifier_Filter
  3248. {
  3249. /**
  3250. * Name of the filter for identification purposes
  3251. */
  3252. public $name;
  3253. /**
  3254. * Pre-processor function, handles HTML before HTML Purifier
  3255. */
  3256. public function preFilter($html, $config, $context) {
  3257. return $html;
  3258. }
  3259. /**
  3260. * Post-processor function, handles HTML after HTML Purifier
  3261. */
  3262. public function postFilter($html, $config, $context) {
  3263. return $html;
  3264. }
  3265. }
  3266. /**
  3267. * Generates HTML from tokens.
  3268. * @todo Refactor interface so that configuration/context is determined
  3269. * upon instantiation, no need for messy generateFromTokens() calls
  3270. * @todo Make some of the more internal functions protected, and have
  3271. * unit tests work around that
  3272. */
  3273. class HTMLPurifier_Generator
  3274. {
  3275. /**
  3276. * Whether or not generator should produce XML output
  3277. */
  3278. private $_xhtml = true;
  3279. /**
  3280. * :HACK: Whether or not generator should comment the insides of <script> tags
  3281. */
  3282. private $_scriptFix = false;
  3283. /**
  3284. * Cache of HTMLDefinition during HTML output to determine whether or
  3285. * not attributes should be minimized.
  3286. */
  3287. private $_def;
  3288. /**
  3289. * Cache of %Output.SortAttr
  3290. */
  3291. private $_sortAttr;
  3292. /**
  3293. * Cache of %Output.FlashCompat
  3294. */
  3295. private $_flashCompat;
  3296. /**
  3297. * Stack for keeping track of object information when outputting IE
  3298. * compatibility code.
  3299. */
  3300. private $_flashStack = array();
  3301. /**
  3302. * Configuration for the generator
  3303. */
  3304. protected $config;
  3305. /**
  3306. * @param $config Instance of HTMLPurifier_Config
  3307. * @param $context Instance of HTMLPurifier_Context
  3308. */
  3309. public function __construct($config, $context) {
  3310. $this->config = $config;
  3311. $this->_scriptFix = $config->get('Output.CommentScriptContents');
  3312. $this->_sortAttr = $config->get('Output.SortAttr');
  3313. $this->_flashCompat = $config->get('Output.FlashCompat');
  3314. $this->_def = $config->getHTMLDefinition();
  3315. $this->_xhtml = $this->_def->doctype->xml;
  3316. }
  3317. /**
  3318. * Generates HTML from an array of tokens.
  3319. * @param $tokens Array of HTMLPurifier_Token
  3320. * @param $config HTMLPurifier_Config object
  3321. * @return Generated HTML
  3322. */
  3323. public function generateFromTokens($tokens) {
  3324. if (!$tokens) return '';
  3325. // Basic algorithm
  3326. $html = '';
  3327. for ($i = 0, $size = count($tokens); $i < $size; $i++) {
  3328. if ($this->_scriptFix && $tokens[$i]->name === 'script'
  3329. && $i + 2 < $size && $tokens[$i+2] instanceof HTMLPurifier_Token_End) {
  3330. // script special case
  3331. // the contents of the script block must be ONE token
  3332. // for this to work.
  3333. $html .= $this->generateFromToken($tokens[$i++]);
  3334. $html .= $this->generateScriptFromToken($tokens[$i++]);
  3335. }
  3336. $html .= $this->generateFromToken($tokens[$i]);
  3337. }
  3338. // Tidy cleanup
  3339. if (extension_loaded('tidy') && $this->config->get('Output.TidyFormat')) {
  3340. $tidy = new Tidy;
  3341. $tidy->parseString($html, array(
  3342. 'indent'=> true,
  3343. 'output-xhtml' => $this->_xhtml,
  3344. 'show-body-only' => true,
  3345. 'indent-spaces' => 2,
  3346. 'wrap' => 68,
  3347. ), 'utf8');
  3348. $tidy->cleanRepair();
  3349. $html = (string) $tidy; // explicit cast necessary
  3350. }
  3351. // Normalize newlines to system defined value
  3352. if ($this->config->get('Core.NormalizeNewlines')) {
  3353. $nl = $this->config->get('Output.Newline');
  3354. if ($nl === null) $nl = PHP_EOL;
  3355. if ($nl !== "\n") $html = str_replace("\n", $nl, $html);
  3356. }
  3357. return $html;
  3358. }
  3359. /**
  3360. * Generates HTML from a single token.
  3361. * @param $token HTMLPurifier_Token object.
  3362. * @return Generated HTML
  3363. */
  3364. public function generateFromToken($token) {
  3365. if (!$token instanceof HTMLPurifier_Token) {
  3366. trigger_error('Cannot generate HTML from non-HTMLPurifier_Token object', E_USER_WARNING);
  3367. return '';
  3368. } elseif ($token instanceof HTMLPurifier_Token_Start) {
  3369. $attr = $this->generateAttributes($token->attr, $token->name);
  3370. if ($this->_flashCompat) {
  3371. if ($token->name == "object") {
  3372. $flash = new stdclass();
  3373. $flash->attr = $token->attr;
  3374. $flash->param = array();
  3375. $this->_flashStack[] = $flash;
  3376. }
  3377. }
  3378. return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
  3379. } elseif ($token instanceof HTMLPurifier_Token_End) {
  3380. $_extra = '';
  3381. if ($this->_flashCompat) {
  3382. if ($token->name == "object" && !empty($this->_flashStack)) {
  3383. $flash = array_pop($this->_flashStack);
  3384. $compat_token = new HTMLPurifier_Token_Empty("embed");
  3385. foreach ($flash->attr as $name => $val) {
  3386. if ($name == "classid") continue;
  3387. if ($name == "type") continue;
  3388. if ($name == "data") $name = "src";
  3389. $compat_token->attr[$name] = $val;
  3390. }
  3391. foreach ($flash->param as $name => $val) {
  3392. if ($name == "movie") $name = "src";
  3393. $compat_token->attr[$name] = $val;
  3394. }
  3395. $_extra = "<!--[if IE]>".$this->generateFromToken($compat_token)."<![endif]-->";
  3396. }
  3397. }
  3398. return $_extra . '</' . $token->name . '>';
  3399. } elseif ($token instanceof HTMLPurifier_Token_Empty) {
  3400. if ($this->_flashCompat && $token->name == "param" && !empty($this->_flashStack)) {
  3401. $this->_flashStack[count($this->_flashStack)-1]->param[$token->attr['name']] = $token->attr['value'];
  3402. }
  3403. $attr = $this->generateAttributes($token->attr, $token->name);
  3404. return '<' . $token->name . ($attr ? ' ' : '') . $attr .
  3405. ( $this->_xhtml ? ' /': '' ) // <br /> v. <br>
  3406. . '>';
  3407. } elseif ($token instanceof HTMLPurifier_Token_Text) {
  3408. return $this->escape($token->data, ENT_NOQUOTES);
  3409. } elseif ($token instanceof HTMLPurifier_Token_Comment) {
  3410. return '<!--' . $token->data . '-->';
  3411. } else {
  3412. return '';
  3413. }
  3414. }
  3415. /**
  3416. * Special case processor for the contents of script tags
  3417. * @warning This runs into problems if there's already a literal
  3418. * --> somewhere inside the script contents.
  3419. */
  3420. public function generateScriptFromToken($token) {
  3421. if (!$token instanceof HTMLPurifier_Token_Text) return $this->generateFromToken($token);
  3422. // Thanks <http://lachy.id.au/log/2005/05/script-comments>
  3423. $data = preg_replace('#//\s*$#', '', $token->data);
  3424. return '<!--//--><![CDATA[//><!--' . "\n" . trim($data) . "\n" . '//--><!]]>';
  3425. }
  3426. /**
  3427. * Generates attribute declarations from attribute array.
  3428. * @note This does not include the leading or trailing space.
  3429. * @param $assoc_array_of_attributes Attribute array
  3430. * @param $element Name of element attributes are for, used to check
  3431. * attribute minimization.
  3432. * @return Generate HTML fragment for insertion.
  3433. */
  3434. public function generateAttributes($assoc_array_of_attributes, $element = false) {
  3435. $html = '';
  3436. if ($this->_sortAttr) ksort($assoc_array_of_attributes);
  3437. foreach ($assoc_array_of_attributes as $key => $value) {
  3438. if (!$this->_xhtml) {
  3439. // Remove namespaced attributes
  3440. if (strpos($key, ':') !== false) continue;
  3441. // Check if we should minimize the attribute: val="val" -> val
  3442. if ($element && !empty($this->_def->info[$element]->attr[$key]->minimized)) {
  3443. $html .= $key . ' ';
  3444. continue;
  3445. }
  3446. }
  3447. $html .= $key.'="'.$this->escape($value).'" ';
  3448. }
  3449. return rtrim($html);
  3450. }
  3451. /**
  3452. * Escapes raw text data.
  3453. * @todo This really ought to be protected, but until we have a facility
  3454. * for properly generating HTML here w/o using tokens, it stays
  3455. * public.
  3456. * @param $string String data to escape for HTML.
  3457. * @param $quote Quoting style, like htmlspecialchars. ENT_NOQUOTES is
  3458. * permissible for non-attribute output.
  3459. * @return String escaped data.
  3460. */
  3461. public function escape($string, $quote = null) {
  3462. // Workaround for APC bug on Mac Leopard reported by sidepodcast
  3463. // http://htmlpurifier.org/phorum/read.php?3,4823,4846
  3464. if ($quote === null) $quote = ENT_COMPAT;
  3465. return htmlspecialchars($string, $quote, 'UTF-8');
  3466. }
  3467. }
  3468. /**
  3469. * Definition of the purified HTML that describes allowed children,
  3470. * attributes, and many other things.
  3471. *
  3472. * Conventions:
  3473. *
  3474. * All member variables that are prefixed with info
  3475. * (including the main $info array) are used by HTML Purifier internals
  3476. * and should not be directly edited when customizing the HTMLDefinition.
  3477. * They can usually be set via configuration directives or custom
  3478. * modules.
  3479. *
  3480. * On the other hand, member variables without the info prefix are used
  3481. * internally by the HTMLDefinition and MUST NOT be used by other HTML
  3482. * Purifier internals. Many of them, however, are public, and may be
  3483. * edited by userspace code to tweak the behavior of HTMLDefinition.
  3484. *
  3485. * @note This class is inspected by Printer_HTMLDefinition; please
  3486. * update that class if things here change.
  3487. *
  3488. * @warning Directives that change this object's structure must be in
  3489. * the HTML or Attr namespace!
  3490. */
  3491. class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition
  3492. {
  3493. // FULLY-PUBLIC VARIABLES ---------------------------------------------
  3494. /**
  3495. * Associative array of element names to HTMLPurifier_ElementDef
  3496. */
  3497. public $info = array();
  3498. /**
  3499. * Associative array of global attribute name to attribute definition.
  3500. */
  3501. public $info_global_attr = array();
  3502. /**
  3503. * String name of parent element HTML will be going into.
  3504. */
  3505. public $info_parent = 'div';
  3506. /**
  3507. * Definition for parent element, allows parent element to be a
  3508. * tag that's not allowed inside the HTML fragment.
  3509. */
  3510. public $info_parent_def;
  3511. /**
  3512. * String name of element used to wrap inline elements in block context
  3513. * @note This is rarely used except for BLOCKQUOTEs in strict mode
  3514. */
  3515. public $info_block_wrapper = 'p';
  3516. /**
  3517. * Associative array of deprecated tag name to HTMLPurifier_TagTransform
  3518. */
  3519. public $info_tag_transform = array();
  3520. /**
  3521. * Indexed list of HTMLPurifier_AttrTransform to be performed before validation.
  3522. */
  3523. public $info_attr_transform_pre = array();
  3524. /**
  3525. * Indexed list of HTMLPurifier_AttrTransform to be performed after validation.
  3526. */
  3527. public $info_attr_transform_post = array();
  3528. /**
  3529. * Nested lookup array of content set name (Block, Inline) to
  3530. * element name to whether or not it belongs in that content set.
  3531. */
  3532. public $info_content_sets = array();
  3533. /**
  3534. * Indexed list of HTMLPurifier_Injector to be used.
  3535. */
  3536. public $info_injector = array();
  3537. /**
  3538. * Doctype object
  3539. */
  3540. public $doctype;
  3541. // RAW CUSTOMIZATION STUFF --------------------------------------------
  3542. /**
  3543. * Adds a custom attribute to a pre-existing element
  3544. * @note This is strictly convenience, and does not have a corresponding
  3545. * method in HTMLPurifier_HTMLModule
  3546. * @param $element_name String element name to add attribute to
  3547. * @param $attr_name String name of attribute
  3548. * @param $def Attribute definition, can be string or object, see
  3549. * HTMLPurifier_AttrTypes for details
  3550. */
  3551. public function addAttribute($element_name, $attr_name, $def) {
  3552. $module = $this->getAnonymousModule();
  3553. if (!isset($module->info[$element_name])) {
  3554. $element = $module->addBlankElement($element_name);
  3555. } else {
  3556. $element = $module->info[$element_name];
  3557. }
  3558. $element->attr[$attr_name] = $def;
  3559. }
  3560. /**
  3561. * Adds a custom element to your HTML definition
  3562. * @note See HTMLPurifier_HTMLModule::addElement for detailed
  3563. * parameter and return value descriptions.
  3564. */
  3565. public function addElement($element_name, $type, $contents, $attr_collections, $attributes = array()) {
  3566. $module = $this->getAnonymousModule();
  3567. // assume that if the user is calling this, the element
  3568. // is safe. This may not be a good idea
  3569. $element = $module->addElement($element_name, $type, $contents, $attr_collections, $attributes);
  3570. return $element;
  3571. }
  3572. /**
  3573. * Adds a blank element to your HTML definition, for overriding
  3574. * existing behavior
  3575. * @note See HTMLPurifier_HTMLModule::addBlankElement for detailed
  3576. * parameter and return value descriptions.
  3577. */
  3578. public function addBlankElement($element_name) {
  3579. $module = $this->getAnonymousModule();
  3580. $element = $module->addBlankElement($element_name);
  3581. return $element;
  3582. }
  3583. /**
  3584. * Retrieves a reference to the anonymous module, so you can
  3585. * bust out advanced features without having to make your own
  3586. * module.
  3587. */
  3588. public function getAnonymousModule() {
  3589. if (!$this->_anonModule) {
  3590. $this->_anonModule = new HTMLPurifier_HTMLModule();
  3591. $this->_anonModule->name = 'Anonymous';
  3592. }
  3593. return $this->_anonModule;
  3594. }
  3595. private $_anonModule;
  3596. // PUBLIC BUT INTERNAL VARIABLES --------------------------------------
  3597. public $type = 'HTML';
  3598. public $manager; /**< Instance of HTMLPurifier_HTMLModuleManager */
  3599. /**
  3600. * Performs low-cost, preliminary initialization.
  3601. */
  3602. public function __construct() {
  3603. $this->manager = new HTMLPurifier_HTMLModuleManager();
  3604. }
  3605. protected function doSetup($config) {
  3606. $this->processModules($config);
  3607. $this->setupConfigStuff($config);
  3608. unset($this->manager);
  3609. // cleanup some of the element definitions
  3610. foreach ($this->info as $k => $v) {
  3611. unset($this->info[$k]->content_model);
  3612. unset($this->info[$k]->content_model_type);
  3613. }
  3614. }
  3615. /**
  3616. * Extract out the information from the manager
  3617. */
  3618. protected function processModules($config) {
  3619. if ($this->_anonModule) {
  3620. // for user specific changes
  3621. // this is late-loaded so we don't have to deal with PHP4
  3622. // reference wonky-ness
  3623. $this->manager->addModule($this->_anonModule);
  3624. unset($this->_anonModule);
  3625. }
  3626. $this->manager->setup($config);
  3627. $this->doctype = $this->manager->doctype;
  3628. foreach ($this->manager->modules as $module) {
  3629. foreach($module->info_tag_transform as $k => $v) {
  3630. if ($v === false) unset($this->info_tag_transform[$k]);
  3631. else $this->info_tag_transform[$k] = $v;
  3632. }
  3633. foreach($module->info_attr_transform_pre as $k => $v) {
  3634. if ($v === false) unset($this->info_attr_transform_pre[$k]);
  3635. else $this->info_attr_transform_pre[$k] = $v;
  3636. }
  3637. foreach($module->info_attr_transform_post as $k => $v) {
  3638. if ($v === false) unset($this->info_attr_transform_post[$k]);
  3639. else $this->info_attr_transform_post[$k] = $v;
  3640. }
  3641. foreach ($module->info_injector as $k => $v) {
  3642. if ($v === false) unset($this->info_injector[$k]);
  3643. else $this->info_injector[$k] = $v;
  3644. }
  3645. }
  3646. $this->info = $this->manager->getElements();
  3647. $this->info_content_sets = $this->manager->contentSets->lookup;
  3648. }
  3649. /**
  3650. * Sets up stuff based on config. We need a better way of doing this.
  3651. */
  3652. protected function setupConfigStuff($config) {
  3653. $block_wrapper = $config->get('HTML.BlockWrapper');
  3654. if (isset($this->info_content_sets['Block'][$block_wrapper])) {
  3655. $this->info_block_wrapper = $block_wrapper;
  3656. } else {
  3657. trigger_error('Cannot use non-block element as block wrapper',
  3658. E_USER_ERROR);
  3659. }
  3660. $parent = $config->get('HTML.Parent');
  3661. $def = $this->manager->getElement($parent, true);
  3662. if ($def) {
  3663. $this->info_parent = $parent;
  3664. $this->info_parent_def = $def;
  3665. } else {
  3666. trigger_error('Cannot use unrecognized element as parent',
  3667. E_USER_ERROR);
  3668. $this->info_parent_def = $this->manager->getElement($this->info_parent, true);
  3669. }
  3670. // support template text
  3671. $support = "(for information on implementing this, see the ".
  3672. "support forums) ";
  3673. // setup allowed elements -----------------------------------------
  3674. $allowed_elements = $config->get('HTML.AllowedElements');
  3675. $allowed_attributes = $config->get('HTML.AllowedAttributes'); // retrieve early
  3676. if (!is_array($allowed_elements) && !is_array($allowed_attributes)) {
  3677. $allowed = $config->get('HTML.Allowed');
  3678. if (is_string($allowed)) {
  3679. list($allowed_elements, $allowed_attributes) = $this->parseTinyMCEAllowedList($allowed);
  3680. }
  3681. }
  3682. if (is_array($allowed_elements)) {
  3683. foreach ($this->info as $name => $d) {
  3684. if(!isset($allowed_elements[$name])) unset($this->info[$name]);
  3685. unset($allowed_elements[$name]);
  3686. }
  3687. // emit errors
  3688. foreach ($allowed_elements as $element => $d) {
  3689. $element = htmlspecialchars($element); // PHP doesn't escape errors, be careful!
  3690. trigger_error("Element '$element' is not supported $support", E_USER_WARNING);
  3691. }
  3692. }
  3693. // setup allowed attributes ---------------------------------------
  3694. $allowed_attributes_mutable = $allowed_attributes; // by copy!
  3695. if (is_array($allowed_attributes)) {
  3696. // This actually doesn't do anything, since we went away from
  3697. // global attributes. It's possible that userland code uses
  3698. // it, but HTMLModuleManager doesn't!
  3699. foreach ($this->info_global_attr as $attr => $x) {
  3700. $keys = array($attr, "*@$attr", "*.$attr");
  3701. $delete = true;
  3702. foreach ($keys as $key) {
  3703. if ($delete && isset($allowed_attributes[$key])) {
  3704. $delete = false;
  3705. }
  3706. if (isset($allowed_attributes_mutable[$key])) {
  3707. unset($allowed_attributes_mutable[$key]);
  3708. }
  3709. }
  3710. if ($delete) unset($this->info_global_attr[$attr]);
  3711. }
  3712. foreach ($this->info as $tag => $info) {
  3713. foreach ($info->attr as $attr => $x) {
  3714. $keys = array("$tag@$attr", $attr, "*@$attr", "$tag.$attr", "*.$attr");
  3715. $delete = true;
  3716. foreach ($keys as $key) {
  3717. if ($delete && isset($allowed_attributes[$key])) {
  3718. $delete = false;
  3719. }
  3720. if (isset($allowed_attributes_mutable[$key])) {
  3721. unset($allowed_attributes_mutable[$key]);
  3722. }
  3723. }
  3724. if ($delete) {
  3725. if ($this->info[$tag]->attr[$attr]->required) {
  3726. trigger_error("Required attribute '$attr' in element '$tag' was not allowed, which means '$tag' will not be allowed either", E_USER_WARNING);
  3727. }
  3728. unset($this->info[$tag]->attr[$attr]);
  3729. }
  3730. }
  3731. }
  3732. // emit errors
  3733. foreach ($allowed_attributes_mutable as $elattr => $d) {
  3734. $bits = preg_split('/[.@]/', $elattr, 2);
  3735. $c = count($bits);
  3736. switch ($c) {
  3737. case 2:
  3738. if ($bits[0] !== '*') {
  3739. $element = htmlspecialchars($bits[0]);
  3740. $attribute = htmlspecialchars($bits[1]);
  3741. if (!isset($this->info[$element])) {
  3742. trigger_error("Cannot allow attribute '$attribute' if element '$element' is not allowed/supported $support");
  3743. } else {
  3744. trigger_error("Attribute '$attribute' in element '$element' not supported $support",
  3745. E_USER_WARNING);
  3746. }
  3747. break;
  3748. }
  3749. // otherwise fall through
  3750. case 1:
  3751. $attribute = htmlspecialchars($bits[0]);
  3752. trigger_error("Global attribute '$attribute' is not ".
  3753. "supported in any elements $support",
  3754. E_USER_WARNING);
  3755. break;
  3756. }
  3757. }
  3758. }
  3759. // setup forbidden elements ---------------------------------------
  3760. $forbidden_elements = $config->get('HTML.ForbiddenElements');
  3761. $forbidden_attributes = $config->get('HTML.ForbiddenAttributes');
  3762. foreach ($this->info as $tag => $info) {
  3763. if (isset($forbidden_elements[$tag])) {
  3764. unset($this->info[$tag]);
  3765. continue;
  3766. }
  3767. foreach ($info->attr as $attr => $x) {
  3768. if (
  3769. isset($forbidden_attributes["$tag@$attr"]) ||
  3770. isset($forbidden_attributes["*@$attr"]) ||
  3771. isset($forbidden_attributes[$attr])
  3772. ) {
  3773. unset($this->info[$tag]->attr[$attr]);
  3774. continue;
  3775. } // this segment might get removed eventually
  3776. elseif (isset($forbidden_attributes["$tag.$attr"])) {
  3777. // $tag.$attr are not user supplied, so no worries!
  3778. trigger_error("Error with $tag.$attr: tag.attr syntax not supported for HTML.ForbiddenAttributes; use tag@attr instead", E_USER_WARNING);
  3779. }
  3780. }
  3781. }
  3782. foreach ($forbidden_attributes as $key => $v) {
  3783. if (strlen($key) < 2) continue;
  3784. if ($key[0] != '*') continue;
  3785. if ($key[1] == '.') {
  3786. trigger_error("Error with $key: *.attr syntax not supported for HTML.ForbiddenAttributes; use attr instead", E_USER_WARNING);
  3787. }
  3788. }
  3789. // setup injectors -----------------------------------------------------
  3790. foreach ($this->info_injector as $i => $injector) {
  3791. if ($injector->checkNeeded($config) !== false) {
  3792. // remove injector that does not have it's required
  3793. // elements/attributes present, and is thus not needed.
  3794. unset($this->info_injector[$i]);
  3795. }
  3796. }
  3797. }
  3798. /**
  3799. * Parses a TinyMCE-flavored Allowed Elements and Attributes list into
  3800. * separate lists for processing. Format is element[attr1|attr2],element2...
  3801. * @warning Although it's largely drawn from TinyMCE's implementation,
  3802. * it is different, and you'll probably have to modify your lists
  3803. * @param $list String list to parse
  3804. * @param array($allowed_elements, $allowed_attributes)
  3805. * @todo Give this its own class, probably static interface
  3806. */
  3807. public function parseTinyMCEAllowedList($list) {
  3808. $list = str_replace(array(' ', "\t"), '', $list);
  3809. $elements = array();
  3810. $attributes = array();
  3811. $chunks = preg_split('/(,|[\n\r]+)/', $list);
  3812. foreach ($chunks as $chunk) {
  3813. if (empty($chunk)) continue;
  3814. // remove TinyMCE element control characters
  3815. if (!strpos($chunk, '[')) {
  3816. $element = $chunk;
  3817. $attr = false;
  3818. } else {
  3819. list($element, $attr) = explode('[', $chunk);
  3820. }
  3821. if ($element !== '*') $elements[$element] = true;
  3822. if (!$attr) continue;
  3823. $attr = substr($attr, 0, strlen($attr) - 1); // remove trailing ]
  3824. $attr = explode('|', $attr);
  3825. foreach ($attr as $key) {
  3826. $attributes["$element.$key"] = true;
  3827. }
  3828. }
  3829. return array($elements, $attributes);
  3830. }
  3831. }
  3832. /**
  3833. * Represents an XHTML 1.1 module, with information on elements, tags
  3834. * and attributes.
  3835. * @note Even though this is technically XHTML 1.1, it is also used for
  3836. * regular HTML parsing. We are using modulization as a convenient
  3837. * way to represent the internals of HTMLDefinition, and our
  3838. * implementation is by no means conforming and does not directly
  3839. * use the normative DTDs or XML schemas.
  3840. * @note The public variables in a module should almost directly
  3841. * correspond to the variables in HTMLPurifier_HTMLDefinition.
  3842. * However, the prefix info carries no special meaning in these
  3843. * objects (include it anyway if that's the correspondence though).
  3844. * @todo Consider making some member functions protected
  3845. */
  3846. class HTMLPurifier_HTMLModule
  3847. {
  3848. // -- Overloadable ----------------------------------------------------
  3849. /**
  3850. * Short unique string identifier of the module
  3851. */
  3852. public $name;
  3853. /**
  3854. * Informally, a list of elements this module changes. Not used in
  3855. * any significant way.
  3856. */
  3857. public $elements = array();
  3858. /**
  3859. * Associative array of element names to element definitions.
  3860. * Some definitions may be incomplete, to be merged in later
  3861. * with the full definition.
  3862. */
  3863. public $info = array();
  3864. /**
  3865. * Associative array of content set names to content set additions.
  3866. * This is commonly used to, say, add an A element to the Inline
  3867. * content set. This corresponds to an internal variable $content_sets
  3868. * and NOT info_content_sets member variable of HTMLDefinition.
  3869. */
  3870. public $content_sets = array();
  3871. /**
  3872. * Associative array of attribute collection names to attribute
  3873. * collection additions. More rarely used for adding attributes to
  3874. * the global collections. Example is the StyleAttribute module adding
  3875. * the style attribute to the Core. Corresponds to HTMLDefinition's
  3876. * attr_collections->info, since the object's data is only info,
  3877. * with extra behavior associated with it.
  3878. */
  3879. public $attr_collections = array();
  3880. /**
  3881. * Associative array of deprecated tag name to HTMLPurifier_TagTransform
  3882. */
  3883. public $info_tag_transform = array();
  3884. /**
  3885. * List of HTMLPurifier_AttrTransform to be performed before validation.
  3886. */
  3887. public $info_attr_transform_pre = array();
  3888. /**
  3889. * List of HTMLPurifier_AttrTransform to be performed after validation.
  3890. */
  3891. public $info_attr_transform_post = array();
  3892. /**
  3893. * List of HTMLPurifier_Injector to be performed during well-formedness fixing.
  3894. * An injector will only be invoked if all of it's pre-requisites are met;
  3895. * if an injector fails setup, there will be no error; it will simply be
  3896. * silently disabled.
  3897. */
  3898. public $info_injector = array();
  3899. /**
  3900. * Boolean flag that indicates whether or not getChildDef is implemented.
  3901. * For optimization reasons: may save a call to a function. Be sure
  3902. * to set it if you do implement getChildDef(), otherwise it will have
  3903. * no effect!
  3904. */
  3905. public $defines_child_def = false;
  3906. /**
  3907. * Boolean flag whether or not this module is safe. If it is not safe, all
  3908. * of its members are unsafe. Modules are safe by default (this might be
  3909. * slightly dangerous, but it doesn't make much sense to force HTML Purifier,
  3910. * which is based off of safe HTML, to explicitly say, "This is safe," even
  3911. * though there are modules which are "unsafe")
  3912. *
  3913. * @note Previously, safety could be applied at an element level granularity.
  3914. * We've removed this ability, so in order to add "unsafe" elements
  3915. * or attributes, a dedicated module with this property set to false
  3916. * must be used.
  3917. */
  3918. public $safe = true;
  3919. /**
  3920. * Retrieves a proper HTMLPurifier_ChildDef subclass based on
  3921. * content_model and content_model_type member variables of
  3922. * the HTMLPurifier_ElementDef class. There is a similar function
  3923. * in HTMLPurifier_HTMLDefinition.
  3924. * @param $def HTMLPurifier_ElementDef instance
  3925. * @return HTMLPurifier_ChildDef subclass
  3926. */
  3927. public function getChildDef($def) {return false;}
  3928. // -- Convenience -----------------------------------------------------
  3929. /**
  3930. * Convenience function that sets up a new element
  3931. * @param $element Name of element to add
  3932. * @param $type What content set should element be registered to?
  3933. * Set as false to skip this step.
  3934. * @param $contents Allowed children in form of:
  3935. * "$content_model_type: $content_model"
  3936. * @param $attr_includes What attribute collections to register to
  3937. * element?
  3938. * @param $attr What unique attributes does the element define?
  3939. * @note See ElementDef for in-depth descriptions of these parameters.
  3940. * @return Created element definition object, so you
  3941. * can set advanced parameters
  3942. */
  3943. public function addElement($element, $type, $contents, $attr_includes = array(), $attr = array()) {
  3944. $this->elements[] = $element;
  3945. // parse content_model
  3946. list($content_model_type, $content_model) = $this->parseContents($contents);
  3947. // merge in attribute inclusions
  3948. $this->mergeInAttrIncludes($attr, $attr_includes);
  3949. // add element to content sets
  3950. if ($type) $this->addElementToContentSet($element, $type);
  3951. // create element
  3952. $this->info[$element] = HTMLPurifier_ElementDef::create(
  3953. $content_model, $content_model_type, $attr
  3954. );
  3955. // literal object $contents means direct child manipulation
  3956. if (!is_string($contents)) $this->info[$element]->child = $contents;
  3957. return $this->info[$element];
  3958. }
  3959. /**
  3960. * Convenience function that creates a totally blank, non-standalone
  3961. * element.
  3962. * @param $element Name of element to create
  3963. * @return Created element
  3964. */
  3965. public function addBlankElement($element) {
  3966. if (!isset($this->info[$element])) {
  3967. $this->elements[] = $element;
  3968. $this->info[$element] = new HTMLPurifier_ElementDef();
  3969. $this->info[$element]->standalone = false;
  3970. } else {
  3971. trigger_error("Definition for $element already exists in module, cannot redefine");
  3972. }
  3973. return $this->info[$element];
  3974. }
  3975. /**
  3976. * Convenience function that registers an element to a content set
  3977. * @param Element to register
  3978. * @param Name content set (warning: case sensitive, usually upper-case
  3979. * first letter)
  3980. */
  3981. public function addElementToContentSet($element, $type) {
  3982. if (!isset($this->content_sets[$type])) $this->content_sets[$type] = '';
  3983. else $this->content_sets[$type] .= ' | ';
  3984. $this->content_sets[$type] .= $element;
  3985. }
  3986. /**
  3987. * Convenience function that transforms single-string contents
  3988. * into separate content model and content model type
  3989. * @param $contents Allowed children in form of:
  3990. * "$content_model_type: $content_model"
  3991. * @note If contents is an object, an array of two nulls will be
  3992. * returned, and the callee needs to take the original $contents
  3993. * and use it directly.
  3994. */
  3995. public function parseContents($contents) {
  3996. if (!is_string($contents)) return array(null, null); // defer
  3997. switch ($contents) {
  3998. // check for shorthand content model forms
  3999. case 'Empty':
  4000. return array('empty', '');
  4001. case 'Inline':
  4002. return array('optional', 'Inline | #PCDATA');
  4003. case 'Flow':
  4004. return array('optional', 'Flow | #PCDATA');
  4005. }
  4006. list($content_model_type, $content_model) = explode(':', $contents);
  4007. $content_model_type = strtolower(trim($content_model_type));
  4008. $content_model = trim($content_model);
  4009. return array($content_model_type, $content_model);
  4010. }
  4011. /**
  4012. * Convenience function that merges a list of attribute includes into
  4013. * an attribute array.
  4014. * @param $attr Reference to attr array to modify
  4015. * @param $attr_includes Array of includes / string include to merge in
  4016. */
  4017. public function mergeInAttrIncludes(&$attr, $attr_includes) {
  4018. if (!is_array($attr_includes)) {
  4019. if (empty($attr_includes)) $attr_includes = array();
  4020. else $attr_includes = array($attr_includes);
  4021. }
  4022. $attr[0] = $attr_includes;
  4023. }
  4024. /**
  4025. * Convenience function that generates a lookup table with boolean
  4026. * true as value.
  4027. * @param $list List of values to turn into a lookup
  4028. * @note You can also pass an arbitrary number of arguments in
  4029. * place of the regular argument
  4030. * @return Lookup array equivalent of list
  4031. */
  4032. public function makeLookup($list) {
  4033. if (is_string($list)) $list = func_get_args();
  4034. $ret = array();
  4035. foreach ($list as $value) {
  4036. if (is_null($value)) continue;
  4037. $ret[$value] = true;
  4038. }
  4039. return $ret;
  4040. }
  4041. /**
  4042. * Lazy load construction of the module after determining whether
  4043. * or not it's needed, and also when a finalized configuration object
  4044. * is available.
  4045. * @param $config Instance of HTMLPurifier_Config
  4046. */
  4047. public function setup($config) {}
  4048. }
  4049. class HTMLPurifier_HTMLModuleManager
  4050. {
  4051. /**
  4052. * Instance of HTMLPurifier_DoctypeRegistry
  4053. */
  4054. public $doctypes;
  4055. /**
  4056. * Instance of current doctype
  4057. */
  4058. public $doctype;
  4059. /**
  4060. * Instance of HTMLPurifier_AttrTypes
  4061. */
  4062. public $attrTypes;
  4063. /**
  4064. * Active instances of modules for the specified doctype are
  4065. * indexed, by name, in this array.
  4066. */
  4067. public $modules = array();
  4068. /**
  4069. * Array of recognized HTMLPurifier_Module instances, indexed by
  4070. * module's class name. This array is usually lazy loaded, but a
  4071. * user can overload a module by pre-emptively registering it.
  4072. */
  4073. public $registeredModules = array();
  4074. /**
  4075. * List of extra modules that were added by the user using addModule().
  4076. * These get unconditionally merged into the current doctype, whatever
  4077. * it may be.
  4078. */
  4079. public $userModules = array();
  4080. /**
  4081. * Associative array of element name to list of modules that have
  4082. * definitions for the element; this array is dynamically filled.
  4083. */
  4084. public $elementLookup = array();
  4085. /** List of prefixes we should use for registering small names */
  4086. public $prefixes = array('HTMLPurifier_HTMLModule_');
  4087. public $contentSets; /**< Instance of HTMLPurifier_ContentSets */
  4088. public $attrCollections; /**< Instance of HTMLPurifier_AttrCollections */
  4089. /** If set to true, unsafe elements and attributes will be allowed */
  4090. public $trusted = false;
  4091. public function __construct() {
  4092. // editable internal objects
  4093. $this->attrTypes = new HTMLPurifier_AttrTypes();
  4094. $this->doctypes = new HTMLPurifier_DoctypeRegistry();
  4095. // setup basic modules
  4096. $common = array(
  4097. 'CommonAttributes', 'Text', 'Hypertext', 'List',
  4098. 'Presentation', 'Edit', 'Bdo', 'Tables', 'Image',
  4099. 'StyleAttribute',
  4100. // Unsafe:
  4101. 'Scripting', 'Object', 'Forms',
  4102. // Sorta legacy, but present in strict:
  4103. 'Name',
  4104. );
  4105. $transitional = array('Legacy', 'Target');
  4106. $xml = array('XMLCommonAttributes');
  4107. $non_xml = array('NonXMLCommonAttributes');
  4108. // setup basic doctypes
  4109. $this->doctypes->register(
  4110. 'HTML 4.01 Transitional', false,
  4111. array_merge($common, $transitional, $non_xml),
  4112. array('Tidy_Transitional', 'Tidy_Proprietary'),
  4113. array(),
  4114. '-//W3C//DTD HTML 4.01 Transitional//EN',
  4115. 'http://www.w3.org/TR/html4/loose.dtd'
  4116. );
  4117. $this->doctypes->register(
  4118. 'HTML 4.01 Strict', false,
  4119. array_merge($common, $non_xml),
  4120. array('Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'),
  4121. array(),
  4122. '-//W3C//DTD HTML 4.01//EN',
  4123. 'http://www.w3.org/TR/html4/strict.dtd'
  4124. );
  4125. $this->doctypes->register(
  4126. 'XHTML 1.0 Transitional', true,
  4127. array_merge($common, $transitional, $xml, $non_xml),
  4128. array('Tidy_Transitional', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Name'),
  4129. array(),
  4130. '-//W3C//DTD XHTML 1.0 Transitional//EN',
  4131. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
  4132. );
  4133. $this->doctypes->register(
  4134. 'XHTML 1.0 Strict', true,
  4135. array_merge($common, $xml, $non_xml),
  4136. array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'),
  4137. array(),
  4138. '-//W3C//DTD XHTML 1.0 Strict//EN',
  4139. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'
  4140. );
  4141. $this->doctypes->register(
  4142. 'XHTML 1.1', true,
  4143. array_merge($common, $xml, array('Ruby')),
  4144. array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Strict', 'Tidy_Name'), // Tidy_XHTML1_1
  4145. array(),
  4146. '-//W3C//DTD XHTML 1.1//EN',
  4147. 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'
  4148. );
  4149. }
  4150. /**
  4151. * Registers a module to the recognized module list, useful for
  4152. * overloading pre-existing modules.
  4153. * @param $module Mixed: string module name, with or without
  4154. * HTMLPurifier_HTMLModule prefix, or instance of
  4155. * subclass of HTMLPurifier_HTMLModule.
  4156. * @param $overload Boolean whether or not to overload previous modules.
  4157. * If this is not set, and you do overload a module,
  4158. * HTML Purifier will complain with a warning.
  4159. * @note This function will not call autoload, you must instantiate
  4160. * (and thus invoke) autoload outside the method.
  4161. * @note If a string is passed as a module name, different variants
  4162. * will be tested in this order:
  4163. * - Check for HTMLPurifier_HTMLModule_$name
  4164. * - Check all prefixes with $name in order they were added
  4165. * - Check for literal object name
  4166. * - Throw fatal error
  4167. * If your object name collides with an internal class, specify
  4168. * your module manually. All modules must have been included
  4169. * externally: registerModule will not perform inclusions for you!
  4170. */
  4171. public function registerModule($module, $overload = false) {
  4172. if (is_string($module)) {
  4173. // attempt to load the module
  4174. $original_module = $module;
  4175. $ok = false;
  4176. foreach ($this->prefixes as $prefix) {
  4177. $module = $prefix . $original_module;
  4178. if (class_exists($module)) {
  4179. $ok = true;
  4180. break;
  4181. }
  4182. }
  4183. if (!$ok) {
  4184. $module = $original_module;
  4185. if (!class_exists($module)) {
  4186. trigger_error($original_module . ' module does not exist',
  4187. E_USER_ERROR);
  4188. return;
  4189. }
  4190. }
  4191. $module = new $module();
  4192. }
  4193. if (empty($module->name)) {
  4194. trigger_error('Module instance of ' . get_class($module) . ' must have name');
  4195. return;
  4196. }
  4197. if (!$overload && isset($this->registeredModules[$module->name])) {
  4198. trigger_error('Overloading ' . $module->name . ' without explicit overload parameter', E_USER_WARNING);
  4199. }
  4200. $this->registeredModules[$module->name] = $module;
  4201. }
  4202. /**
  4203. * Adds a module to the current doctype by first registering it,
  4204. * and then tacking it on to the active doctype
  4205. */
  4206. public function addModule($module) {
  4207. $this->registerModule($module);
  4208. if (is_object($module)) $module = $module->name;
  4209. $this->userModules[] = $module;
  4210. }
  4211. /**
  4212. * Adds a class prefix that registerModule() will use to resolve a
  4213. * string name to a concrete class
  4214. */
  4215. public function addPrefix($prefix) {
  4216. $this->prefixes[] = $prefix;
  4217. }
  4218. /**
  4219. * Performs processing on modules, after being called you may
  4220. * use getElement() and getElements()
  4221. * @param $config Instance of HTMLPurifier_Config
  4222. */
  4223. public function setup($config) {
  4224. $this->trusted = $config->get('HTML.Trusted');
  4225. // generate
  4226. $this->doctype = $this->doctypes->make($config);
  4227. $modules = $this->doctype->modules;
  4228. // take out the default modules that aren't allowed
  4229. $lookup = $config->get('HTML.AllowedModules');
  4230. $special_cases = $config->get('HTML.CoreModules');
  4231. if (is_array($lookup)) {
  4232. foreach ($modules as $k => $m) {
  4233. if (isset($special_cases[$m])) continue;
  4234. if (!isset($lookup[$m])) unset($modules[$k]);
  4235. }
  4236. }
  4237. // add proprietary module (this gets special treatment because
  4238. // it is completely removed from doctypes, etc.)
  4239. if ($config->get('HTML.Proprietary')) {
  4240. $modules[] = 'Proprietary';
  4241. }
  4242. // add SafeObject/Safeembed modules
  4243. if ($config->get('HTML.SafeObject')) {
  4244. $modules[] = 'SafeObject';
  4245. }
  4246. if ($config->get('HTML.SafeEmbed')) {
  4247. $modules[] = 'SafeEmbed';
  4248. }
  4249. // merge in custom modules
  4250. $modules = array_merge($modules, $this->userModules);
  4251. foreach ($modules as $module) {
  4252. $this->processModule($module);
  4253. $this->modules[$module]->setup($config);
  4254. }
  4255. foreach ($this->doctype->tidyModules as $module) {
  4256. $this->processModule($module);
  4257. $this->modules[$module]->setup($config);
  4258. }
  4259. // prepare any injectors
  4260. foreach ($this->modules as $module) {
  4261. $n = array();
  4262. foreach ($module->info_injector as $i => $injector) {
  4263. if (!is_object($injector)) {
  4264. $class = "HTMLPurifier_Injector_$injector";
  4265. $injector = new $class;
  4266. }
  4267. $n[$injector->name] = $injector;
  4268. }
  4269. $module->info_injector = $n;
  4270. }
  4271. // setup lookup table based on all valid modules
  4272. foreach ($this->modules as $module) {
  4273. foreach ($module->info as $name => $def) {
  4274. if (!isset($this->elementLookup[$name])) {
  4275. $this->elementLookup[$name] = array();
  4276. }
  4277. $this->elementLookup[$name][] = $module->name;
  4278. }
  4279. }
  4280. // note the different choice
  4281. $this->contentSets = new HTMLPurifier_ContentSets(
  4282. // content set assembly deals with all possible modules,
  4283. // not just ones deemed to be "safe"
  4284. $this->modules
  4285. );
  4286. $this->attrCollections = new HTMLPurifier_AttrCollections(
  4287. $this->attrTypes,
  4288. // there is no way to directly disable a global attribute,
  4289. // but using AllowedAttributes or simply not including
  4290. // the module in your custom doctype should be sufficient
  4291. $this->modules
  4292. );
  4293. }
  4294. /**
  4295. * Takes a module and adds it to the active module collection,
  4296. * registering it if necessary.
  4297. */
  4298. public function processModule($module) {
  4299. if (!isset($this->registeredModules[$module]) || is_object($module)) {
  4300. $this->registerModule($module);
  4301. }
  4302. $this->modules[$module] = $this->registeredModules[$module];
  4303. }
  4304. /**
  4305. * Retrieves merged element definitions.
  4306. * @return Array of HTMLPurifier_ElementDef
  4307. */
  4308. public function getElements() {
  4309. $elements = array();
  4310. foreach ($this->modules as $module) {
  4311. if (!$this->trusted && !$module->safe) continue;
  4312. foreach ($module->info as $name => $v) {
  4313. if (isset($elements[$name])) continue;
  4314. $elements[$name] = $this->getElement($name);
  4315. }
  4316. }
  4317. // remove dud elements, this happens when an element that
  4318. // appeared to be safe actually wasn't
  4319. foreach ($elements as $n => $v) {
  4320. if ($v === false) unset($elements[$n]);
  4321. }
  4322. return $elements;
  4323. }
  4324. /**
  4325. * Retrieves a single merged element definition
  4326. * @param $name Name of element
  4327. * @param $trusted Boolean trusted overriding parameter: set to true
  4328. * if you want the full version of an element
  4329. * @return Merged HTMLPurifier_ElementDef
  4330. * @note You may notice that modules are getting iterated over twice (once
  4331. * in getElements() and once here). This
  4332. * is because
  4333. */
  4334. public function getElement($name, $trusted = null) {
  4335. if (!isset($this->elementLookup[$name])) {
  4336. return false;
  4337. }
  4338. // setup global state variables
  4339. $def = false;
  4340. if ($trusted === null) $trusted = $this->trusted;
  4341. // iterate through each module that has registered itself to this
  4342. // element
  4343. foreach($this->elementLookup[$name] as $module_name) {
  4344. $module = $this->modules[$module_name];
  4345. // refuse to create/merge from a module that is deemed unsafe--
  4346. // pretend the module doesn't exist--when trusted mode is not on.
  4347. if (!$trusted && !$module->safe) {
  4348. continue;
  4349. }
  4350. // clone is used because, ideally speaking, the original
  4351. // definition should not be modified. Usually, this will
  4352. // make no difference, but for consistency's sake
  4353. $new_def = clone $module->info[$name];
  4354. if (!$def && $new_def->standalone) {
  4355. $def = $new_def;
  4356. } elseif ($def) {
  4357. // This will occur even if $new_def is standalone. In practice,
  4358. // this will usually result in a full replacement.
  4359. $def->mergeIn($new_def);
  4360. } else {
  4361. // :TODO:
  4362. // non-standalone definitions that don't have a standalone
  4363. // to merge into could be deferred to the end
  4364. continue;
  4365. }
  4366. // attribute value expansions
  4367. $this->attrCollections->performInclusions($def->attr);
  4368. $this->attrCollections->expandIdentifiers($def->attr, $this->attrTypes);
  4369. // descendants_are_inline, for ChildDef_Chameleon
  4370. if (is_string($def->content_model) &&
  4371. strpos($def->content_model, 'Inline') !== false) {
  4372. if ($name != 'del' && $name != 'ins') {
  4373. // this is for you, ins/del
  4374. $def->descendants_are_inline = true;
  4375. }
  4376. }
  4377. $this->contentSets->generateChildDef($def, $module);
  4378. }
  4379. // This can occur if there is a blank definition, but no base to
  4380. // mix it in with
  4381. if (!$def) return false;
  4382. // add information on required attributes
  4383. foreach ($def->attr as $attr_name => $attr_def) {
  4384. if ($attr_def->required) {
  4385. $def->required_attr[] = $attr_name;
  4386. }
  4387. }
  4388. return $def;
  4389. }
  4390. }
  4391. /**
  4392. * Component of HTMLPurifier_AttrContext that accumulates IDs to prevent dupes
  4393. * @note In Slashdot-speak, dupe means duplicate.
  4394. * @note The default constructor does not accept $config or $context objects:
  4395. * use must use the static build() factory method to perform initialization.
  4396. */
  4397. class HTMLPurifier_IDAccumulator
  4398. {
  4399. /**
  4400. * Lookup table of IDs we've accumulated.
  4401. * @public
  4402. */
  4403. public $ids = array();
  4404. /**
  4405. * Builds an IDAccumulator, also initializing the default blacklist
  4406. * @param $config Instance of HTMLPurifier_Config
  4407. * @param $context Instance of HTMLPurifier_Context
  4408. * @return Fully initialized HTMLPurifier_IDAccumulator
  4409. */
  4410. public static function build($config, $context) {
  4411. $id_accumulator = new HTMLPurifier_IDAccumulator();
  4412. $id_accumulator->load($config->get('Attr.IDBlacklist'));
  4413. return $id_accumulator;
  4414. }
  4415. /**
  4416. * Add an ID to the lookup table.
  4417. * @param $id ID to be added.
  4418. * @return Bool status, true if success, false if there's a dupe
  4419. */
  4420. public function add($id) {
  4421. if (isset($this->ids[$id])) return false;
  4422. return $this->ids[$id] = true;
  4423. }
  4424. /**
  4425. * Load a list of IDs into the lookup table
  4426. * @param $array_of_ids Array of IDs to load
  4427. * @note This function doesn't care about duplicates
  4428. */
  4429. public function load($array_of_ids) {
  4430. foreach ($array_of_ids as $id) {
  4431. $this->ids[$id] = true;
  4432. }
  4433. }
  4434. }
  4435. /**
  4436. * Injects tokens into the document while parsing for well-formedness.
  4437. * This enables "formatter-like" functionality such as auto-paragraphing,
  4438. * smiley-ification and linkification to take place.
  4439. *
  4440. * A note on how handlers create changes; this is done by assigning a new
  4441. * value to the $token reference. These values can take a variety of forms and
  4442. * are best described HTMLPurifier_Strategy_MakeWellFormed->processToken()
  4443. * documentation.
  4444. *
  4445. * @todo Allow injectors to request a re-run on their output. This
  4446. * would help if an operation is recursive.
  4447. */
  4448. abstract class HTMLPurifier_Injector
  4449. {
  4450. /**
  4451. * Advisory name of injector, this is for friendly error messages
  4452. */
  4453. public $name;
  4454. /**
  4455. * Instance of HTMLPurifier_HTMLDefinition
  4456. */
  4457. protected $htmlDefinition;
  4458. /**
  4459. * Reference to CurrentNesting variable in Context. This is an array
  4460. * list of tokens that we are currently "inside"
  4461. */
  4462. protected $currentNesting;
  4463. /**
  4464. * Reference to InputTokens variable in Context. This is an array
  4465. * list of the input tokens that are being processed.
  4466. */
  4467. protected $inputTokens;
  4468. /**
  4469. * Reference to InputIndex variable in Context. This is an integer
  4470. * array index for $this->inputTokens that indicates what token
  4471. * is currently being processed.
  4472. */
  4473. protected $inputIndex;
  4474. /**
  4475. * Array of elements and attributes this injector creates and therefore
  4476. * need to be allowed by the definition. Takes form of
  4477. * array('element' => array('attr', 'attr2'), 'element2')
  4478. */
  4479. public $needed = array();
  4480. /**
  4481. * Index of inputTokens to rewind to.
  4482. */
  4483. protected $rewind = false;
  4484. /**
  4485. * Rewind to a spot to re-perform processing. This is useful if you
  4486. * deleted a node, and now need to see if this change affected any
  4487. * earlier nodes. Rewinding does not affect other injectors, and can
  4488. * result in infinite loops if not used carefully.
  4489. * @warning HTML Purifier will prevent you from fast-forwarding with this
  4490. * function.
  4491. */
  4492. public function rewind($index) {
  4493. $this->rewind = $index;
  4494. }
  4495. /**
  4496. * Retrieves rewind, and then unsets it.
  4497. */
  4498. public function getRewind() {
  4499. $r = $this->rewind;
  4500. $this->rewind = false;
  4501. return $r;
  4502. }
  4503. /**
  4504. * Prepares the injector by giving it the config and context objects:
  4505. * this allows references to important variables to be made within
  4506. * the injector. This function also checks if the HTML environment
  4507. * will work with the Injector (see checkNeeded()).
  4508. * @param $config Instance of HTMLPurifier_Config
  4509. * @param $context Instance of HTMLPurifier_Context
  4510. * @return Boolean false if success, string of missing needed element/attribute if failure
  4511. */
  4512. public function prepare($config, $context) {
  4513. $this->htmlDefinition = $config->getHTMLDefinition();
  4514. // Even though this might fail, some unit tests ignore this and
  4515. // still test checkNeeded, so be careful. Maybe get rid of that
  4516. // dependency.
  4517. $result = $this->checkNeeded($config);
  4518. if ($result !== false) return $result;
  4519. $this->currentNesting =& $context->get('CurrentNesting');
  4520. $this->inputTokens =& $context->get('InputTokens');
  4521. $this->inputIndex =& $context->get('InputIndex');
  4522. return false;
  4523. }
  4524. /**
  4525. * This function checks if the HTML environment
  4526. * will work with the Injector: if p tags are not allowed, the
  4527. * Auto-Paragraphing injector should not be enabled.
  4528. * @param $config Instance of HTMLPurifier_Config
  4529. * @param $context Instance of HTMLPurifier_Context
  4530. * @return Boolean false if success, string of missing needed element/attribute if failure
  4531. */
  4532. public function checkNeeded($config) {
  4533. $def = $config->getHTMLDefinition();
  4534. foreach ($this->needed as $element => $attributes) {
  4535. if (is_int($element)) $element = $attributes;
  4536. if (!isset($def->info[$element])) return $element;
  4537. if (!is_array($attributes)) continue;
  4538. foreach ($attributes as $name) {
  4539. if (!isset($def->info[$element]->attr[$name])) return "$element.$name";
  4540. }
  4541. }
  4542. return false;
  4543. }
  4544. /**
  4545. * Tests if the context node allows a certain element
  4546. * @param $name Name of element to test for
  4547. * @return True if element is allowed, false if it is not
  4548. */
  4549. public function allowsElement($name) {
  4550. if (!empty($this->currentNesting)) {
  4551. $parent_token = array_pop($this->currentNesting);
  4552. $this->currentNesting[] = $parent_token;
  4553. $parent = $this->htmlDefinition->info[$parent_token->name];
  4554. } else {
  4555. $parent = $this->htmlDefinition->info_parent_def;
  4556. }
  4557. if (!isset($parent->child->elements[$name]) || isset($parent->excludes[$name])) {
  4558. return false;
  4559. }
  4560. // check for exclusion
  4561. for ($i = count($this->currentNesting) - 2; $i >= 0; $i--) {
  4562. $node = $this->currentNesting[$i];
  4563. $def = $this->htmlDefinition->info[$node->name];
  4564. if (isset($def->excludes[$name])) return false;
  4565. }
  4566. return true;
  4567. }
  4568. /**
  4569. * Iterator function, which starts with the next token and continues until
  4570. * you reach the end of the input tokens.
  4571. * @warning Please prevent previous references from interfering with this
  4572. * functions by setting $i = null beforehand!
  4573. * @param &$i Current integer index variable for inputTokens
  4574. * @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
  4575. */
  4576. protected function forward(&$i, &$current) {
  4577. if ($i === null) $i = $this->inputIndex + 1;
  4578. else $i++;
  4579. if (!isset($this->inputTokens[$i])) return false;
  4580. $current = $this->inputTokens[$i];
  4581. return true;
  4582. }
  4583. /**
  4584. * Similar to _forward, but accepts a third parameter $nesting (which
  4585. * should be initialized at 0) and stops when we hit the end tag
  4586. * for the node $this->inputIndex starts in.
  4587. */
  4588. protected function forwardUntilEndToken(&$i, &$current, &$nesting) {
  4589. $result = $this->forward($i, $current);
  4590. if (!$result) return false;
  4591. if ($nesting === null) $nesting = 0;
  4592. if ($current instanceof HTMLPurifier_Token_Start) $nesting++;
  4593. elseif ($current instanceof HTMLPurifier_Token_End) {
  4594. if ($nesting <= 0) return false;
  4595. $nesting--;
  4596. }
  4597. return true;
  4598. }
  4599. /**
  4600. * Iterator function, starts with the previous token and continues until
  4601. * you reach the beginning of input tokens.
  4602. * @warning Please prevent previous references from interfering with this
  4603. * functions by setting $i = null beforehand!
  4604. * @param &$i Current integer index variable for inputTokens
  4605. * @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
  4606. */
  4607. protected function backward(&$i, &$current) {
  4608. if ($i === null) $i = $this->inputIndex - 1;
  4609. else $i--;
  4610. if ($i < 0) return false;
  4611. $current = $this->inputTokens[$i];
  4612. return true;
  4613. }
  4614. /**
  4615. * Initializes the iterator at the current position. Use in a do {} while;
  4616. * loop to force the _forward and _backward functions to start at the
  4617. * current location.
  4618. * @warning Please prevent previous references from interfering with this
  4619. * functions by setting $i = null beforehand!
  4620. * @param &$i Current integer index variable for inputTokens
  4621. * @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
  4622. */
  4623. protected function current(&$i, &$current) {
  4624. if ($i === null) $i = $this->inputIndex;
  4625. $current = $this->inputTokens[$i];
  4626. }
  4627. /**
  4628. * Handler that is called when a text token is processed
  4629. */
  4630. public function handleText(&$token) {}
  4631. /**
  4632. * Handler that is called when a start or empty token is processed
  4633. */
  4634. public function handleElement(&$token) {}
  4635. /**
  4636. * Handler that is called when an end token is processed
  4637. */
  4638. public function handleEnd(&$token) {
  4639. $this->notifyEnd($token);
  4640. }
  4641. /**
  4642. * Notifier that is called when an end token is processed
  4643. * @note This differs from handlers in that the token is read-only
  4644. * @deprecated
  4645. */
  4646. public function notifyEnd($token) {}
  4647. }
  4648. /**
  4649. * Represents a language and defines localizable string formatting and
  4650. * other functions, as well as the localized messages for HTML Purifier.
  4651. */
  4652. class HTMLPurifier_Language
  4653. {
  4654. /**
  4655. * ISO 639 language code of language. Prefers shortest possible version
  4656. */
  4657. public $code = 'en';
  4658. /**
  4659. * Fallback language code
  4660. */
  4661. public $fallback = false;
  4662. /**
  4663. * Array of localizable messages
  4664. */
  4665. public $messages = array();
  4666. /**
  4667. * Array of localizable error codes
  4668. */
  4669. public $errorNames = array();
  4670. /**
  4671. * True if no message file was found for this language, so English
  4672. * is being used instead. Check this if you'd like to notify the
  4673. * user that they've used a non-supported language.
  4674. */
  4675. public $error = false;
  4676. /**
  4677. * Has the language object been loaded yet?
  4678. * @todo Make it private, fix usage in HTMLPurifier_LanguageTest
  4679. */
  4680. public $_loaded = false;
  4681. /**
  4682. * Instances of HTMLPurifier_Config and HTMLPurifier_Context
  4683. */
  4684. protected $config, $context;
  4685. public function __construct($config, $context) {
  4686. $this->config = $config;
  4687. $this->context = $context;
  4688. }
  4689. /**
  4690. * Loads language object with necessary info from factory cache
  4691. * @note This is a lazy loader
  4692. */
  4693. public function load() {
  4694. if ($this->_loaded) return;
  4695. $factory = HTMLPurifier_LanguageFactory::instance();
  4696. $factory->loadLanguage($this->code);
  4697. foreach ($factory->keys as $key) {
  4698. $this->$key = $factory->cache[$this->code][$key];
  4699. }
  4700. $this->_loaded = true;
  4701. }
  4702. /**
  4703. * Retrieves a localised message.
  4704. * @param $key string identifier of message
  4705. * @return string localised message
  4706. */
  4707. public function getMessage($key) {
  4708. if (!$this->_loaded) $this->load();
  4709. if (!isset($this->messages[$key])) return "[$key]";
  4710. return $this->messages[$key];
  4711. }
  4712. /**
  4713. * Retrieves a localised error name.
  4714. * @param $int integer error number, corresponding to PHP's error
  4715. * reporting
  4716. * @return string localised message
  4717. */
  4718. public function getErrorName($int) {
  4719. if (!$this->_loaded) $this->load();
  4720. if (!isset($this->errorNames[$int])) return "[Error: $int]";
  4721. return $this->errorNames[$int];
  4722. }
  4723. /**
  4724. * Converts an array list into a string readable representation
  4725. */
  4726. public function listify($array) {
  4727. $sep = $this->getMessage('Item separator');
  4728. $sep_last = $this->getMessage('Item separator last');
  4729. $ret = '';
  4730. for ($i = 0, $c = count($array); $i < $c; $i++) {
  4731. if ($i == 0) {
  4732. } elseif ($i + 1 < $c) {
  4733. $ret .= $sep;
  4734. } else {
  4735. $ret .= $sep_last;
  4736. }
  4737. $ret .= $array[$i];
  4738. }
  4739. return $ret;
  4740. }
  4741. /**
  4742. * Formats a localised message with passed parameters
  4743. * @param $key string identifier of message
  4744. * @param $args Parameters to substitute in
  4745. * @return string localised message
  4746. * @todo Implement conditionals? Right now, some messages make
  4747. * reference to line numbers, but those aren't always available
  4748. */
  4749. public function formatMessage($key, $args = array()) {
  4750. if (!$this->_loaded) $this->load();
  4751. if (!isset($this->messages[$key])) return "[$key]";
  4752. $raw = $this->messages[$key];
  4753. $subst = array();
  4754. $generator = false;
  4755. foreach ($args as $i => $value) {
  4756. if (is_object($value)) {
  4757. if ($value instanceof HTMLPurifier_Token) {
  4758. // factor this out some time
  4759. if (!$generator) $generator = $this->context->get('Generator');
  4760. if (isset($value->name)) $subst['$'.$i.'.Name'] = $value->name;
  4761. if (isset($value->data)) $subst['$'.$i.'.Data'] = $value->data;
  4762. $subst['$'.$i.'.Compact'] =
  4763. $subst['$'.$i.'.Serialized'] = $generator->generateFromToken($value);
  4764. // a more complex algorithm for compact representation
  4765. // could be introduced for all types of tokens. This
  4766. // may need to be factored out into a dedicated class
  4767. if (!empty($value->attr)) {
  4768. $stripped_token = clone $value;
  4769. $stripped_token->attr = array();
  4770. $subst['$'.$i.'.Compact'] = $generator->generateFromToken($stripped_token);
  4771. }
  4772. $subst['$'.$i.'.Line'] = $value->line ? $value->line : 'unknown';
  4773. }
  4774. continue;
  4775. } elseif (is_array($value)) {
  4776. $keys = array_keys($value);
  4777. if (array_keys($keys) === $keys) {
  4778. // list
  4779. $subst['$'.$i] = $this->listify($value);
  4780. } else {
  4781. // associative array
  4782. // no $i implementation yet, sorry
  4783. $subst['$'.$i.'.Keys'] = $this->listify($keys);
  4784. $subst['$'.$i.'.Values'] = $this->listify(array_values($value));
  4785. }
  4786. continue;
  4787. }
  4788. $subst['$' . $i] = $value;
  4789. }
  4790. return strtr($raw, $subst);
  4791. }
  4792. }
  4793. /**
  4794. * Class responsible for generating HTMLPurifier_Language objects, managing
  4795. * caching and fallbacks.
  4796. * @note Thanks to MediaWiki for the general logic, although this version
  4797. * has been entirely rewritten
  4798. * @todo Serialized cache for languages
  4799. */
  4800. class HTMLPurifier_LanguageFactory
  4801. {
  4802. /**
  4803. * Cache of language code information used to load HTMLPurifier_Language objects
  4804. * Structure is: $factory->cache[$language_code][$key] = $value
  4805. * @value array map
  4806. */
  4807. public $cache;
  4808. /**
  4809. * Valid keys in the HTMLPurifier_Language object. Designates which
  4810. * variables to slurp out of a message file.
  4811. * @value array list
  4812. */
  4813. public $keys = array('fallback', 'messages', 'errorNames');
  4814. /**
  4815. * Instance of HTMLPurifier_AttrDef_Lang to validate language codes
  4816. * @value object HTMLPurifier_AttrDef_Lang
  4817. */
  4818. protected $validator;
  4819. /**
  4820. * Cached copy of dirname(__FILE__), directory of current file without
  4821. * trailing slash
  4822. * @value string filename
  4823. */
  4824. protected $dir;
  4825. /**
  4826. * Keys whose contents are a hash map and can be merged
  4827. * @value array lookup
  4828. */
  4829. protected $mergeable_keys_map = array('messages' => true, 'errorNames' => true);
  4830. /**
  4831. * Keys whose contents are a list and can be merged
  4832. * @value array lookup
  4833. */
  4834. protected $mergeable_keys_list = array();
  4835. /**
  4836. * Retrieve sole instance of the factory.
  4837. * @param $prototype Optional prototype to overload sole instance with,
  4838. * or bool true to reset to default factory.
  4839. */
  4840. public static function instance($prototype = null) {
  4841. static $instance = null;
  4842. if ($prototype !== null) {
  4843. $instance = $prototype;
  4844. } elseif ($instance === null || $prototype == true) {
  4845. $instance = new HTMLPurifier_LanguageFactory();
  4846. $instance->setup();
  4847. }
  4848. return $instance;
  4849. }
  4850. /**
  4851. * Sets up the singleton, much like a constructor
  4852. * @note Prevents people from getting this outside of the singleton
  4853. */
  4854. public function setup() {
  4855. $this->validator = new HTMLPurifier_AttrDef_Lang();
  4856. $this->dir = HTMLPURIFIER_PREFIX . '/HTMLPurifier';
  4857. }
  4858. /**
  4859. * Creates a language object, handles class fallbacks
  4860. * @param $config Instance of HTMLPurifier_Config
  4861. * @param $context Instance of HTMLPurifier_Context
  4862. * @param $code Code to override configuration with. Private parameter.
  4863. */
  4864. public function create($config, $context, $code = false) {
  4865. // validate language code
  4866. if ($code === false) {
  4867. $code = $this->validator->validate(
  4868. $config->get('Core.Language'), $config, $context
  4869. );
  4870. } else {
  4871. $code = $this->validator->validate($code, $config, $context);
  4872. }
  4873. if ($code === false) $code = 'en'; // malformed code becomes English
  4874. $pcode = str_replace('-', '_', $code); // make valid PHP classname
  4875. static $depth = 0; // recursion protection
  4876. if ($code == 'en') {
  4877. $lang = new HTMLPurifier_Language($config, $context);
  4878. } else {
  4879. $class = 'HTMLPurifier_Language_' . $pcode;
  4880. $file = $this->dir . '/Language/classes/' . $code . '.php';
  4881. if (file_exists($file) || class_exists($class, false)) {
  4882. $lang = new $class($config, $context);
  4883. } else {
  4884. // Go fallback
  4885. $raw_fallback = $this->getFallbackFor($code);
  4886. $fallback = $raw_fallback ? $raw_fallback : 'en';
  4887. $depth++;
  4888. $lang = $this->create($config, $context, $fallback);
  4889. if (!$raw_fallback) {
  4890. $lang->error = true;
  4891. }
  4892. $depth--;
  4893. }
  4894. }
  4895. $lang->code = $code;
  4896. return $lang;
  4897. }
  4898. /**
  4899. * Returns the fallback language for language
  4900. * @note Loads the original language into cache
  4901. * @param $code string language code
  4902. */
  4903. public function getFallbackFor($code) {
  4904. $this->loadLanguage($code);
  4905. return $this->cache[$code]['fallback'];
  4906. }
  4907. /**
  4908. * Loads language into the cache, handles message file and fallbacks
  4909. * @param $code string language code
  4910. */
  4911. public function loadLanguage($code) {
  4912. static $languages_seen = array(); // recursion guard
  4913. // abort if we've already loaded it
  4914. if (isset($this->cache[$code])) return;
  4915. // generate filename
  4916. $filename = $this->dir . '/Language/messages/' . $code . '.php';
  4917. // default fallback : may be overwritten by the ensuing include
  4918. $fallback = ($code != 'en') ? 'en' : false;
  4919. // load primary localisation
  4920. if (!file_exists($filename)) {
  4921. // skip the include: will rely solely on fallback
  4922. $filename = $this->dir . '/Language/messages/en.php';
  4923. $cache = array();
  4924. } else {
  4925. include $filename;
  4926. $cache = compact($this->keys);
  4927. }
  4928. // load fallback localisation
  4929. if (!empty($fallback)) {
  4930. // infinite recursion guard
  4931. if (isset($languages_seen[$code])) {
  4932. trigger_error('Circular fallback reference in language ' .
  4933. $code, E_USER_ERROR);
  4934. $fallback = 'en';
  4935. }
  4936. $language_seen[$code] = true;
  4937. // load the fallback recursively
  4938. $this->loadLanguage($fallback);
  4939. $fallback_cache = $this->cache[$fallback];
  4940. // merge fallback with current language
  4941. foreach ( $this->keys as $key ) {
  4942. if (isset($cache[$key]) && isset($fallback_cache[$key])) {
  4943. if (isset($this->mergeable_keys_map[$key])) {
  4944. $cache[$key] = $cache[$key] + $fallback_cache[$key];
  4945. } elseif (isset($this->mergeable_keys_list[$key])) {
  4946. $cache[$key] = array_merge( $fallback_cache[$key], $cache[$key] );
  4947. }
  4948. } else {
  4949. $cache[$key] = $fallback_cache[$key];
  4950. }
  4951. }
  4952. }
  4953. // save to cache for later retrieval
  4954. $this->cache[$code] = $cache;
  4955. return;
  4956. }
  4957. }
  4958. /**
  4959. * Represents a measurable length, with a string numeric magnitude
  4960. * and a unit. This object is immutable.
  4961. */
  4962. class HTMLPurifier_Length
  4963. {
  4964. /**
  4965. * String numeric magnitude.
  4966. */
  4967. protected $n;
  4968. /**
  4969. * String unit. False is permitted if $n = 0.
  4970. */
  4971. protected $unit;
  4972. /**
  4973. * Whether or not this length is valid. Null if not calculated yet.
  4974. */
  4975. protected $isValid;
  4976. /**
  4977. * Lookup array of units recognized by CSS 2.1
  4978. */
  4979. protected static $allowedUnits = array(
  4980. 'em' => true, 'ex' => true, 'px' => true, 'in' => true,
  4981. 'cm' => true, 'mm' => true, 'pt' => true, 'pc' => true
  4982. );
  4983. /**
  4984. * @param number $n Magnitude
  4985. * @param string $u Unit
  4986. */
  4987. public function __construct($n = '0', $u = false) {
  4988. $this->n = (string) $n;
  4989. $this->unit = $u !== false ? (string) $u : false;
  4990. }
  4991. /**
  4992. * @param string $s Unit string, like '2em' or '3.4in'
  4993. * @warning Does not perform validation.
  4994. */
  4995. static public function make($s) {
  4996. if ($s instanceof HTMLPurifier_Length) return $s;
  4997. $n_length = strspn($s, '1234567890.+-');
  4998. $n = substr($s, 0, $n_length);
  4999. $unit = substr($s, $n_length);
  5000. if ($unit === '') $unit = false;
  5001. return new HTMLPurifier_Length($n, $unit);
  5002. }
  5003. /**
  5004. * Validates the number and unit.
  5005. */
  5006. protected function validate() {
  5007. // Special case:
  5008. if ($this->n === '+0' || $this->n === '-0') $this->n = '0';
  5009. if ($this->n === '0' && $this->unit === false) return true;
  5010. if (!ctype_lower($this->unit)) $this->unit = strtolower($this->unit);
  5011. if (!isset(HTMLPurifier_Length::$allowedUnits[$this->unit])) return false;
  5012. // Hack:
  5013. $def = new HTMLPurifier_AttrDef_CSS_Number();
  5014. $result = $def->validate($this->n, false, false);
  5015. if ($result === false) return false;
  5016. $this->n = $result;
  5017. return true;
  5018. }
  5019. /**
  5020. * Returns string representation of number.
  5021. */
  5022. public function toString() {
  5023. if (!$this->isValid()) return false;
  5024. return $this->n . $this->unit;
  5025. }
  5026. /**
  5027. * Retrieves string numeric magnitude.
  5028. */
  5029. public function getN() {return $this->n;}
  5030. /**
  5031. * Retrieves string unit.
  5032. */
  5033. public function getUnit() {return $this->unit;}
  5034. /**
  5035. * Returns true if this length unit is valid.
  5036. */
  5037. public function isValid() {
  5038. if ($this->isValid === null) $this->isValid = $this->validate();
  5039. return $this->isValid;
  5040. }
  5041. /**
  5042. * Compares two lengths, and returns 1 if greater, -1 if less and 0 if equal.
  5043. * @warning If both values are too large or small, this calculation will
  5044. * not work properly
  5045. */
  5046. public function compareTo($l) {
  5047. if ($l === false) return false;
  5048. if ($l->unit !== $this->unit) {
  5049. $converter = new HTMLPurifier_UnitConverter();
  5050. $l = $converter->convert($l, $this->unit);
  5051. if ($l === false) return false;
  5052. }
  5053. return $this->n - $l->n;
  5054. }
  5055. }
  5056. /**
  5057. * Forgivingly lexes HTML (SGML-style) markup into tokens.
  5058. *
  5059. * A lexer parses a string of SGML-style markup and converts them into
  5060. * corresponding tokens. It doesn't check for well-formedness, although its
  5061. * internal mechanism may make this automatic (such as the case of
  5062. * HTMLPurifier_Lexer_DOMLex). There are several implementations to choose
  5063. * from.
  5064. *
  5065. * A lexer is HTML-oriented: it might work with XML, but it's not
  5066. * recommended, as we adhere to a subset of the specification for optimization
  5067. * reasons. This might change in the future. Also, most tokenizers are not
  5068. * expected to handle DTDs or PIs.
  5069. *
  5070. * This class should not be directly instantiated, but you may use create() to
  5071. * retrieve a default copy of the lexer. Being a supertype, this class
  5072. * does not actually define any implementation, but offers commonly used
  5073. * convenience functions for subclasses.
  5074. *
  5075. * @note The unit tests will instantiate this class for testing purposes, as
  5076. * many of the utility functions require a class to be instantiated.
  5077. * This means that, even though this class is not runnable, it will
  5078. * not be declared abstract.
  5079. *
  5080. * @par
  5081. *
  5082. * @note
  5083. * We use tokens rather than create a DOM representation because DOM would:
  5084. *
  5085. * @par
  5086. * -# Require more processing and memory to create,
  5087. * -# Is not streamable, and
  5088. * -# Has the entire document structure (html and body not needed).
  5089. *
  5090. * @par
  5091. * However, DOM is helpful in that it makes it easy to move around nodes
  5092. * without a lot of lookaheads to see when a tag is closed. This is a
  5093. * limitation of the token system and some workarounds would be nice.
  5094. */
  5095. class HTMLPurifier_Lexer
  5096. {
  5097. /**
  5098. * Whether or not this lexer implements line-number/column-number tracking.
  5099. * If it does, set to true.
  5100. */
  5101. public $tracksLineNumbers = false;
  5102. // -- STATIC ----------------------------------------------------------
  5103. /**
  5104. * Retrieves or sets the default Lexer as a Prototype Factory.
  5105. *
  5106. * By default HTMLPurifier_Lexer_DOMLex will be returned. There are
  5107. * a few exceptions involving special features that only DirectLex
  5108. * implements.
  5109. *
  5110. * @note The behavior of this class has changed, rather than accepting
  5111. * a prototype object, it now accepts a configuration object.
  5112. * To specify your own prototype, set %Core.LexerImpl to it.
  5113. * This change in behavior de-singletonizes the lexer object.
  5114. *
  5115. * @param $config Instance of HTMLPurifier_Config
  5116. * @return Concrete lexer.
  5117. */
  5118. public static function create($config) {
  5119. if (!($config instanceof HTMLPurifier_Config)) {
  5120. $lexer = $config;
  5121. trigger_error("Passing a prototype to
  5122. HTMLPurifier_Lexer::create() is deprecated, please instead
  5123. use %Core.LexerImpl", E_USER_WARNING);
  5124. } else {
  5125. $lexer = $config->get('Core.LexerImpl');
  5126. }
  5127. $needs_tracking =
  5128. $config->get('Core.MaintainLineNumbers') ||
  5129. $config->get('Core.CollectErrors');
  5130. $inst = null;
  5131. if (is_object($lexer)) {
  5132. $inst = $lexer;
  5133. } else {
  5134. if (is_null($lexer)) { do {
  5135. // auto-detection algorithm
  5136. if ($needs_tracking) {
  5137. $lexer = 'DirectLex';
  5138. break;
  5139. }
  5140. if (
  5141. class_exists('DOMDocument') &&
  5142. method_exists('DOMDocument', 'loadHTML') &&
  5143. !extension_loaded('domxml')
  5144. ) {
  5145. // check for DOM support, because while it's part of the
  5146. // core, it can be disabled compile time. Also, the PECL
  5147. // domxml extension overrides the default DOM, and is evil
  5148. // and nasty and we shan't bother to support it
  5149. $lexer = 'DOMLex';
  5150. } else {
  5151. $lexer = 'DirectLex';
  5152. }
  5153. } while(0); } // do..while so we can break
  5154. // instantiate recognized string names
  5155. switch ($lexer) {
  5156. case 'DOMLex':
  5157. $inst = new HTMLPurifier_Lexer_DOMLex();
  5158. break;
  5159. case 'DirectLex':
  5160. $inst = new HTMLPurifier_Lexer_DirectLex();
  5161. break;
  5162. case 'PH5P':
  5163. $inst = new HTMLPurifier_Lexer_PH5P();
  5164. break;
  5165. default:
  5166. throw new HTMLPurifier_Exception("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer));
  5167. }
  5168. }
  5169. if (!$inst) throw new HTMLPurifier_Exception('No lexer was instantiated');
  5170. // once PHP DOM implements native line numbers, or we
  5171. // hack out something using XSLT, remove this stipulation
  5172. if ($needs_tracking && !$inst->tracksLineNumbers) {
  5173. throw new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)');
  5174. }
  5175. return $inst;
  5176. }
  5177. // -- CONVENIENCE MEMBERS ---------------------------------------------
  5178. public function __construct() {
  5179. $this->_entity_parser = new HTMLPurifier_EntityParser();
  5180. }
  5181. /**
  5182. * Most common entity to raw value conversion table for special entities.
  5183. */
  5184. protected $_special_entity2str =
  5185. array(
  5186. '&quot;' => '"',
  5187. '&amp;' => '&',
  5188. '&lt;' => '<',
  5189. '&gt;' => '>',
  5190. '&#39;' => "'",
  5191. '&#039;' => "'",
  5192. '&#x27;' => "'"
  5193. );
  5194. /**
  5195. * Parses special entities into the proper characters.
  5196. *
  5197. * This string will translate escaped versions of the special characters
  5198. * into the correct ones.
  5199. *
  5200. * @warning
  5201. * You should be able to treat the output of this function as
  5202. * completely parsed, but that's only because all other entities should
  5203. * have been handled previously in substituteNonSpecialEntities()
  5204. *
  5205. * @param $string String character data to be parsed.
  5206. * @returns Parsed character data.
  5207. */
  5208. public function parseData($string) {
  5209. // following functions require at least one character
  5210. if ($string === '') return '';
  5211. // subtracts amps that cannot possibly be escaped
  5212. $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
  5213. ($string[strlen($string)-1] === '&' ? 1 : 0);
  5214. if (!$num_amp) return $string; // abort if no entities
  5215. $num_esc_amp = substr_count($string, '&amp;');
  5216. $string = strtr($string, $this->_special_entity2str);
  5217. // code duplication for sake of optimization, see above
  5218. $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
  5219. ($string[strlen($string)-1] === '&' ? 1 : 0);
  5220. if ($num_amp_2 <= $num_esc_amp) return $string;
  5221. // hmm... now we have some uncommon entities. Use the callback.
  5222. $string = $this->_entity_parser->substituteSpecialEntities($string);
  5223. return $string;
  5224. }
  5225. /**
  5226. * Lexes an HTML string into tokens.
  5227. *
  5228. * @param $string String HTML.
  5229. * @return HTMLPurifier_Token array representation of HTML.
  5230. */
  5231. public function tokenizeHTML($string, $config, $context) {
  5232. trigger_error('Call to abstract class', E_USER_ERROR);
  5233. }
  5234. /**
  5235. * Translates CDATA sections into regular sections (through escaping).
  5236. *
  5237. * @param $string HTML string to process.
  5238. * @returns HTML with CDATA sections escaped.
  5239. */
  5240. protected static function escapeCDATA($string) {
  5241. return preg_replace_callback(
  5242. '/<!\[CDATA\[(.+?)\]\]>/s',
  5243. array('HTMLPurifier_Lexer', 'CDATACallback'),
  5244. $string
  5245. );
  5246. }
  5247. /**
  5248. * Special CDATA case that is especially convoluted for <script>
  5249. */
  5250. protected static function escapeCommentedCDATA($string) {
  5251. return preg_replace_callback(
  5252. '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
  5253. array('HTMLPurifier_Lexer', 'CDATACallback'),
  5254. $string
  5255. );
  5256. }
  5257. /**
  5258. * Special Internet Explorer conditional comments should be removed.
  5259. */
  5260. protected static function removeIEConditional($string) {
  5261. return preg_replace(
  5262. '#<!--\[if [^>]+\]>.*<!\[endif\]-->#si', // probably should generalize for all strings
  5263. '',
  5264. $string
  5265. );
  5266. }
  5267. /**
  5268. * Callback function for escapeCDATA() that does the work.
  5269. *
  5270. * @warning Though this is public in order to let the callback happen,
  5271. * calling it directly is not recommended.
  5272. * @params $matches PCRE matches array, with index 0 the entire match
  5273. * and 1 the inside of the CDATA section.
  5274. * @returns Escaped internals of the CDATA section.
  5275. */
  5276. protected static function CDATACallback($matches) {
  5277. // not exactly sure why the character set is needed, but whatever
  5278. return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
  5279. }
  5280. /**
  5281. * Takes a piece of HTML and normalizes it by converting entities, fixing
  5282. * encoding, extracting bits, and other good stuff.
  5283. * @todo Consider making protected
  5284. */
  5285. public function normalize($html, $config, $context) {
  5286. // normalize newlines to \n
  5287. if ($config->get('Core.NormalizeNewlines')) {
  5288. $html = str_replace("\r\n", "\n", $html);
  5289. $html = str_replace("\r", "\n", $html);
  5290. }
  5291. if ($config->get('HTML.Trusted')) {
  5292. // escape convoluted CDATA
  5293. $html = $this->escapeCommentedCDATA($html);
  5294. }
  5295. $html = $this->removeIEConditional($html);
  5296. // escape CDATA
  5297. $html = $this->escapeCDATA($html);
  5298. // extract body from document if applicable
  5299. if ($config->get('Core.ConvertDocumentToFragment')) {
  5300. $e = false;
  5301. if ($config->get('Core.CollectErrors')) {
  5302. $e =& $context->get('ErrorCollector');
  5303. }
  5304. $new_html = $this->extractBody($html);
  5305. if ($e && $new_html != $html) {
  5306. $e->send(E_WARNING, 'Lexer: Extracted body');
  5307. }
  5308. $html = $new_html;
  5309. }
  5310. // expand entities that aren't the big five
  5311. $html = $this->_entity_parser->substituteNonSpecialEntities($html);
  5312. // clean into wellformed UTF-8 string for an SGML context: this has
  5313. // to be done after entity expansion because the entities sometimes
  5314. // represent non-SGML characters (horror, horror!)
  5315. $html = HTMLPurifier_Encoder::cleanUTF8($html);
  5316. // if processing instructions are to removed, remove them now
  5317. if ($config->get('Core.RemoveProcessingInstructions')) {
  5318. $html = preg_replace('#<\?.+?\?>#s', '', $html);
  5319. }
  5320. return $html;
  5321. }
  5322. /**
  5323. * Takes a string of HTML (fragment or document) and returns the content
  5324. * @todo Consider making protected
  5325. */
  5326. public function extractBody($html) {
  5327. $matches = array();
  5328. $result = preg_match('!<body[^>]*>(.*)</body>!is', $html, $matches);
  5329. if ($result) {
  5330. return $matches[1];
  5331. } else {
  5332. return $html;
  5333. }
  5334. }
  5335. }
  5336. /**
  5337. * Class that handles operations involving percent-encoding in URIs.
  5338. *
  5339. * @warning
  5340. * Be careful when reusing instances of PercentEncoder. The object
  5341. * you use for normalize() SHOULD NOT be used for encode(), or
  5342. * vice-versa.
  5343. */
  5344. class HTMLPurifier_PercentEncoder
  5345. {
  5346. /**
  5347. * Reserved characters to preserve when using encode().
  5348. */
  5349. protected $preserve = array();
  5350. /**
  5351. * String of characters that should be preserved while using encode().
  5352. */
  5353. public function __construct($preserve = false) {
  5354. // unreserved letters, ought to const-ify
  5355. for ($i = 48; $i <= 57; $i++) $this->preserve[$i] = true; // digits
  5356. for ($i = 65; $i <= 90; $i++) $this->preserve[$i] = true; // upper-case
  5357. for ($i = 97; $i <= 122; $i++) $this->preserve[$i] = true; // lower-case
  5358. $this->preserve[45] = true; // Dash -
  5359. $this->preserve[46] = true; // Period .
  5360. $this->preserve[95] = true; // Underscore _
  5361. $this->preserve[126]= true; // Tilde ~
  5362. // extra letters not to escape
  5363. if ($preserve !== false) {
  5364. for ($i = 0, $c = strlen($preserve); $i < $c; $i++) {
  5365. $this->preserve[ord($preserve[$i])] = true;
  5366. }
  5367. }
  5368. }
  5369. /**
  5370. * Our replacement for urlencode, it encodes all non-reserved characters,
  5371. * as well as any extra characters that were instructed to be preserved.
  5372. * @note
  5373. * Assumes that the string has already been normalized, making any
  5374. * and all percent escape sequences valid. Percents will not be
  5375. * re-escaped, regardless of their status in $preserve
  5376. * @param $string String to be encoded
  5377. * @return Encoded string.
  5378. */
  5379. public function encode($string) {
  5380. $ret = '';
  5381. for ($i = 0, $c = strlen($string); $i < $c; $i++) {
  5382. if ($string[$i] !== '%' && !isset($this->preserve[$int = ord($string[$i])]) ) {
  5383. $ret .= '%' . sprintf('%02X', $int);
  5384. } else {
  5385. $ret .= $string[$i];
  5386. }
  5387. }
  5388. return $ret;
  5389. }
  5390. /**
  5391. * Fix up percent-encoding by decoding unreserved characters and normalizing.
  5392. * @warning This function is affected by $preserve, even though the
  5393. * usual desired behavior is for this not to preserve those
  5394. * characters. Be careful when reusing instances of PercentEncoder!
  5395. * @param $string String to normalize
  5396. */
  5397. public function normalize($string) {
  5398. if ($string == '') return '';
  5399. $parts = explode('%', $string);
  5400. $ret = array_shift($parts);
  5401. foreach ($parts as $part) {
  5402. $length = strlen($part);
  5403. if ($length < 2) {
  5404. $ret .= '%25' . $part;
  5405. continue;
  5406. }
  5407. $encoding = substr($part, 0, 2);
  5408. $text = substr($part, 2);
  5409. if (!ctype_xdigit($encoding)) {
  5410. $ret .= '%25' . $part;
  5411. continue;
  5412. }
  5413. $int = hexdec($encoding);
  5414. if (isset($this->preserve[$int])) {
  5415. $ret .= chr($int) . $text;
  5416. continue;
  5417. }
  5418. $encoding = strtoupper($encoding);
  5419. $ret .= '%' . $encoding . $text;
  5420. }
  5421. return $ret;
  5422. }
  5423. }
  5424. /**
  5425. * Generic property list implementation
  5426. */
  5427. class HTMLPurifier_PropertyList
  5428. {
  5429. /**
  5430. * Internal data-structure for properties
  5431. */
  5432. protected $data = array();
  5433. /**
  5434. * Parent plist
  5435. */
  5436. protected $parent;
  5437. protected $cache;
  5438. public function __construct($parent = null) {
  5439. $this->parent = $parent;
  5440. }
  5441. /**
  5442. * Recursively retrieves the value for a key
  5443. */
  5444. public function get($name) {
  5445. if ($this->has($name)) return $this->data[$name];
  5446. // possible performance bottleneck, convert to iterative if necessary
  5447. if ($this->parent) return $this->parent->get($name);
  5448. throw new HTMLPurifier_Exception("Key '$name' not found");
  5449. }
  5450. /**
  5451. * Sets the value of a key, for this plist
  5452. */
  5453. public function set($name, $value) {
  5454. $this->data[$name] = $value;
  5455. }
  5456. /**
  5457. * Returns true if a given key exists
  5458. */
  5459. public function has($name) {
  5460. return array_key_exists($name, $this->data);
  5461. }
  5462. /**
  5463. * Resets a value to the value of it's parent, usually the default. If
  5464. * no value is specified, the entire plist is reset.
  5465. */
  5466. public function reset($name = null) {
  5467. if ($name == null) $this->data = array();
  5468. else unset($this->data[$name]);
  5469. }
  5470. /**
  5471. * Squashes this property list and all of its property lists into a single
  5472. * array, and returns the array. This value is cached by default.
  5473. * @param $force If true, ignores the cache and regenerates the array.
  5474. */
  5475. public function squash($force = false) {
  5476. if ($this->cache !== null && !$force) return $this->cache;
  5477. if ($this->parent) {
  5478. return $this->cache = array_merge($this->parent->squash($force), $this->data);
  5479. } else {
  5480. return $this->cache = $this->data;
  5481. }
  5482. }
  5483. /**
  5484. * Returns the parent plist.
  5485. */
  5486. public function getParent() {
  5487. return $this->parent;
  5488. }
  5489. /**
  5490. * Sets the parent plist.
  5491. */
  5492. public function setParent($plist) {
  5493. $this->parent = $plist;
  5494. }
  5495. }
  5496. /**
  5497. * Property list iterator. Do not instantiate this class directly.
  5498. */
  5499. class HTMLPurifier_PropertyListIterator extends FilterIterator
  5500. {
  5501. protected $l;
  5502. protected $filter;
  5503. /**
  5504. * @param $data Array of data to iterate over
  5505. * @param $filter Optional prefix to only allow values of
  5506. */
  5507. public function __construct(Iterator $iterator, $filter = null) {
  5508. parent::__construct($iterator);
  5509. $this->l = strlen($filter);
  5510. $this->filter = $filter;
  5511. }
  5512. public function accept() {
  5513. $key = $this->getInnerIterator()->key();
  5514. if( strncmp($key, $this->filter, $this->l) !== 0 ) {
  5515. return false;
  5516. }
  5517. return true;
  5518. }
  5519. }
  5520. /**
  5521. * Supertype for classes that define a strategy for modifying/purifying tokens.
  5522. *
  5523. * While HTMLPurifier's core purpose is fixing HTML into something proper,
  5524. * strategies provide plug points for extra configuration or even extra
  5525. * features, such as custom tags, custom parsing of text, etc.
  5526. */
  5527. abstract class HTMLPurifier_Strategy
  5528. {
  5529. /**
  5530. * Executes the strategy on the tokens.
  5531. *
  5532. * @param $tokens Array of HTMLPurifier_Token objects to be operated on.
  5533. * @param $config Configuration options
  5534. * @returns Processed array of token objects.
  5535. */
  5536. abstract public function execute($tokens, $config, $context);
  5537. }
  5538. /**
  5539. * This is in almost every respect equivalent to an array except
  5540. * that it keeps track of which keys were accessed.
  5541. *
  5542. * @warning For the sake of backwards compatibility with early versions
  5543. * of PHP 5, you must not use the $hash[$key] syntax; if you do
  5544. * our version of offsetGet is never called.
  5545. */
  5546. class HTMLPurifier_StringHash extends ArrayObject
  5547. {
  5548. protected $accessed = array();
  5549. /**
  5550. * Retrieves a value, and logs the access.
  5551. */
  5552. public function offsetGet($index) {
  5553. $this->accessed[$index] = true;
  5554. return parent::offsetGet($index);
  5555. }
  5556. /**
  5557. * Returns a lookup array of all array indexes that have been accessed.
  5558. * @return Array in form array($index => true).
  5559. */
  5560. public function getAccessed() {
  5561. return $this->accessed;
  5562. }
  5563. /**
  5564. * Resets the access array.
  5565. */
  5566. public function resetAccessed() {
  5567. $this->accessed = array();
  5568. }
  5569. }
  5570. /**
  5571. * Parses string hash files. File format is as such:
  5572. *
  5573. * DefaultKeyValue
  5574. * KEY: Value
  5575. * KEY2: Value2
  5576. * --MULTILINE-KEY--
  5577. * Multiline
  5578. * value.
  5579. *
  5580. * Which would output something similar to:
  5581. *
  5582. * array(
  5583. * 'ID' => 'DefaultKeyValue',
  5584. * 'KEY' => 'Value',
  5585. * 'KEY2' => 'Value2',
  5586. * 'MULTILINE-KEY' => "Multiline\nvalue.\n",
  5587. * )
  5588. *
  5589. * We use this as an easy to use file-format for configuration schema
  5590. * files, but the class itself is usage agnostic.
  5591. *
  5592. * You can use ---- to forcibly terminate parsing of a single string-hash;
  5593. * this marker is used in multi string-hashes to delimit boundaries.
  5594. */
  5595. class HTMLPurifier_StringHashParser
  5596. {
  5597. public $default = 'ID';
  5598. /**
  5599. * Parses a file that contains a single string-hash.
  5600. */
  5601. public function parseFile($file) {
  5602. if (!file_exists($file)) return false;
  5603. $fh = fopen($file, 'r');
  5604. if (!$fh) return false;
  5605. $ret = $this->parseHandle($fh);
  5606. fclose($fh);
  5607. return $ret;
  5608. }
  5609. /**
  5610. * Parses a file that contains multiple string-hashes delimited by '----'
  5611. */
  5612. public function parseMultiFile($file) {
  5613. if (!file_exists($file)) return false;
  5614. $ret = array();
  5615. $fh = fopen($file, 'r');
  5616. if (!$fh) return false;
  5617. while (!feof($fh)) {
  5618. $ret[] = $this->parseHandle($fh);
  5619. }
  5620. fclose($fh);
  5621. return $ret;
  5622. }
  5623. /**
  5624. * Internal parser that acepts a file handle.
  5625. * @note While it's possible to simulate in-memory parsing by using
  5626. * custom stream wrappers, if such a use-case arises we should
  5627. * factor out the file handle into its own class.
  5628. * @param $fh File handle with pointer at start of valid string-hash
  5629. * block.
  5630. */
  5631. protected function parseHandle($fh) {
  5632. $state = false;
  5633. $single = false;
  5634. $ret = array();
  5635. do {
  5636. $line = fgets($fh);
  5637. if ($line === false) break;
  5638. $line = rtrim($line, "\n\r");
  5639. if (!$state && $line === '') continue;
  5640. if ($line === '----') break;
  5641. if (strncmp('--#', $line, 3) === 0) {
  5642. // Comment
  5643. continue;
  5644. } elseif (strncmp('--', $line, 2) === 0) {
  5645. // Multiline declaration
  5646. $state = trim($line, '- ');
  5647. if (!isset($ret[$state])) $ret[$state] = '';
  5648. continue;
  5649. } elseif (!$state) {
  5650. $single = true;
  5651. if (strpos($line, ':') !== false) {
  5652. // Single-line declaration
  5653. list($state, $line) = explode(':', $line, 2);
  5654. $line = trim($line);
  5655. } else {
  5656. // Use default declaration
  5657. $state = $this->default;
  5658. }
  5659. }
  5660. if ($single) {
  5661. $ret[$state] = $line;
  5662. $single = false;
  5663. $state = false;
  5664. } else {
  5665. $ret[$state] .= "$line\n";
  5666. }
  5667. } while (!feof($fh));
  5668. return $ret;
  5669. }
  5670. }
  5671. /**
  5672. * Defines a mutation of an obsolete tag into a valid tag.
  5673. */
  5674. abstract class HTMLPurifier_TagTransform
  5675. {
  5676. /**
  5677. * Tag name to transform the tag to.
  5678. */
  5679. public $transform_to;
  5680. /**
  5681. * Transforms the obsolete tag into the valid tag.
  5682. * @param $tag Tag to be transformed.
  5683. * @param $config Mandatory HTMLPurifier_Config object
  5684. * @param $context Mandatory HTMLPurifier_Context object
  5685. */
  5686. abstract public function transform($tag, $config, $context);
  5687. /**
  5688. * Prepends CSS properties to the style attribute, creating the
  5689. * attribute if it doesn't exist.
  5690. * @warning Copied over from AttrTransform, be sure to keep in sync
  5691. * @param $attr Attribute array to process (passed by reference)
  5692. * @param $css CSS to prepend
  5693. */
  5694. protected function prependCSS(&$attr, $css) {
  5695. $attr['style'] = isset($attr['style']) ? $attr['style'] : '';
  5696. $attr['style'] = $css . $attr['style'];
  5697. }
  5698. }
  5699. /**
  5700. * Abstract base token class that all others inherit from.
  5701. */
  5702. class HTMLPurifier_Token {
  5703. public $line; /**< Line number node was on in source document. Null if unknown. */
  5704. public $col; /**< Column of line node was on in source document. Null if unknown. */
  5705. /**
  5706. * Lookup array of processing that this token is exempt from.
  5707. * Currently, valid values are "ValidateAttributes" and
  5708. * "MakeWellFormed_TagClosedError"
  5709. */
  5710. public $armor = array();
  5711. /**
  5712. * Used during MakeWellFormed.
  5713. */
  5714. public $skip;
  5715. public $rewind;
  5716. public $carryover;
  5717. public function __get($n) {
  5718. if ($n === 'type') {
  5719. trigger_error('Deprecated type property called; use instanceof', E_USER_NOTICE);
  5720. switch (get_class($this)) {
  5721. case 'HTMLPurifier_Token_Start': return 'start';
  5722. case 'HTMLPurifier_Token_Empty': return 'empty';
  5723. case 'HTMLPurifier_Token_End': return 'end';
  5724. case 'HTMLPurifier_Token_Text': return 'text';
  5725. case 'HTMLPurifier_Token_Comment': return 'comment';
  5726. default: return null;
  5727. }
  5728. }
  5729. }
  5730. /**
  5731. * Sets the position of the token in the source document.
  5732. */
  5733. public function position($l = null, $c = null) {
  5734. $this->line = $l;
  5735. $this->col = $c;
  5736. }
  5737. /**
  5738. * Convenience function for DirectLex settings line/col position.
  5739. */
  5740. public function rawPosition($l, $c) {
  5741. if ($c === -1) $l++;
  5742. $this->line = $l;
  5743. $this->col = $c;
  5744. }
  5745. }
  5746. /**
  5747. * Factory for token generation.
  5748. *
  5749. * @note Doing some benchmarking indicates that the new operator is much
  5750. * slower than the clone operator (even discounting the cost of the
  5751. * constructor). This class is for that optimization.
  5752. * Other then that, there's not much point as we don't
  5753. * maintain parallel HTMLPurifier_Token hierarchies (the main reason why
  5754. * you'd want to use an abstract factory).
  5755. * @todo Port DirectLex to use this
  5756. */
  5757. class HTMLPurifier_TokenFactory
  5758. {
  5759. /**
  5760. * Prototypes that will be cloned.
  5761. * @private
  5762. */
  5763. // p stands for prototype
  5764. private $p_start, $p_end, $p_empty, $p_text, $p_comment;
  5765. /**
  5766. * Generates blank prototypes for cloning.
  5767. */
  5768. public function __construct() {
  5769. $this->p_start = new HTMLPurifier_Token_Start('', array());
  5770. $this->p_end = new HTMLPurifier_Token_End('');
  5771. $this->p_empty = new HTMLPurifier_Token_Empty('', array());
  5772. $this->p_text = new HTMLPurifier_Token_Text('');
  5773. $this->p_comment= new HTMLPurifier_Token_Comment('');
  5774. }
  5775. /**
  5776. * Creates a HTMLPurifier_Token_Start.
  5777. * @param $name Tag name
  5778. * @param $attr Associative array of attributes
  5779. * @return Generated HTMLPurifier_Token_Start
  5780. */
  5781. public function createStart($name, $attr = array()) {
  5782. $p = clone $this->p_start;
  5783. $p->__construct($name, $attr);
  5784. return $p;
  5785. }
  5786. /**
  5787. * Creates a HTMLPurifier_Token_End.
  5788. * @param $name Tag name
  5789. * @return Generated HTMLPurifier_Token_End
  5790. */
  5791. public function createEnd($name) {
  5792. $p = clone $this->p_end;
  5793. $p->__construct($name);
  5794. return $p;
  5795. }
  5796. /**
  5797. * Creates a HTMLPurifier_Token_Empty.
  5798. * @param $name Tag name
  5799. * @param $attr Associative array of attributes
  5800. * @return Generated HTMLPurifier_Token_Empty
  5801. */
  5802. public function createEmpty($name, $attr = array()) {
  5803. $p = clone $this->p_empty;
  5804. $p->__construct($name, $attr);
  5805. return $p;
  5806. }
  5807. /**
  5808. * Creates a HTMLPurifier_Token_Text.
  5809. * @param $data Data of text token
  5810. * @return Generated HTMLPurifier_Token_Text
  5811. */
  5812. public function createText($data) {
  5813. $p = clone $this->p_text;
  5814. $p->__construct($data);
  5815. return $p;
  5816. }
  5817. /**
  5818. * Creates a HTMLPurifier_Token_Comment.
  5819. * @param $data Data of comment token
  5820. * @return Generated HTMLPurifier_Token_Comment
  5821. */
  5822. public function createComment($data) {
  5823. $p = clone $this->p_comment;
  5824. $p->__construct($data);
  5825. return $p;
  5826. }
  5827. }
  5828. /**
  5829. * HTML Purifier's internal representation of a URI.
  5830. * @note
  5831. * Internal data-structures are completely escaped. If the data needs
  5832. * to be used in a non-URI context (which is very unlikely), be sure
  5833. * to decode it first. The URI may not necessarily be well-formed until
  5834. * validate() is called.
  5835. */
  5836. class HTMLPurifier_URI
  5837. {
  5838. public $scheme, $userinfo, $host, $port, $path, $query, $fragment;
  5839. /**
  5840. * @note Automatically normalizes scheme and port
  5841. */
  5842. public function __construct($scheme, $userinfo, $host, $port, $path, $query, $fragment) {
  5843. $this->scheme = is_null($scheme) || ctype_lower($scheme) ? $scheme : strtolower($scheme);
  5844. $this->userinfo = $userinfo;
  5845. $this->host = $host;
  5846. $this->port = is_null($port) ? $port : (int) $port;
  5847. $this->path = $path;
  5848. $this->query = $query;
  5849. $this->fragment = $fragment;
  5850. }
  5851. /**
  5852. * Retrieves a scheme object corresponding to the URI's scheme/default
  5853. * @param $config Instance of HTMLPurifier_Config
  5854. * @param $context Instance of HTMLPurifier_Context
  5855. * @return Scheme object appropriate for validating this URI
  5856. */
  5857. public function getSchemeObj($config, $context) {
  5858. $registry = HTMLPurifier_URISchemeRegistry::instance();
  5859. if ($this->scheme !== null) {
  5860. $scheme_obj = $registry->getScheme($this->scheme, $config, $context);
  5861. if (!$scheme_obj) return false; // invalid scheme, clean it out
  5862. } else {
  5863. // no scheme: retrieve the default one
  5864. $def = $config->getDefinition('URI');
  5865. $scheme_obj = $registry->getScheme($def->defaultScheme, $config, $context);
  5866. if (!$scheme_obj) {
  5867. // something funky happened to the default scheme object
  5868. trigger_error(
  5869. 'Default scheme object "' . $def->defaultScheme . '" was not readable',
  5870. E_USER_WARNING
  5871. );
  5872. return false;
  5873. }
  5874. }
  5875. return $scheme_obj;
  5876. }
  5877. /**
  5878. * Generic validation method applicable for all schemes. May modify
  5879. * this URI in order to get it into a compliant form.
  5880. * @param $config Instance of HTMLPurifier_Config
  5881. * @param $context Instance of HTMLPurifier_Context
  5882. * @return True if validation/filtering succeeds, false if failure
  5883. */
  5884. public function validate($config, $context) {
  5885. // ABNF definitions from RFC 3986
  5886. $chars_sub_delims = '!$&\'()*+,;=';
  5887. $chars_gen_delims = ':/?#[]@';
  5888. $chars_pchar = $chars_sub_delims . ':@';
  5889. // validate scheme (MUST BE FIRST!)
  5890. if (!is_null($this->scheme) && is_null($this->host)) {
  5891. $def = $config->getDefinition('URI');
  5892. if ($def->defaultScheme === $this->scheme) {
  5893. $this->scheme = null;
  5894. }
  5895. }
  5896. // validate host
  5897. if (!is_null($this->host)) {
  5898. $host_def = new HTMLPurifier_AttrDef_URI_Host();
  5899. $this->host = $host_def->validate($this->host, $config, $context);
  5900. if ($this->host === false) $this->host = null;
  5901. }
  5902. // validate username
  5903. if (!is_null($this->userinfo)) {
  5904. $encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
  5905. $this->userinfo = $encoder->encode($this->userinfo);
  5906. }
  5907. // validate port
  5908. if (!is_null($this->port)) {
  5909. if ($this->port < 1 || $this->port > 65535) $this->port = null;
  5910. }
  5911. // validate path
  5912. $path_parts = array();
  5913. $segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
  5914. if (!is_null($this->host)) {
  5915. // path-abempty (hier and relative)
  5916. $this->path = $segments_encoder->encode($this->path);
  5917. } elseif ($this->path !== '' && $this->path[0] === '/') {
  5918. // path-absolute (hier and relative)
  5919. if (strlen($this->path) >= 2 && $this->path[1] === '/') {
  5920. // This shouldn't ever happen!
  5921. $this->path = '';
  5922. } else {
  5923. $this->path = $segments_encoder->encode($this->path);
  5924. }
  5925. } elseif (!is_null($this->scheme) && $this->path !== '') {
  5926. // path-rootless (hier)
  5927. // Short circuit evaluation means we don't need to check nz
  5928. $this->path = $segments_encoder->encode($this->path);
  5929. } elseif (is_null($this->scheme) && $this->path !== '') {
  5930. // path-noscheme (relative)
  5931. // (once again, not checking nz)
  5932. $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
  5933. $c = strpos($this->path, '/');
  5934. if ($c !== false) {
  5935. $this->path =
  5936. $segment_nc_encoder->encode(substr($this->path, 0, $c)) .
  5937. $segments_encoder->encode(substr($this->path, $c));
  5938. } else {
  5939. $this->path = $segment_nc_encoder->encode($this->path);
  5940. }
  5941. } else {
  5942. // path-empty (hier and relative)
  5943. $this->path = ''; // just to be safe
  5944. }
  5945. // qf = query and fragment
  5946. $qf_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/?');
  5947. if (!is_null($this->query)) {
  5948. $this->query = $qf_encoder->encode($this->query);
  5949. }
  5950. if (!is_null($this->fragment)) {
  5951. $this->fragment = $qf_encoder->encode($this->fragment);
  5952. }
  5953. return true;
  5954. }
  5955. /**
  5956. * Convert URI back to string
  5957. * @return String URI appropriate for output
  5958. */
  5959. public function toString() {
  5960. // reconstruct authority
  5961. $authority = null;
  5962. if (!is_null($this->host)) {
  5963. $authority = '';
  5964. if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@';
  5965. $authority .= $this->host;
  5966. if(!is_null($this->port)) $authority .= ':' . $this->port;
  5967. }
  5968. // reconstruct the result
  5969. $result = '';
  5970. if (!is_null($this->scheme)) $result .= $this->scheme . ':';
  5971. if (!is_null($authority)) $result .= '//' . $authority;
  5972. $result .= $this->path;
  5973. if (!is_null($this->query)) $result .= '?' . $this->query;
  5974. if (!is_null($this->fragment)) $result .= '#' . $this->fragment;
  5975. return $result;
  5976. }
  5977. }
  5978. class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition
  5979. {
  5980. public $type = 'URI';
  5981. protected $filters = array();
  5982. protected $postFilters = array();
  5983. protected $registeredFilters = array();
  5984. /**
  5985. * HTMLPurifier_URI object of the base specified at %URI.Base
  5986. */
  5987. public $base;
  5988. /**
  5989. * String host to consider "home" base, derived off of $base
  5990. */
  5991. public $host;
  5992. /**
  5993. * Name of default scheme based on %URI.DefaultScheme and %URI.Base
  5994. */
  5995. public $defaultScheme;
  5996. public function __construct() {
  5997. $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternal());
  5998. $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternalResources());
  5999. $this->registerFilter(new HTMLPurifier_URIFilter_HostBlacklist());
  6000. $this->registerFilter(new HTMLPurifier_URIFilter_MakeAbsolute());
  6001. $this->registerFilter(new HTMLPurifier_URIFilter_Munge());
  6002. }
  6003. public function registerFilter($filter) {
  6004. $this->registeredFilters[$filter->name] = $filter;
  6005. }
  6006. public function addFilter($filter, $config) {
  6007. $r = $filter->prepare($config);
  6008. if ($r === false) return; // null is ok, for backwards compat
  6009. if ($filter->post) {
  6010. $this->postFilters[$filter->name] = $filter;
  6011. } else {
  6012. $this->filters[$filter->name] = $filter;
  6013. }
  6014. }
  6015. protected function doSetup($config) {
  6016. $this->setupMemberVariables($config);
  6017. $this->setupFilters($config);
  6018. }
  6019. protected function setupFilters($config) {
  6020. foreach ($this->registeredFilters as $name => $filter) {
  6021. $conf = $config->get('URI.' . $name);
  6022. if ($conf !== false && $conf !== null) {
  6023. $this->addFilter($filter, $config);
  6024. }
  6025. }
  6026. unset($this->registeredFilters);
  6027. }
  6028. protected function setupMemberVariables($config) {
  6029. $this->host = $config->get('URI.Host');
  6030. $base_uri = $config->get('URI.Base');
  6031. if (!is_null($base_uri)) {
  6032. $parser = new HTMLPurifier_URIParser();
  6033. $this->base = $parser->parse($base_uri);
  6034. $this->defaultScheme = $this->base->scheme;
  6035. if (is_null($this->host)) $this->host = $this->base->host;
  6036. }
  6037. if (is_null($this->defaultScheme)) $this->defaultScheme = $config->get('URI.DefaultScheme');
  6038. }
  6039. public function filter(&$uri, $config, $context) {
  6040. foreach ($this->filters as $name => $f) {
  6041. $result = $f->filter($uri, $config, $context);
  6042. if (!$result) return false;
  6043. }
  6044. return true;
  6045. }
  6046. public function postFilter(&$uri, $config, $context) {
  6047. foreach ($this->postFilters as $name => $f) {
  6048. $result = $f->filter($uri, $config, $context);
  6049. if (!$result) return false;
  6050. }
  6051. return true;
  6052. }
  6053. }
  6054. /**
  6055. * Chainable filters for custom URI processing.
  6056. *
  6057. * These filters can perform custom actions on a URI filter object,
  6058. * including transformation or blacklisting.
  6059. *
  6060. * @warning This filter is called before scheme object validation occurs.
  6061. * Make sure, if you require a specific scheme object, you
  6062. * you check that it exists. This allows filters to convert
  6063. * proprietary URI schemes into regular ones.
  6064. */
  6065. abstract class HTMLPurifier_URIFilter
  6066. {
  6067. /**
  6068. * Unique identifier of filter
  6069. */
  6070. public $name;
  6071. /**
  6072. * True if this filter should be run after scheme validation.
  6073. */
  6074. public $post = false;
  6075. /**
  6076. * Performs initialization for the filter
  6077. */
  6078. public function prepare($config) {return true;}
  6079. /**
  6080. * Filter a URI object
  6081. * @param $uri Reference to URI object variable
  6082. * @param $config Instance of HTMLPurifier_Config
  6083. * @param $context Instance of HTMLPurifier_Context
  6084. * @return bool Whether or not to continue processing: false indicates
  6085. * URL is no good, true indicates continue processing. Note that
  6086. * all changes are committed directly on the URI object
  6087. */
  6088. abstract public function filter(&$uri, $config, $context);
  6089. }
  6090. /**
  6091. * Parses a URI into the components and fragment identifier as specified
  6092. * by RFC 3986.
  6093. */
  6094. class HTMLPurifier_URIParser
  6095. {
  6096. /**
  6097. * Instance of HTMLPurifier_PercentEncoder to do normalization with.
  6098. */
  6099. protected $percentEncoder;
  6100. public function __construct() {
  6101. $this->percentEncoder = new HTMLPurifier_PercentEncoder();
  6102. }
  6103. /**
  6104. * Parses a URI.
  6105. * @param $uri string URI to parse
  6106. * @return HTMLPurifier_URI representation of URI. This representation has
  6107. * not been validated yet and may not conform to RFC.
  6108. */
  6109. public function parse($uri) {
  6110. $uri = $this->percentEncoder->normalize($uri);
  6111. // Regexp is as per Appendix B.
  6112. // Note that ["<>] are an addition to the RFC's recommended
  6113. // characters, because they represent external delimeters.
  6114. $r_URI = '!'.
  6115. '(([^:/?#"<>]+):)?'. // 2. Scheme
  6116. '(//([^/?#"<>]*))?'. // 4. Authority
  6117. '([^?#"<>]*)'. // 5. Path
  6118. '(\?([^#"<>]*))?'. // 7. Query
  6119. '(#([^"<>]*))?'. // 8. Fragment
  6120. '!';
  6121. $matches = array();
  6122. $result = preg_match($r_URI, $uri, $matches);
  6123. if (!$result) return false; // *really* invalid URI
  6124. // seperate out parts
  6125. $scheme = !empty($matches[1]) ? $matches[2] : null;
  6126. $authority = !empty($matches[3]) ? $matches[4] : null;
  6127. $path = $matches[5]; // always present, can be empty
  6128. $query = !empty($matches[6]) ? $matches[7] : null;
  6129. $fragment = !empty($matches[8]) ? $matches[9] : null;
  6130. // further parse authority
  6131. if ($authority !== null) {
  6132. $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
  6133. $matches = array();
  6134. preg_match($r_authority, $authority, $matches);
  6135. $userinfo = !empty($matches[1]) ? $matches[2] : null;
  6136. $host = !empty($matches[3]) ? $matches[3] : '';
  6137. $port = !empty($matches[4]) ? (int) $matches[5] : null;
  6138. } else {
  6139. $port = $host = $userinfo = null;
  6140. }
  6141. return new HTMLPurifier_URI(
  6142. $scheme, $userinfo, $host, $port, $path, $query, $fragment);
  6143. }
  6144. }
  6145. /**
  6146. * Validator for the components of a URI for a specific scheme
  6147. */
  6148. class HTMLPurifier_URIScheme
  6149. {
  6150. /**
  6151. * Scheme's default port (integer)
  6152. */
  6153. public $default_port = null;
  6154. /**
  6155. * Whether or not URIs of this schem are locatable by a browser
  6156. * http and ftp are accessible, while mailto and news are not.
  6157. */
  6158. public $browsable = false;
  6159. /**
  6160. * Whether or not the URI always uses <hier_part>, resolves edge cases
  6161. * with making relative URIs absolute
  6162. */
  6163. public $hierarchical = false;
  6164. /**
  6165. * Validates the components of a URI
  6166. * @note This implementation should be called by children if they define
  6167. * a default port, as it does port processing.
  6168. * @param $uri Instance of HTMLPurifier_URI
  6169. * @param $config HTMLPurifier_Config object
  6170. * @param $context HTMLPurifier_Context object
  6171. * @return Bool success or failure
  6172. */
  6173. public function validate(&$uri, $config, $context) {
  6174. if ($this->default_port == $uri->port) $uri->port = null;
  6175. return true;
  6176. }
  6177. }
  6178. /**
  6179. * Registry for retrieving specific URI scheme validator objects.
  6180. */
  6181. class HTMLPurifier_URISchemeRegistry
  6182. {
  6183. /**
  6184. * Retrieve sole instance of the registry.
  6185. * @param $prototype Optional prototype to overload sole instance with,
  6186. * or bool true to reset to default registry.
  6187. * @note Pass a registry object $prototype with a compatible interface and
  6188. * the function will copy it and return it all further times.
  6189. */
  6190. public static function instance($prototype = null) {
  6191. static $instance = null;
  6192. if ($prototype !== null) {
  6193. $instance = $prototype;
  6194. } elseif ($instance === null || $prototype == true) {
  6195. $instance = new HTMLPurifier_URISchemeRegistry();
  6196. }
  6197. return $instance;
  6198. }
  6199. /**
  6200. * Cache of retrieved schemes.
  6201. */
  6202. protected $schemes = array();
  6203. /**
  6204. * Retrieves a scheme validator object
  6205. * @param $scheme String scheme name like http or mailto
  6206. * @param $config HTMLPurifier_Config object
  6207. * @param $config HTMLPurifier_Context object
  6208. */
  6209. public function getScheme($scheme, $config, $context) {
  6210. if (!$config) $config = HTMLPurifier_Config::createDefault();
  6211. // important, otherwise attacker could include arbitrary file
  6212. $allowed_schemes = $config->get('URI.AllowedSchemes');
  6213. if (!$config->get('URI.OverrideAllowedSchemes') &&
  6214. !isset($allowed_schemes[$scheme])
  6215. ) {
  6216. return;
  6217. }
  6218. if (isset($this->schemes[$scheme])) return $this->schemes[$scheme];
  6219. if (!isset($allowed_schemes[$scheme])) return;
  6220. $class = 'HTMLPurifier_URIScheme_' . $scheme;
  6221. if (!class_exists($class)) return;
  6222. $this->schemes[$scheme] = new $class();
  6223. return $this->schemes[$scheme];
  6224. }
  6225. /**
  6226. * Registers a custom scheme to the cache, bypassing reflection.
  6227. * @param $scheme Scheme name
  6228. * @param $scheme_obj HTMLPurifier_URIScheme object
  6229. */
  6230. public function register($scheme, $scheme_obj) {
  6231. $this->schemes[$scheme] = $scheme_obj;
  6232. }
  6233. }
  6234. /**
  6235. * Class for converting between different unit-lengths as specified by
  6236. * CSS.
  6237. */
  6238. class HTMLPurifier_UnitConverter
  6239. {
  6240. const ENGLISH = 1;
  6241. const METRIC = 2;
  6242. const DIGITAL = 3;
  6243. /**
  6244. * Units information array. Units are grouped into measuring systems
  6245. * (English, Metric), and are assigned an integer representing
  6246. * the conversion factor between that unit and the smallest unit in
  6247. * the system. Numeric indexes are actually magical constants that
  6248. * encode conversion data from one system to the next, with a O(n^2)
  6249. * constraint on memory (this is generally not a problem, since
  6250. * the number of measuring systems is small.)
  6251. */
  6252. protected static $units = array(
  6253. self::ENGLISH => array(
  6254. 'px' => 3, // This is as per CSS 2.1 and Firefox. Your mileage may vary
  6255. 'pt' => 4,
  6256. 'pc' => 48,
  6257. 'in' => 288,
  6258. self::METRIC => array('pt', '0.352777778', 'mm'),
  6259. ),
  6260. self::METRIC => array(
  6261. 'mm' => 1,
  6262. 'cm' => 10,
  6263. self::ENGLISH => array('mm', '2.83464567', 'pt'),
  6264. ),
  6265. );
  6266. /**
  6267. * Minimum bcmath precision for output.
  6268. */
  6269. protected $outputPrecision;
  6270. /**
  6271. * Bcmath precision for internal calculations.
  6272. */
  6273. protected $internalPrecision;
  6274. /**
  6275. * Whether or not BCMath is available
  6276. */
  6277. private $bcmath;
  6278. public function __construct($output_precision = 4, $internal_precision = 10, $force_no_bcmath = false) {
  6279. $this->outputPrecision = $output_precision;
  6280. $this->internalPrecision = $internal_precision;
  6281. $this->bcmath = !$force_no_bcmath && function_exists('bcmul');
  6282. }
  6283. /**
  6284. * Converts a length object of one unit into another unit.
  6285. * @param HTMLPurifier_Length $length
  6286. * Instance of HTMLPurifier_Length to convert. You must validate()
  6287. * it before passing it here!
  6288. * @param string $to_unit
  6289. * Unit to convert to.
  6290. * @note
  6291. * About precision: This conversion function pays very special
  6292. * attention to the incoming precision of values and attempts
  6293. * to maintain a number of significant figure. Results are
  6294. * fairly accurate up to nine digits. Some caveats:
  6295. * - If a number is zero-padded as a result of this significant
  6296. * figure tracking, the zeroes will be eliminated.
  6297. * - If a number contains less than four sigfigs ($outputPrecision)
  6298. * and this causes some decimals to be excluded, those
  6299. * decimals will be added on.
  6300. */
  6301. public function convert($length, $to_unit) {
  6302. if (!$length->isValid()) return false;
  6303. $n = $length->getN();
  6304. $unit = $length->getUnit();
  6305. if ($n === '0' || $unit === false) {
  6306. return new HTMLPurifier_Length('0', false);
  6307. }
  6308. $state = $dest_state = false;
  6309. foreach (self::$units as $k => $x) {
  6310. if (isset($x[$unit])) $state = $k;
  6311. if (isset($x[$to_unit])) $dest_state = $k;
  6312. }
  6313. if (!$state || !$dest_state) return false;
  6314. // Some calculations about the initial precision of the number;
  6315. // this will be useful when we need to do final rounding.
  6316. $sigfigs = $this->getSigFigs($n);
  6317. if ($sigfigs < $this->outputPrecision) $sigfigs = $this->outputPrecision;
  6318. // BCMath's internal precision deals only with decimals. Use
  6319. // our default if the initial number has no decimals, or increase
  6320. // it by how ever many decimals, thus, the number of guard digits
  6321. // will always be greater than or equal to internalPrecision.
  6322. $log = (int) floor(log(abs($n), 10));
  6323. $cp = ($log < 0) ? $this->internalPrecision - $log : $this->internalPrecision; // internal precision
  6324. for ($i = 0; $i < 2; $i++) {
  6325. // Determine what unit IN THIS SYSTEM we need to convert to
  6326. if ($dest_state === $state) {
  6327. // Simple conversion
  6328. $dest_unit = $to_unit;
  6329. } else {
  6330. // Convert to the smallest unit, pending a system shift
  6331. $dest_unit = self::$units[$state][$dest_state][0];
  6332. }
  6333. // Do the conversion if necessary
  6334. if ($dest_unit !== $unit) {
  6335. $factor = $this->div(self::$units[$state][$unit], self::$units[$state][$dest_unit], $cp);
  6336. $n = $this->mul($n, $factor, $cp);
  6337. $unit = $dest_unit;
  6338. }
  6339. // Output was zero, so bail out early. Shouldn't ever happen.
  6340. if ($n === '') {
  6341. $n = '0';
  6342. $unit = $to_unit;
  6343. break;
  6344. }
  6345. // It was a simple conversion, so bail out
  6346. if ($dest_state === $state) {
  6347. break;
  6348. }
  6349. if ($i !== 0) {
  6350. // Conversion failed! Apparently, the system we forwarded
  6351. // to didn't have this unit. This should never happen!
  6352. return false;
  6353. }
  6354. // Pre-condition: $i == 0
  6355. // Perform conversion to next system of units
  6356. $n = $this->mul($n, self::$units[$state][$dest_state][1], $cp);
  6357. $unit = self::$units[$state][$dest_state][2];
  6358. $state = $dest_state;
  6359. // One more loop around to convert the unit in the new system.
  6360. }
  6361. // Post-condition: $unit == $to_unit
  6362. if ($unit !== $to_unit) return false;
  6363. // Useful for debugging:
  6364. //echo "<pre>n";
  6365. //echo "$n\nsigfigs = $sigfigs\nnew_log = $new_log\nlog = $log\nrp = $rp\n</pre>\n";
  6366. $n = $this->round($n, $sigfigs);
  6367. if (strpos($n, '.') !== false) $n = rtrim($n, '0');
  6368. $n = rtrim($n, '.');
  6369. return new HTMLPurifier_Length($n, $unit);
  6370. }
  6371. /**
  6372. * Returns the number of significant figures in a string number.
  6373. * @param string $n Decimal number
  6374. * @return int number of sigfigs
  6375. */
  6376. public function getSigFigs($n) {
  6377. $n = ltrim($n, '0+-');
  6378. $dp = strpos($n, '.'); // decimal position
  6379. if ($dp === false) {
  6380. $sigfigs = strlen(rtrim($n, '0'));
  6381. } else {
  6382. $sigfigs = strlen(ltrim($n, '0.')); // eliminate extra decimal character
  6383. if ($dp !== 0) $sigfigs--;
  6384. }
  6385. return $sigfigs;
  6386. }
  6387. /**
  6388. * Adds two numbers, using arbitrary precision when available.
  6389. */
  6390. private function add($s1, $s2, $scale) {
  6391. if ($this->bcmath) return bcadd($s1, $s2, $scale);
  6392. else return $this->scale($s1 + $s2, $scale);
  6393. }
  6394. /**
  6395. * Multiples two numbers, using arbitrary precision when available.
  6396. */
  6397. private function mul($s1, $s2, $scale) {
  6398. if ($this->bcmath) return bcmul($s1, $s2, $scale);
  6399. else return $this->scale($s1 * $s2, $scale);
  6400. }
  6401. /**
  6402. * Divides two numbers, using arbitrary precision when available.
  6403. */
  6404. private function div($s1, $s2, $scale) {
  6405. if ($this->bcmath) return bcdiv($s1, $s2, $scale);
  6406. else return $this->scale($s1 / $s2, $scale);
  6407. }
  6408. /**
  6409. * Rounds a number according to the number of sigfigs it should have,
  6410. * using arbitrary precision when available.
  6411. */
  6412. private function round($n, $sigfigs) {
  6413. $new_log = (int) floor(log(abs($n), 10)); // Number of digits left of decimal - 1
  6414. $rp = $sigfigs - $new_log - 1; // Number of decimal places needed
  6415. $neg = $n < 0 ? '-' : ''; // Negative sign
  6416. if ($this->bcmath) {
  6417. if ($rp >= 0) {
  6418. $n = bcadd($n, $neg . '0.' . str_repeat('0', $rp) . '5', $rp + 1);
  6419. $n = bcdiv($n, '1', $rp);
  6420. } else {
  6421. // This algorithm partially depends on the standardized
  6422. // form of numbers that comes out of bcmath.
  6423. $n = bcadd($n, $neg . '5' . str_repeat('0', $new_log - $sigfigs), 0);
  6424. $n = substr($n, 0, $sigfigs + strlen($neg)) . str_repeat('0', $new_log - $sigfigs + 1);
  6425. }
  6426. return $n;
  6427. } else {
  6428. return $this->scale(round($n, $sigfigs - $new_log - 1), $rp + 1);
  6429. }
  6430. }
  6431. /**
  6432. * Scales a float to $scale digits right of decimal point, like BCMath.
  6433. */
  6434. private function scale($r, $scale) {
  6435. if ($scale < 0) {
  6436. // The f sprintf type doesn't support negative numbers, so we
  6437. // need to cludge things manually. First get the string.
  6438. $r = sprintf('%.0f', (float) $r);
  6439. // Due to floating point precision loss, $r will more than likely
  6440. // look something like 4652999999999.9234. We grab one more digit
  6441. // than we need to precise from $r and then use that to round
  6442. // appropriately.
  6443. $precise = (string) round(substr($r, 0, strlen($r) + $scale), -1);
  6444. // Now we return it, truncating the zero that was rounded off.
  6445. return substr($precise, 0, -1) . str_repeat('0', -$scale + 1);
  6446. }
  6447. return sprintf('%.' . $scale . 'f', (float) $r);
  6448. }
  6449. }
  6450. /**
  6451. * Parses string representations into their corresponding native PHP
  6452. * variable type. The base implementation does a simple type-check.
  6453. */
  6454. class HTMLPurifier_VarParser
  6455. {
  6456. const STRING = 1;
  6457. const ISTRING = 2;
  6458. const TEXT = 3;
  6459. const ITEXT = 4;
  6460. const INT = 5;
  6461. const FLOAT = 6;
  6462. const BOOL = 7;
  6463. const LOOKUP = 8;
  6464. const ALIST = 9;
  6465. const HASH = 10;
  6466. const MIXED = 11;
  6467. /**
  6468. * Lookup table of allowed types. Mainly for backwards compatibility, but
  6469. * also convenient for transforming string type names to the integer constants.
  6470. */
  6471. static public $types = array(
  6472. 'string' => self::STRING,
  6473. 'istring' => self::ISTRING,
  6474. 'text' => self::TEXT,
  6475. 'itext' => self::ITEXT,
  6476. 'int' => self::INT,
  6477. 'float' => self::FLOAT,
  6478. 'bool' => self::BOOL,
  6479. 'lookup' => self::LOOKUP,
  6480. 'list' => self::ALIST,
  6481. 'hash' => self::HASH,
  6482. 'mixed' => self::MIXED
  6483. );
  6484. /**
  6485. * Lookup table of types that are string, and can have aliases or
  6486. * allowed value lists.
  6487. */
  6488. static public $stringTypes = array(
  6489. self::STRING => true,
  6490. self::ISTRING => true,
  6491. self::TEXT => true,
  6492. self::ITEXT => true,
  6493. );
  6494. /**
  6495. * Validate a variable according to type. Throws
  6496. * HTMLPurifier_VarParserException if invalid.
  6497. * It may return NULL as a valid type if $allow_null is true.
  6498. *
  6499. * @param $var Variable to validate
  6500. * @param $type Type of variable, see HTMLPurifier_VarParser->types
  6501. * @param $allow_null Whether or not to permit null as a value
  6502. * @return Validated and type-coerced variable
  6503. */
  6504. final public function parse($var, $type, $allow_null = false) {
  6505. if (is_string($type)) {
  6506. if (!isset(HTMLPurifier_VarParser::$types[$type])) {
  6507. throw new HTMLPurifier_VarParserException("Invalid type '$type'");
  6508. } else {
  6509. $type = HTMLPurifier_VarParser::$types[$type];
  6510. }
  6511. }
  6512. $var = $this->parseImplementation($var, $type, $allow_null);
  6513. if ($allow_null && $var === null) return null;
  6514. // These are basic checks, to make sure nothing horribly wrong
  6515. // happened in our implementations.
  6516. switch ($type) {
  6517. case (self::STRING):
  6518. case (self::ISTRING):
  6519. case (self::TEXT):
  6520. case (self::ITEXT):
  6521. if (!is_string($var)) break;
  6522. if ($type == self::ISTRING || $type == self::ITEXT) $var = strtolower($var);
  6523. return $var;
  6524. case (self::INT):
  6525. if (!is_int($var)) break;
  6526. return $var;
  6527. case (self::FLOAT):
  6528. if (!is_float($var)) break;
  6529. return $var;
  6530. case (self::BOOL):
  6531. if (!is_bool($var)) break;
  6532. return $var;
  6533. case (self::LOOKUP):
  6534. case (self::ALIST):
  6535. case (self::HASH):
  6536. if (!is_array($var)) break;
  6537. if ($type === self::LOOKUP) {
  6538. foreach ($var as $k) if ($k !== true) $this->error('Lookup table contains value other than true');
  6539. } elseif ($type === self::ALIST) {
  6540. $keys = array_keys($var);
  6541. if (array_keys($keys) !== $keys) $this->error('Indices for list are not uniform');
  6542. }
  6543. return $var;
  6544. case (self::MIXED):
  6545. return $var;
  6546. default:
  6547. $this->errorInconsistent(get_class($this), $type);
  6548. }
  6549. $this->errorGeneric($var, $type);
  6550. }
  6551. /**
  6552. * Actually implements the parsing. Base implementation is to not
  6553. * do anything to $var. Subclasses should overload this!
  6554. */
  6555. protected function parseImplementation($var, $type, $allow_null) {
  6556. return $var;
  6557. }
  6558. /**
  6559. * Throws an exception.
  6560. */
  6561. protected function error($msg) {
  6562. throw new HTMLPurifier_VarParserException($msg);
  6563. }
  6564. /**
  6565. * Throws an inconsistency exception.
  6566. * @note This should not ever be called. It would be called if we
  6567. * extend the allowed values of HTMLPurifier_VarParser without
  6568. * updating subclasses.
  6569. */
  6570. protected function errorInconsistent($class, $type) {
  6571. throw new HTMLPurifier_Exception("Inconsistency in $class: ".HTMLPurifier_VarParser::getTypeName($type)." not implemented");
  6572. }
  6573. /**
  6574. * Generic error for if a type didn't work.
  6575. */
  6576. protected function errorGeneric($var, $type) {
  6577. $vtype = gettype($var);
  6578. $this->error("Expected type ".HTMLPurifier_VarParser::getTypeName($type).", got $vtype");
  6579. }
  6580. static public function getTypeName($type) {
  6581. static $lookup;
  6582. if (!$lookup) {
  6583. // Lazy load the alternative lookup table
  6584. $lookup = array_flip(HTMLPurifier_VarParser::$types);
  6585. }
  6586. if (!isset($lookup[$type])) return 'unknown';
  6587. return $lookup[$type];
  6588. }
  6589. }
  6590. /**
  6591. * Exception type for HTMLPurifier_VarParser
  6592. */
  6593. class HTMLPurifier_VarParserException extends HTMLPurifier_Exception
  6594. {
  6595. }
  6596. /**
  6597. * Validates the HTML attribute style, otherwise known as CSS.
  6598. * @note We don't implement the whole CSS specification, so it might be
  6599. * difficult to reuse this component in the context of validating
  6600. * actual stylesheet declarations.
  6601. * @note If we were really serious about validating the CSS, we would
  6602. * tokenize the styles and then parse the tokens. Obviously, we
  6603. * are not doing that. Doing that could seriously harm performance,
  6604. * but would make these components a lot more viable for a CSS
  6605. * filtering solution.
  6606. */
  6607. class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
  6608. {
  6609. public function validate($css, $config, $context) {
  6610. $css = $this->parseCDATA($css);
  6611. $definition = $config->getCSSDefinition();
  6612. // we're going to break the spec and explode by semicolons.
  6613. // This is because semicolon rarely appears in escaped form
  6614. // Doing this is generally flaky but fast
  6615. // IT MIGHT APPEAR IN URIs, see HTMLPurifier_AttrDef_CSSURI
  6616. // for details
  6617. $declarations = explode(';', $css);
  6618. $propvalues = array();
  6619. /**
  6620. * Name of the current CSS property being validated.
  6621. */
  6622. $property = false;
  6623. $context->register('CurrentCSSProperty', $property);
  6624. foreach ($declarations as $declaration) {
  6625. if (!$declaration) continue;
  6626. if (!strpos($declaration, ':')) continue;
  6627. list($property, $value) = explode(':', $declaration, 2);
  6628. $property = trim($property);
  6629. $value = trim($value);
  6630. $ok = false;
  6631. do {
  6632. if (isset($definition->info[$property])) {
  6633. $ok = true;
  6634. break;
  6635. }
  6636. if (ctype_lower($property)) break;
  6637. $property = strtolower($property);
  6638. if (isset($definition->info[$property])) {
  6639. $ok = true;
  6640. break;
  6641. }
  6642. } while(0);
  6643. if (!$ok) continue;
  6644. // inefficient call, since the validator will do this again
  6645. if (strtolower(trim($value)) !== 'inherit') {
  6646. // inherit works for everything (but only on the base property)
  6647. $result = $definition->info[$property]->validate(
  6648. $value, $config, $context );
  6649. } else {
  6650. $result = 'inherit';
  6651. }
  6652. if ($result === false) continue;
  6653. $propvalues[$property] = $result;
  6654. }
  6655. $context->destroy('CurrentCSSProperty');
  6656. // procedure does not write the new CSS simultaneously, so it's
  6657. // slightly inefficient, but it's the only way of getting rid of
  6658. // duplicates. Perhaps config to optimize it, but not now.
  6659. $new_declarations = '';
  6660. foreach ($propvalues as $prop => $value) {
  6661. $new_declarations .= "$prop:$value;";
  6662. }
  6663. return $new_declarations ? $new_declarations : false;
  6664. }
  6665. }
  6666. // Enum = Enumerated
  6667. /**
  6668. * Validates a keyword against a list of valid values.
  6669. * @warning The case-insensitive compare of this function uses PHP's
  6670. * built-in strtolower and ctype_lower functions, which may
  6671. * cause problems with international comparisons
  6672. */
  6673. class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef
  6674. {
  6675. /**
  6676. * Lookup table of valid values.
  6677. * @todo Make protected
  6678. */
  6679. public $valid_values = array();
  6680. /**
  6681. * Bool indicating whether or not enumeration is case sensitive.
  6682. * @note In general this is always case insensitive.
  6683. */
  6684. protected $case_sensitive = false; // values according to W3C spec
  6685. /**
  6686. * @param $valid_values List of valid values
  6687. * @param $case_sensitive Bool indicating whether or not case sensitive
  6688. */
  6689. public function __construct(
  6690. $valid_values = array(), $case_sensitive = false
  6691. ) {
  6692. $this->valid_values = array_flip($valid_values);
  6693. $this->case_sensitive = $case_sensitive;
  6694. }
  6695. public function validate($string, $config, $context) {
  6696. $string = trim($string);
  6697. if (!$this->case_sensitive) {
  6698. // we may want to do full case-insensitive libraries
  6699. $string = ctype_lower($string) ? $string : strtolower($string);
  6700. }
  6701. $result = isset($this->valid_values[$string]);
  6702. return $result ? $string : false;
  6703. }
  6704. /**
  6705. * @param $string In form of comma-delimited list of case-insensitive
  6706. * valid values. Example: "foo,bar,baz". Prepend "s:" to make
  6707. * case sensitive
  6708. */
  6709. public function make($string) {
  6710. if (strlen($string) > 2 && $string[0] == 's' && $string[1] == ':') {
  6711. $string = substr($string, 2);
  6712. $sensitive = true;
  6713. } else {
  6714. $sensitive = false;
  6715. }
  6716. $values = explode(',', $string);
  6717. return new HTMLPurifier_AttrDef_Enum($values, $sensitive);
  6718. }
  6719. }
  6720. /**
  6721. * Validates an integer.
  6722. * @note While this class was modeled off the CSS definition, no currently
  6723. * allowed CSS uses this type. The properties that do are: widows,
  6724. * orphans, z-index, counter-increment, counter-reset. Some of the
  6725. * HTML attributes, however, find use for a non-negative version of this.
  6726. */
  6727. class HTMLPurifier_AttrDef_Integer extends HTMLPurifier_AttrDef
  6728. {
  6729. /**
  6730. * Bool indicating whether or not negative values are allowed
  6731. */
  6732. protected $negative = true;
  6733. /**
  6734. * Bool indicating whether or not zero is allowed
  6735. */
  6736. protected $zero = true;
  6737. /**
  6738. * Bool indicating whether or not positive values are allowed
  6739. */
  6740. protected $positive = true;
  6741. /**
  6742. * @param $negative Bool indicating whether or not negative values are allowed
  6743. * @param $zero Bool indicating whether or not zero is allowed
  6744. * @param $positive Bool indicating whether or not positive values are allowed
  6745. */
  6746. public function __construct(
  6747. $negative = true, $zero = true, $positive = true
  6748. ) {
  6749. $this->negative = $negative;
  6750. $this->zero = $zero;
  6751. $this->positive = $positive;
  6752. }
  6753. public function validate($integer, $config, $context) {
  6754. $integer = $this->parseCDATA($integer);
  6755. if ($integer === '') return false;
  6756. // we could possibly simply typecast it to integer, but there are
  6757. // certain fringe cases that must not return an integer.
  6758. // clip leading sign
  6759. if ( $this->negative && $integer[0] === '-' ) {
  6760. $digits = substr($integer, 1);
  6761. if ($digits === '0') $integer = '0'; // rm minus sign for zero
  6762. } elseif( $this->positive && $integer[0] === '+' ) {
  6763. $digits = $integer = substr($integer, 1); // rm unnecessary plus
  6764. } else {
  6765. $digits = $integer;
  6766. }
  6767. // test if it's numeric
  6768. if (!ctype_digit($digits)) return false;
  6769. // perform scope tests
  6770. if (!$this->zero && $integer == 0) return false;
  6771. if (!$this->positive && $integer > 0) return false;
  6772. if (!$this->negative && $integer < 0) return false;
  6773. return $integer;
  6774. }
  6775. }
  6776. /**
  6777. * Validates the HTML attribute lang, effectively a language code.
  6778. * @note Built according to RFC 3066, which obsoleted RFC 1766
  6779. */
  6780. class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef
  6781. {
  6782. public function validate($string, $config, $context) {
  6783. $string = trim($string);
  6784. if (!$string) return false;
  6785. $subtags = explode('-', $string);
  6786. $num_subtags = count($subtags);
  6787. if ($num_subtags == 0) return false; // sanity check
  6788. // process primary subtag : $subtags[0]
  6789. $length = strlen($subtags[0]);
  6790. switch ($length) {
  6791. case 0:
  6792. return false;
  6793. case 1:
  6794. if (! ($subtags[0] == 'x' || $subtags[0] == 'i') ) {
  6795. return false;
  6796. }
  6797. break;
  6798. case 2:
  6799. case 3:
  6800. if (! ctype_alpha($subtags[0]) ) {
  6801. return false;
  6802. } elseif (! ctype_lower($subtags[0]) ) {
  6803. $subtags[0] = strtolower($subtags[0]);
  6804. }
  6805. break;
  6806. default:
  6807. return false;
  6808. }
  6809. $new_string = $subtags[0];
  6810. if ($num_subtags == 1) return $new_string;
  6811. // process second subtag : $subtags[1]
  6812. $length = strlen($subtags[1]);
  6813. if ($length == 0 || ($length == 1 && $subtags[1] != 'x') || $length > 8 || !ctype_alnum($subtags[1])) {
  6814. return $new_string;
  6815. }
  6816. if (!ctype_lower($subtags[1])) $subtags[1] = strtolower($subtags[1]);
  6817. $new_string .= '-' . $subtags[1];
  6818. if ($num_subtags == 2) return $new_string;
  6819. // process all other subtags, index 2 and up
  6820. for ($i = 2; $i < $num_subtags; $i++) {
  6821. $length = strlen($subtags[$i]);
  6822. if ($length == 0 || $length > 8 || !ctype_alnum($subtags[$i])) {
  6823. return $new_string;
  6824. }
  6825. if (!ctype_lower($subtags[$i])) {
  6826. $subtags[$i] = strtolower($subtags[$i]);
  6827. }
  6828. $new_string .= '-' . $subtags[$i];
  6829. }
  6830. return $new_string;
  6831. }
  6832. }
  6833. /**
  6834. * Decorator that, depending on a token, switches between two definitions.
  6835. */
  6836. class HTMLPurifier_AttrDef_Switch
  6837. {
  6838. protected $tag;
  6839. protected $withTag, $withoutTag;
  6840. /**
  6841. * @param string $tag Tag name to switch upon
  6842. * @param HTMLPurifier_AttrDef $with_tag Call if token matches tag
  6843. * @param HTMLPurifier_AttrDef $without_tag Call if token doesn't match, or there is no token
  6844. */
  6845. public function __construct($tag, $with_tag, $without_tag) {
  6846. $this->tag = $tag;
  6847. $this->withTag = $with_tag;
  6848. $this->withoutTag = $without_tag;
  6849. }
  6850. public function validate($string, $config, $context) {
  6851. $token = $context->get('CurrentToken', true);
  6852. if (!$token || $token->name !== $this->tag) {
  6853. return $this->withoutTag->validate($string, $config, $context);
  6854. } else {
  6855. return $this->withTag->validate($string, $config, $context);
  6856. }
  6857. }
  6858. }
  6859. /**
  6860. * Validates arbitrary text according to the HTML spec.
  6861. */
  6862. class HTMLPurifier_AttrDef_Text extends HTMLPurifier_AttrDef
  6863. {
  6864. public function validate($string, $config, $context) {
  6865. return $this->parseCDATA($string);
  6866. }
  6867. }
  6868. /**
  6869. * Validates a URI as defined by RFC 3986.
  6870. * @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme
  6871. */
  6872. class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
  6873. {
  6874. protected $parser;
  6875. protected $embedsResource;
  6876. /**
  6877. * @param $embeds_resource_resource Does the URI here result in an extra HTTP request?
  6878. */
  6879. public function __construct($embeds_resource = false) {
  6880. $this->parser = new HTMLPurifier_URIParser();
  6881. $this->embedsResource = (bool) $embeds_resource;
  6882. }
  6883. public function make($string) {
  6884. $embeds = (bool) $string;
  6885. return new HTMLPurifier_AttrDef_URI($embeds);
  6886. }
  6887. public function validate($uri, $config, $context) {
  6888. if ($config->get('URI.Disable')) return false;
  6889. $uri = $this->parseCDATA($uri);
  6890. // parse the URI
  6891. $uri = $this->parser->parse($uri);
  6892. if ($uri === false) return false;
  6893. // add embedded flag to context for validators
  6894. $context->register('EmbeddedURI', $this->embedsResource);
  6895. $ok = false;
  6896. do {
  6897. // generic validation
  6898. $result = $uri->validate($config, $context);
  6899. if (!$result) break;
  6900. // chained filtering
  6901. $uri_def = $config->getDefinition('URI');
  6902. $result = $uri_def->filter($uri, $config, $context);
  6903. if (!$result) break;
  6904. // scheme-specific validation
  6905. $scheme_obj = $uri->getSchemeObj($config, $context);
  6906. if (!$scheme_obj) break;
  6907. if ($this->embedsResource && !$scheme_obj->browsable) break;
  6908. $result = $scheme_obj->validate($uri, $config, $context);
  6909. if (!$result) break;
  6910. // Post chained filtering
  6911. $result = $uri_def->postFilter($uri, $config, $context);
  6912. if (!$result) break;
  6913. // survived gauntlet
  6914. $ok = true;
  6915. } while (false);
  6916. $context->destroy('EmbeddedURI');
  6917. if (!$ok) return false;
  6918. // back to string
  6919. return $uri->toString();
  6920. }
  6921. }
  6922. /**
  6923. * Validates a number as defined by the CSS spec.
  6924. */
  6925. class HTMLPurifier_AttrDef_CSS_Number extends HTMLPurifier_AttrDef
  6926. {
  6927. /**
  6928. * Bool indicating whether or not only positive values allowed.
  6929. */
  6930. protected $non_negative = false;
  6931. /**
  6932. * @param $non_negative Bool indicating whether negatives are forbidden
  6933. */
  6934. public function __construct($non_negative = false) {
  6935. $this->non_negative = $non_negative;
  6936. }
  6937. /**
  6938. * @warning Some contexts do not pass $config, $context. These
  6939. * variables should not be used without checking HTMLPurifier_Length
  6940. */
  6941. public function validate($number, $config, $context) {
  6942. $number = $this->parseCDATA($number);
  6943. if ($number === '') return false;
  6944. if ($number === '0') return '0';
  6945. $sign = '';
  6946. switch ($number[0]) {
  6947. case '-':
  6948. if ($this->non_negative) return false;
  6949. $sign = '-';
  6950. case '+':
  6951. $number = substr($number, 1);
  6952. }
  6953. if (ctype_digit($number)) {
  6954. $number = ltrim($number, '0');
  6955. return $number ? $sign . $number : '0';
  6956. }
  6957. // Period is the only non-numeric character allowed
  6958. if (strpos($number, '.') === false) return false;
  6959. list($left, $right) = explode('.', $number, 2);
  6960. if ($left === '' && $right === '') return false;
  6961. if ($left !== '' && !ctype_digit($left)) return false;
  6962. $left = ltrim($left, '0');
  6963. $right = rtrim($right, '0');
  6964. if ($right === '') {
  6965. return $left ? $sign . $left : '0';
  6966. } elseif (!ctype_digit($right)) {
  6967. return false;
  6968. }
  6969. return $sign . $left . '.' . $right;
  6970. }
  6971. }
  6972. class HTMLPurifier_AttrDef_CSS_AlphaValue extends HTMLPurifier_AttrDef_CSS_Number
  6973. {
  6974. public function __construct() {
  6975. parent::__construct(false); // opacity is non-negative, but we will clamp it
  6976. }
  6977. public function validate($number, $config, $context) {
  6978. $result = parent::validate($number, $config, $context);
  6979. if ($result === false) return $result;
  6980. $float = (float) $result;
  6981. if ($float < 0.0) $result = '0';
  6982. if ($float > 1.0) $result = '1';
  6983. return $result;
  6984. }
  6985. }
  6986. /**
  6987. * Validates shorthand CSS property background.
  6988. * @warning Does not support url tokens that have internal spaces.
  6989. */
  6990. class HTMLPurifier_AttrDef_CSS_Background extends HTMLPurifier_AttrDef
  6991. {
  6992. /**
  6993. * Local copy of component validators.
  6994. * @note See HTMLPurifier_AttrDef_Font::$info for a similar impl.
  6995. */
  6996. protected $info;
  6997. public function __construct($config) {
  6998. $def = $config->getCSSDefinition();
  6999. $this->info['background-color'] = $def->info['background-color'];
  7000. $this->info['background-image'] = $def->info['background-image'];
  7001. $this->info['background-repeat'] = $def->info['background-repeat'];
  7002. $this->info['background-attachment'] = $def->info['background-attachment'];
  7003. $this->info['background-position'] = $def->info['background-position'];
  7004. }
  7005. public function validate($string, $config, $context) {
  7006. // regular pre-processing
  7007. $string = $this->parseCDATA($string);
  7008. if ($string === '') return false;
  7009. // munge rgb() decl if necessary
  7010. $string = $this->mungeRgb($string);
  7011. // assumes URI doesn't have spaces in it
  7012. $bits = explode(' ', strtolower($string)); // bits to process
  7013. $caught = array();
  7014. $caught['color'] = false;
  7015. $caught['image'] = false;
  7016. $caught['repeat'] = false;
  7017. $caught['attachment'] = false;
  7018. $caught['position'] = false;
  7019. $i = 0; // number of catches
  7020. $none = false;
  7021. foreach ($bits as $bit) {
  7022. if ($bit === '') continue;
  7023. foreach ($caught as $key => $status) {
  7024. if ($key != 'position') {
  7025. if ($status !== false) continue;
  7026. $r = $this->info['background-' . $key]->validate($bit, $config, $context);
  7027. } else {
  7028. $r = $bit;
  7029. }
  7030. if ($r === false) continue;
  7031. if ($key == 'position') {
  7032. if ($caught[$key] === false) $caught[$key] = '';
  7033. $caught[$key] .= $r . ' ';
  7034. } else {
  7035. $caught[$key] = $r;
  7036. }
  7037. $i++;
  7038. break;
  7039. }
  7040. }
  7041. if (!$i) return false;
  7042. if ($caught['position'] !== false) {
  7043. $caught['position'] = $this->info['background-position']->
  7044. validate($caught['position'], $config, $context);
  7045. }
  7046. $ret = array();
  7047. foreach ($caught as $value) {
  7048. if ($value === false) continue;
  7049. $ret[] = $value;
  7050. }
  7051. if (empty($ret)) return false;
  7052. return implode(' ', $ret);
  7053. }
  7054. }
  7055. /* W3C says:
  7056. [ // adjective and number must be in correct order, even if
  7057. // you could switch them without introducing ambiguity.
  7058. // some browsers support that syntax
  7059. [
  7060. <percentage> | <length> | left | center | right
  7061. ]
  7062. [
  7063. <percentage> | <length> | top | center | bottom
  7064. ]?
  7065. ] |
  7066. [ // this signifies that the vertical and horizontal adjectives
  7067. // can be arbitrarily ordered, however, there can only be two,
  7068. // one of each, or none at all
  7069. [
  7070. left | center | right
  7071. ] ||
  7072. [
  7073. top | center | bottom
  7074. ]
  7075. ]
  7076. top, left = 0%
  7077. center, (none) = 50%
  7078. bottom, right = 100%
  7079. */
  7080. /* QuirksMode says:
  7081. keyword + length/percentage must be ordered correctly, as per W3C
  7082. Internet Explorer and Opera, however, support arbitrary ordering. We
  7083. should fix it up.
  7084. Minor issue though, not strictly necessary.
  7085. */
  7086. // control freaks may appreciate the ability to convert these to
  7087. // percentages or something, but it's not necessary
  7088. /**
  7089. * Validates the value of background-position.
  7090. */
  7091. class HTMLPurifier_AttrDef_CSS_BackgroundPosition extends HTMLPurifier_AttrDef
  7092. {
  7093. protected $length;
  7094. protected $percentage;
  7095. public function __construct() {
  7096. $this->length = new HTMLPurifier_AttrDef_CSS_Length();
  7097. $this->percentage = new HTMLPurifier_AttrDef_CSS_Percentage();
  7098. }
  7099. public function validate($string, $config, $context) {
  7100. $string = $this->parseCDATA($string);
  7101. $bits = explode(' ', $string);
  7102. $keywords = array();
  7103. $keywords['h'] = false; // left, right
  7104. $keywords['v'] = false; // top, bottom
  7105. $keywords['ch'] = false; // center (first word)
  7106. $keywords['cv'] = false; // center (second word)
  7107. $measures = array();
  7108. $i = 0;
  7109. $lookup = array(
  7110. 'top' => 'v',
  7111. 'bottom' => 'v',
  7112. 'left' => 'h',
  7113. 'right' => 'h',
  7114. 'center' => 'c'
  7115. );
  7116. foreach ($bits as $bit) {
  7117. if ($bit === '') continue;
  7118. // test for keyword
  7119. $lbit = ctype_lower($bit) ? $bit : strtolower($bit);
  7120. if (isset($lookup[$lbit])) {
  7121. $status = $lookup[$lbit];
  7122. if ($status == 'c') {
  7123. if ($i == 0) {
  7124. $status = 'ch';
  7125. } else {
  7126. $status = 'cv';
  7127. }
  7128. }
  7129. $keywords[$status] = $lbit;
  7130. $i++;
  7131. }
  7132. // test for length
  7133. $r = $this->length->validate($bit, $config, $context);
  7134. if ($r !== false) {
  7135. $measures[] = $r;
  7136. $i++;
  7137. }
  7138. // test for percentage
  7139. $r = $this->percentage->validate($bit, $config, $context);
  7140. if ($r !== false) {
  7141. $measures[] = $r;
  7142. $i++;
  7143. }
  7144. }
  7145. if (!$i) return false; // no valid values were caught
  7146. $ret = array();
  7147. // first keyword
  7148. if ($keywords['h']) $ret[] = $keywords['h'];
  7149. elseif ($keywords['ch']) {
  7150. $ret[] = $keywords['ch'];
  7151. $keywords['cv'] = false; // prevent re-use: center = center center
  7152. }
  7153. elseif (count($measures)) $ret[] = array_shift($measures);
  7154. if ($keywords['v']) $ret[] = $keywords['v'];
  7155. elseif ($keywords['cv']) $ret[] = $keywords['cv'];
  7156. elseif (count($measures)) $ret[] = array_shift($measures);
  7157. if (empty($ret)) return false;
  7158. return implode(' ', $ret);
  7159. }
  7160. }
  7161. /**
  7162. * Validates the border property as defined by CSS.
  7163. */
  7164. class HTMLPurifier_AttrDef_CSS_Border extends HTMLPurifier_AttrDef
  7165. {
  7166. /**
  7167. * Local copy of properties this property is shorthand for.
  7168. */
  7169. protected $info = array();
  7170. public function __construct($config) {
  7171. $def = $config->getCSSDefinition();
  7172. $this->info['border-width'] = $def->info['border-width'];
  7173. $this->info['border-style'] = $def->info['border-style'];
  7174. $this->info['border-top-color'] = $def->info['border-top-color'];
  7175. }
  7176. public function validate($string, $config, $context) {
  7177. $string = $this->parseCDATA($string);
  7178. $string = $this->mungeRgb($string);
  7179. $bits = explode(' ', $string);
  7180. $done = array(); // segments we've finished
  7181. $ret = ''; // return value
  7182. foreach ($bits as $bit) {
  7183. foreach ($this->info as $propname => $validator) {
  7184. if (isset($done[$propname])) continue;
  7185. $r = $validator->validate($bit, $config, $context);
  7186. if ($r !== false) {
  7187. $ret .= $r . ' ';
  7188. $done[$propname] = true;
  7189. break;
  7190. }
  7191. }
  7192. }
  7193. return rtrim($ret);
  7194. }
  7195. }
  7196. /**
  7197. * Validates Color as defined by CSS.
  7198. */
  7199. class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
  7200. {
  7201. public function validate($color, $config, $context) {
  7202. static $colors = null;
  7203. if ($colors === null) $colors = $config->get('Core.ColorKeywords');
  7204. $color = trim($color);
  7205. if ($color === '') return false;
  7206. $lower = strtolower($color);
  7207. if (isset($colors[$lower])) return $colors[$lower];
  7208. if (strpos($color, 'rgb(') !== false) {
  7209. // rgb literal handling
  7210. $length = strlen($color);
  7211. if (strpos($color, ')') !== $length - 1) return false;
  7212. $triad = substr($color, 4, $length - 4 - 1);
  7213. $parts = explode(',', $triad);
  7214. if (count($parts) !== 3) return false;
  7215. $type = false; // to ensure that they're all the same type
  7216. $new_parts = array();
  7217. foreach ($parts as $part) {
  7218. $part = trim($part);
  7219. if ($part === '') return false;
  7220. $length = strlen($part);
  7221. if ($part[$length - 1] === '%') {
  7222. // handle percents
  7223. if (!$type) {
  7224. $type = 'percentage';
  7225. } elseif ($type !== 'percentage') {
  7226. return false;
  7227. }
  7228. $num = (float) substr($part, 0, $length - 1);
  7229. if ($num < 0) $num = 0;
  7230. if ($num > 100) $num = 100;
  7231. $new_parts[] = "$num%";
  7232. } else {
  7233. // handle integers
  7234. if (!$type) {
  7235. $type = 'integer';
  7236. } elseif ($type !== 'integer') {
  7237. return false;
  7238. }
  7239. $num = (int) $part;
  7240. if ($num < 0) $num = 0;
  7241. if ($num > 255) $num = 255;
  7242. $new_parts[] = (string) $num;
  7243. }
  7244. }
  7245. $new_triad = implode(',', $new_parts);
  7246. $color = "rgb($new_triad)";
  7247. } else {
  7248. // hexadecimal handling
  7249. if ($color[0] === '#') {
  7250. $hex = substr($color, 1);
  7251. } else {
  7252. $hex = $color;
  7253. $color = '#' . $color;
  7254. }
  7255. $length = strlen($hex);
  7256. if ($length !== 3 && $length !== 6) return false;
  7257. if (!ctype_xdigit($hex)) return false;
  7258. }
  7259. return $color;
  7260. }
  7261. }
  7262. /**
  7263. * Allows multiple validators to attempt to validate attribute.
  7264. *
  7265. * Composite is just what it sounds like: a composite of many validators.
  7266. * This means that multiple HTMLPurifier_AttrDef objects will have a whack
  7267. * at the string. If one of them passes, that's what is returned. This is
  7268. * especially useful for CSS values, which often are a choice between
  7269. * an enumerated set of predefined values or a flexible data type.
  7270. */
  7271. class HTMLPurifier_AttrDef_CSS_Composite extends HTMLPurifier_AttrDef
  7272. {
  7273. /**
  7274. * List of HTMLPurifier_AttrDef objects that may process strings
  7275. * @todo Make protected
  7276. */
  7277. public $defs;
  7278. /**
  7279. * @param $defs List of HTMLPurifier_AttrDef objects
  7280. */
  7281. public function __construct($defs) {
  7282. $this->defs = $defs;
  7283. }
  7284. public function validate($string, $config, $context) {
  7285. foreach ($this->defs as $i => $def) {
  7286. $result = $this->defs[$i]->validate($string, $config, $context);
  7287. if ($result !== false) return $result;
  7288. }
  7289. return false;
  7290. }
  7291. }
  7292. /**
  7293. * Decorator which enables CSS properties to be disabled for specific elements.
  7294. */
  7295. class HTMLPurifier_AttrDef_CSS_DenyElementDecorator extends HTMLPurifier_AttrDef
  7296. {
  7297. public $def, $element;
  7298. /**
  7299. * @param $def Definition to wrap
  7300. * @param $element Element to deny
  7301. */
  7302. public function __construct($def, $element) {
  7303. $this->def = $def;
  7304. $this->element = $element;
  7305. }
  7306. /**
  7307. * Checks if CurrentToken is set and equal to $this->element
  7308. */
  7309. public function validate($string, $config, $context) {
  7310. $token = $context->get('CurrentToken', true);
  7311. if ($token && $token->name == $this->element) return false;
  7312. return $this->def->validate($string, $config, $context);
  7313. }
  7314. }
  7315. /**
  7316. * Microsoft's proprietary filter: CSS property
  7317. * @note Currently supports the alpha filter. In the future, this will
  7318. * probably need an extensible framework
  7319. */
  7320. class HTMLPurifier_AttrDef_CSS_Filter extends HTMLPurifier_AttrDef
  7321. {
  7322. protected $intValidator;
  7323. public function __construct() {
  7324. $this->intValidator = new HTMLPurifier_AttrDef_Integer();
  7325. }
  7326. public function validate($value, $config, $context) {
  7327. $value = $this->parseCDATA($value);
  7328. if ($value === 'none') return $value;
  7329. // if we looped this we could support multiple filters
  7330. $function_length = strcspn($value, '(');
  7331. $function = trim(substr($value, 0, $function_length));
  7332. if ($function !== 'alpha' &&
  7333. $function !== 'Alpha' &&
  7334. $function !== 'progid:DXImageTransform.Microsoft.Alpha'
  7335. ) return false;
  7336. $cursor = $function_length + 1;
  7337. $parameters_length = strcspn($value, ')', $cursor);
  7338. $parameters = substr($value, $cursor, $parameters_length);
  7339. $params = explode(',', $parameters);
  7340. $ret_params = array();
  7341. $lookup = array();
  7342. foreach ($params as $param) {
  7343. list($key, $value) = explode('=', $param);
  7344. $key = trim($key);
  7345. $value = trim($value);
  7346. if (isset($lookup[$key])) continue;
  7347. if ($key !== 'opacity') continue;
  7348. $value = $this->intValidator->validate($value, $config, $context);
  7349. if ($value === false) continue;
  7350. $int = (int) $value;
  7351. if ($int > 100) $value = '100';
  7352. if ($int < 0) $value = '0';
  7353. $ret_params[] = "$key=$value";
  7354. $lookup[$key] = true;
  7355. }
  7356. $ret_parameters = implode(',', $ret_params);
  7357. $ret_function = "$function($ret_parameters)";
  7358. return $ret_function;
  7359. }
  7360. }
  7361. /**
  7362. * Validates shorthand CSS property font.
  7363. */
  7364. class HTMLPurifier_AttrDef_CSS_Font extends HTMLPurifier_AttrDef
  7365. {
  7366. /**
  7367. * Local copy of component validators.
  7368. *
  7369. * @note If we moved specific CSS property definitions to their own
  7370. * classes instead of having them be assembled at run time by
  7371. * CSSDefinition, this wouldn't be necessary. We'd instantiate
  7372. * our own copies.
  7373. */
  7374. protected $info = array();
  7375. public function __construct($config) {
  7376. $def = $config->getCSSDefinition();
  7377. $this->info['font-style'] = $def->info['font-style'];
  7378. $this->info['font-variant'] = $def->info['font-variant'];
  7379. $this->info['font-weight'] = $def->info['font-weight'];
  7380. $this->info['font-size'] = $def->info['font-size'];
  7381. $this->info['line-height'] = $def->info['line-height'];
  7382. $this->info['font-family'] = $def->info['font-family'];
  7383. }
  7384. public function validate($string, $config, $context) {
  7385. static $system_fonts = array(
  7386. 'caption' => true,
  7387. 'icon' => true,
  7388. 'menu' => true,
  7389. 'message-box' => true,
  7390. 'small-caption' => true,
  7391. 'status-bar' => true
  7392. );
  7393. // regular pre-processing
  7394. $string = $this->parseCDATA($string);
  7395. if ($string === '') return false;
  7396. // check if it's one of the keywords
  7397. $lowercase_string = strtolower($string);
  7398. if (isset($system_fonts[$lowercase_string])) {
  7399. return $lowercase_string;
  7400. }
  7401. $bits = explode(' ', $string); // bits to process
  7402. $stage = 0; // this indicates what we're looking for
  7403. $caught = array(); // which stage 0 properties have we caught?
  7404. $stage_1 = array('font-style', 'font-variant', 'font-weight');
  7405. $final = ''; // output
  7406. for ($i = 0, $size = count($bits); $i < $size; $i++) {
  7407. if ($bits[$i] === '') continue;
  7408. switch ($stage) {
  7409. // attempting to catch font-style, font-variant or font-weight
  7410. case 0:
  7411. foreach ($stage_1 as $validator_name) {
  7412. if (isset($caught[$validator_name])) continue;
  7413. $r = $this->info[$validator_name]->validate(
  7414. $bits[$i], $config, $context);
  7415. if ($r !== false) {
  7416. $final .= $r . ' ';
  7417. $caught[$validator_name] = true;
  7418. break;
  7419. }
  7420. }
  7421. // all three caught, continue on
  7422. if (count($caught) >= 3) $stage = 1;
  7423. if ($r !== false) break;
  7424. // attempting to catch font-size and perhaps line-height
  7425. case 1:
  7426. $found_slash = false;
  7427. if (strpos($bits[$i], '/') !== false) {
  7428. list($font_size, $line_height) =
  7429. explode('/', $bits[$i]);
  7430. if ($line_height === '') {
  7431. // ooh, there's a space after the slash!
  7432. $line_height = false;
  7433. $found_slash = true;
  7434. }
  7435. } else {
  7436. $font_size = $bits[$i];
  7437. $line_height = false;
  7438. }
  7439. $r = $this->info['font-size']->validate(
  7440. $font_size, $config, $context);
  7441. if ($r !== false) {
  7442. $final .= $r;
  7443. // attempt to catch line-height
  7444. if ($line_height === false) {
  7445. // we need to scroll forward
  7446. for ($j = $i + 1; $j < $size; $j++) {
  7447. if ($bits[$j] === '') continue;
  7448. if ($bits[$j] === '/') {
  7449. if ($found_slash) {
  7450. return false;
  7451. } else {
  7452. $found_slash = true;
  7453. continue;
  7454. }
  7455. }
  7456. $line_height = $bits[$j];
  7457. break;
  7458. }
  7459. } else {
  7460. // slash already found
  7461. $found_slash = true;
  7462. $j = $i;
  7463. }
  7464. if ($found_slash) {
  7465. $i = $j;
  7466. $r = $this->info['line-height']->validate(
  7467. $line_height, $config, $context);
  7468. if ($r !== false) {
  7469. $final .= '/' . $r;
  7470. }
  7471. }
  7472. $final .= ' ';
  7473. $stage = 2;
  7474. break;
  7475. }
  7476. return false;
  7477. // attempting to catch font-family
  7478. case 2:
  7479. $font_family =
  7480. implode(' ', array_slice($bits, $i, $size - $i));
  7481. $r = $this->info['font-family']->validate(
  7482. $font_family, $config, $context);
  7483. if ($r !== false) {
  7484. $final .= $r . ' ';
  7485. // processing completed successfully
  7486. return rtrim($final);
  7487. }
  7488. return false;
  7489. }
  7490. }
  7491. return false;
  7492. }
  7493. }
  7494. /**
  7495. * Validates a font family list according to CSS spec
  7496. * @todo whitelisting allowed fonts would be nice
  7497. */
  7498. class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
  7499. {
  7500. public function validate($string, $config, $context) {
  7501. static $generic_names = array(
  7502. 'serif' => true,
  7503. 'sans-serif' => true,
  7504. 'monospace' => true,
  7505. 'fantasy' => true,
  7506. 'cursive' => true
  7507. );
  7508. // assume that no font names contain commas in them
  7509. $fonts = explode(',', $string);
  7510. $final = '';
  7511. foreach($fonts as $font) {
  7512. $font = trim($font);
  7513. if ($font === '') continue;
  7514. // match a generic name
  7515. if (isset($generic_names[$font])) {
  7516. $final .= $font . ', ';
  7517. continue;
  7518. }
  7519. // match a quoted name
  7520. if ($font[0] === '"' || $font[0] === "'") {
  7521. $length = strlen($font);
  7522. if ($length <= 2) continue;
  7523. $quote = $font[0];
  7524. if ($font[$length - 1] !== $quote) continue;
  7525. $font = substr($font, 1, $length - 2);
  7526. }
  7527. $font = $this->expandCSSEscape($font);
  7528. // $font is a pure representation of the font name
  7529. if (ctype_alnum($font) && $font !== '') {
  7530. // very simple font, allow it in unharmed
  7531. $final .= $font . ', ';
  7532. continue;
  7533. }
  7534. // bugger out on whitespace. form feed (0C) really
  7535. // shouldn't show up regardless
  7536. $font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font);
  7537. // These ugly transforms don't pose a security
  7538. // risk (as \\ and \" might). We could try to be clever and
  7539. // use single-quote wrapping when there is a double quote
  7540. // present, but I have choosen not to implement that.
  7541. // (warning: this code relies on the selection of quotation
  7542. // mark below)
  7543. $font = str_replace('\\', '\\5C ', $font);
  7544. $font = str_replace('"', '\\22 ', $font);
  7545. // complicated font, requires quoting
  7546. $final .= "\"$font\", "; // note that this will later get turned into &quot;
  7547. }
  7548. $final = rtrim($final, ', ');
  7549. if ($final === '') return false;
  7550. return $final;
  7551. }
  7552. }
  7553. /**
  7554. * Decorator which enables !important to be used in CSS values.
  7555. */
  7556. class HTMLPurifier_AttrDef_CSS_ImportantDecorator extends HTMLPurifier_AttrDef
  7557. {
  7558. public $def, $allow;
  7559. /**
  7560. * @param $def Definition to wrap
  7561. * @param $allow Whether or not to allow !important
  7562. */
  7563. public function __construct($def, $allow = false) {
  7564. $this->def = $def;
  7565. $this->allow = $allow;
  7566. }
  7567. /**
  7568. * Intercepts and removes !important if necessary
  7569. */
  7570. public function validate($string, $config, $context) {
  7571. // test for ! and important tokens
  7572. $string = trim($string);
  7573. $is_important = false;
  7574. // :TODO: optimization: test directly for !important and ! important
  7575. if (strlen($string) >= 9 && substr($string, -9) === 'important') {
  7576. $temp = rtrim(substr($string, 0, -9));
  7577. // use a temp, because we might want to restore important
  7578. if (strlen($temp) >= 1 && substr($temp, -1) === '!') {
  7579. $string = rtrim(substr($temp, 0, -1));
  7580. $is_important = true;
  7581. }
  7582. }
  7583. $string = $this->def->validate($string, $config, $context);
  7584. if ($this->allow && $is_important) $string .= ' !important';
  7585. return $string;
  7586. }
  7587. }
  7588. /**
  7589. * Represents a Length as defined by CSS.
  7590. */
  7591. class HTMLPurifier_AttrDef_CSS_Length extends HTMLPurifier_AttrDef
  7592. {
  7593. protected $min, $max;
  7594. /**
  7595. * @param HTMLPurifier_Length $max Minimum length, or null for no bound. String is also acceptable.
  7596. * @param HTMLPurifier_Length $max Maximum length, or null for no bound. String is also acceptable.
  7597. */
  7598. public function __construct($min = null, $max = null) {
  7599. $this->min = $min !== null ? HTMLPurifier_Length::make($min) : null;
  7600. $this->max = $max !== null ? HTMLPurifier_Length::make($max) : null;
  7601. }
  7602. public function validate($string, $config, $context) {
  7603. $string = $this->parseCDATA($string);
  7604. // Optimizations
  7605. if ($string === '') return false;
  7606. if ($string === '0') return '0';
  7607. if (strlen($string) === 1) return false;
  7608. $length = HTMLPurifier_Length::make($string);
  7609. if (!$length->isValid()) return false;
  7610. if ($this->min) {
  7611. $c = $length->compareTo($this->min);
  7612. if ($c === false) return false;
  7613. if ($c < 0) return false;
  7614. }
  7615. if ($this->max) {
  7616. $c = $length->compareTo($this->max);
  7617. if ($c === false) return false;
  7618. if ($c > 0) return false;
  7619. }
  7620. return $length->toString();
  7621. }
  7622. }
  7623. /**
  7624. * Validates shorthand CSS property list-style.
  7625. * @warning Does not support url tokens that have internal spaces.
  7626. */
  7627. class HTMLPurifier_AttrDef_CSS_ListStyle extends HTMLPurifier_AttrDef
  7628. {
  7629. /**
  7630. * Local copy of component validators.
  7631. * @note See HTMLPurifier_AttrDef_CSS_Font::$info for a similar impl.
  7632. */
  7633. protected $info;
  7634. public function __construct($config) {
  7635. $def = $config->getCSSDefinition();
  7636. $this->info['list-style-type'] = $def->info['list-style-type'];
  7637. $this->info['list-style-position'] = $def->info['list-style-position'];
  7638. $this->info['list-style-image'] = $def->info['list-style-image'];
  7639. }
  7640. public function validate($string, $config, $context) {
  7641. // regular pre-processing
  7642. $string = $this->parseCDATA($string);
  7643. if ($string === '') return false;
  7644. // assumes URI doesn't have spaces in it
  7645. $bits = explode(' ', strtolower($string)); // bits to process
  7646. $caught = array();
  7647. $caught['type'] = false;
  7648. $caught['position'] = false;
  7649. $caught['image'] = false;
  7650. $i = 0; // number of catches
  7651. $none = false;
  7652. foreach ($bits as $bit) {
  7653. if ($i >= 3) return; // optimization bit
  7654. if ($bit === '') continue;
  7655. foreach ($caught as $key => $status) {
  7656. if ($status !== false) continue;
  7657. $r = $this->info['list-style-' . $key]->validate($bit, $config, $context);
  7658. if ($r === false) continue;
  7659. if ($r === 'none') {
  7660. if ($none) continue;
  7661. else $none = true;
  7662. if ($key == 'image') continue;
  7663. }
  7664. $caught[$key] = $r;
  7665. $i++;
  7666. break;
  7667. }
  7668. }
  7669. if (!$i) return false;
  7670. $ret = array();
  7671. // construct type
  7672. if ($caught['type']) $ret[] = $caught['type'];
  7673. // construct image
  7674. if ($caught['image']) $ret[] = $caught['image'];
  7675. // construct position
  7676. if ($caught['position']) $ret[] = $caught['position'];
  7677. if (empty($ret)) return false;
  7678. return implode(' ', $ret);
  7679. }
  7680. }
  7681. /**
  7682. * Framework class for strings that involve multiple values.
  7683. *
  7684. * Certain CSS properties such as border-width and margin allow multiple
  7685. * lengths to be specified. This class can take a vanilla border-width
  7686. * definition and multiply it, usually into a max of four.
  7687. *
  7688. * @note Even though the CSS specification isn't clear about it, inherit
  7689. * can only be used alone: it will never manifest as part of a multi
  7690. * shorthand declaration. Thus, this class does not allow inherit.
  7691. */
  7692. class HTMLPurifier_AttrDef_CSS_Multiple extends HTMLPurifier_AttrDef
  7693. {
  7694. /**
  7695. * Instance of component definition to defer validation to.
  7696. * @todo Make protected
  7697. */
  7698. public $single;
  7699. /**
  7700. * Max number of values allowed.
  7701. * @todo Make protected
  7702. */
  7703. public $max;
  7704. /**
  7705. * @param $single HTMLPurifier_AttrDef to multiply
  7706. * @param $max Max number of values allowed (usually four)
  7707. */
  7708. public function __construct($single, $max = 4) {
  7709. $this->single = $single;
  7710. $this->max = $max;
  7711. }
  7712. public function validate($string, $config, $context) {
  7713. $string = $this->parseCDATA($string);
  7714. if ($string === '') return false;
  7715. $parts = explode(' ', $string); // parseCDATA replaced \r, \t and \n
  7716. $length = count($parts);
  7717. $final = '';
  7718. for ($i = 0, $num = 0; $i < $length && $num < $this->max; $i++) {
  7719. if (ctype_space($parts[$i])) continue;
  7720. $result = $this->single->validate($parts[$i], $config, $context);
  7721. if ($result !== false) {
  7722. $final .= $result . ' ';
  7723. $num++;
  7724. }
  7725. }
  7726. if ($final === '') return false;
  7727. return rtrim($final);
  7728. }
  7729. }
  7730. /**
  7731. * Validates a Percentage as defined by the CSS spec.
  7732. */
  7733. class HTMLPurifier_AttrDef_CSS_Percentage extends HTMLPurifier_AttrDef
  7734. {
  7735. /**
  7736. * Instance of HTMLPurifier_AttrDef_CSS_Number to defer number validation
  7737. */
  7738. protected $number_def;
  7739. /**
  7740. * @param Bool indicating whether to forbid negative values
  7741. */
  7742. public function __construct($non_negative = false) {
  7743. $this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative);
  7744. }
  7745. public function validate($string, $config, $context) {
  7746. $string = $this->parseCDATA($string);
  7747. if ($string === '') return false;
  7748. $length = strlen($string);
  7749. if ($length === 1) return false;
  7750. if ($string[$length - 1] !== '%') return false;
  7751. $number = substr($string, 0, $length - 1);
  7752. $number = $this->number_def->validate($number, $config, $context);
  7753. if ($number === false) return false;
  7754. return "$number%";
  7755. }
  7756. }
  7757. /**
  7758. * Validates the value for the CSS property text-decoration
  7759. * @note This class could be generalized into a version that acts sort of
  7760. * like Enum except you can compound the allowed values.
  7761. */
  7762. class HTMLPurifier_AttrDef_CSS_TextDecoration extends HTMLPurifier_AttrDef
  7763. {
  7764. public function validate($string, $config, $context) {
  7765. static $allowed_values = array(
  7766. 'line-through' => true,
  7767. 'overline' => true,
  7768. 'underline' => true,
  7769. );
  7770. $string = strtolower($this->parseCDATA($string));
  7771. if ($string === 'none') return $string;
  7772. $parts = explode(' ', $string);
  7773. $final = '';
  7774. foreach ($parts as $part) {
  7775. if (isset($allowed_values[$part])) {
  7776. $final .= $part . ' ';
  7777. }
  7778. }
  7779. $final = rtrim($final);
  7780. if ($final === '') return false;
  7781. return $final;
  7782. }
  7783. }
  7784. /**
  7785. * Validates a URI in CSS syntax, which uses url('http://example.com')
  7786. * @note While theoretically speaking a URI in a CSS document could
  7787. * be non-embedded, as of CSS2 there is no such usage so we're
  7788. * generalizing it. This may need to be changed in the future.
  7789. * @warning Since HTMLPurifier_AttrDef_CSS blindly uses semicolons as
  7790. * the separator, you cannot put a literal semicolon in
  7791. * in the URI. Try percent encoding it, in that case.
  7792. */
  7793. class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI
  7794. {
  7795. public function __construct() {
  7796. parent::__construct(true); // always embedded
  7797. }
  7798. public function validate($uri_string, $config, $context) {
  7799. // parse the URI out of the string and then pass it onto
  7800. // the parent object
  7801. $uri_string = $this->parseCDATA($uri_string);
  7802. if (strpos($uri_string, 'url(') !== 0) return false;
  7803. $uri_string = substr($uri_string, 4);
  7804. $new_length = strlen($uri_string) - 1;
  7805. if ($uri_string[$new_length] != ')') return false;
  7806. $uri = trim(substr($uri_string, 0, $new_length));
  7807. if (!empty($uri) && ($uri[0] == "'" || $uri[0] == '"')) {
  7808. $quote = $uri[0];
  7809. $new_length = strlen($uri) - 1;
  7810. if ($uri[$new_length] !== $quote) return false;
  7811. $uri = substr($uri, 1, $new_length - 1);
  7812. }
  7813. $uri = $this->expandCSSEscape($uri);
  7814. $result = parent::validate($uri, $config, $context);
  7815. if ($result === false) return false;
  7816. // extra sanity check; should have been done by URI
  7817. $result = str_replace(array('"', "\\", "\n", "\x0c", "\r"), "", $result);
  7818. return "url(\"$result\")";
  7819. }
  7820. }
  7821. /**
  7822. * Validates a boolean attribute
  7823. */
  7824. class HTMLPurifier_AttrDef_HTML_Bool extends HTMLPurifier_AttrDef
  7825. {
  7826. protected $name;
  7827. public $minimized = true;
  7828. public function __construct($name = false) {$this->name = $name;}
  7829. public function validate($string, $config, $context) {
  7830. if (empty($string)) return false;
  7831. return $this->name;
  7832. }
  7833. /**
  7834. * @param $string Name of attribute
  7835. */
  7836. public function make($string) {
  7837. return new HTMLPurifier_AttrDef_HTML_Bool($string);
  7838. }
  7839. }
  7840. /**
  7841. * Validates contents based on NMTOKENS attribute type.
  7842. */
  7843. class HTMLPurifier_AttrDef_HTML_Nmtokens extends HTMLPurifier_AttrDef
  7844. {
  7845. public function validate($string, $config, $context) {
  7846. $string = trim($string);
  7847. // early abort: '' and '0' (strings that convert to false) are invalid
  7848. if (!$string) return false;
  7849. $tokens = $this->split($string, $config, $context);
  7850. $tokens = $this->filter($tokens, $config, $context);
  7851. if (empty($tokens)) return false;
  7852. return implode(' ', $tokens);
  7853. }
  7854. /**
  7855. * Splits a space separated list of tokens into its constituent parts.
  7856. */
  7857. protected function split($string, $config, $context) {
  7858. // OPTIMIZABLE!
  7859. // do the preg_match, capture all subpatterns for reformulation
  7860. // we don't support U+00A1 and up codepoints or
  7861. // escaping because I don't know how to do that with regexps
  7862. // and plus it would complicate optimization efforts (you never
  7863. // see that anyway).
  7864. $pattern = '/(?:(?<=\s)|\A)'. // look behind for space or string start
  7865. '((?:--|-?[A-Za-z_])[A-Za-z_\-0-9]*)'.
  7866. '(?:(?=\s)|\z)/'; // look ahead for space or string end
  7867. preg_match_all($pattern, $string, $matches);
  7868. return $matches[1];
  7869. }
  7870. /**
  7871. * Template method for removing certain tokens based on arbitrary criteria.
  7872. * @note If we wanted to be really functional, we'd do an array_filter
  7873. * with a callback. But... we're not.
  7874. */
  7875. protected function filter($tokens, $config, $context) {
  7876. return $tokens;
  7877. }
  7878. }
  7879. /**
  7880. * Implements special behavior for class attribute (normally NMTOKENS)
  7881. */
  7882. class HTMLPurifier_AttrDef_HTML_Class extends HTMLPurifier_AttrDef_HTML_Nmtokens
  7883. {
  7884. protected function split($string, $config, $context) {
  7885. // really, this twiddle should be lazy loaded
  7886. $name = $config->getDefinition('HTML')->doctype->name;
  7887. if ($name == "XHTML 1.1" || $name == "XHTML 2.0") {
  7888. return parent::split($string, $config, $context);
  7889. } else {
  7890. return preg_split('/\s+/', $string);
  7891. }
  7892. }
  7893. protected function filter($tokens, $config, $context) {
  7894. $allowed = $config->get('Attr.AllowedClasses');
  7895. $forbidden = $config->get('Attr.ForbiddenClasses');
  7896. $ret = array();
  7897. foreach ($tokens as $token) {
  7898. if (
  7899. ($allowed === null || isset($allowed[$token])) &&
  7900. !isset($forbidden[$token]) &&
  7901. // We need this O(n) check because of PHP's array
  7902. // implementation that casts -0 to 0.
  7903. !in_array($token, $ret, true)
  7904. ) {
  7905. $ret[] = $token;
  7906. }
  7907. }
  7908. return $ret;
  7909. }
  7910. }
  7911. /**
  7912. * Validates a color according to the HTML spec.
  7913. */
  7914. class HTMLPurifier_AttrDef_HTML_Color extends HTMLPurifier_AttrDef
  7915. {
  7916. public function validate($string, $config, $context) {
  7917. static $colors = null;
  7918. if ($colors === null) $colors = $config->get('Core.ColorKeywords');
  7919. $string = trim($string);
  7920. if (empty($string)) return false;
  7921. if (isset($colors[$string])) return $colors[$string];
  7922. if ($string[0] === '#') $hex = substr($string, 1);
  7923. else $hex = $string;
  7924. $length = strlen($hex);
  7925. if ($length !== 3 && $length !== 6) return false;
  7926. if (!ctype_xdigit($hex)) return false;
  7927. if ($length === 3) $hex = $hex[0].$hex[0].$hex[1].$hex[1].$hex[2].$hex[2];
  7928. return "#$hex";
  7929. }
  7930. }
  7931. /**
  7932. * Special-case enum attribute definition that lazy loads allowed frame targets
  7933. */
  7934. class HTMLPurifier_AttrDef_HTML_FrameTarget extends HTMLPurifier_AttrDef_Enum
  7935. {
  7936. public $valid_values = false; // uninitialized value
  7937. protected $case_sensitive = false;
  7938. public function __construct() {}
  7939. public function validate($string, $config, $context) {
  7940. if ($this->valid_values === false) $this->valid_values = $config->get('Attr.AllowedFrameTargets');
  7941. return parent::validate($string, $config, $context);
  7942. }
  7943. }
  7944. /**
  7945. * Validates the HTML attribute ID.
  7946. * @warning Even though this is the id processor, it
  7947. * will ignore the directive Attr:IDBlacklist, since it will only
  7948. * go according to the ID accumulator. Since the accumulator is
  7949. * automatically generated, it will have already absorbed the
  7950. * blacklist. If you're hacking around, make sure you use load()!
  7951. */
  7952. class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef
  7953. {
  7954. // ref functionality disabled, since we also have to verify
  7955. // whether or not the ID it refers to exists
  7956. public function validate($id, $config, $context) {
  7957. if (!$config->get('Attr.EnableID')) return false;
  7958. $id = trim($id); // trim it first
  7959. if ($id === '') return false;
  7960. $prefix = $config->get('Attr.IDPrefix');
  7961. if ($prefix !== '') {
  7962. $prefix .= $config->get('Attr.IDPrefixLocal');
  7963. // prevent re-appending the prefix
  7964. if (strpos($id, $prefix) !== 0) $id = $prefix . $id;
  7965. } elseif ($config->get('Attr.IDPrefixLocal') !== '') {
  7966. trigger_error('%Attr.IDPrefixLocal cannot be used unless '.
  7967. '%Attr.IDPrefix is set', E_USER_WARNING);
  7968. }
  7969. //if (!$this->ref) {
  7970. $id_accumulator =& $context->get('IDAccumulator');
  7971. if (isset($id_accumulator->ids[$id])) return false;
  7972. //}
  7973. // we purposely avoid using regex, hopefully this is faster
  7974. if (ctype_alpha($id)) {
  7975. $result = true;
  7976. } else {
  7977. if (!ctype_alpha(@$id[0])) return false;
  7978. $trim = trim( // primitive style of regexps, I suppose
  7979. $id,
  7980. 'A..Za..z0..9:-._'
  7981. );
  7982. $result = ($trim === '');
  7983. }
  7984. $regexp = $config->get('Attr.IDBlacklistRegexp');
  7985. if ($regexp && preg_match($regexp, $id)) {
  7986. return false;
  7987. }
  7988. if (/*!$this->ref && */$result) $id_accumulator->add($id);
  7989. // if no change was made to the ID, return the result
  7990. // else, return the new id if stripping whitespace made it
  7991. // valid, or return false.
  7992. return $result ? $id : false;
  7993. }
  7994. }
  7995. /**
  7996. * Validates an integer representation of pixels according to the HTML spec.
  7997. */
  7998. class HTMLPurifier_AttrDef_HTML_Pixels extends HTMLPurifier_AttrDef
  7999. {
  8000. protected $max;
  8001. public function __construct($max = null) {
  8002. $this->max = $max;
  8003. }
  8004. public function validate($string, $config, $context) {
  8005. $string = trim($string);
  8006. if ($string === '0') return $string;
  8007. if ($string === '') return false;
  8008. $length = strlen($string);
  8009. if (substr($string, $length - 2) == 'px') {
  8010. $string = substr($string, 0, $length - 2);
  8011. }
  8012. if (!is_numeric($string)) return false;
  8013. $int = (int) $string;
  8014. if ($int < 0) return '0';
  8015. // upper-bound value, extremely high values can
  8016. // crash operating systems, see <http://ha.ckers.org/imagecrash.html>
  8017. // WARNING, above link WILL crash you if you're using Windows
  8018. if ($this->max !== null && $int > $this->max) return (string) $this->max;
  8019. return (string) $int;
  8020. }
  8021. public function make($string) {
  8022. if ($string === '') $max = null;
  8023. else $max = (int) $string;
  8024. $class = get_class($this);
  8025. return new $class($max);
  8026. }
  8027. }
  8028. /**
  8029. * Validates the HTML type length (not to be confused with CSS's length).
  8030. *
  8031. * This accepts integer pixels or percentages as lengths for certain
  8032. * HTML attributes.
  8033. */
  8034. class HTMLPurifier_AttrDef_HTML_Length extends HTMLPurifier_AttrDef_HTML_Pixels
  8035. {
  8036. public function validate($string, $config, $context) {
  8037. $string = trim($string);
  8038. if ($string === '') return false;
  8039. $parent_result = parent::validate($string, $config, $context);
  8040. if ($parent_result !== false) return $parent_result;
  8041. $length = strlen($string);
  8042. $last_char = $string[$length - 1];
  8043. if ($last_char !== '%') return false;
  8044. $points = substr($string, 0, $length - 1);
  8045. if (!is_numeric($points)) return false;
  8046. $points = (int) $points;
  8047. if ($points < 0) return '0%';
  8048. if ($points > 100) return '100%';
  8049. return ((string) $points) . '%';
  8050. }
  8051. }
  8052. /**
  8053. * Validates a rel/rev link attribute against a directive of allowed values
  8054. * @note We cannot use Enum because link types allow multiple
  8055. * values.
  8056. * @note Assumes link types are ASCII text
  8057. */
  8058. class HTMLPurifier_AttrDef_HTML_LinkTypes extends HTMLPurifier_AttrDef
  8059. {
  8060. /** Name config attribute to pull. */
  8061. protected $name;
  8062. public function __construct($name) {
  8063. $configLookup = array(
  8064. 'rel' => 'AllowedRel',
  8065. 'rev' => 'AllowedRev'
  8066. );
  8067. if (!isset($configLookup[$name])) {
  8068. trigger_error('Unrecognized attribute name for link '.
  8069. 'relationship.', E_USER_ERROR);
  8070. return;
  8071. }
  8072. $this->name = $configLookup[$name];
  8073. }
  8074. public function validate($string, $config, $context) {
  8075. $allowed = $config->get('Attr.' . $this->name);
  8076. if (empty($allowed)) return false;
  8077. $string = $this->parseCDATA($string);
  8078. $parts = explode(' ', $string);
  8079. // lookup to prevent duplicates
  8080. $ret_lookup = array();
  8081. foreach ($parts as $part) {
  8082. $part = strtolower(trim($part));
  8083. if (!isset($allowed[$part])) continue;
  8084. $ret_lookup[$part] = true;
  8085. }
  8086. if (empty($ret_lookup)) return false;
  8087. $string = implode(' ', array_keys($ret_lookup));
  8088. return $string;
  8089. }
  8090. }
  8091. /**
  8092. * Validates a MultiLength as defined by the HTML spec.
  8093. *
  8094. * A multilength is either a integer (pixel count), a percentage, or
  8095. * a relative number.
  8096. */
  8097. class HTMLPurifier_AttrDef_HTML_MultiLength extends HTMLPurifier_AttrDef_HTML_Length
  8098. {
  8099. public function validate($string, $config, $context) {
  8100. $string = trim($string);
  8101. if ($string === '') return false;
  8102. $parent_result = parent::validate($string, $config, $context);
  8103. if ($parent_result !== false) return $parent_result;
  8104. $length = strlen($string);
  8105. $last_char = $string[$length - 1];
  8106. if ($last_char !== '*') return false;
  8107. $int = substr($string, 0, $length - 1);
  8108. if ($int == '') return '*';
  8109. if (!is_numeric($int)) return false;
  8110. $int = (int) $int;
  8111. if ($int < 0) return false;
  8112. if ($int == 0) return '0';
  8113. if ($int == 1) return '*';
  8114. return ((string) $int) . '*';
  8115. }
  8116. }
  8117. abstract class HTMLPurifier_AttrDef_URI_Email extends HTMLPurifier_AttrDef
  8118. {
  8119. /**
  8120. * Unpacks a mailbox into its display-name and address
  8121. */
  8122. function unpack($string) {
  8123. // needs to be implemented
  8124. }
  8125. }
  8126. // sub-implementations
  8127. /**
  8128. * Validates a host according to the IPv4, IPv6 and DNS (future) specifications.
  8129. */
  8130. class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
  8131. {
  8132. /**
  8133. * Instance of HTMLPurifier_AttrDef_URI_IPv4 sub-validator
  8134. */
  8135. protected $ipv4;
  8136. /**
  8137. * Instance of HTMLPurifier_AttrDef_URI_IPv6 sub-validator
  8138. */
  8139. protected $ipv6;
  8140. public function __construct() {
  8141. $this->ipv4 = new HTMLPurifier_AttrDef_URI_IPv4();
  8142. $this->ipv6 = new HTMLPurifier_AttrDef_URI_IPv6();
  8143. }
  8144. public function validate($string, $config, $context) {
  8145. $length = strlen($string);
  8146. if ($string === '') return '';
  8147. if ($length > 1 && $string[0] === '[' && $string[$length-1] === ']') {
  8148. //IPv6
  8149. $ip = substr($string, 1, $length - 2);
  8150. $valid = $this->ipv6->validate($ip, $config, $context);
  8151. if ($valid === false) return false;
  8152. return '['. $valid . ']';
  8153. }
  8154. // need to do checks on unusual encodings too
  8155. $ipv4 = $this->ipv4->validate($string, $config, $context);
  8156. if ($ipv4 !== false) return $ipv4;
  8157. // A regular domain name.
  8158. // This breaks I18N domain names, but we don't have proper IRI support,
  8159. // so force users to insert Punycode. If there's complaining we'll
  8160. // try to fix things into an international friendly form.
  8161. // The productions describing this are:
  8162. $a = '[a-z]'; // alpha
  8163. $an = '[a-z0-9]'; // alphanum
  8164. $and = '[a-z0-9-]'; // alphanum | "-"
  8165. // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
  8166. $domainlabel = "$an($and*$an)?";
  8167. // toplabel = alpha | alpha *( alphanum | "-" ) alphanum
  8168. $toplabel = "$a($and*$an)?";
  8169. // hostname = *( domainlabel "." ) toplabel [ "." ]
  8170. $match = preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string);
  8171. if (!$match) return false;
  8172. return $string;
  8173. }
  8174. }
  8175. /**
  8176. * Validates an IPv4 address
  8177. * @author Feyd @ forums.devnetwork.net (public domain)
  8178. */
  8179. class HTMLPurifier_AttrDef_URI_IPv4 extends HTMLPurifier_AttrDef
  8180. {
  8181. /**
  8182. * IPv4 regex, protected so that IPv6 can reuse it
  8183. */
  8184. protected $ip4;
  8185. public function validate($aIP, $config, $context) {
  8186. if (!$this->ip4) $this->_loadRegex();
  8187. if (preg_match('#^' . $this->ip4 . '$#s', $aIP))
  8188. {
  8189. return $aIP;
  8190. }
  8191. return false;
  8192. }
  8193. /**
  8194. * Lazy load function to prevent regex from being stuffed in
  8195. * cache.
  8196. */
  8197. protected function _loadRegex() {
  8198. $oct = '(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'; // 0-255
  8199. $this->ip4 = "(?:{$oct}\\.{$oct}\\.{$oct}\\.{$oct})";
  8200. }
  8201. }
  8202. /**
  8203. * Validates an IPv6 address.
  8204. * @author Feyd @ forums.devnetwork.net (public domain)
  8205. * @note This function requires brackets to have been removed from address
  8206. * in URI.
  8207. */
  8208. class HTMLPurifier_AttrDef_URI_IPv6 extends HTMLPurifier_AttrDef_URI_IPv4
  8209. {
  8210. public function validate($aIP, $config, $context) {
  8211. if (!$this->ip4) $this->_loadRegex();
  8212. $original = $aIP;
  8213. $hex = '[0-9a-fA-F]';
  8214. $blk = '(?:' . $hex . '{1,4})';
  8215. $pre = '(?:/(?:12[0-8]|1[0-1][0-9]|[1-9][0-9]|[0-9]))'; // /0 - /128
  8216. // prefix check
  8217. if (strpos($aIP, '/') !== false)
  8218. {
  8219. if (preg_match('#' . $pre . '$#s', $aIP, $find))
  8220. {
  8221. $aIP = substr($aIP, 0, 0-strlen($find[0]));
  8222. unset($find);
  8223. }
  8224. else
  8225. {
  8226. return false;
  8227. }
  8228. }
  8229. // IPv4-compatiblity check
  8230. if (preg_match('#(?<=:'.')' . $this->ip4 . '$#s', $aIP, $find))
  8231. {
  8232. $aIP = substr($aIP, 0, 0-strlen($find[0]));
  8233. $ip = explode('.', $find[0]);
  8234. $ip = array_map('dechex', $ip);
  8235. $aIP .= $ip[0] . $ip[1] . ':' . $ip[2] . $ip[3];
  8236. unset($find, $ip);
  8237. }
  8238. // compression check
  8239. $aIP = explode('::', $aIP);
  8240. $c = count($aIP);
  8241. if ($c > 2)
  8242. {
  8243. return false;
  8244. }
  8245. elseif ($c == 2)
  8246. {
  8247. list($first, $second) = $aIP;
  8248. $first = explode(':', $first);
  8249. $second = explode(':', $second);
  8250. if (count($first) + count($second) > 8)
  8251. {
  8252. return false;
  8253. }
  8254. while(count($first) < 8)
  8255. {
  8256. array_push($first, '0');
  8257. }
  8258. array_splice($first, 8 - count($second), 8, $second);
  8259. $aIP = $first;
  8260. unset($first,$second);
  8261. }
  8262. else
  8263. {
  8264. $aIP = explode(':', $aIP[0]);
  8265. }
  8266. $c = count($aIP);
  8267. if ($c != 8)
  8268. {
  8269. return false;
  8270. }
  8271. // All the pieces should be 16-bit hex strings. Are they?
  8272. foreach ($aIP as $piece)
  8273. {
  8274. if (!preg_match('#^[0-9a-fA-F]{4}$#s', sprintf('%04s', $piece)))
  8275. {
  8276. return false;
  8277. }
  8278. }
  8279. return $original;
  8280. }
  8281. }
  8282. /**
  8283. * Primitive email validation class based on the regexp found at
  8284. * http://www.regular-expressions.info/email.html
  8285. */
  8286. class HTMLPurifier_AttrDef_URI_Email_SimpleCheck extends HTMLPurifier_AttrDef_URI_Email
  8287. {
  8288. public function validate($string, $config, $context) {
  8289. // no support for named mailboxes i.e. "Bob <bob@example.com>"
  8290. // that needs more percent encoding to be done
  8291. if ($string == '') return false;
  8292. $string = trim($string);
  8293. $result = preg_match('/^[A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,4}$/i', $string);
  8294. return $result ? $string : false;
  8295. }
  8296. }
  8297. /**
  8298. * Pre-transform that changes proprietary background attribute to CSS.
  8299. */
  8300. class HTMLPurifier_AttrTransform_Background extends HTMLPurifier_AttrTransform {
  8301. public function transform($attr, $config, $context) {
  8302. if (!isset($attr['background'])) return $attr;
  8303. $background = $this->confiscateAttr($attr, 'background');
  8304. // some validation should happen here
  8305. $this->prependCSS($attr, "background-image:url($background);");
  8306. return $attr;
  8307. }
  8308. }
  8309. // this MUST be placed in post, as it assumes that any value in dir is valid
  8310. /**
  8311. * Post-trasnform that ensures that bdo tags have the dir attribute set.
  8312. */
  8313. class HTMLPurifier_AttrTransform_BdoDir extends HTMLPurifier_AttrTransform
  8314. {
  8315. public function transform($attr, $config, $context) {
  8316. if (isset($attr['dir'])) return $attr;
  8317. $attr['dir'] = $config->get('Attr.DefaultTextDir');
  8318. return $attr;
  8319. }
  8320. }
  8321. /**
  8322. * Pre-transform that changes deprecated bgcolor attribute to CSS.
  8323. */
  8324. class HTMLPurifier_AttrTransform_BgColor extends HTMLPurifier_AttrTransform {
  8325. public function transform($attr, $config, $context) {
  8326. if (!isset($attr['bgcolor'])) return $attr;
  8327. $bgcolor = $this->confiscateAttr($attr, 'bgcolor');
  8328. // some validation should happen here
  8329. $this->prependCSS($attr, "background-color:$bgcolor;");
  8330. return $attr;
  8331. }
  8332. }
  8333. /**
  8334. * Pre-transform that changes converts a boolean attribute to fixed CSS
  8335. */
  8336. class HTMLPurifier_AttrTransform_BoolToCSS extends HTMLPurifier_AttrTransform {
  8337. /**
  8338. * Name of boolean attribute that is trigger
  8339. */
  8340. protected $attr;
  8341. /**
  8342. * CSS declarations to add to style, needs trailing semicolon
  8343. */
  8344. protected $css;
  8345. /**
  8346. * @param $attr string attribute name to convert from
  8347. * @param $css string CSS declarations to add to style (needs semicolon)
  8348. */
  8349. public function __construct($attr, $css) {
  8350. $this->attr = $attr;
  8351. $this->css = $css;
  8352. }
  8353. public function transform($attr, $config, $context) {
  8354. if (!isset($attr[$this->attr])) return $attr;
  8355. unset($attr[$this->attr]);
  8356. $this->prependCSS($attr, $this->css);
  8357. return $attr;
  8358. }
  8359. }
  8360. /**
  8361. * Pre-transform that changes deprecated border attribute to CSS.
  8362. */
  8363. class HTMLPurifier_AttrTransform_Border extends HTMLPurifier_AttrTransform {
  8364. public function transform($attr, $config, $context) {
  8365. if (!isset($attr['border'])) return $attr;
  8366. $border_width = $this->confiscateAttr($attr, 'border');
  8367. // some validation should happen here
  8368. $this->prependCSS($attr, "border:{$border_width}px solid;");
  8369. return $attr;
  8370. }
  8371. }
  8372. /**
  8373. * Generic pre-transform that converts an attribute with a fixed number of
  8374. * values (enumerated) to CSS.
  8375. */
  8376. class HTMLPurifier_AttrTransform_EnumToCSS extends HTMLPurifier_AttrTransform {
  8377. /**
  8378. * Name of attribute to transform from
  8379. */
  8380. protected $attr;
  8381. /**
  8382. * Lookup array of attribute values to CSS
  8383. */
  8384. protected $enumToCSS = array();
  8385. /**
  8386. * Case sensitivity of the matching
  8387. * @warning Currently can only be guaranteed to work with ASCII
  8388. * values.
  8389. */
  8390. protected $caseSensitive = false;
  8391. /**
  8392. * @param $attr String attribute name to transform from
  8393. * @param $enumToCSS Lookup array of attribute values to CSS
  8394. * @param $case_sensitive Boolean case sensitivity indicator, default false
  8395. */
  8396. public function __construct($attr, $enum_to_css, $case_sensitive = false) {
  8397. $this->attr = $attr;
  8398. $this->enumToCSS = $enum_to_css;
  8399. $this->caseSensitive = (bool) $case_sensitive;
  8400. }
  8401. public function transform($attr, $config, $context) {
  8402. if (!isset($attr[$this->attr])) return $attr;
  8403. $value = trim($attr[$this->attr]);
  8404. unset($attr[$this->attr]);
  8405. if (!$this->caseSensitive) $value = strtolower($value);
  8406. if (!isset($this->enumToCSS[$value])) {
  8407. return $attr;
  8408. }
  8409. $this->prependCSS($attr, $this->enumToCSS[$value]);
  8410. return $attr;
  8411. }
  8412. }
  8413. // must be called POST validation
  8414. /**
  8415. * Transform that supplies default values for the src and alt attributes
  8416. * in img tags, as well as prevents the img tag from being removed
  8417. * because of a missing alt tag. This needs to be registered as both
  8418. * a pre and post attribute transform.
  8419. */
  8420. class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform
  8421. {
  8422. public function transform($attr, $config, $context) {
  8423. $src = true;
  8424. if (!isset($attr['src'])) {
  8425. if ($config->get('Core.RemoveInvalidImg')) return $attr;
  8426. $attr['src'] = $config->get('Attr.DefaultInvalidImage');
  8427. $src = false;
  8428. }
  8429. if (!isset($attr['alt'])) {
  8430. if ($src) {
  8431. $alt = $config->get('Attr.DefaultImageAlt');
  8432. if ($alt === null) {
  8433. // truncate if the alt is too long
  8434. $attr['alt'] = substr(basename($attr['src']),0,40);
  8435. } else {
  8436. $attr['alt'] = $alt;
  8437. }
  8438. } else {
  8439. $attr['alt'] = $config->get('Attr.DefaultInvalidImageAlt');
  8440. }
  8441. }
  8442. return $attr;
  8443. }
  8444. }
  8445. /**
  8446. * Pre-transform that changes deprecated hspace and vspace attributes to CSS
  8447. */
  8448. class HTMLPurifier_AttrTransform_ImgSpace extends HTMLPurifier_AttrTransform {
  8449. protected $attr;
  8450. protected $css = array(
  8451. 'hspace' => array('left', 'right'),
  8452. 'vspace' => array('top', 'bottom')
  8453. );
  8454. public function __construct($attr) {
  8455. $this->attr = $attr;
  8456. if (!isset($this->css[$attr])) {
  8457. trigger_error(htmlspecialchars($attr) . ' is not valid space attribute');
  8458. }
  8459. }
  8460. public function transform($attr, $config, $context) {
  8461. if (!isset($attr[$this->attr])) return $attr;
  8462. $width = $this->confiscateAttr($attr, $this->attr);
  8463. // some validation could happen here
  8464. if (!isset($this->css[$this->attr])) return $attr;
  8465. $style = '';
  8466. foreach ($this->css[$this->attr] as $suffix) {
  8467. $property = "margin-$suffix";
  8468. $style .= "$property:{$width}px;";
  8469. }
  8470. $this->prependCSS($attr, $style);
  8471. return $attr;
  8472. }
  8473. }
  8474. /**
  8475. * Performs miscellaneous cross attribute validation and filtering for
  8476. * input elements. This is meant to be a post-transform.
  8477. */
  8478. class HTMLPurifier_AttrTransform_Input extends HTMLPurifier_AttrTransform {
  8479. protected $pixels;
  8480. public function __construct() {
  8481. $this->pixels = new HTMLPurifier_AttrDef_HTML_Pixels();
  8482. }
  8483. public function transform($attr, $config, $context) {
  8484. if (!isset($attr['type'])) $t = 'text';
  8485. else $t = strtolower($attr['type']);
  8486. if (isset($attr['checked']) && $t !== 'radio' && $t !== 'checkbox') {
  8487. unset($attr['checked']);
  8488. }
  8489. if (isset($attr['maxlength']) && $t !== 'text' && $t !== 'password') {
  8490. unset($attr['maxlength']);
  8491. }
  8492. if (isset($attr['size']) && $t !== 'text' && $t !== 'password') {
  8493. $result = $this->pixels->validate($attr['size'], $config, $context);
  8494. if ($result === false) unset($attr['size']);
  8495. else $attr['size'] = $result;
  8496. }
  8497. if (isset($attr['src']) && $t !== 'image') {
  8498. unset($attr['src']);
  8499. }
  8500. if (!isset($attr['value']) && ($t === 'radio' || $t === 'checkbox')) {
  8501. $attr['value'] = '';
  8502. }
  8503. return $attr;
  8504. }
  8505. }
  8506. /**
  8507. * Post-transform that copies lang's value to xml:lang (and vice-versa)
  8508. * @note Theoretically speaking, this could be a pre-transform, but putting
  8509. * post is more efficient.
  8510. */
  8511. class HTMLPurifier_AttrTransform_Lang extends HTMLPurifier_AttrTransform
  8512. {
  8513. public function transform($attr, $config, $context) {
  8514. $lang = isset($attr['lang']) ? $attr['lang'] : false;
  8515. $xml_lang = isset($attr['xml:lang']) ? $attr['xml:lang'] : false;
  8516. if ($lang !== false && $xml_lang === false) {
  8517. $attr['xml:lang'] = $lang;
  8518. } elseif ($xml_lang !== false) {
  8519. $attr['lang'] = $xml_lang;
  8520. }
  8521. return $attr;
  8522. }
  8523. }
  8524. /**
  8525. * Class for handling width/height length attribute transformations to CSS
  8526. */
  8527. class HTMLPurifier_AttrTransform_Length extends HTMLPurifier_AttrTransform
  8528. {
  8529. protected $name;
  8530. protected $cssName;
  8531. public function __construct($name, $css_name = null) {
  8532. $this->name = $name;
  8533. $this->cssName = $css_name ? $css_name : $name;
  8534. }
  8535. public function transform($attr, $config, $context) {
  8536. if (!isset($attr[$this->name])) return $attr;
  8537. $length = $this->confiscateAttr($attr, $this->name);
  8538. if(ctype_digit($length)) $length .= 'px';
  8539. $this->prependCSS($attr, $this->cssName . ":$length;");
  8540. return $attr;
  8541. }
  8542. }
  8543. /**
  8544. * Pre-transform that changes deprecated name attribute to ID if necessary
  8545. */
  8546. class HTMLPurifier_AttrTransform_Name extends HTMLPurifier_AttrTransform
  8547. {
  8548. public function transform($attr, $config, $context) {
  8549. // Abort early if we're using relaxed definition of name
  8550. if ($config->get('HTML.Attr.Name.UseCDATA')) return $attr;
  8551. if (!isset($attr['name'])) return $attr;
  8552. $id = $this->confiscateAttr($attr, 'name');
  8553. if ( isset($attr['id'])) return $attr;
  8554. $attr['id'] = $id;
  8555. return $attr;
  8556. }
  8557. }
  8558. /**
  8559. * Post-transform that performs validation to the name attribute; if
  8560. * it is present with an equivalent id attribute, it is passed through;
  8561. * otherwise validation is performed.
  8562. */
  8563. class HTMLPurifier_AttrTransform_NameSync extends HTMLPurifier_AttrTransform
  8564. {
  8565. public function __construct() {
  8566. $this->idDef = new HTMLPurifier_AttrDef_HTML_ID();
  8567. }
  8568. public function transform($attr, $config, $context) {
  8569. if (!isset($attr['name'])) return $attr;
  8570. $name = $attr['name'];
  8571. if (isset($attr['id']) && $attr['id'] === $name) return $attr;
  8572. $result = $this->idDef->validate($name, $config, $context);
  8573. if ($result === false) unset($attr['name']);
  8574. else $attr['name'] = $result;
  8575. return $attr;
  8576. }
  8577. }
  8578. class HTMLPurifier_AttrTransform_SafeEmbed extends HTMLPurifier_AttrTransform
  8579. {
  8580. public $name = "SafeEmbed";
  8581. public function transform($attr, $config, $context) {
  8582. $attr['allowscriptaccess'] = 'never';
  8583. $attr['allownetworking'] = 'internal';
  8584. $attr['type'] = 'application/x-shockwave-flash';
  8585. return $attr;
  8586. }
  8587. }
  8588. /**
  8589. * Writes default type for all objects. Currently only supports flash.
  8590. */
  8591. class HTMLPurifier_AttrTransform_SafeObject extends HTMLPurifier_AttrTransform
  8592. {
  8593. public $name = "SafeObject";
  8594. function transform($attr, $config, $context) {
  8595. if (!isset($attr['type'])) $attr['type'] = 'application/x-shockwave-flash';
  8596. return $attr;
  8597. }
  8598. }
  8599. /**
  8600. * Validates name/value pairs in param tags to be used in safe objects. This
  8601. * will only allow name values it recognizes, and pre-fill certain attributes
  8602. * with required values.
  8603. *
  8604. * @note
  8605. * This class only supports Flash. In the future, Quicktime support
  8606. * may be added.
  8607. *
  8608. * @warning
  8609. * This class expects an injector to add the necessary parameters tags.
  8610. */
  8611. class HTMLPurifier_AttrTransform_SafeParam extends HTMLPurifier_AttrTransform
  8612. {
  8613. public $name = "SafeParam";
  8614. private $uri;
  8615. public function __construct() {
  8616. $this->uri = new HTMLPurifier_AttrDef_URI(true); // embedded
  8617. }
  8618. public function transform($attr, $config, $context) {
  8619. // If we add support for other objects, we'll need to alter the
  8620. // transforms.
  8621. switch ($attr['name']) {
  8622. // application/x-shockwave-flash
  8623. // Keep this synchronized with Injector/SafeObject.php
  8624. case 'allowScriptAccess':
  8625. $attr['value'] = 'never';
  8626. break;
  8627. case 'allowNetworking':
  8628. $attr['value'] = 'internal';
  8629. break;
  8630. case 'allowFullScreen':
  8631. if ($config->get('HTML.FlashAllowFullScreen')) {
  8632. $attr['value'] = ($attr['value'] == 'true') ? 'true' : 'false';
  8633. } else {
  8634. $attr['value'] = 'false';
  8635. }
  8636. break;
  8637. case 'wmode':
  8638. $attr['value'] = 'window';
  8639. break;
  8640. case 'movie':
  8641. case 'src':
  8642. $attr['name'] = "movie";
  8643. $attr['value'] = $this->uri->validate($attr['value'], $config, $context);
  8644. break;
  8645. case 'flashvars':
  8646. // we're going to allow arbitrary inputs to the SWF, on
  8647. // the reasoning that it could only hack the SWF, not us.
  8648. break;
  8649. // add other cases to support other param name/value pairs
  8650. default:
  8651. $attr['name'] = $attr['value'] = null;
  8652. }
  8653. return $attr;
  8654. }
  8655. }
  8656. /**
  8657. * Implements required attribute stipulation for <script>
  8658. */
  8659. class HTMLPurifier_AttrTransform_ScriptRequired extends HTMLPurifier_AttrTransform
  8660. {
  8661. public function transform($attr, $config, $context) {
  8662. if (!isset($attr['type'])) {
  8663. $attr['type'] = 'text/javascript';
  8664. }
  8665. return $attr;
  8666. }
  8667. }
  8668. /**
  8669. * Sets height/width defaults for <textarea>
  8670. */
  8671. class HTMLPurifier_AttrTransform_Textarea extends HTMLPurifier_AttrTransform
  8672. {
  8673. public function transform($attr, $config, $context) {
  8674. // Calculated from Firefox
  8675. if (!isset($attr['cols'])) $attr['cols'] = '22';
  8676. if (!isset($attr['rows'])) $attr['rows'] = '3';
  8677. return $attr;
  8678. }
  8679. }
  8680. /**
  8681. * Definition that uses different definitions depending on context.
  8682. *
  8683. * The del and ins tags are notable because they allow different types of
  8684. * elements depending on whether or not they're in a block or inline context.
  8685. * Chameleon allows this behavior to happen by using two different
  8686. * definitions depending on context. While this somewhat generalized,
  8687. * it is specifically intended for those two tags.
  8688. */
  8689. class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
  8690. {
  8691. /**
  8692. * Instance of the definition object to use when inline. Usually stricter.
  8693. */
  8694. public $inline;
  8695. /**
  8696. * Instance of the definition object to use when block.
  8697. */
  8698. public $block;
  8699. public $type = 'chameleon';
  8700. /**
  8701. * @param $inline List of elements to allow when inline.
  8702. * @param $block List of elements to allow when block.
  8703. */
  8704. public function __construct($inline, $block) {
  8705. $this->inline = new HTMLPurifier_ChildDef_Optional($inline);
  8706. $this->block = new HTMLPurifier_ChildDef_Optional($block);
  8707. $this->elements = $this->block->elements;
  8708. }
  8709. public function validateChildren($tokens_of_children, $config, $context) {
  8710. if ($context->get('IsInline') === false) {
  8711. return $this->block->validateChildren(
  8712. $tokens_of_children, $config, $context);
  8713. } else {
  8714. return $this->inline->validateChildren(
  8715. $tokens_of_children, $config, $context);
  8716. }
  8717. }
  8718. }
  8719. /**
  8720. * Custom validation class, accepts DTD child definitions
  8721. *
  8722. * @warning Currently this class is an all or nothing proposition, that is,
  8723. * it will only give a bool return value.
  8724. */
  8725. class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
  8726. {
  8727. public $type = 'custom';
  8728. public $allow_empty = false;
  8729. /**
  8730. * Allowed child pattern as defined by the DTD
  8731. */
  8732. public $dtd_regex;
  8733. /**
  8734. * PCRE regex derived from $dtd_regex
  8735. * @private
  8736. */
  8737. private $_pcre_regex;
  8738. /**
  8739. * @param $dtd_regex Allowed child pattern from the DTD
  8740. */
  8741. public function __construct($dtd_regex) {
  8742. $this->dtd_regex = $dtd_regex;
  8743. $this->_compileRegex();
  8744. }
  8745. /**
  8746. * Compiles the PCRE regex from a DTD regex ($dtd_regex to $_pcre_regex)
  8747. */
  8748. protected function _compileRegex() {
  8749. $raw = str_replace(' ', '', $this->dtd_regex);
  8750. if ($raw{0} != '(') {
  8751. $raw = "($raw)";
  8752. }
  8753. $el = '[#a-zA-Z0-9_.-]+';
  8754. $reg = $raw;
  8755. // COMPLICATED! AND MIGHT BE BUGGY! I HAVE NO CLUE WHAT I'M
  8756. // DOING! Seriously: if there's problems, please report them.
  8757. // collect all elements into the $elements array
  8758. preg_match_all("/$el/", $reg, $matches);
  8759. foreach ($matches[0] as $match) {
  8760. $this->elements[$match] = true;
  8761. }
  8762. // setup all elements as parentheticals with leading commas
  8763. $reg = preg_replace("/$el/", '(,\\0)', $reg);
  8764. // remove commas when they were not solicited
  8765. $reg = preg_replace("/([^,(|]\(+),/", '\\1', $reg);
  8766. // remove all non-paranthetical commas: they are handled by first regex
  8767. $reg = preg_replace("/,\(/", '(', $reg);
  8768. $this->_pcre_regex = $reg;
  8769. }
  8770. public function validateChildren($tokens_of_children, $config, $context) {
  8771. $list_of_children = '';
  8772. $nesting = 0; // depth into the nest
  8773. foreach ($tokens_of_children as $token) {
  8774. if (!empty($token->is_whitespace)) continue;
  8775. $is_child = ($nesting == 0); // direct
  8776. if ($token instanceof HTMLPurifier_Token_Start) {
  8777. $nesting++;
  8778. } elseif ($token instanceof HTMLPurifier_Token_End) {
  8779. $nesting--;
  8780. }
  8781. if ($is_child) {
  8782. $list_of_children .= $token->name . ',';
  8783. }
  8784. }
  8785. // add leading comma to deal with stray comma declarations
  8786. $list_of_children = ',' . rtrim($list_of_children, ',');
  8787. $okay =
  8788. preg_match(
  8789. '/^,?'.$this->_pcre_regex.'$/',
  8790. $list_of_children
  8791. );
  8792. return (bool) $okay;
  8793. }
  8794. }
  8795. /**
  8796. * Definition that disallows all elements.
  8797. * @warning validateChildren() in this class is actually never called, because
  8798. * empty elements are corrected in HTMLPurifier_Strategy_MakeWellFormed
  8799. * before child definitions are parsed in earnest by
  8800. * HTMLPurifier_Strategy_FixNesting.
  8801. */
  8802. class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef
  8803. {
  8804. public $allow_empty = true;
  8805. public $type = 'empty';
  8806. public function __construct() {}
  8807. public function validateChildren($tokens_of_children, $config, $context) {
  8808. return array();
  8809. }
  8810. }
  8811. /**
  8812. * Definition that allows a set of elements, but disallows empty children.
  8813. */
  8814. class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
  8815. {
  8816. /**
  8817. * Lookup table of allowed elements.
  8818. * @public
  8819. */
  8820. public $elements = array();
  8821. /**
  8822. * Whether or not the last passed node was all whitespace.
  8823. */
  8824. protected $whitespace = false;
  8825. /**
  8826. * @param $elements List of allowed element names (lowercase).
  8827. */
  8828. public function __construct($elements) {
  8829. if (is_string($elements)) {
  8830. $elements = str_replace(' ', '', $elements);
  8831. $elements = explode('|', $elements);
  8832. }
  8833. $keys = array_keys($elements);
  8834. if ($keys == array_keys($keys)) {
  8835. $elements = array_flip($elements);
  8836. foreach ($elements as $i => $x) {
  8837. $elements[$i] = true;
  8838. if (empty($i)) unset($elements[$i]); // remove blank
  8839. }
  8840. }
  8841. $this->elements = $elements;
  8842. }
  8843. public $allow_empty = false;
  8844. public $type = 'required';
  8845. public function validateChildren($tokens_of_children, $config, $context) {
  8846. // Flag for subclasses
  8847. $this->whitespace = false;
  8848. // if there are no tokens, delete parent node
  8849. if (empty($tokens_of_children)) return false;
  8850. // the new set of children
  8851. $result = array();
  8852. // current depth into the nest
  8853. $nesting = 0;
  8854. // whether or not we're deleting a node
  8855. $is_deleting = false;
  8856. // whether or not parsed character data is allowed
  8857. // this controls whether or not we silently drop a tag
  8858. // or generate escaped HTML from it
  8859. $pcdata_allowed = isset($this->elements['#PCDATA']);
  8860. // a little sanity check to make sure it's not ALL whitespace
  8861. $all_whitespace = true;
  8862. // some configuration
  8863. $escape_invalid_children = $config->get('Core.EscapeInvalidChildren');
  8864. // generator
  8865. $gen = new HTMLPurifier_Generator($config, $context);
  8866. foreach ($tokens_of_children as $token) {
  8867. if (!empty($token->is_whitespace)) {
  8868. $result[] = $token;
  8869. continue;
  8870. }
  8871. $all_whitespace = false; // phew, we're not talking about whitespace
  8872. $is_child = ($nesting == 0);
  8873. if ($token instanceof HTMLPurifier_Token_Start) {
  8874. $nesting++;
  8875. } elseif ($token instanceof HTMLPurifier_Token_End) {
  8876. $nesting--;
  8877. }
  8878. if ($is_child) {
  8879. $is_deleting = false;
  8880. if (!isset($this->elements[$token->name])) {
  8881. $is_deleting = true;
  8882. if ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text) {
  8883. $result[] = $token;
  8884. } elseif ($pcdata_allowed && $escape_invalid_children) {
  8885. $result[] = new HTMLPurifier_Token_Text(
  8886. $gen->generateFromToken($token)
  8887. );
  8888. }
  8889. continue;
  8890. }
  8891. }
  8892. if (!$is_deleting || ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text)) {
  8893. $result[] = $token;
  8894. } elseif ($pcdata_allowed && $escape_invalid_children) {
  8895. $result[] =
  8896. new HTMLPurifier_Token_Text(
  8897. $gen->generateFromToken($token)
  8898. );
  8899. } else {
  8900. // drop silently
  8901. }
  8902. }
  8903. if (empty($result)) return false;
  8904. if ($all_whitespace) {
  8905. $this->whitespace = true;
  8906. return false;
  8907. }
  8908. if ($tokens_of_children == $result) return true;
  8909. return $result;
  8910. }
  8911. }
  8912. /**
  8913. * Definition that allows a set of elements, and allows no children.
  8914. * @note This is a hack to reuse code from HTMLPurifier_ChildDef_Required,
  8915. * really, one shouldn't inherit from the other. Only altered behavior
  8916. * is to overload a returned false with an array. Thus, it will never
  8917. * return false.
  8918. */
  8919. class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
  8920. {
  8921. public $allow_empty = true;
  8922. public $type = 'optional';
  8923. public function validateChildren($tokens_of_children, $config, $context) {
  8924. $result = parent::validateChildren($tokens_of_children, $config, $context);
  8925. // we assume that $tokens_of_children is not modified
  8926. if ($result === false) {
  8927. if (empty($tokens_of_children)) return true;
  8928. elseif ($this->whitespace) return $tokens_of_children;
  8929. else return array();
  8930. }
  8931. return $result;
  8932. }
  8933. }
  8934. /**
  8935. * Takes the contents of blockquote when in strict and reformats for validation.
  8936. */
  8937. class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Required
  8938. {
  8939. protected $real_elements;
  8940. protected $fake_elements;
  8941. public $allow_empty = true;
  8942. public $type = 'strictblockquote';
  8943. protected $init = false;
  8944. /**
  8945. * @note We don't want MakeWellFormed to auto-close inline elements since
  8946. * they might be allowed.
  8947. */
  8948. public function getAllowedElements($config) {
  8949. $this->init($config);
  8950. return $this->fake_elements;
  8951. }
  8952. public function validateChildren($tokens_of_children, $config, $context) {
  8953. $this->init($config);
  8954. // trick the parent class into thinking it allows more
  8955. $this->elements = $this->fake_elements;
  8956. $result = parent::validateChildren($tokens_of_children, $config, $context);
  8957. $this->elements = $this->real_elements;
  8958. if ($result === false) return array();
  8959. if ($result === true) $result = $tokens_of_children;
  8960. $def = $config->getHTMLDefinition();
  8961. $block_wrap_start = new HTMLPurifier_Token_Start($def->info_block_wrapper);
  8962. $block_wrap_end = new HTMLPurifier_Token_End( $def->info_block_wrapper);
  8963. $is_inline = false;
  8964. $depth = 0;
  8965. $ret = array();
  8966. // assuming that there are no comment tokens
  8967. foreach ($result as $i => $token) {
  8968. $token = $result[$i];
  8969. // ifs are nested for readability
  8970. if (!$is_inline) {
  8971. if (!$depth) {
  8972. if (
  8973. ($token instanceof HTMLPurifier_Token_Text && !$token->is_whitespace) ||
  8974. (!$token instanceof HTMLPurifier_Token_Text && !isset($this->elements[$token->name]))
  8975. ) {
  8976. $is_inline = true;
  8977. $ret[] = $block_wrap_start;
  8978. }
  8979. }
  8980. } else {
  8981. if (!$depth) {
  8982. // starting tokens have been inline text / empty
  8983. if ($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) {
  8984. if (isset($this->elements[$token->name])) {
  8985. // ended
  8986. $ret[] = $block_wrap_end;
  8987. $is_inline = false;
  8988. }
  8989. }
  8990. }
  8991. }
  8992. $ret[] = $token;
  8993. if ($token instanceof HTMLPurifier_Token_Start) $depth++;
  8994. if ($token instanceof HTMLPurifier_Token_End) $depth--;
  8995. }
  8996. if ($is_inline) $ret[] = $block_wrap_end;
  8997. return $ret;
  8998. }
  8999. private function init($config) {
  9000. if (!$this->init) {
  9001. $def = $config->getHTMLDefinition();
  9002. // allow all inline elements
  9003. $this->real_elements = $this->elements;
  9004. $this->fake_elements = $def->info_content_sets['Flow'];
  9005. $this->fake_elements['#PCDATA'] = true;
  9006. $this->init = true;
  9007. }
  9008. }
  9009. }
  9010. /**
  9011. * Definition for tables
  9012. */
  9013. class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
  9014. {
  9015. public $allow_empty = false;
  9016. public $type = 'table';
  9017. public $elements = array('tr' => true, 'tbody' => true, 'thead' => true,
  9018. 'tfoot' => true, 'caption' => true, 'colgroup' => true, 'col' => true);
  9019. public function __construct() {}
  9020. public function validateChildren($tokens_of_children, $config, $context) {
  9021. if (empty($tokens_of_children)) return false;
  9022. // this ensures that the loop gets run one last time before closing
  9023. // up. It's a little bit of a hack, but it works! Just make sure you
  9024. // get rid of the token later.
  9025. $tokens_of_children[] = false;
  9026. // only one of these elements is allowed in a table
  9027. $caption = false;
  9028. $thead = false;
  9029. $tfoot = false;
  9030. // as many of these as you want
  9031. $cols = array();
  9032. $content = array();
  9033. $nesting = 0; // current depth so we can determine nodes
  9034. $is_collecting = false; // are we globbing together tokens to package
  9035. // into one of the collectors?
  9036. $collection = array(); // collected nodes
  9037. $tag_index = 0; // the first node might be whitespace,
  9038. // so this tells us where the start tag is
  9039. foreach ($tokens_of_children as $token) {
  9040. $is_child = ($nesting == 0);
  9041. if ($token === false) {
  9042. // terminating sequence started
  9043. } elseif ($token instanceof HTMLPurifier_Token_Start) {
  9044. $nesting++;
  9045. } elseif ($token instanceof HTMLPurifier_Token_End) {
  9046. $nesting--;
  9047. }
  9048. // handle node collection
  9049. if ($is_collecting) {
  9050. if ($is_child) {
  9051. // okay, let's stash the tokens away
  9052. // first token tells us the type of the collection
  9053. switch ($collection[$tag_index]->name) {
  9054. case 'tr':
  9055. case 'tbody':
  9056. $content[] = $collection;
  9057. break;
  9058. case 'caption':
  9059. if ($caption !== false) break;
  9060. $caption = $collection;
  9061. break;
  9062. case 'thead':
  9063. case 'tfoot':
  9064. // access the appropriate variable, $thead or $tfoot
  9065. $var = $collection[$tag_index]->name;
  9066. if ($$var === false) {
  9067. $$var = $collection;
  9068. } else {
  9069. // transmutate the first and less entries into
  9070. // tbody tags, and then put into content
  9071. $collection[$tag_index]->name = 'tbody';
  9072. $collection[count($collection)-1]->name = 'tbody';
  9073. $content[] = $collection;
  9074. }
  9075. break;
  9076. case 'colgroup':
  9077. $cols[] = $collection;
  9078. break;
  9079. }
  9080. $collection = array();
  9081. $is_collecting = false;
  9082. $tag_index = 0;
  9083. } else {
  9084. // add the node to the collection
  9085. $collection[] = $token;
  9086. }
  9087. }
  9088. // terminate
  9089. if ($token === false) break;
  9090. if ($is_child) {
  9091. // determine what we're dealing with
  9092. if ($token->name == 'col') {
  9093. // the only empty tag in the possie, we can handle it
  9094. // immediately
  9095. $cols[] = array_merge($collection, array($token));
  9096. $collection = array();
  9097. $tag_index = 0;
  9098. continue;
  9099. }
  9100. switch($token->name) {
  9101. case 'caption':
  9102. case 'colgroup':
  9103. case 'thead':
  9104. case 'tfoot':
  9105. case 'tbody':
  9106. case 'tr':
  9107. $is_collecting = true;
  9108. $collection[] = $token;
  9109. continue;
  9110. default:
  9111. if (!empty($token->is_whitespace)) {
  9112. $collection[] = $token;
  9113. $tag_index++;
  9114. }
  9115. continue;
  9116. }
  9117. }
  9118. }
  9119. if (empty($content)) return false;
  9120. $ret = array();
  9121. if ($caption !== false) $ret = array_merge($ret, $caption);
  9122. if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array);
  9123. if ($thead !== false) $ret = array_merge($ret, $thead);
  9124. if ($tfoot !== false) $ret = array_merge($ret, $tfoot);
  9125. foreach ($content as $token_array) $ret = array_merge($ret, $token_array);
  9126. if (!empty($collection) && $is_collecting == false){
  9127. // grab the trailing space
  9128. $ret = array_merge($ret, $collection);
  9129. }
  9130. array_pop($tokens_of_children); // remove phantom token
  9131. return ($ret === $tokens_of_children) ? true : $ret;
  9132. }
  9133. }
  9134. class HTMLPurifier_DefinitionCache_Decorator extends HTMLPurifier_DefinitionCache
  9135. {
  9136. /**
  9137. * Cache object we are decorating
  9138. */
  9139. public $cache;
  9140. public function __construct() {}
  9141. /**
  9142. * Lazy decorator function
  9143. * @param $cache Reference to cache object to decorate
  9144. */
  9145. public function decorate(&$cache) {
  9146. $decorator = $this->copy();
  9147. // reference is necessary for mocks in PHP 4
  9148. $decorator->cache =& $cache;
  9149. $decorator->type = $cache->type;
  9150. return $decorator;
  9151. }
  9152. /**
  9153. * Cross-compatible clone substitute
  9154. */
  9155. public function copy() {
  9156. return new HTMLPurifier_DefinitionCache_Decorator();
  9157. }
  9158. public function add($def, $config) {
  9159. return $this->cache->add($def, $config);
  9160. }
  9161. public function set($def, $config) {
  9162. return $this->cache->set($def, $config);
  9163. }
  9164. public function replace($def, $config) {
  9165. return $this->cache->replace($def, $config);
  9166. }
  9167. public function get($config) {
  9168. return $this->cache->get($config);
  9169. }
  9170. public function remove($config) {
  9171. return $this->cache->remove($config);
  9172. }
  9173. public function flush($config) {
  9174. return $this->cache->flush($config);
  9175. }
  9176. public function cleanup($config) {
  9177. return $this->cache->cleanup($config);
  9178. }
  9179. }
  9180. /**
  9181. * Null cache object to use when no caching is on.
  9182. */
  9183. class HTMLPurifier_DefinitionCache_Null extends HTMLPurifier_DefinitionCache
  9184. {
  9185. public function add($def, $config) {
  9186. return false;
  9187. }
  9188. public function set($def, $config) {
  9189. return false;
  9190. }
  9191. public function replace($def, $config) {
  9192. return false;
  9193. }
  9194. public function remove($config) {
  9195. return false;
  9196. }
  9197. public function get($config) {
  9198. return false;
  9199. }
  9200. public function flush($config) {
  9201. return false;
  9202. }
  9203. public function cleanup($config) {
  9204. return false;
  9205. }
  9206. }
  9207. class HTMLPurifier_DefinitionCache_Serializer extends
  9208. HTMLPurifier_DefinitionCache
  9209. {
  9210. public function add($def, $config) {
  9211. if (!$this->checkDefType($def)) return;
  9212. $file = $this->generateFilePath($config);
  9213. if (file_exists($file)) return false;
  9214. if (!$this->_prepareDir($config)) return false;
  9215. return $this->_write($file, serialize($def));
  9216. }
  9217. public function set($def, $config) {
  9218. if (!$this->checkDefType($def)) return;
  9219. $file = $this->generateFilePath($config);
  9220. if (!$this->_prepareDir($config)) return false;
  9221. return $this->_write($file, serialize($def));
  9222. }
  9223. public function replace($def, $config) {
  9224. if (!$this->checkDefType($def)) return;
  9225. $file = $this->generateFilePath($config);
  9226. if (!file_exists($file)) return false;
  9227. if (!$this->_prepareDir($config)) return false;
  9228. return $this->_write($file, serialize($def));
  9229. }
  9230. public function get($config) {
  9231. $file = $this->generateFilePath($config);
  9232. if (!file_exists($file)) return false;
  9233. return unserialize(file_get_contents($file));
  9234. }
  9235. public function remove($config) {
  9236. $file = $this->generateFilePath($config);
  9237. if (!file_exists($file)) return false;
  9238. return unlink($file);
  9239. }
  9240. public function flush($config) {
  9241. if (!$this->_prepareDir($config)) return false;
  9242. $dir = $this->generateDirectoryPath($config);
  9243. $dh = opendir($dir);
  9244. while (false !== ($filename = readdir($dh))) {
  9245. if (empty($filename)) continue;
  9246. if ($filename[0] === '.') continue;
  9247. unlink($dir . '/' . $filename);
  9248. }
  9249. }
  9250. public function cleanup($config) {
  9251. if (!$this->_prepareDir($config)) return false;
  9252. $dir = $this->generateDirectoryPath($config);
  9253. $dh = opendir($dir);
  9254. while (false !== ($filename = readdir($dh))) {
  9255. if (empty($filename)) continue;
  9256. if ($filename[0] === '.') continue;
  9257. $key = substr($filename, 0, strlen($filename) - 4);
  9258. if ($this->isOld($key, $config)) unlink($dir . '/' . $filename);
  9259. }
  9260. }
  9261. /**
  9262. * Generates the file path to the serial file corresponding to
  9263. * the configuration and definition name
  9264. * @todo Make protected
  9265. */
  9266. public function generateFilePath($config) {
  9267. $key = $this->generateKey($config);
  9268. return $this->generateDirectoryPath($config) . '/' . $key . '.ser';
  9269. }
  9270. /**
  9271. * Generates the path to the directory contain this cache's serial files
  9272. * @note No trailing slash
  9273. * @todo Make protected
  9274. */
  9275. public function generateDirectoryPath($config) {
  9276. $base = $this->generateBaseDirectoryPath($config);
  9277. return $base . '/' . $this->type;
  9278. }
  9279. /**
  9280. * Generates path to base directory that contains all definition type
  9281. * serials
  9282. * @todo Make protected
  9283. */
  9284. public function generateBaseDirectoryPath($config) {
  9285. $base = $config->get('Cache.SerializerPath');
  9286. $base = is_null($base) ? HTMLPURIFIER_PREFIX . '/HTMLPurifier/DefinitionCache/Serializer' : $base;
  9287. return $base;
  9288. }
  9289. /**
  9290. * Convenience wrapper function for file_put_contents
  9291. * @param $file File name to write to
  9292. * @param $data Data to write into file
  9293. * @return Number of bytes written if success, or false if failure.
  9294. */
  9295. private function _write($file, $data) {
  9296. return file_put_contents($file, $data);
  9297. }
  9298. /**
  9299. * Prepares the directory that this type stores the serials in
  9300. * @return True if successful
  9301. */
  9302. private function _prepareDir($config) {
  9303. $directory = $this->generateDirectoryPath($config);
  9304. if (!is_dir($directory)) {
  9305. $base = $this->generateBaseDirectoryPath($config);
  9306. if (!is_dir($base)) {
  9307. trigger_error('Base directory '.$base.' does not exist,
  9308. please create or change using %Cache.SerializerPath',
  9309. E_USER_WARNING);
  9310. return false;
  9311. } elseif (!$this->_testPermissions($base)) {
  9312. return false;
  9313. }
  9314. $old = umask(0022); // disable group and world writes
  9315. mkdir($directory);
  9316. umask($old);
  9317. } elseif (!$this->_testPermissions($directory)) {
  9318. return false;
  9319. }
  9320. return true;
  9321. }
  9322. /**
  9323. * Tests permissions on a directory and throws out friendly
  9324. * error messages and attempts to chmod it itself if possible
  9325. */
  9326. private function _testPermissions($dir) {
  9327. // early abort, if it is writable, everything is hunky-dory
  9328. if (is_writable($dir)) return true;
  9329. if (!is_dir($dir)) {
  9330. // generally, you'll want to handle this beforehand
  9331. // so a more specific error message can be given
  9332. trigger_error('Directory '.$dir.' does not exist',
  9333. E_USER_WARNING);
  9334. return false;
  9335. }
  9336. if (function_exists('posix_getuid')) {
  9337. // POSIX system, we can give more specific advice
  9338. if (fileowner($dir) === posix_getuid()) {
  9339. // we can chmod it ourselves
  9340. chmod($dir, 0755);
  9341. return true;
  9342. } elseif (filegroup($dir) === posix_getgid()) {
  9343. $chmod = '775';
  9344. } else {
  9345. // PHP's probably running as nobody, so we'll
  9346. // need to give global permissions
  9347. $chmod = '777';
  9348. }
  9349. trigger_error('Directory '.$dir.' not writable, '.
  9350. 'please chmod to ' . $chmod,
  9351. E_USER_WARNING);
  9352. } else {
  9353. // generic error message
  9354. trigger_error('Directory '.$dir.' not writable, '.
  9355. 'please alter file permissions',
  9356. E_USER_WARNING);
  9357. }
  9358. return false;
  9359. }
  9360. }
  9361. /**
  9362. * Definition cache decorator class that cleans up the cache
  9363. * whenever there is a cache miss.
  9364. */
  9365. class HTMLPurifier_DefinitionCache_Decorator_Cleanup extends
  9366. HTMLPurifier_DefinitionCache_Decorator
  9367. {
  9368. public $name = 'Cleanup';
  9369. public function copy() {
  9370. return new HTMLPurifier_DefinitionCache_Decorator_Cleanup();
  9371. }
  9372. public function add($def, $config) {
  9373. $status = parent::add($def, $config);
  9374. if (!$status) parent::cleanup($config);
  9375. return $status;
  9376. }
  9377. public function set($def, $config) {
  9378. $status = parent::set($def, $config);
  9379. if (!$status) parent::cleanup($config);
  9380. return $status;
  9381. }
  9382. public function replace($def, $config) {
  9383. $status = parent::replace($def, $config);
  9384. if (!$status) parent::cleanup($config);
  9385. return $status;
  9386. }
  9387. public function get($config) {
  9388. $ret = parent::get($config);
  9389. if (!$ret) parent::cleanup($config);
  9390. return $ret;
  9391. }
  9392. }
  9393. /**
  9394. * Definition cache decorator class that saves all cache retrievals
  9395. * to PHP's memory; good for unit tests or circumstances where
  9396. * there are lots of configuration objects floating around.
  9397. */
  9398. class HTMLPurifier_DefinitionCache_Decorator_Memory extends
  9399. HTMLPurifier_DefinitionCache_Decorator
  9400. {
  9401. protected $definitions;
  9402. public $name = 'Memory';
  9403. public function copy() {
  9404. return new HTMLPurifier_DefinitionCache_Decorator_Memory();
  9405. }
  9406. public function add($def, $config) {
  9407. $status = parent::add($def, $config);
  9408. if ($status) $this->definitions[$this->generateKey($config)] = $def;
  9409. return $status;
  9410. }
  9411. public function set($def, $config) {
  9412. $status = parent::set($def, $config);
  9413. if ($status) $this->definitions[$this->generateKey($config)] = $def;
  9414. return $status;
  9415. }
  9416. public function replace($def, $config) {
  9417. $status = parent::replace($def, $config);
  9418. if ($status) $this->definitions[$this->generateKey($config)] = $def;
  9419. return $status;
  9420. }
  9421. public function get($config) {
  9422. $key = $this->generateKey($config);
  9423. if (isset($this->definitions[$key])) return $this->definitions[$key];
  9424. $this->definitions[$key] = parent::get($config);
  9425. return $this->definitions[$key];
  9426. }
  9427. }
  9428. /**
  9429. * XHTML 1.1 Bi-directional Text Module, defines elements that
  9430. * declare directionality of content. Text Extension Module.
  9431. */
  9432. class HTMLPurifier_HTMLModule_Bdo extends HTMLPurifier_HTMLModule
  9433. {
  9434. public $name = 'Bdo';
  9435. public $attr_collections = array(
  9436. 'I18N' => array('dir' => false)
  9437. );
  9438. public function setup($config) {
  9439. $bdo = $this->addElement(
  9440. 'bdo', 'Inline', 'Inline', array('Core', 'Lang'),
  9441. array(
  9442. 'dir' => 'Enum#ltr,rtl', // required
  9443. // The Abstract Module specification has the attribute
  9444. // inclusions wrong for bdo: bdo allows Lang
  9445. )
  9446. );
  9447. $bdo->attr_transform_post['required-dir'] = new HTMLPurifier_AttrTransform_BdoDir();
  9448. $this->attr_collections['I18N']['dir'] = 'Enum#ltr,rtl';
  9449. }
  9450. }
  9451. class HTMLPurifier_HTMLModule_CommonAttributes extends HTMLPurifier_HTMLModule
  9452. {
  9453. public $name = 'CommonAttributes';
  9454. public $attr_collections = array(
  9455. 'Core' => array(
  9456. 0 => array('Style'),
  9457. // 'xml:space' => false,
  9458. 'class' => 'Class',
  9459. 'id' => 'ID',
  9460. 'title' => 'CDATA',
  9461. ),
  9462. 'Lang' => array(),
  9463. 'I18N' => array(
  9464. 0 => array('Lang'), // proprietary, for xml:lang/lang
  9465. ),
  9466. 'Common' => array(
  9467. 0 => array('Core', 'I18N')
  9468. )
  9469. );
  9470. }
  9471. /**
  9472. * XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
  9473. * Module.
  9474. */
  9475. class HTMLPurifier_HTMLModule_Edit extends HTMLPurifier_HTMLModule
  9476. {
  9477. public $name = 'Edit';
  9478. public function setup($config) {
  9479. $contents = 'Chameleon: #PCDATA | Inline ! #PCDATA | Flow';
  9480. $attr = array(
  9481. 'cite' => 'URI',
  9482. // 'datetime' => 'Datetime', // not implemented
  9483. );
  9484. $this->addElement('del', 'Inline', $contents, 'Common', $attr);
  9485. $this->addElement('ins', 'Inline', $contents, 'Common', $attr);
  9486. }
  9487. // HTML 4.01 specifies that ins/del must not contain block
  9488. // elements when used in an inline context, chameleon is
  9489. // a complicated workaround to acheive this effect
  9490. // Inline context ! Block context (exclamation mark is
  9491. // separator, see getChildDef for parsing)
  9492. public $defines_child_def = true;
  9493. public function getChildDef($def) {
  9494. if ($def->content_model_type != 'chameleon') return false;
  9495. $value = explode('!', $def->content_model);
  9496. return new HTMLPurifier_ChildDef_Chameleon($value[0], $value[1]);
  9497. }
  9498. }
  9499. /**
  9500. * XHTML 1.1 Forms module, defines all form-related elements found in HTML 4.
  9501. */
  9502. class HTMLPurifier_HTMLModule_Forms extends HTMLPurifier_HTMLModule
  9503. {
  9504. public $name = 'Forms';
  9505. public $safe = false;
  9506. public $content_sets = array(
  9507. 'Block' => 'Form',
  9508. 'Inline' => 'Formctrl',
  9509. );
  9510. public function setup($config) {
  9511. $form = $this->addElement('form', 'Form',
  9512. 'Required: Heading | List | Block | fieldset', 'Common', array(
  9513. 'accept' => 'ContentTypes',
  9514. 'accept-charset' => 'Charsets',
  9515. 'action*' => 'URI',
  9516. 'method' => 'Enum#get,post',
  9517. // really ContentType, but these two are the only ones used today
  9518. 'enctype' => 'Enum#application/x-www-form-urlencoded,multipart/form-data',
  9519. ));
  9520. $form->excludes = array('form' => true);
  9521. $input = $this->addElement('input', 'Formctrl', 'Empty', 'Common', array(
  9522. 'accept' => 'ContentTypes',
  9523. 'accesskey' => 'Character',
  9524. 'alt' => 'Text',
  9525. 'checked' => 'Bool#checked',
  9526. 'disabled' => 'Bool#disabled',
  9527. 'maxlength' => 'Number',
  9528. 'name' => 'CDATA',
  9529. 'readonly' => 'Bool#readonly',
  9530. 'size' => 'Number',
  9531. 'src' => 'URI#embeds',
  9532. 'tabindex' => 'Number',
  9533. 'type' => 'Enum#text,password,checkbox,button,radio,submit,reset,file,hidden,image',
  9534. 'value' => 'CDATA',
  9535. ));
  9536. $input->attr_transform_post[] = new HTMLPurifier_AttrTransform_Input();
  9537. $this->addElement('select', 'Formctrl', 'Required: optgroup | option', 'Common', array(
  9538. 'disabled' => 'Bool#disabled',
  9539. 'multiple' => 'Bool#multiple',
  9540. 'name' => 'CDATA',
  9541. 'size' => 'Number',
  9542. 'tabindex' => 'Number',
  9543. ));
  9544. $this->addElement('option', false, 'Optional: #PCDATA', 'Common', array(
  9545. 'disabled' => 'Bool#disabled',
  9546. 'label' => 'Text',
  9547. 'selected' => 'Bool#selected',
  9548. 'value' => 'CDATA',
  9549. ));
  9550. // It's illegal for there to be more than one selected, but not
  9551. // be multiple. Also, no selected means undefined behavior. This might
  9552. // be difficult to implement; perhaps an injector, or a context variable.
  9553. $textarea = $this->addElement('textarea', 'Formctrl', 'Optional: #PCDATA', 'Common', array(
  9554. 'accesskey' => 'Character',
  9555. 'cols*' => 'Number',
  9556. 'disabled' => 'Bool#disabled',
  9557. 'name' => 'CDATA',
  9558. 'readonly' => 'Bool#readonly',
  9559. 'rows*' => 'Number',
  9560. 'tabindex' => 'Number',
  9561. ));
  9562. $textarea->attr_transform_pre[] = new HTMLPurifier_AttrTransform_Textarea();
  9563. $button = $this->addElement('button', 'Formctrl', 'Optional: #PCDATA | Heading | List | Block | Inline', 'Common', array(
  9564. 'accesskey' => 'Character',
  9565. 'disabled' => 'Bool#disabled',
  9566. 'name' => 'CDATA',
  9567. 'tabindex' => 'Number',
  9568. 'type' => 'Enum#button,submit,reset',
  9569. 'value' => 'CDATA',
  9570. ));
  9571. // For exclusions, ideally we'd specify content sets, not literal elements
  9572. $button->excludes = $this->makeLookup(
  9573. 'form', 'fieldset', // Form
  9574. 'input', 'select', 'textarea', 'label', 'button', // Formctrl
  9575. 'a' // as per HTML 4.01 spec, this is omitted by modularization
  9576. );
  9577. // Extra exclusion: img usemap="" is not permitted within this element.
  9578. // We'll omit this for now, since we don't have any good way of
  9579. // indicating it yet.
  9580. // This is HIGHLY user-unfriendly; we need a custom child-def for this
  9581. $this->addElement('fieldset', 'Form', 'Custom: (#WS?,legend,(Flow|#PCDATA)*)', 'Common');
  9582. $label = $this->addElement('label', 'Formctrl', 'Optional: #PCDATA | Inline', 'Common', array(
  9583. 'accesskey' => 'Character',
  9584. // 'for' => 'IDREF', // IDREF not implemented, cannot allow
  9585. ));
  9586. $label->excludes = array('label' => true);
  9587. $this->addElement('legend', false, 'Optional: #PCDATA | Inline', 'Common', array(
  9588. 'accesskey' => 'Character',
  9589. ));
  9590. $this->addElement('optgroup', false, 'Required: option', 'Common', array(
  9591. 'disabled' => 'Bool#disabled',
  9592. 'label*' => 'Text',
  9593. ));
  9594. // Don't forget an injector for <isindex>. This one's a little complex
  9595. // because it maps to multiple elements.
  9596. }
  9597. }
  9598. /**
  9599. * XHTML 1.1 Hypertext Module, defines hypertext links. Core Module.
  9600. */
  9601. class HTMLPurifier_HTMLModule_Hypertext extends HTMLPurifier_HTMLModule
  9602. {
  9603. public $name = 'Hypertext';
  9604. public function setup($config) {
  9605. $a = $this->addElement(
  9606. 'a', 'Inline', 'Inline', 'Common',
  9607. array(
  9608. // 'accesskey' => 'Character',
  9609. // 'charset' => 'Charset',
  9610. 'href' => 'URI',
  9611. // 'hreflang' => 'LanguageCode',
  9612. 'rel' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rel'),
  9613. 'rev' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rev'),
  9614. // 'tabindex' => 'Number',
  9615. // 'type' => 'ContentType',
  9616. )
  9617. );
  9618. $a->formatting = true;
  9619. $a->excludes = array('a' => true);
  9620. }
  9621. }
  9622. /**
  9623. * XHTML 1.1 Image Module provides basic image embedding.
  9624. * @note There is specialized code for removing empty images in
  9625. * HTMLPurifier_Strategy_RemoveForeignElements
  9626. */
  9627. class HTMLPurifier_HTMLModule_Image extends HTMLPurifier_HTMLModule
  9628. {
  9629. public $name = 'Image';
  9630. public function setup($config) {
  9631. $max = $config->get('HTML.MaxImgLength');
  9632. $img = $this->addElement(
  9633. 'img', 'Inline', 'Empty', 'Common',
  9634. array(
  9635. 'alt*' => 'Text',
  9636. // According to the spec, it's Length, but percents can
  9637. // be abused, so we allow only Pixels.
  9638. 'height' => 'Pixels#' . $max,
  9639. 'width' => 'Pixels#' . $max,
  9640. 'longdesc' => 'URI',
  9641. 'src*' => new HTMLPurifier_AttrDef_URI(true), // embedded
  9642. )
  9643. );
  9644. if ($max === null || $config->get('HTML.Trusted')) {
  9645. $img->attr['height'] =
  9646. $img->attr['width'] = 'Length';
  9647. }
  9648. // kind of strange, but splitting things up would be inefficient
  9649. $img->attr_transform_pre[] =
  9650. $img->attr_transform_post[] =
  9651. new HTMLPurifier_AttrTransform_ImgRequired();
  9652. }
  9653. }
  9654. /**
  9655. * XHTML 1.1 Legacy module defines elements that were previously
  9656. * deprecated.
  9657. *
  9658. * @note Not all legacy elements have been implemented yet, which
  9659. * is a bit of a reverse problem as compared to browsers! In
  9660. * addition, this legacy module may implement a bit more than
  9661. * mandated by XHTML 1.1.
  9662. *
  9663. * This module can be used in combination with TransformToStrict in order
  9664. * to transform as many deprecated elements as possible, but retain
  9665. * questionably deprecated elements that do not have good alternatives
  9666. * as well as transform elements that don't have an implementation.
  9667. * See docs/ref-strictness.txt for more details.
  9668. */
  9669. class HTMLPurifier_HTMLModule_Legacy extends HTMLPurifier_HTMLModule
  9670. {
  9671. public $name = 'Legacy';
  9672. public function setup($config) {
  9673. $this->addElement('basefont', 'Inline', 'Empty', false, array(
  9674. 'color' => 'Color',
  9675. 'face' => 'Text', // extremely broad, we should
  9676. 'size' => 'Text', // tighten it
  9677. 'id' => 'ID'
  9678. ));
  9679. $this->addElement('center', 'Block', 'Flow', 'Common');
  9680. $this->addElement('dir', 'Block', 'Required: li', 'Common', array(
  9681. 'compact' => 'Bool#compact'
  9682. ));
  9683. $this->addElement('font', 'Inline', 'Inline', array('Core', 'I18N'), array(
  9684. 'color' => 'Color',
  9685. 'face' => 'Text', // extremely broad, we should
  9686. 'size' => 'Text', // tighten it
  9687. ));
  9688. $this->addElement('menu', 'Block', 'Required: li', 'Common', array(
  9689. 'compact' => 'Bool#compact'
  9690. ));
  9691. $s = $this->addElement('s', 'Inline', 'Inline', 'Common');
  9692. $s->formatting = true;
  9693. $strike = $this->addElement('strike', 'Inline', 'Inline', 'Common');
  9694. $strike->formatting = true;
  9695. $u = $this->addElement('u', 'Inline', 'Inline', 'Common');
  9696. $u->formatting = true;
  9697. // setup modifications to old elements
  9698. $align = 'Enum#left,right,center,justify';
  9699. $address = $this->addBlankElement('address');
  9700. $address->content_model = 'Inline | #PCDATA | p';
  9701. $address->content_model_type = 'optional';
  9702. $address->child = false;
  9703. $blockquote = $this->addBlankElement('blockquote');
  9704. $blockquote->content_model = 'Flow | #PCDATA';
  9705. $blockquote->content_model_type = 'optional';
  9706. $blockquote->child = false;
  9707. $br = $this->addBlankElement('br');
  9708. $br->attr['clear'] = 'Enum#left,all,right,none';
  9709. $caption = $this->addBlankElement('caption');
  9710. $caption->attr['align'] = 'Enum#top,bottom,left,right';
  9711. $div = $this->addBlankElement('div');
  9712. $div->attr['align'] = $align;
  9713. $dl = $this->addBlankElement('dl');
  9714. $dl->attr['compact'] = 'Bool#compact';
  9715. for ($i = 1; $i <= 6; $i++) {
  9716. $h = $this->addBlankElement("h$i");
  9717. $h->attr['align'] = $align;
  9718. }
  9719. $hr = $this->addBlankElement('hr');
  9720. $hr->attr['align'] = $align;
  9721. $hr->attr['noshade'] = 'Bool#noshade';
  9722. $hr->attr['size'] = 'Pixels';
  9723. $hr->attr['width'] = 'Length';
  9724. $img = $this->addBlankElement('img');
  9725. $img->attr['align'] = 'Enum#top,middle,bottom,left,right';
  9726. $img->attr['border'] = 'Pixels';
  9727. $img->attr['hspace'] = 'Pixels';
  9728. $img->attr['vspace'] = 'Pixels';
  9729. // figure out this integer business
  9730. $li = $this->addBlankElement('li');
  9731. $li->attr['value'] = new HTMLPurifier_AttrDef_Integer();
  9732. $li->attr['type'] = 'Enum#s:1,i,I,a,A,disc,square,circle';
  9733. $ol = $this->addBlankElement('ol');
  9734. $ol->attr['compact'] = 'Bool#compact';
  9735. $ol->attr['start'] = new HTMLPurifier_AttrDef_Integer();
  9736. $ol->attr['type'] = 'Enum#s:1,i,I,a,A';
  9737. $p = $this->addBlankElement('p');
  9738. $p->attr['align'] = $align;
  9739. $pre = $this->addBlankElement('pre');
  9740. $pre->attr['width'] = 'Number';
  9741. // script omitted
  9742. $table = $this->addBlankElement('table');
  9743. $table->attr['align'] = 'Enum#left,center,right';
  9744. $table->attr['bgcolor'] = 'Color';
  9745. $tr = $this->addBlankElement('tr');
  9746. $tr->attr['bgcolor'] = 'Color';
  9747. $th = $this->addBlankElement('th');
  9748. $th->attr['bgcolor'] = 'Color';
  9749. $th->attr['height'] = 'Length';
  9750. $th->attr['nowrap'] = 'Bool#nowrap';
  9751. $th->attr['width'] = 'Length';
  9752. $td = $this->addBlankElement('td');
  9753. $td->attr['bgcolor'] = 'Color';
  9754. $td->attr['height'] = 'Length';
  9755. $td->attr['nowrap'] = 'Bool#nowrap';
  9756. $td->attr['width'] = 'Length';
  9757. $ul = $this->addBlankElement('ul');
  9758. $ul->attr['compact'] = 'Bool#compact';
  9759. $ul->attr['type'] = 'Enum#square,disc,circle';
  9760. }
  9761. }
  9762. /**
  9763. * XHTML 1.1 List Module, defines list-oriented elements. Core Module.
  9764. */
  9765. class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule
  9766. {
  9767. public $name = 'List';
  9768. // According to the abstract schema, the List content set is a fully formed
  9769. // one or more expr, but it invariably occurs in an optional declaration
  9770. // so we're not going to do that subtlety. It might cause trouble
  9771. // if a user defines "List" and expects that multiple lists are
  9772. // allowed to be specified, but then again, that's not very intuitive.
  9773. // Furthermore, the actual XML Schema may disagree. Regardless,
  9774. // we don't have support for such nested expressions without using
  9775. // the incredibly inefficient and draconic Custom ChildDef.
  9776. public $content_sets = array('Flow' => 'List');
  9777. public function setup($config) {
  9778. $ol = $this->addElement('ol', 'List', 'Required: li', 'Common');
  9779. $ol->wrap = "li";
  9780. $ul = $this->addElement('ul', 'List', 'Required: li', 'Common');
  9781. $ul->wrap = "li";
  9782. $this->addElement('dl', 'List', 'Required: dt | dd', 'Common');
  9783. $this->addElement('li', false, 'Flow', 'Common');
  9784. $this->addElement('dd', false, 'Flow', 'Common');
  9785. $this->addElement('dt', false, 'Inline', 'Common');
  9786. }
  9787. }
  9788. class HTMLPurifier_HTMLModule_Name extends HTMLPurifier_HTMLModule
  9789. {
  9790. public $name = 'Name';
  9791. public function setup($config) {
  9792. $elements = array('a', 'applet', 'form', 'frame', 'iframe', 'img', 'map');
  9793. foreach ($elements as $name) {
  9794. $element = $this->addBlankElement($name);
  9795. $element->attr['name'] = 'CDATA';
  9796. if (!$config->get('HTML.Attr.Name.UseCDATA')) {
  9797. $element->attr_transform_post['NameSync'] = new HTMLPurifier_AttrTransform_NameSync();
  9798. }
  9799. }
  9800. }
  9801. }
  9802. class HTMLPurifier_HTMLModule_NonXMLCommonAttributes extends HTMLPurifier_HTMLModule
  9803. {
  9804. public $name = 'NonXMLCommonAttributes';
  9805. public $attr_collections = array(
  9806. 'Lang' => array(
  9807. 'lang' => 'LanguageCode',
  9808. )
  9809. );
  9810. }
  9811. /**
  9812. * XHTML 1.1 Object Module, defines elements for generic object inclusion
  9813. * @warning Users will commonly use <embed> to cater to legacy browsers: this
  9814. * module does not allow this sort of behavior
  9815. */
  9816. class HTMLPurifier_HTMLModule_Object extends HTMLPurifier_HTMLModule
  9817. {
  9818. public $name = 'Object';
  9819. public $safe = false;
  9820. public function setup($config) {
  9821. $this->addElement('object', 'Inline', 'Optional: #PCDATA | Flow | param', 'Common',
  9822. array(
  9823. 'archive' => 'URI',
  9824. 'classid' => 'URI',
  9825. 'codebase' => 'URI',
  9826. 'codetype' => 'Text',
  9827. 'data' => 'URI',
  9828. 'declare' => 'Bool#declare',
  9829. 'height' => 'Length',
  9830. 'name' => 'CDATA',
  9831. 'standby' => 'Text',
  9832. 'tabindex' => 'Number',
  9833. 'type' => 'ContentType',
  9834. 'width' => 'Length'
  9835. )
  9836. );
  9837. $this->addElement('param', false, 'Empty', false,
  9838. array(
  9839. 'id' => 'ID',
  9840. 'name*' => 'Text',
  9841. 'type' => 'Text',
  9842. 'value' => 'Text',
  9843. 'valuetype' => 'Enum#data,ref,object'
  9844. )
  9845. );
  9846. }
  9847. }
  9848. /**
  9849. * XHTML 1.1 Presentation Module, defines simple presentation-related
  9850. * markup. Text Extension Module.
  9851. * @note The official XML Schema and DTD specs further divide this into
  9852. * two modules:
  9853. * - Block Presentation (hr)
  9854. * - Inline Presentation (b, big, i, small, sub, sup, tt)
  9855. * We have chosen not to heed this distinction, as content_sets
  9856. * provides satisfactory disambiguation.
  9857. */
  9858. class HTMLPurifier_HTMLModule_Presentation extends HTMLPurifier_HTMLModule
  9859. {
  9860. public $name = 'Presentation';
  9861. public function setup($config) {
  9862. $this->addElement('hr', 'Block', 'Empty', 'Common');
  9863. $this->addElement('sub', 'Inline', 'Inline', 'Common');
  9864. $this->addElement('sup', 'Inline', 'Inline', 'Common');
  9865. $b = $this->addElement('b', 'Inline', 'Inline', 'Common');
  9866. $b->formatting = true;
  9867. $big = $this->addElement('big', 'Inline', 'Inline', 'Common');
  9868. $big->formatting = true;
  9869. $i = $this->addElement('i', 'Inline', 'Inline', 'Common');
  9870. $i->formatting = true;
  9871. $small = $this->addElement('small', 'Inline', 'Inline', 'Common');
  9872. $small->formatting = true;
  9873. $tt = $this->addElement('tt', 'Inline', 'Inline', 'Common');
  9874. $tt->formatting = true;
  9875. }
  9876. }
  9877. /**
  9878. * Module defines proprietary tags and attributes in HTML.
  9879. * @warning If this module is enabled, standards-compliance is off!
  9880. */
  9881. class HTMLPurifier_HTMLModule_Proprietary extends HTMLPurifier_HTMLModule
  9882. {
  9883. public $name = 'Proprietary';
  9884. public function setup($config) {
  9885. $this->addElement('marquee', 'Inline', 'Flow', 'Common',
  9886. array(
  9887. 'direction' => 'Enum#left,right,up,down',
  9888. 'behavior' => 'Enum#alternate',
  9889. 'width' => 'Length',
  9890. 'height' => 'Length',
  9891. 'scrolldelay' => 'Number',
  9892. 'scrollamount' => 'Number',
  9893. 'loop' => 'Number',
  9894. 'bgcolor' => 'Color',
  9895. 'hspace' => 'Pixels',
  9896. 'vspace' => 'Pixels',
  9897. )
  9898. );
  9899. }
  9900. }
  9901. /**
  9902. * XHTML 1.1 Ruby Annotation Module, defines elements that indicate
  9903. * short runs of text alongside base text for annotation or pronounciation.
  9904. */
  9905. class HTMLPurifier_HTMLModule_Ruby extends HTMLPurifier_HTMLModule
  9906. {
  9907. public $name = 'Ruby';
  9908. public function setup($config) {
  9909. $this->addElement('ruby', 'Inline',
  9910. 'Custom: ((rb, (rt | (rp, rt, rp))) | (rbc, rtc, rtc?))',
  9911. 'Common');
  9912. $this->addElement('rbc', false, 'Required: rb', 'Common');
  9913. $this->addElement('rtc', false, 'Required: rt', 'Common');
  9914. $rb = $this->addElement('rb', false, 'Inline', 'Common');
  9915. $rb->excludes = array('ruby' => true);
  9916. $rt = $this->addElement('rt', false, 'Inline', 'Common', array('rbspan' => 'Number'));
  9917. $rt->excludes = array('ruby' => true);
  9918. $this->addElement('rp', false, 'Optional: #PCDATA', 'Common');
  9919. }
  9920. }
  9921. /**
  9922. * A "safe" embed module. See SafeObject. This is a proprietary element.
  9923. */
  9924. class HTMLPurifier_HTMLModule_SafeEmbed extends HTMLPurifier_HTMLModule
  9925. {
  9926. public $name = 'SafeEmbed';
  9927. public function setup($config) {
  9928. $max = $config->get('HTML.MaxImgLength');
  9929. $embed = $this->addElement(
  9930. 'embed', 'Inline', 'Empty', 'Common',
  9931. array(
  9932. 'src*' => 'URI#embedded',
  9933. 'type' => 'Enum#application/x-shockwave-flash',
  9934. 'width' => 'Pixels#' . $max,
  9935. 'height' => 'Pixels#' . $max,
  9936. 'allowscriptaccess' => 'Enum#never',
  9937. 'allownetworking' => 'Enum#internal',
  9938. 'flashvars' => 'Text',
  9939. 'wmode' => 'Enum#window',
  9940. 'name' => 'ID',
  9941. )
  9942. );
  9943. $embed->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeEmbed();
  9944. }
  9945. }
  9946. /**
  9947. * A "safe" object module. In theory, objects permitted by this module will
  9948. * be safe, and untrusted users can be allowed to embed arbitrary flash objects
  9949. * (maybe other types too, but only Flash is supported as of right now).
  9950. * Highly experimental.
  9951. */
  9952. class HTMLPurifier_HTMLModule_SafeObject extends HTMLPurifier_HTMLModule
  9953. {
  9954. public $name = 'SafeObject';
  9955. public function setup($config) {
  9956. // These definitions are not intrinsically safe: the attribute transforms
  9957. // are a vital part of ensuring safety.
  9958. $max = $config->get('HTML.MaxImgLength');
  9959. $object = $this->addElement(
  9960. 'object',
  9961. 'Inline',
  9962. 'Optional: param | Flow | #PCDATA',
  9963. 'Common',
  9964. array(
  9965. // While technically not required by the spec, we're forcing
  9966. // it to this value.
  9967. 'type' => 'Enum#application/x-shockwave-flash',
  9968. 'width' => 'Pixels#' . $max,
  9969. 'height' => 'Pixels#' . $max,
  9970. 'data' => 'URI#embedded',
  9971. 'classid' => 'Enum#clsid:d27cdb6e-ae6d-11cf-96b8-444553540000',
  9972. 'codebase' => new HTMLPurifier_AttrDef_Enum(array(
  9973. 'http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,40,0')),
  9974. )
  9975. );
  9976. $object->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeObject();
  9977. $param = $this->addElement('param', false, 'Empty', false,
  9978. array(
  9979. 'id' => 'ID',
  9980. 'name*' => 'Text',
  9981. 'value' => 'Text'
  9982. )
  9983. );
  9984. $param->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeParam();
  9985. $this->info_injector[] = 'SafeObject';
  9986. }
  9987. }
  9988. /*
  9989. WARNING: THIS MODULE IS EXTREMELY DANGEROUS AS IT ENABLES INLINE SCRIPTING
  9990. INSIDE HTML PURIFIER DOCUMENTS. USE ONLY WITH TRUSTED USER INPUT!!!
  9991. */
  9992. /**
  9993. * XHTML 1.1 Scripting module, defines elements that are used to contain
  9994. * information pertaining to executable scripts or the lack of support
  9995. * for executable scripts.
  9996. * @note This module does not contain inline scripting elements
  9997. */
  9998. class HTMLPurifier_HTMLModule_Scripting extends HTMLPurifier_HTMLModule
  9999. {
  10000. public $name = 'Scripting';
  10001. public $elements = array('script', 'noscript');
  10002. public $content_sets = array('Block' => 'script | noscript', 'Inline' => 'script | noscript');
  10003. public $safe = false;
  10004. public function setup($config) {
  10005. // TODO: create custom child-definition for noscript that
  10006. // auto-wraps stray #PCDATA in a similar manner to
  10007. // blockquote's custom definition (we would use it but
  10008. // blockquote's contents are optional while noscript's contents
  10009. // are required)
  10010. // TODO: convert this to new syntax, main problem is getting
  10011. // both content sets working
  10012. // In theory, this could be safe, but I don't see any reason to
  10013. // allow it.
  10014. $this->info['noscript'] = new HTMLPurifier_ElementDef();
  10015. $this->info['noscript']->attr = array( 0 => array('Common') );
  10016. $this->info['noscript']->content_model = 'Heading | List | Block';
  10017. $this->info['noscript']->content_model_type = 'required';
  10018. $this->info['script'] = new HTMLPurifier_ElementDef();
  10019. $this->info['script']->attr = array(
  10020. 'defer' => new HTMLPurifier_AttrDef_Enum(array('defer')),
  10021. 'src' => new HTMLPurifier_AttrDef_URI(true),
  10022. 'type' => new HTMLPurifier_AttrDef_Enum(array('text/javascript'))
  10023. );
  10024. $this->info['script']->content_model = '#PCDATA';
  10025. $this->info['script']->content_model_type = 'optional';
  10026. $this->info['script']->attr_transform_pre['type'] =
  10027. $this->info['script']->attr_transform_post['type'] =
  10028. new HTMLPurifier_AttrTransform_ScriptRequired();
  10029. }
  10030. }
  10031. /**
  10032. * XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
  10033. * Module.
  10034. */
  10035. class HTMLPurifier_HTMLModule_StyleAttribute extends HTMLPurifier_HTMLModule
  10036. {
  10037. public $name = 'StyleAttribute';
  10038. public $attr_collections = array(
  10039. // The inclusion routine differs from the Abstract Modules but
  10040. // is in line with the DTD and XML Schemas.
  10041. 'Style' => array('style' => false), // see constructor
  10042. 'Core' => array(0 => array('Style'))
  10043. );
  10044. public function setup($config) {
  10045. $this->attr_collections['Style']['style'] = new HTMLPurifier_AttrDef_CSS();
  10046. }
  10047. }
  10048. /**
  10049. * XHTML 1.1 Tables Module, fully defines accessible table elements.
  10050. */
  10051. class HTMLPurifier_HTMLModule_Tables extends HTMLPurifier_HTMLModule
  10052. {
  10053. public $name = 'Tables';
  10054. public function setup($config) {
  10055. $this->addElement('caption', false, 'Inline', 'Common');
  10056. $this->addElement('table', 'Block',
  10057. new HTMLPurifier_ChildDef_Table(), 'Common',
  10058. array(
  10059. 'border' => 'Pixels',
  10060. 'cellpadding' => 'Length',
  10061. 'cellspacing' => 'Length',
  10062. 'frame' => 'Enum#void,above,below,hsides,lhs,rhs,vsides,box,border',
  10063. 'rules' => 'Enum#none,groups,rows,cols,all',
  10064. 'summary' => 'Text',
  10065. 'width' => 'Length'
  10066. )
  10067. );
  10068. // common attributes
  10069. $cell_align = array(
  10070. 'align' => 'Enum#left,center,right,justify,char',
  10071. 'charoff' => 'Length',
  10072. 'valign' => 'Enum#top,middle,bottom,baseline',
  10073. );
  10074. $cell_t = array_merge(
  10075. array(
  10076. 'abbr' => 'Text',
  10077. 'colspan' => 'Number',
  10078. 'rowspan' => 'Number',
  10079. ),
  10080. $cell_align
  10081. );
  10082. $this->addElement('td', false, 'Flow', 'Common', $cell_t);
  10083. $this->addElement('th', false, 'Flow', 'Common', $cell_t);
  10084. $this->addElement('tr', false, 'Required: td | th', 'Common', $cell_align);
  10085. $cell_col = array_merge(
  10086. array(
  10087. 'span' => 'Number',
  10088. 'width' => 'MultiLength',
  10089. ),
  10090. $cell_align
  10091. );
  10092. $this->addElement('col', false, 'Empty', 'Common', $cell_col);
  10093. $this->addElement('colgroup', false, 'Optional: col', 'Common', $cell_col);
  10094. $this->addElement('tbody', false, 'Required: tr', 'Common', $cell_align);
  10095. $this->addElement('thead', false, 'Required: tr', 'Common', $cell_align);
  10096. $this->addElement('tfoot', false, 'Required: tr', 'Common', $cell_align);
  10097. }
  10098. }
  10099. /**
  10100. * XHTML 1.1 Target Module, defines target attribute in link elements.
  10101. */
  10102. class HTMLPurifier_HTMLModule_Target extends HTMLPurifier_HTMLModule
  10103. {
  10104. public $name = 'Target';
  10105. public function setup($config) {
  10106. $elements = array('a');
  10107. foreach ($elements as $name) {
  10108. $e = $this->addBlankElement($name);
  10109. $e->attr = array(
  10110. 'target' => new HTMLPurifier_AttrDef_HTML_FrameTarget()
  10111. );
  10112. }
  10113. }
  10114. }
  10115. /**
  10116. * XHTML 1.1 Text Module, defines basic text containers. Core Module.
  10117. * @note In the normative XML Schema specification, this module
  10118. * is further abstracted into the following modules:
  10119. * - Block Phrasal (address, blockquote, pre, h1, h2, h3, h4, h5, h6)
  10120. * - Block Structural (div, p)
  10121. * - Inline Phrasal (abbr, acronym, cite, code, dfn, em, kbd, q, samp, strong, var)
  10122. * - Inline Structural (br, span)
  10123. * This module, functionally, does not distinguish between these
  10124. * sub-modules, but the code is internally structured to reflect
  10125. * these distinctions.
  10126. */
  10127. class HTMLPurifier_HTMLModule_Text extends HTMLPurifier_HTMLModule
  10128. {
  10129. public $name = 'Text';
  10130. public $content_sets = array(
  10131. 'Flow' => 'Heading | Block | Inline'
  10132. );
  10133. public function setup($config) {
  10134. // Inline Phrasal -------------------------------------------------
  10135. $this->addElement('abbr', 'Inline', 'Inline', 'Common');
  10136. $this->addElement('acronym', 'Inline', 'Inline', 'Common');
  10137. $this->addElement('cite', 'Inline', 'Inline', 'Common');
  10138. $this->addElement('dfn', 'Inline', 'Inline', 'Common');
  10139. $this->addElement('kbd', 'Inline', 'Inline', 'Common');
  10140. $this->addElement('q', 'Inline', 'Inline', 'Common', array('cite' => 'URI'));
  10141. $this->addElement('samp', 'Inline', 'Inline', 'Common');
  10142. $this->addElement('var', 'Inline', 'Inline', 'Common');
  10143. $em = $this->addElement('em', 'Inline', 'Inline', 'Common');
  10144. $em->formatting = true;
  10145. $strong = $this->addElement('strong', 'Inline', 'Inline', 'Common');
  10146. $strong->formatting = true;
  10147. $code = $this->addElement('code', 'Inline', 'Inline', 'Common');
  10148. $code->formatting = true;
  10149. // Inline Structural ----------------------------------------------
  10150. $this->addElement('span', 'Inline', 'Inline', 'Common');
  10151. $this->addElement('br', 'Inline', 'Empty', 'Core');
  10152. // Block Phrasal --------------------------------------------------
  10153. $this->addElement('address', 'Block', 'Inline', 'Common');
  10154. $this->addElement('blockquote', 'Block', 'Optional: Heading | Block | List', 'Common', array('cite' => 'URI') );
  10155. $pre = $this->addElement('pre', 'Block', 'Inline', 'Common');
  10156. $pre->excludes = $this->makeLookup(
  10157. 'img', 'big', 'small', 'object', 'applet', 'font', 'basefont' );
  10158. $this->addElement('h1', 'Heading', 'Inline', 'Common');
  10159. $this->addElement('h2', 'Heading', 'Inline', 'Common');
  10160. $this->addElement('h3', 'Heading', 'Inline', 'Common');
  10161. $this->addElement('h4', 'Heading', 'Inline', 'Common');
  10162. $this->addElement('h5', 'Heading', 'Inline', 'Common');
  10163. $this->addElement('h6', 'Heading', 'Inline', 'Common');
  10164. // Block Structural -----------------------------------------------
  10165. $p = $this->addElement('p', 'Block', 'Inline', 'Common');
  10166. $p->autoclose = array_flip(array("address", "blockquote", "center", "dir", "div", "dl", "fieldset", "ol", "p", "ul"));
  10167. $this->addElement('div', 'Block', 'Flow', 'Common');
  10168. }
  10169. }
  10170. /**
  10171. * Abstract class for a set of proprietary modules that clean up (tidy)
  10172. * poorly written HTML.
  10173. * @todo Figure out how to protect some of these methods/properties
  10174. */
  10175. class HTMLPurifier_HTMLModule_Tidy extends HTMLPurifier_HTMLModule
  10176. {
  10177. /**
  10178. * List of supported levels. Index zero is a special case "no fixes"
  10179. * level.
  10180. */
  10181. public $levels = array(0 => 'none', 'light', 'medium', 'heavy');
  10182. /**
  10183. * Default level to place all fixes in. Disabled by default
  10184. */
  10185. public $defaultLevel = null;
  10186. /**
  10187. * Lists of fixes used by getFixesForLevel(). Format is:
  10188. * HTMLModule_Tidy->fixesForLevel[$level] = array('fix-1', 'fix-2');
  10189. */
  10190. public $fixesForLevel = array(
  10191. 'light' => array(),
  10192. 'medium' => array(),
  10193. 'heavy' => array()
  10194. );
  10195. /**
  10196. * Lazy load constructs the module by determining the necessary
  10197. * fixes to create and then delegating to the populate() function.
  10198. * @todo Wildcard matching and error reporting when an added or
  10199. * subtracted fix has no effect.
  10200. */
  10201. public function setup($config) {
  10202. // create fixes, initialize fixesForLevel
  10203. $fixes = $this->makeFixes();
  10204. $this->makeFixesForLevel($fixes);
  10205. // figure out which fixes to use
  10206. $level = $config->get('HTML.TidyLevel');
  10207. $fixes_lookup = $this->getFixesForLevel($level);
  10208. // get custom fix declarations: these need namespace processing
  10209. $add_fixes = $config->get('HTML.TidyAdd');
  10210. $remove_fixes = $config->get('HTML.TidyRemove');
  10211. foreach ($fixes as $name => $fix) {
  10212. // needs to be refactored a little to implement globbing
  10213. if (
  10214. isset($remove_fixes[$name]) ||
  10215. (!isset($add_fixes[$name]) && !isset($fixes_lookup[$name]))
  10216. ) {
  10217. unset($fixes[$name]);
  10218. }
  10219. }
  10220. // populate this module with necessary fixes
  10221. $this->populate($fixes);
  10222. }
  10223. /**
  10224. * Retrieves all fixes per a level, returning fixes for that specific
  10225. * level as well as all levels below it.
  10226. * @param $level String level identifier, see $levels for valid values
  10227. * @return Lookup up table of fixes
  10228. */
  10229. public function getFixesForLevel($level) {
  10230. if ($level == $this->levels[0]) {
  10231. return array();
  10232. }
  10233. $activated_levels = array();
  10234. for ($i = 1, $c = count($this->levels); $i < $c; $i++) {
  10235. $activated_levels[] = $this->levels[$i];
  10236. if ($this->levels[$i] == $level) break;
  10237. }
  10238. if ($i == $c) {
  10239. trigger_error(
  10240. 'Tidy level ' . htmlspecialchars($level) . ' not recognized',
  10241. E_USER_WARNING
  10242. );
  10243. return array();
  10244. }
  10245. $ret = array();
  10246. foreach ($activated_levels as $level) {
  10247. foreach ($this->fixesForLevel[$level] as $fix) {
  10248. $ret[$fix] = true;
  10249. }
  10250. }
  10251. return $ret;
  10252. }
  10253. /**
  10254. * Dynamically populates the $fixesForLevel member variable using
  10255. * the fixes array. It may be custom overloaded, used in conjunction
  10256. * with $defaultLevel, or not used at all.
  10257. */
  10258. public function makeFixesForLevel($fixes) {
  10259. if (!isset($this->defaultLevel)) return;
  10260. if (!isset($this->fixesForLevel[$this->defaultLevel])) {
  10261. trigger_error(
  10262. 'Default level ' . $this->defaultLevel . ' does not exist',
  10263. E_USER_ERROR
  10264. );
  10265. return;
  10266. }
  10267. $this->fixesForLevel[$this->defaultLevel] = array_keys($fixes);
  10268. }
  10269. /**
  10270. * Populates the module with transforms and other special-case code
  10271. * based on a list of fixes passed to it
  10272. * @param $lookup Lookup table of fixes to activate
  10273. */
  10274. public function populate($fixes) {
  10275. foreach ($fixes as $name => $fix) {
  10276. // determine what the fix is for
  10277. list($type, $params) = $this->getFixType($name);
  10278. switch ($type) {
  10279. case 'attr_transform_pre':
  10280. case 'attr_transform_post':
  10281. $attr = $params['attr'];
  10282. if (isset($params['element'])) {
  10283. $element = $params['element'];
  10284. if (empty($this->info[$element])) {
  10285. $e = $this->addBlankElement($element);
  10286. } else {
  10287. $e = $this->info[$element];
  10288. }
  10289. } else {
  10290. $type = "info_$type";
  10291. $e = $this;
  10292. }
  10293. // PHP does some weird parsing when I do
  10294. // $e->$type[$attr], so I have to assign a ref.
  10295. $f =& $e->$type;
  10296. $f[$attr] = $fix;
  10297. break;
  10298. case 'tag_transform':
  10299. $this->info_tag_transform[$params['element']] = $fix;
  10300. break;
  10301. case 'child':
  10302. case 'content_model_type':
  10303. $element = $params['element'];
  10304. if (empty($this->info[$element])) {
  10305. $e = $this->addBlankElement($element);
  10306. } else {
  10307. $e = $this->info[$element];
  10308. }
  10309. $e->$type = $fix;
  10310. break;
  10311. default:
  10312. trigger_error("Fix type $type not supported", E_USER_ERROR);
  10313. break;
  10314. }
  10315. }
  10316. }
  10317. /**
  10318. * Parses a fix name and determines what kind of fix it is, as well
  10319. * as other information defined by the fix
  10320. * @param $name String name of fix
  10321. * @return array(string $fix_type, array $fix_parameters)
  10322. * @note $fix_parameters is type dependant, see populate() for usage
  10323. * of these parameters
  10324. */
  10325. public function getFixType($name) {
  10326. // parse it
  10327. $property = $attr = null;
  10328. if (strpos($name, '#') !== false) list($name, $property) = explode('#', $name);
  10329. if (strpos($name, '@') !== false) list($name, $attr) = explode('@', $name);
  10330. // figure out the parameters
  10331. $params = array();
  10332. if ($name !== '') $params['element'] = $name;
  10333. if (!is_null($attr)) $params['attr'] = $attr;
  10334. // special case: attribute transform
  10335. if (!is_null($attr)) {
  10336. if (is_null($property)) $property = 'pre';
  10337. $type = 'attr_transform_' . $property;
  10338. return array($type, $params);
  10339. }
  10340. // special case: tag transform
  10341. if (is_null($property)) {
  10342. return array('tag_transform', $params);
  10343. }
  10344. return array($property, $params);
  10345. }
  10346. /**
  10347. * Defines all fixes the module will perform in a compact
  10348. * associative array of fix name to fix implementation.
  10349. */
  10350. public function makeFixes() {}
  10351. }
  10352. class HTMLPurifier_HTMLModule_XMLCommonAttributes extends HTMLPurifier_HTMLModule
  10353. {
  10354. public $name = 'XMLCommonAttributes';
  10355. public $attr_collections = array(
  10356. 'Lang' => array(
  10357. 'xml:lang' => 'LanguageCode',
  10358. )
  10359. );
  10360. }
  10361. /**
  10362. * Name is deprecated, but allowed in strict doctypes, so onl
  10363. */
  10364. class HTMLPurifier_HTMLModule_Tidy_Name extends HTMLPurifier_HTMLModule_Tidy
  10365. {
  10366. public $name = 'Tidy_Name';
  10367. public $defaultLevel = 'heavy';
  10368. public function makeFixes() {
  10369. $r = array();
  10370. // @name for img, a -----------------------------------------------
  10371. // Technically, it's allowed even on strict, so we allow authors to use
  10372. // it. However, it's deprecated in future versions of XHTML.
  10373. $r['img@name'] =
  10374. $r['a@name'] = new HTMLPurifier_AttrTransform_Name();
  10375. return $r;
  10376. }
  10377. }
  10378. class HTMLPurifier_HTMLModule_Tidy_Proprietary extends HTMLPurifier_HTMLModule_Tidy
  10379. {
  10380. public $name = 'Tidy_Proprietary';
  10381. public $defaultLevel = 'light';
  10382. public function makeFixes() {
  10383. $r = array();
  10384. $r['table@background'] = new HTMLPurifier_AttrTransform_Background();
  10385. $r['td@background'] = new HTMLPurifier_AttrTransform_Background();
  10386. $r['th@background'] = new HTMLPurifier_AttrTransform_Background();
  10387. $r['tr@background'] = new HTMLPurifier_AttrTransform_Background();
  10388. $r['thead@background'] = new HTMLPurifier_AttrTransform_Background();
  10389. $r['tfoot@background'] = new HTMLPurifier_AttrTransform_Background();
  10390. $r['tbody@background'] = new HTMLPurifier_AttrTransform_Background();
  10391. $r['table@height'] = new HTMLPurifier_AttrTransform_Length('height');
  10392. return $r;
  10393. }
  10394. }
  10395. class HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4 extends HTMLPurifier_HTMLModule_Tidy
  10396. {
  10397. public function makeFixes() {
  10398. $r = array();
  10399. // == deprecated tag transforms ===================================
  10400. $r['font'] = new HTMLPurifier_TagTransform_Font();
  10401. $r['menu'] = new HTMLPurifier_TagTransform_Simple('ul');
  10402. $r['dir'] = new HTMLPurifier_TagTransform_Simple('ul');
  10403. $r['center'] = new HTMLPurifier_TagTransform_Simple('div', 'text-align:center;');
  10404. $r['u'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:underline;');
  10405. $r['s'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:line-through;');
  10406. $r['strike'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:line-through;');
  10407. // == deprecated attribute transforms =============================
  10408. $r['caption@align'] =
  10409. new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
  10410. // we're following IE's behavior, not Firefox's, due
  10411. // to the fact that no one supports caption-side:right,
  10412. // W3C included (with CSS 2.1). This is a slightly
  10413. // unreasonable attribute!
  10414. 'left' => 'text-align:left;',
  10415. 'right' => 'text-align:right;',
  10416. 'top' => 'caption-side:top;',
  10417. 'bottom' => 'caption-side:bottom;' // not supported by IE
  10418. ));
  10419. // @align for img -------------------------------------------------
  10420. $r['img@align'] =
  10421. new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
  10422. 'left' => 'float:left;',
  10423. 'right' => 'float:right;',
  10424. 'top' => 'vertical-align:top;',
  10425. 'middle' => 'vertical-align:middle;',
  10426. 'bottom' => 'vertical-align:baseline;',
  10427. ));
  10428. // @align for table -----------------------------------------------
  10429. $r['table@align'] =
  10430. new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
  10431. 'left' => 'float:left;',
  10432. 'center' => 'margin-left:auto;margin-right:auto;',
  10433. 'right' => 'float:right;'
  10434. ));
  10435. // @align for hr -----------------------------------------------
  10436. $r['hr@align'] =
  10437. new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
  10438. // we use both text-align and margin because these work
  10439. // for different browsers (IE and Firefox, respectively)
  10440. // and the melange makes for a pretty cross-compatible
  10441. // solution
  10442. 'left' => 'margin-left:0;margin-right:auto;text-align:left;',
  10443. 'center' => 'margin-left:auto;margin-right:auto;text-align:center;',
  10444. 'right' => 'margin-left:auto;margin-right:0;text-align:right;'
  10445. ));
  10446. // @align for h1, h2, h3, h4, h5, h6, p, div ----------------------
  10447. // {{{
  10448. $align_lookup = array();
  10449. $align_values = array('left', 'right', 'center', 'justify');
  10450. foreach ($align_values as $v) $align_lookup[$v] = "text-align:$v;";
  10451. // }}}
  10452. $r['h1@align'] =
  10453. $r['h2@align'] =
  10454. $r['h3@align'] =
  10455. $r['h4@align'] =
  10456. $r['h5@align'] =
  10457. $r['h6@align'] =
  10458. $r['p@align'] =
  10459. $r['div@align'] =
  10460. new HTMLPurifier_AttrTransform_EnumToCSS('align', $align_lookup);
  10461. // @bgcolor for table, tr, td, th ---------------------------------
  10462. $r['table@bgcolor'] =
  10463. $r['td@bgcolor'] =
  10464. $r['th@bgcolor'] =
  10465. new HTMLPurifier_AttrTransform_BgColor();
  10466. // @border for img ------------------------------------------------
  10467. $r['img@border'] = new HTMLPurifier_AttrTransform_Border();
  10468. // @clear for br --------------------------------------------------
  10469. $r['br@clear'] =
  10470. new HTMLPurifier_AttrTransform_EnumToCSS('clear', array(
  10471. 'left' => 'clear:left;',
  10472. 'right' => 'clear:right;',
  10473. 'all' => 'clear:both;',
  10474. 'none' => 'clear:none;',
  10475. ));
  10476. // @height for td, th ---------------------------------------------
  10477. $r['td@height'] =
  10478. $r['th@height'] =
  10479. new HTMLPurifier_AttrTransform_Length('height');
  10480. // @hspace for img ------------------------------------------------
  10481. $r['img@hspace'] = new HTMLPurifier_AttrTransform_ImgSpace('hspace');
  10482. // @noshade for hr ------------------------------------------------
  10483. // this transformation is not precise but often good enough.
  10484. // different browsers use different styles to designate noshade
  10485. $r['hr@noshade'] =
  10486. new HTMLPurifier_AttrTransform_BoolToCSS(
  10487. 'noshade',
  10488. 'color:#808080;background-color:#808080;border:0;'
  10489. );
  10490. // @nowrap for td, th ---------------------------------------------
  10491. $r['td@nowrap'] =
  10492. $r['th@nowrap'] =
  10493. new HTMLPurifier_AttrTransform_BoolToCSS(
  10494. 'nowrap',
  10495. 'white-space:nowrap;'
  10496. );
  10497. // @size for hr --------------------------------------------------
  10498. $r['hr@size'] = new HTMLPurifier_AttrTransform_Length('size', 'height');
  10499. // @type for li, ol, ul -------------------------------------------
  10500. // {{{
  10501. $ul_types = array(
  10502. 'disc' => 'list-style-type:disc;',
  10503. 'square' => 'list-style-type:square;',
  10504. 'circle' => 'list-style-type:circle;'
  10505. );
  10506. $ol_types = array(
  10507. '1' => 'list-style-type:decimal;',
  10508. 'i' => 'list-style-type:lower-roman;',
  10509. 'I' => 'list-style-type:upper-roman;',
  10510. 'a' => 'list-style-type:lower-alpha;',
  10511. 'A' => 'list-style-type:upper-alpha;'
  10512. );
  10513. $li_types = $ul_types + $ol_types;
  10514. // }}}
  10515. $r['ul@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ul_types);
  10516. $r['ol@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ol_types, true);
  10517. $r['li@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $li_types, true);
  10518. // @vspace for img ------------------------------------------------
  10519. $r['img@vspace'] = new HTMLPurifier_AttrTransform_ImgSpace('vspace');
  10520. // @width for hr, td, th ------------------------------------------
  10521. $r['td@width'] =
  10522. $r['th@width'] =
  10523. $r['hr@width'] = new HTMLPurifier_AttrTransform_Length('width');
  10524. return $r;
  10525. }
  10526. }
  10527. class HTMLPurifier_HTMLModule_Tidy_Strict extends HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4
  10528. {
  10529. public $name = 'Tidy_Strict';
  10530. public $defaultLevel = 'light';
  10531. public function makeFixes() {
  10532. $r = parent::makeFixes();
  10533. $r['blockquote#content_model_type'] = 'strictblockquote';
  10534. return $r;
  10535. }
  10536. public $defines_child_def = true;
  10537. public function getChildDef($def) {
  10538. if ($def->content_model_type != 'strictblockquote') return parent::getChildDef($def);
  10539. return new HTMLPurifier_ChildDef_StrictBlockquote($def->content_model);
  10540. }
  10541. }
  10542. class HTMLPurifier_HTMLModule_Tidy_Transitional extends HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4
  10543. {
  10544. public $name = 'Tidy_Transitional';
  10545. public $defaultLevel = 'heavy';
  10546. }
  10547. class HTMLPurifier_HTMLModule_Tidy_XHTML extends HTMLPurifier_HTMLModule_Tidy
  10548. {
  10549. public $name = 'Tidy_XHTML';
  10550. public $defaultLevel = 'medium';
  10551. public function makeFixes() {
  10552. $r = array();
  10553. $r['@lang'] = new HTMLPurifier_AttrTransform_Lang();
  10554. return $r;
  10555. }
  10556. }
  10557. /**
  10558. * Injector that auto paragraphs text in the root node based on
  10559. * double-spacing.
  10560. * @todo Ensure all states are unit tested, including variations as well.
  10561. * @todo Make a graph of the flow control for this Injector.
  10562. */
  10563. class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
  10564. {
  10565. public $name = 'AutoParagraph';
  10566. public $needed = array('p');
  10567. private function _pStart() {
  10568. $par = new HTMLPurifier_Token_Start('p');
  10569. $par->armor['MakeWellFormed_TagClosedError'] = true;
  10570. return $par;
  10571. }
  10572. public function handleText(&$token) {
  10573. $text = $token->data;
  10574. // Does the current parent allow <p> tags?
  10575. if ($this->allowsElement('p')) {
  10576. if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) {
  10577. // Note that we have differing behavior when dealing with text
  10578. // in the anonymous root node, or a node inside the document.
  10579. // If the text as a double-newline, the treatment is the same;
  10580. // if it doesn't, see the next if-block if you're in the document.
  10581. $i = $nesting = null;
  10582. if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) {
  10583. // State 1.1: ... ^ (whitespace, then document end)
  10584. // ----
  10585. // This is a degenerate case
  10586. } else {
  10587. if (!$token->is_whitespace || $this->_isInline($current)) {
  10588. // State 1.2: PAR1
  10589. // ----
  10590. // State 1.3: PAR1\n\nPAR2
  10591. // ------------
  10592. // State 1.4: <div>PAR1\n\nPAR2 (see State 2)
  10593. // ------------
  10594. $token = array($this->_pStart());
  10595. $this->_splitText($text, $token);
  10596. } else {
  10597. // State 1.5: \n<hr />
  10598. // --
  10599. }
  10600. }
  10601. } else {
  10602. // State 2: <div>PAR1... (similar to 1.4)
  10603. // ----
  10604. // We're in an element that allows paragraph tags, but we're not
  10605. // sure if we're going to need them.
  10606. if ($this->_pLookAhead()) {
  10607. // State 2.1: <div>PAR1<b>PAR1\n\nPAR2
  10608. // ----
  10609. // Note: This will always be the first child, since any
  10610. // previous inline element would have triggered this very
  10611. // same routine, and found the double newline. One possible
  10612. // exception would be a comment.
  10613. $token = array($this->_pStart(), $token);
  10614. } else {
  10615. // State 2.2.1: <div>PAR1<div>
  10616. // ----
  10617. // State 2.2.2: <div>PAR1<b>PAR1</b></div>
  10618. // ----
  10619. }
  10620. }
  10621. // Is the current parent a <p> tag?
  10622. } elseif (
  10623. !empty($this->currentNesting) &&
  10624. $this->currentNesting[count($this->currentNesting)-1]->name == 'p'
  10625. ) {
  10626. // State 3.1: ...<p>PAR1
  10627. // ----
  10628. // State 3.2: ...<p>PAR1\n\nPAR2
  10629. // ------------
  10630. $token = array();
  10631. $this->_splitText($text, $token);
  10632. // Abort!
  10633. } else {
  10634. // State 4.1: ...<b>PAR1
  10635. // ----
  10636. // State 4.2: ...<b>PAR1\n\nPAR2
  10637. // ------------
  10638. }
  10639. }
  10640. public function handleElement(&$token) {
  10641. // We don't have to check if we're already in a <p> tag for block
  10642. // tokens, because the tag would have been autoclosed by MakeWellFormed.
  10643. if ($this->allowsElement('p')) {
  10644. if (!empty($this->currentNesting)) {
  10645. if ($this->_isInline($token)) {
  10646. // State 1: <div>...<b>
  10647. // ---
  10648. // Check if this token is adjacent to the parent token
  10649. // (seek backwards until token isn't whitespace)
  10650. $i = null;
  10651. $this->backward($i, $prev);
  10652. if (!$prev instanceof HTMLPurifier_Token_Start) {
  10653. // Token wasn't adjacent
  10654. if (
  10655. $prev instanceof HTMLPurifier_Token_Text &&
  10656. substr($prev->data, -2) === "\n\n"
  10657. ) {
  10658. // State 1.1.4: <div><p>PAR1</p>\n\n<b>
  10659. // ---
  10660. // Quite frankly, this should be handled by splitText
  10661. $token = array($this->_pStart(), $token);
  10662. } else {
  10663. // State 1.1.1: <div><p>PAR1</p><b>
  10664. // ---
  10665. // State 1.1.2: <div><br /><b>
  10666. // ---
  10667. // State 1.1.3: <div>PAR<b>
  10668. // ---
  10669. }
  10670. } else {
  10671. // State 1.2.1: <div><b>
  10672. // ---
  10673. // Lookahead to see if <p> is needed.
  10674. if ($this->_pLookAhead()) {
  10675. // State 1.3.1: <div><b>PAR1\n\nPAR2
  10676. // ---
  10677. $token = array($this->_pStart(), $token);
  10678. } else {
  10679. // State 1.3.2: <div><b>PAR1</b></div>
  10680. // ---
  10681. // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div>
  10682. // ---
  10683. }
  10684. }
  10685. } else {
  10686. // State 2.3: ...<div>
  10687. // -----
  10688. }
  10689. } else {
  10690. if ($this->_isInline($token)) {
  10691. // State 3.1: <b>
  10692. // ---
  10693. // This is where the {p} tag is inserted, not reflected in
  10694. // inputTokens yet, however.
  10695. $token = array($this->_pStart(), $token);
  10696. } else {
  10697. // State 3.2: <div>
  10698. // -----
  10699. }
  10700. $i = null;
  10701. if ($this->backward($i, $prev)) {
  10702. if (
  10703. !$prev instanceof HTMLPurifier_Token_Text
  10704. ) {
  10705. // State 3.1.1: ...</p>{p}<b>
  10706. // ---
  10707. // State 3.2.1: ...</p><div>
  10708. // -----
  10709. if (!is_array($token)) $token = array($token);
  10710. array_unshift($token, new HTMLPurifier_Token_Text("\n\n"));
  10711. } else {
  10712. // State 3.1.2: ...</p>\n\n{p}<b>
  10713. // ---
  10714. // State 3.2.2: ...</p>\n\n<div>
  10715. // -----
  10716. // Note: PAR<ELEM> cannot occur because PAR would have been
  10717. // wrapped in <p> tags.
  10718. }
  10719. }
  10720. }
  10721. } else {
  10722. // State 2.2: <ul><li>
  10723. // ----
  10724. // State 2.4: <p><b>
  10725. // ---
  10726. }
  10727. }
  10728. /**
  10729. * Splits up a text in paragraph tokens and appends them
  10730. * to the result stream that will replace the original
  10731. * @param $data String text data that will be processed
  10732. * into paragraphs
  10733. * @param $result Reference to array of tokens that the
  10734. * tags will be appended onto
  10735. * @param $config Instance of HTMLPurifier_Config
  10736. * @param $context Instance of HTMLPurifier_Context
  10737. */
  10738. private function _splitText($data, &$result) {
  10739. $raw_paragraphs = explode("\n\n", $data);
  10740. $paragraphs = array(); // without empty paragraphs
  10741. $needs_start = false;
  10742. $needs_end = false;
  10743. $c = count($raw_paragraphs);
  10744. if ($c == 1) {
  10745. // There were no double-newlines, abort quickly. In theory this
  10746. // should never happen.
  10747. $result[] = new HTMLPurifier_Token_Text($data);
  10748. return;
  10749. }
  10750. for ($i = 0; $i < $c; $i++) {
  10751. $par = $raw_paragraphs[$i];
  10752. if (trim($par) !== '') {
  10753. $paragraphs[] = $par;
  10754. } else {
  10755. if ($i == 0) {
  10756. // Double newline at the front
  10757. if (empty($result)) {
  10758. // The empty result indicates that the AutoParagraph
  10759. // injector did not add any start paragraph tokens.
  10760. // This means that we have been in a paragraph for
  10761. // a while, and the newline means we should start a new one.
  10762. $result[] = new HTMLPurifier_Token_End('p');
  10763. $result[] = new HTMLPurifier_Token_Text("\n\n");
  10764. // However, the start token should only be added if
  10765. // there is more processing to be done (i.e. there are
  10766. // real paragraphs in here). If there are none, the
  10767. // next start paragraph tag will be handled by the
  10768. // next call to the injector
  10769. $needs_start = true;
  10770. } else {
  10771. // We just started a new paragraph!
  10772. // Reinstate a double-newline for presentation's sake, since
  10773. // it was in the source code.
  10774. array_unshift($result, new HTMLPurifier_Token_Text("\n\n"));
  10775. }
  10776. } elseif ($i + 1 == $c) {
  10777. // Double newline at the end
  10778. // There should be a trailing </p> when we're finally done.
  10779. $needs_end = true;
  10780. }
  10781. }
  10782. }
  10783. // Check if this was just a giant blob of whitespace. Move this earlier,
  10784. // perhaps?
  10785. if (empty($paragraphs)) {
  10786. return;
  10787. }
  10788. // Add the start tag indicated by \n\n at the beginning of $data
  10789. if ($needs_start) {
  10790. $result[] = $this->_pStart();
  10791. }
  10792. // Append the paragraphs onto the result
  10793. foreach ($paragraphs as $par) {
  10794. $result[] = new HTMLPurifier_Token_Text($par);
  10795. $result[] = new HTMLPurifier_Token_End('p');
  10796. $result[] = new HTMLPurifier_Token_Text("\n\n");
  10797. $result[] = $this->_pStart();
  10798. }
  10799. // Remove trailing start token; Injector will handle this later if
  10800. // it was indeed needed. This prevents from needing to do a lookahead,
  10801. // at the cost of a lookbehind later.
  10802. array_pop($result);
  10803. // If there is no need for an end tag, remove all of it and let
  10804. // MakeWellFormed close it later.
  10805. if (!$needs_end) {
  10806. array_pop($result); // removes \n\n
  10807. array_pop($result); // removes </p>
  10808. }
  10809. }
  10810. /**
  10811. * Returns true if passed token is inline (and, ergo, allowed in
  10812. * paragraph tags)
  10813. */
  10814. private function _isInline($token) {
  10815. return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);
  10816. }
  10817. /**
  10818. * Looks ahead in the token list and determines whether or not we need
  10819. * to insert a <p> tag.
  10820. */
  10821. private function _pLookAhead() {
  10822. $this->current($i, $current);
  10823. if ($current instanceof HTMLPurifier_Token_Start) $nesting = 1;
  10824. else $nesting = 0;
  10825. $ok = false;
  10826. while ($this->forwardUntilEndToken($i, $current, $nesting)) {
  10827. $result = $this->_checkNeedsP($current);
  10828. if ($result !== null) {
  10829. $ok = $result;
  10830. break;
  10831. }
  10832. }
  10833. return $ok;
  10834. }
  10835. /**
  10836. * Determines if a particular token requires an earlier inline token
  10837. * to get a paragraph. This should be used with _forwardUntilEndToken
  10838. */
  10839. private function _checkNeedsP($current) {
  10840. if ($current instanceof HTMLPurifier_Token_Start){
  10841. if (!$this->_isInline($current)) {
  10842. // <div>PAR1<div>
  10843. // ----
  10844. // Terminate early, since we hit a block element
  10845. return false;
  10846. }
  10847. } elseif ($current instanceof HTMLPurifier_Token_Text) {
  10848. if (strpos($current->data, "\n\n") !== false) {
  10849. // <div>PAR1<b>PAR1\n\nPAR2
  10850. // ----
  10851. return true;
  10852. } else {
  10853. // <div>PAR1<b>PAR1...
  10854. // ----
  10855. }
  10856. }
  10857. return null;
  10858. }
  10859. }
  10860. /**
  10861. * Injector that displays the URL of an anchor instead of linking to it, in addition to showing the text of the link.
  10862. */
  10863. class HTMLPurifier_Injector_DisplayLinkURI extends HTMLPurifier_Injector
  10864. {
  10865. public $name = 'DisplayLinkURI';
  10866. public $needed = array('a');
  10867. public function handleElement(&$token) {
  10868. }
  10869. public function handleEnd(&$token) {
  10870. if (isset($token->start->attr['href'])){
  10871. $url = $token->start->attr['href'];
  10872. unset($token->start->attr['href']);
  10873. $token = array($token, new HTMLPurifier_Token_Text(" ($url)"));
  10874. } else {
  10875. // nothing to display
  10876. }
  10877. }
  10878. }
  10879. /**
  10880. * Injector that converts http, https and ftp text URLs to actual links.
  10881. */
  10882. class HTMLPurifier_Injector_Linkify extends HTMLPurifier_Injector
  10883. {
  10884. public $name = 'Linkify';
  10885. public $needed = array('a' => array('href'));
  10886. public function handleText(&$token) {
  10887. if (!$this->allowsElement('a')) return;
  10888. if (strpos($token->data, '://') === false) {
  10889. // our really quick heuristic failed, abort
  10890. // this may not work so well if we want to match things like
  10891. // "google.com", but then again, most people don't
  10892. return;
  10893. }
  10894. // there is/are URL(s). Let's split the string:
  10895. // Note: this regex is extremely permissive
  10896. $bits = preg_split('#((?:https?|ftp)://[^\s\'"<>()]+)#S', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
  10897. $token = array();
  10898. // $i = index
  10899. // $c = count
  10900. // $l = is link
  10901. for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) {
  10902. if (!$l) {
  10903. if ($bits[$i] === '') continue;
  10904. $token[] = new HTMLPurifier_Token_Text($bits[$i]);
  10905. } else {
  10906. $token[] = new HTMLPurifier_Token_Start('a', array('href' => $bits[$i]));
  10907. $token[] = new HTMLPurifier_Token_Text($bits[$i]);
  10908. $token[] = new HTMLPurifier_Token_End('a');
  10909. }
  10910. }
  10911. }
  10912. }
  10913. /**
  10914. * Injector that converts configuration directive syntax %Namespace.Directive
  10915. * to links
  10916. */
  10917. class HTMLPurifier_Injector_PurifierLinkify extends HTMLPurifier_Injector
  10918. {
  10919. public $name = 'PurifierLinkify';
  10920. public $docURL;
  10921. public $needed = array('a' => array('href'));
  10922. public function prepare($config, $context) {
  10923. $this->docURL = $config->get('AutoFormat.PurifierLinkify.DocURL');
  10924. return parent::prepare($config, $context);
  10925. }
  10926. public function handleText(&$token) {
  10927. if (!$this->allowsElement('a')) return;
  10928. if (strpos($token->data, '%') === false) return;
  10929. $bits = preg_split('#%([a-z0-9]+\.[a-z0-9]+)#Si', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
  10930. $token = array();
  10931. // $i = index
  10932. // $c = count
  10933. // $l = is link
  10934. for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) {
  10935. if (!$l) {
  10936. if ($bits[$i] === '') continue;
  10937. $token[] = new HTMLPurifier_Token_Text($bits[$i]);
  10938. } else {
  10939. $token[] = new HTMLPurifier_Token_Start('a',
  10940. array('href' => str_replace('%s', $bits[$i], $this->docURL)));
  10941. $token[] = new HTMLPurifier_Token_Text('%' . $bits[$i]);
  10942. $token[] = new HTMLPurifier_Token_End('a');
  10943. }
  10944. }
  10945. }
  10946. }
  10947. class HTMLPurifier_Injector_RemoveEmpty extends HTMLPurifier_Injector
  10948. {
  10949. private $context, $config, $attrValidator, $removeNbsp, $removeNbspExceptions;
  10950. public function prepare($config, $context) {
  10951. parent::prepare($config, $context);
  10952. $this->config = $config;
  10953. $this->context = $context;
  10954. $this->removeNbsp = $config->get('AutoFormat.RemoveEmpty.RemoveNbsp');
  10955. $this->removeNbspExceptions = $config->get('AutoFormat.RemoveEmpty.RemoveNbsp.Exceptions');
  10956. $this->attrValidator = new HTMLPurifier_AttrValidator();
  10957. }
  10958. public function handleElement(&$token) {
  10959. if (!$token instanceof HTMLPurifier_Token_Start) return;
  10960. $next = false;
  10961. for ($i = $this->inputIndex + 1, $c = count($this->inputTokens); $i < $c; $i++) {
  10962. $next = $this->inputTokens[$i];
  10963. if ($next instanceof HTMLPurifier_Token_Text) {
  10964. if ($next->is_whitespace) continue;
  10965. if ($this->removeNbsp && !isset($this->removeNbspExceptions[$token->name])) {
  10966. $plain = str_replace("\xC2\xA0", "", $next->data);
  10967. $isWsOrNbsp = $plain === '' || ctype_space($plain);
  10968. if ($isWsOrNbsp) continue;
  10969. }
  10970. }
  10971. break;
  10972. }
  10973. if (!$next || ($next instanceof HTMLPurifier_Token_End && $next->name == $token->name)) {
  10974. if ($token->name == 'colgroup') return;
  10975. $this->attrValidator->validateToken($token, $this->config, $this->context);
  10976. $token->armor['ValidateAttributes'] = true;
  10977. if (isset($token->attr['id']) || isset($token->attr['name'])) return;
  10978. $token = $i - $this->inputIndex + 1;
  10979. for ($b = $this->inputIndex - 1; $b > 0; $b--) {
  10980. $prev = $this->inputTokens[$b];
  10981. if ($prev instanceof HTMLPurifier_Token_Text && $prev->is_whitespace) continue;
  10982. break;
  10983. }
  10984. // This is safe because we removed the token that triggered this.
  10985. $this->rewind($b - 1);
  10986. return;
  10987. }
  10988. }
  10989. }
  10990. /**
  10991. * Injector that removes spans with no attributes
  10992. */
  10993. class HTMLPurifier_Injector_RemoveSpansWithoutAttributes extends HTMLPurifier_Injector
  10994. {
  10995. public $name = 'RemoveSpansWithoutAttributes';
  10996. public $needed = array('span');
  10997. private $attrValidator;
  10998. /**
  10999. * Used by AttrValidator
  11000. */
  11001. private $config;
  11002. private $context;
  11003. public function prepare($config, $context) {
  11004. $this->attrValidator = new HTMLPurifier_AttrValidator();
  11005. $this->config = $config;
  11006. $this->context = $context;
  11007. return parent::prepare($config, $context);
  11008. }
  11009. public function handleElement(&$token) {
  11010. if ($token->name !== 'span' || !$token instanceof HTMLPurifier_Token_Start) {
  11011. return;
  11012. }
  11013. // We need to validate the attributes now since this doesn't normally
  11014. // happen until after MakeWellFormed. If all the attributes are removed
  11015. // the span needs to be removed too.
  11016. $this->attrValidator->validateToken($token, $this->config, $this->context);
  11017. $token->armor['ValidateAttributes'] = true;
  11018. if (!empty($token->attr)) {
  11019. return;
  11020. }
  11021. $nesting = 0;
  11022. $spanContentTokens = array();
  11023. while ($this->forwardUntilEndToken($i, $current, $nesting)) {}
  11024. if ($current instanceof HTMLPurifier_Token_End && $current->name === 'span') {
  11025. // Mark closing span tag for deletion
  11026. $current->markForDeletion = true;
  11027. // Delete open span tag
  11028. $token = false;
  11029. }
  11030. }
  11031. public function handleEnd(&$token) {
  11032. if ($token->markForDeletion) {
  11033. $token = false;
  11034. }
  11035. }
  11036. }
  11037. /**
  11038. * Adds important param elements to inside of object in order to make
  11039. * things safe.
  11040. */
  11041. class HTMLPurifier_Injector_SafeObject extends HTMLPurifier_Injector
  11042. {
  11043. public $name = 'SafeObject';
  11044. public $needed = array('object', 'param');
  11045. protected $objectStack = array();
  11046. protected $paramStack = array();
  11047. // Keep this synchronized with AttrTransform/SafeParam.php
  11048. protected $addParam = array(
  11049. 'allowScriptAccess' => 'never',
  11050. 'allowNetworking' => 'internal',
  11051. );
  11052. protected $allowedParam = array(
  11053. 'wmode' => true,
  11054. 'movie' => true,
  11055. 'flashvars' => true,
  11056. 'src' => true,
  11057. 'allowFullScreen' => true, // if omitted, assume to be 'false'
  11058. );
  11059. public function prepare($config, $context) {
  11060. parent::prepare($config, $context);
  11061. }
  11062. public function handleElement(&$token) {
  11063. if ($token->name == 'object') {
  11064. $this->objectStack[] = $token;
  11065. $this->paramStack[] = array();
  11066. $new = array($token);
  11067. foreach ($this->addParam as $name => $value) {
  11068. $new[] = new HTMLPurifier_Token_Empty('param', array('name' => $name, 'value' => $value));
  11069. }
  11070. $token = $new;
  11071. } elseif ($token->name == 'param') {
  11072. $nest = count($this->currentNesting) - 1;
  11073. if ($nest >= 0 && $this->currentNesting[$nest]->name === 'object') {
  11074. $i = count($this->objectStack) - 1;
  11075. if (!isset($token->attr['name'])) {
  11076. $token = false;
  11077. return;
  11078. }
  11079. $n = $token->attr['name'];
  11080. // We need this fix because YouTube doesn't supply a data
  11081. // attribute, which we need if a type is specified. This is
  11082. // *very* Flash specific.
  11083. if (!isset($this->objectStack[$i]->attr['data']) &&
  11084. ($token->attr['name'] == 'movie' || $token->attr['name'] == 'src')) {
  11085. $this->objectStack[$i]->attr['data'] = $token->attr['value'];
  11086. }
  11087. // Check if the parameter is the correct value but has not
  11088. // already been added
  11089. if (
  11090. !isset($this->paramStack[$i][$n]) &&
  11091. isset($this->addParam[$n]) &&
  11092. $token->attr['name'] === $this->addParam[$n]
  11093. ) {
  11094. // keep token, and add to param stack
  11095. $this->paramStack[$i][$n] = true;
  11096. } elseif (isset($this->allowedParam[$n])) {
  11097. // keep token, don't do anything to it
  11098. // (could possibly check for duplicates here)
  11099. } else {
  11100. $token = false;
  11101. }
  11102. } else {
  11103. // not directly inside an object, DENY!
  11104. $token = false;
  11105. }
  11106. }
  11107. }
  11108. public function handleEnd(&$token) {
  11109. // This is the WRONG way of handling the object and param stacks;
  11110. // we should be inserting them directly on the relevant object tokens
  11111. // so that the global stack handling handles it.
  11112. if ($token->name == 'object') {
  11113. array_pop($this->objectStack);
  11114. array_pop($this->paramStack);
  11115. }
  11116. }
  11117. }
  11118. /**
  11119. * Parser that uses PHP 5's DOM extension (part of the core).
  11120. *
  11121. * In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
  11122. * It gives us a forgiving HTML parser, which we use to transform the HTML
  11123. * into a DOM, and then into the tokens. It is blazingly fast (for large
  11124. * documents, it performs twenty times faster than
  11125. * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.
  11126. *
  11127. * @note Any empty elements will have empty tokens associated with them, even if
  11128. * this is prohibited by the spec. This is cannot be fixed until the spec
  11129. * comes into play.
  11130. *
  11131. * @note PHP's DOM extension does not actually parse any entities, we use
  11132. * our own function to do that.
  11133. *
  11134. * @warning DOM tends to drop whitespace, which may wreak havoc on indenting.
  11135. * If this is a huge problem, due to the fact that HTML is hand
  11136. * edited and you are unable to get a parser cache that caches the
  11137. * the output of HTML Purifier while keeping the original HTML lying
  11138. * around, you may want to run Tidy on the resulting output or use
  11139. * HTMLPurifier_DirectLex
  11140. */
  11141. class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
  11142. {
  11143. private $factory;
  11144. public function __construct() {
  11145. // setup the factory
  11146. parent::__construct();
  11147. $this->factory = new HTMLPurifier_TokenFactory();
  11148. }
  11149. public function tokenizeHTML($html, $config, $context) {
  11150. $html = $this->normalize($html, $config, $context);
  11151. // attempt to armor stray angled brackets that cannot possibly
  11152. // form tags and thus are probably being used as emoticons
  11153. if ($config->get('Core.AggressivelyFixLt')) {
  11154. $char = '[^a-z!\/]';
  11155. $comment = "/<!--(.*?)(-->|\z)/is";
  11156. $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
  11157. do {
  11158. $old = $html;
  11159. $html = preg_replace("/<($char)/i", '&lt;\\1', $html);
  11160. } while ($html !== $old);
  11161. $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
  11162. }
  11163. // preprocess html, essential for UTF-8
  11164. $html = $this->wrapHTML($html, $config, $context);
  11165. $doc = new DOMDocument();
  11166. $doc->encoding = 'UTF-8'; // theoretically, the above has this covered
  11167. set_error_handler(array($this, 'muteErrorHandler'));
  11168. $doc->loadHTML($html);
  11169. restore_error_handler();
  11170. $tokens = array();
  11171. $this->tokenizeDOM(
  11172. $doc->getElementsByTagName('html')->item(0)-> // <html>
  11173. getElementsByTagName('body')->item(0)-> // <body>
  11174. getElementsByTagName('div')->item(0) // <div>
  11175. , $tokens);
  11176. return $tokens;
  11177. }
  11178. /**
  11179. * Recursive function that tokenizes a node, putting it into an accumulator.
  11180. *
  11181. * @param $node DOMNode to be tokenized.
  11182. * @param $tokens Array-list of already tokenized tokens.
  11183. * @param $collect Says whether or start and close are collected, set to
  11184. * false at first recursion because it's the implicit DIV
  11185. * tag you're dealing with.
  11186. * @returns Tokens of node appended to previously passed tokens.
  11187. */
  11188. protected function tokenizeDOM($node, &$tokens, $collect = false) {
  11189. // intercept non element nodes. WE MUST catch all of them,
  11190. // but we're not getting the character reference nodes because
  11191. // those should have been preprocessed
  11192. if ($node->nodeType === XML_TEXT_NODE) {
  11193. $tokens[] = $this->factory->createText($node->data);
  11194. return;
  11195. } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
  11196. // undo libxml's special treatment of <script> and <style> tags
  11197. $last = end($tokens);
  11198. $data = $node->data;
  11199. // (note $node->tagname is already normalized)
  11200. if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
  11201. $new_data = trim($data);
  11202. if (substr($new_data, 0, 4) === '<!--') {
  11203. $data = substr($new_data, 4);
  11204. if (substr($data, -3) === '-->') {
  11205. $data = substr($data, 0, -3);
  11206. } else {
  11207. // Highly suspicious! Not sure what to do...
  11208. }
  11209. }
  11210. }
  11211. $tokens[] = $this->factory->createText($this->parseData($data));
  11212. return;
  11213. } elseif ($node->nodeType === XML_COMMENT_NODE) {
  11214. // this is code is only invoked for comments in script/style in versions
  11215. // of libxml pre-2.6.28 (regular comments, of course, are still
  11216. // handled regularly)
  11217. $tokens[] = $this->factory->createComment($node->data);
  11218. return;
  11219. } elseif (
  11220. // not-well tested: there may be other nodes we have to grab
  11221. $node->nodeType !== XML_ELEMENT_NODE
  11222. ) {
  11223. return;
  11224. }
  11225. $attr = $node->hasAttributes() ?
  11226. $this->transformAttrToAssoc($node->attributes) :
  11227. array();
  11228. // We still have to make sure that the element actually IS empty
  11229. if (!$node->childNodes->length) {
  11230. if ($collect) {
  11231. $tokens[] = $this->factory->createEmpty($node->tagName, $attr);
  11232. }
  11233. } else {
  11234. if ($collect) { // don't wrap on first iteration
  11235. $tokens[] = $this->factory->createStart(
  11236. $tag_name = $node->tagName, // somehow, it get's dropped
  11237. $attr
  11238. );
  11239. }
  11240. foreach ($node->childNodes as $node) {
  11241. // remember, it's an accumulator. Otherwise, we'd have
  11242. // to use array_merge
  11243. $this->tokenizeDOM($node, $tokens, true);
  11244. }
  11245. if ($collect) {
  11246. $tokens[] = $this->factory->createEnd($tag_name);
  11247. }
  11248. }
  11249. }
  11250. /**
  11251. * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
  11252. *
  11253. * @param $attribute_list DOMNamedNodeMap of DOMAttr objects.
  11254. * @returns Associative array of attributes.
  11255. */
  11256. protected function transformAttrToAssoc($node_map) {
  11257. // NamedNodeMap is documented very well, so we're using undocumented
  11258. // features, namely, the fact that it implements Iterator and
  11259. // has a ->length attribute
  11260. if ($node_map->length === 0) return array();
  11261. $array = array();
  11262. foreach ($node_map as $attr) {
  11263. $array[$attr->name] = $attr->value;
  11264. }
  11265. return $array;
  11266. }
  11267. /**
  11268. * An error handler that mutes all errors
  11269. */
  11270. public function muteErrorHandler($errno, $errstr) {}
  11271. /**
  11272. * Callback function for undoing escaping of stray angled brackets
  11273. * in comments
  11274. */
  11275. public function callbackUndoCommentSubst($matches) {
  11276. return '<!--' . strtr($matches[1], array('&amp;'=>'&','&lt;'=>'<')) . $matches[2];
  11277. }
  11278. /**
  11279. * Callback function that entity-izes ampersands in comments so that
  11280. * callbackUndoCommentSubst doesn't clobber them
  11281. */
  11282. public function callbackArmorCommentEntities($matches) {
  11283. return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
  11284. }
  11285. /**
  11286. * Wraps an HTML fragment in the necessary HTML
  11287. */
  11288. protected function wrapHTML($html, $config, $context) {
  11289. $def = $config->getDefinition('HTML');
  11290. $ret = '';
  11291. if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
  11292. $ret .= '<!DOCTYPE html ';
  11293. if (!empty($def->doctype->dtdPublic)) $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
  11294. if (!empty($def->doctype->dtdSystem)) $ret .= '"' . $def->doctype->dtdSystem . '" ';
  11295. $ret .= '>';
  11296. }
  11297. $ret .= '<html><head>';
  11298. $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
  11299. // No protection if $html contains a stray </div>!
  11300. $ret .= '</head><body><div>'.$html.'</div></body></html>';
  11301. return $ret;
  11302. }
  11303. }
  11304. /**
  11305. * Our in-house implementation of a parser.
  11306. *
  11307. * A pure PHP parser, DirectLex has absolutely no dependencies, making
  11308. * it a reasonably good default for PHP4. Written with efficiency in mind,
  11309. * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
  11310. * pales in comparison to HTMLPurifier_Lexer_DOMLex.
  11311. *
  11312. * @todo Reread XML spec and document differences.
  11313. */
  11314. class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
  11315. {
  11316. public $tracksLineNumbers = true;
  11317. /**
  11318. * Whitespace characters for str(c)spn.
  11319. */
  11320. protected $_whitespace = "\x20\x09\x0D\x0A";
  11321. /**
  11322. * Callback function for script CDATA fudge
  11323. * @param $matches, in form of array(opening tag, contents, closing tag)
  11324. */
  11325. protected function scriptCallback($matches) {
  11326. return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
  11327. }
  11328. public function tokenizeHTML($html, $config, $context) {
  11329. // special normalization for script tags without any armor
  11330. // our "armor" heurstic is a < sign any number of whitespaces after
  11331. // the first script tag
  11332. if ($config->get('HTML.Trusted')) {
  11333. $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
  11334. array($this, 'scriptCallback'), $html);
  11335. }
  11336. $html = $this->normalize($html, $config, $context);
  11337. $cursor = 0; // our location in the text
  11338. $inside_tag = false; // whether or not we're parsing the inside of a tag
  11339. $array = array(); // result array
  11340. // This is also treated to mean maintain *column* numbers too
  11341. $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
  11342. if ($maintain_line_numbers === null) {
  11343. // automatically determine line numbering by checking
  11344. // if error collection is on
  11345. $maintain_line_numbers = $config->get('Core.CollectErrors');
  11346. }
  11347. if ($maintain_line_numbers) {
  11348. $current_line = 1;
  11349. $current_col = 0;
  11350. $length = strlen($html);
  11351. } else {
  11352. $current_line = false;
  11353. $current_col = false;
  11354. $length = false;
  11355. }
  11356. $context->register('CurrentLine', $current_line);
  11357. $context->register('CurrentCol', $current_col);
  11358. $nl = "\n";
  11359. // how often to manually recalculate. This will ALWAYS be right,
  11360. // but it's pretty wasteful. Set to 0 to turn off
  11361. $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
  11362. $e = false;
  11363. if ($config->get('Core.CollectErrors')) {
  11364. $e =& $context->get('ErrorCollector');
  11365. }
  11366. // for testing synchronization
  11367. $loops = 0;
  11368. while(++$loops) {
  11369. // $cursor is either at the start of a token, or inside of
  11370. // a tag (i.e. there was a < immediately before it), as indicated
  11371. // by $inside_tag
  11372. if ($maintain_line_numbers) {
  11373. // $rcursor, however, is always at the start of a token.
  11374. $rcursor = $cursor - (int) $inside_tag;
  11375. // Column number is cheap, so we calculate it every round.
  11376. // We're interested at the *end* of the newline string, so
  11377. // we need to add strlen($nl) == 1 to $nl_pos before subtracting it
  11378. // from our "rcursor" position.
  11379. $nl_pos = strrpos($html, $nl, $rcursor - $length);
  11380. $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
  11381. // recalculate lines
  11382. if (
  11383. $synchronize_interval && // synchronization is on
  11384. $cursor > 0 && // cursor is further than zero
  11385. $loops % $synchronize_interval === 0 // time to synchronize!
  11386. ) {
  11387. $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
  11388. }
  11389. }
  11390. $position_next_lt = strpos($html, '<', $cursor);
  11391. $position_next_gt = strpos($html, '>', $cursor);
  11392. // triggers on "<b>asdf</b>" but not "asdf <b></b>"
  11393. // special case to set up context
  11394. if ($position_next_lt === $cursor) {
  11395. $inside_tag = true;
  11396. $cursor++;
  11397. }
  11398. if (!$inside_tag && $position_next_lt !== false) {
  11399. // We are not inside tag and there still is another tag to parse
  11400. $token = new
  11401. HTMLPurifier_Token_Text(
  11402. $this->parseData(
  11403. substr(
  11404. $html, $cursor, $position_next_lt - $cursor
  11405. )
  11406. )
  11407. );
  11408. if ($maintain_line_numbers) {
  11409. $token->rawPosition($current_line, $current_col);
  11410. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
  11411. }
  11412. $array[] = $token;
  11413. $cursor = $position_next_lt + 1;
  11414. $inside_tag = true;
  11415. continue;
  11416. } elseif (!$inside_tag) {
  11417. // We are not inside tag but there are no more tags
  11418. // If we're already at the end, break
  11419. if ($cursor === strlen($html)) break;
  11420. // Create Text of rest of string
  11421. $token = new
  11422. HTMLPurifier_Token_Text(
  11423. $this->parseData(
  11424. substr(
  11425. $html, $cursor
  11426. )
  11427. )
  11428. );
  11429. if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
  11430. $array[] = $token;
  11431. break;
  11432. } elseif ($inside_tag && $position_next_gt !== false) {
  11433. // We are in tag and it is well formed
  11434. // Grab the internals of the tag
  11435. $strlen_segment = $position_next_gt - $cursor;
  11436. if ($strlen_segment < 1) {
  11437. // there's nothing to process!
  11438. $token = new HTMLPurifier_Token_Text('<');
  11439. $cursor++;
  11440. continue;
  11441. }
  11442. $segment = substr($html, $cursor, $strlen_segment);
  11443. if ($segment === false) {
  11444. // somehow, we attempted to access beyond the end of
  11445. // the string, defense-in-depth, reported by Nate Abele
  11446. break;
  11447. }
  11448. // Check if it's a comment
  11449. if (
  11450. substr($segment, 0, 3) === '!--'
  11451. ) {
  11452. // re-determine segment length, looking for -->
  11453. $position_comment_end = strpos($html, '-->', $cursor);
  11454. if ($position_comment_end === false) {
  11455. // uh oh, we have a comment that extends to
  11456. // infinity. Can't be helped: set comment
  11457. // end position to end of string
  11458. if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment');
  11459. $position_comment_end = strlen($html);
  11460. $end = true;
  11461. } else {
  11462. $end = false;
  11463. }
  11464. $strlen_segment = $position_comment_end - $cursor;
  11465. $segment = substr($html, $cursor, $strlen_segment);
  11466. $token = new
  11467. HTMLPurifier_Token_Comment(
  11468. substr(
  11469. $segment, 3, $strlen_segment - 3
  11470. )
  11471. );
  11472. if ($maintain_line_numbers) {
  11473. $token->rawPosition($current_line, $current_col);
  11474. $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
  11475. }
  11476. $array[] = $token;
  11477. $cursor = $end ? $position_comment_end : $position_comment_end + 3;
  11478. $inside_tag = false;
  11479. continue;
  11480. }
  11481. // Check if it's an end tag
  11482. $is_end_tag = (strpos($segment,'/') === 0);
  11483. if ($is_end_tag) {
  11484. $type = substr($segment, 1);
  11485. $token = new HTMLPurifier_Token_End($type);
  11486. if ($maintain_line_numbers) {
  11487. $token->rawPosition($current_line, $current_col);
  11488. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  11489. }
  11490. $array[] = $token;
  11491. $inside_tag = false;
  11492. $cursor = $position_next_gt + 1;
  11493. continue;
  11494. }
  11495. // Check leading character is alnum, if not, we may
  11496. // have accidently grabbed an emoticon. Translate into
  11497. // text and go our merry way
  11498. if (!ctype_alpha($segment[0])) {
  11499. // XML: $segment[0] !== '_' && $segment[0] !== ':'
  11500. if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
  11501. $token = new HTMLPurifier_Token_Text('<');
  11502. if ($maintain_line_numbers) {
  11503. $token->rawPosition($current_line, $current_col);
  11504. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  11505. }
  11506. $array[] = $token;
  11507. $inside_tag = false;
  11508. continue;
  11509. }
  11510. // Check if it is explicitly self closing, if so, remove
  11511. // trailing slash. Remember, we could have a tag like <br>, so
  11512. // any later token processing scripts must convert improperly
  11513. // classified EmptyTags from StartTags.
  11514. $is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
  11515. if ($is_self_closing) {
  11516. $strlen_segment--;
  11517. $segment = substr($segment, 0, $strlen_segment);
  11518. }
  11519. // Check if there are any attributes
  11520. $position_first_space = strcspn($segment, $this->_whitespace);
  11521. if ($position_first_space >= $strlen_segment) {
  11522. if ($is_self_closing) {
  11523. $token = new HTMLPurifier_Token_Empty($segment);
  11524. } else {
  11525. $token = new HTMLPurifier_Token_Start($segment);
  11526. }
  11527. if ($maintain_line_numbers) {
  11528. $token->rawPosition($current_line, $current_col);
  11529. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  11530. }
  11531. $array[] = $token;
  11532. $inside_tag = false;
  11533. $cursor = $position_next_gt + 1;
  11534. continue;
  11535. }
  11536. // Grab out all the data
  11537. $type = substr($segment, 0, $position_first_space);
  11538. $attribute_string =
  11539. trim(
  11540. substr(
  11541. $segment, $position_first_space
  11542. )
  11543. );
  11544. if ($attribute_string) {
  11545. $attr = $this->parseAttributeString(
  11546. $attribute_string
  11547. , $config, $context
  11548. );
  11549. } else {
  11550. $attr = array();
  11551. }
  11552. if ($is_self_closing) {
  11553. $token = new HTMLPurifier_Token_Empty($type, $attr);
  11554. } else {
  11555. $token = new HTMLPurifier_Token_Start($type, $attr);
  11556. }
  11557. if ($maintain_line_numbers) {
  11558. $token->rawPosition($current_line, $current_col);
  11559. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  11560. }
  11561. $array[] = $token;
  11562. $cursor = $position_next_gt + 1;
  11563. $inside_tag = false;
  11564. continue;
  11565. } else {
  11566. // inside tag, but there's no ending > sign
  11567. if ($e) $e->send(E_WARNING, 'Lexer: Missing gt');
  11568. $token = new
  11569. HTMLPurifier_Token_Text(
  11570. '<' .
  11571. $this->parseData(
  11572. substr($html, $cursor)
  11573. )
  11574. );
  11575. if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
  11576. // no cursor scroll? Hmm...
  11577. $array[] = $token;
  11578. break;
  11579. }
  11580. break;
  11581. }
  11582. $context->destroy('CurrentLine');
  11583. $context->destroy('CurrentCol');
  11584. return $array;
  11585. }
  11586. /**
  11587. * PHP 5.0.x compatible substr_count that implements offset and length
  11588. */
  11589. protected function substrCount($haystack, $needle, $offset, $length) {
  11590. static $oldVersion;
  11591. if ($oldVersion === null) {
  11592. $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
  11593. }
  11594. if ($oldVersion) {
  11595. $haystack = substr($haystack, $offset, $length);
  11596. return substr_count($haystack, $needle);
  11597. } else {
  11598. return substr_count($haystack, $needle, $offset, $length);
  11599. }
  11600. }
  11601. /**
  11602. * Takes the inside of an HTML tag and makes an assoc array of attributes.
  11603. *
  11604. * @param $string Inside of tag excluding name.
  11605. * @returns Assoc array of attributes.
  11606. */
  11607. public function parseAttributeString($string, $config, $context) {
  11608. $string = (string) $string; // quick typecast
  11609. if ($string == '') return array(); // no attributes
  11610. $e = false;
  11611. if ($config->get('Core.CollectErrors')) {
  11612. $e =& $context->get('ErrorCollector');
  11613. }
  11614. // let's see if we can abort as quickly as possible
  11615. // one equal sign, no spaces => one attribute
  11616. $num_equal = substr_count($string, '=');
  11617. $has_space = strpos($string, ' ');
  11618. if ($num_equal === 0 && !$has_space) {
  11619. // bool attribute
  11620. return array($string => $string);
  11621. } elseif ($num_equal === 1 && !$has_space) {
  11622. // only one attribute
  11623. list($key, $quoted_value) = explode('=', $string);
  11624. $quoted_value = trim($quoted_value);
  11625. if (!$key) {
  11626. if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
  11627. return array();
  11628. }
  11629. if (!$quoted_value) return array($key => '');
  11630. $first_char = @$quoted_value[0];
  11631. $last_char = @$quoted_value[strlen($quoted_value)-1];
  11632. $same_quote = ($first_char == $last_char);
  11633. $open_quote = ($first_char == '"' || $first_char == "'");
  11634. if ( $same_quote && $open_quote) {
  11635. // well behaved
  11636. $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
  11637. } else {
  11638. // not well behaved
  11639. if ($open_quote) {
  11640. if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote');
  11641. $value = substr($quoted_value, 1);
  11642. } else {
  11643. $value = $quoted_value;
  11644. }
  11645. }
  11646. if ($value === false) $value = '';
  11647. return array($key => $this->parseData($value));
  11648. }
  11649. // setup loop environment
  11650. $array = array(); // return assoc array of attributes
  11651. $cursor = 0; // current position in string (moves forward)
  11652. $size = strlen($string); // size of the string (stays the same)
  11653. // if we have unquoted attributes, the parser expects a terminating
  11654. // space, so let's guarantee that there's always a terminating space.
  11655. $string .= ' ';
  11656. while(true) {
  11657. if ($cursor >= $size) {
  11658. break;
  11659. }
  11660. $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
  11661. // grab the key
  11662. $key_begin = $cursor; //we're currently at the start of the key
  11663. // scroll past all characters that are the key (not whitespace or =)
  11664. $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
  11665. $key_end = $cursor; // now at the end of the key
  11666. $key = substr($string, $key_begin, $key_end - $key_begin);
  11667. if (!$key) {
  11668. if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
  11669. $cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
  11670. continue; // empty key
  11671. }
  11672. // scroll past all whitespace
  11673. $cursor += strspn($string, $this->_whitespace, $cursor);
  11674. if ($cursor >= $size) {
  11675. $array[$key] = $key;
  11676. break;
  11677. }
  11678. // if the next character is an equal sign, we've got a regular
  11679. // pair, otherwise, it's a bool attribute
  11680. $first_char = @$string[$cursor];
  11681. if ($first_char == '=') {
  11682. // key="value"
  11683. $cursor++;
  11684. $cursor += strspn($string, $this->_whitespace, $cursor);
  11685. if ($cursor === false) {
  11686. $array[$key] = '';
  11687. break;
  11688. }
  11689. // we might be in front of a quote right now
  11690. $char = @$string[$cursor];
  11691. if ($char == '"' || $char == "'") {
  11692. // it's quoted, end bound is $char
  11693. $cursor++;
  11694. $value_begin = $cursor;
  11695. $cursor = strpos($string, $char, $cursor);
  11696. $value_end = $cursor;
  11697. } else {
  11698. // it's not quoted, end bound is whitespace
  11699. $value_begin = $cursor;
  11700. $cursor += strcspn($string, $this->_whitespace, $cursor);
  11701. $value_end = $cursor;
  11702. }
  11703. // we reached a premature end
  11704. if ($cursor === false) {
  11705. $cursor = $size;
  11706. $value_end = $cursor;
  11707. }
  11708. $value = substr($string, $value_begin, $value_end - $value_begin);
  11709. if ($value === false) $value = '';
  11710. $array[$key] = $this->parseData($value);
  11711. $cursor++;
  11712. } else {
  11713. // boolattr
  11714. if ($key !== '') {
  11715. $array[$key] = $key;
  11716. } else {
  11717. // purely theoretical
  11718. if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
  11719. }
  11720. }
  11721. }
  11722. return $array;
  11723. }
  11724. }
  11725. /**
  11726. * Composite strategy that runs multiple strategies on tokens.
  11727. */
  11728. abstract class HTMLPurifier_Strategy_Composite extends HTMLPurifier_Strategy
  11729. {
  11730. /**
  11731. * List of strategies to run tokens through.
  11732. */
  11733. protected $strategies = array();
  11734. abstract public function __construct();
  11735. public function execute($tokens, $config, $context) {
  11736. foreach ($this->strategies as $strategy) {
  11737. $tokens = $strategy->execute($tokens, $config, $context);
  11738. }
  11739. return $tokens;
  11740. }
  11741. }
  11742. /**
  11743. * Core strategy composed of the big four strategies.
  11744. */
  11745. class HTMLPurifier_Strategy_Core extends HTMLPurifier_Strategy_Composite
  11746. {
  11747. public function __construct() {
  11748. $this->strategies[] = new HTMLPurifier_Strategy_RemoveForeignElements();
  11749. $this->strategies[] = new HTMLPurifier_Strategy_MakeWellFormed();
  11750. $this->strategies[] = new HTMLPurifier_Strategy_FixNesting();
  11751. $this->strategies[] = new HTMLPurifier_Strategy_ValidateAttributes();
  11752. }
  11753. }
  11754. /**
  11755. * Takes a well formed list of tokens and fixes their nesting.
  11756. *
  11757. * HTML elements dictate which elements are allowed to be their children,
  11758. * for example, you can't have a p tag in a span tag. Other elements have
  11759. * much more rigorous definitions: tables, for instance, require a specific
  11760. * order for their elements. There are also constraints not expressible by
  11761. * document type definitions, such as the chameleon nature of ins/del
  11762. * tags and global child exclusions.
  11763. *
  11764. * The first major objective of this strategy is to iterate through all the
  11765. * nodes (not tokens) of the list of tokens and determine whether or not
  11766. * their children conform to the element's definition. If they do not, the
  11767. * child definition may optionally supply an amended list of elements that
  11768. * is valid or require that the entire node be deleted (and the previous
  11769. * node rescanned).
  11770. *
  11771. * The second objective is to ensure that explicitly excluded elements of
  11772. * an element do not appear in its children. Code that accomplishes this
  11773. * task is pervasive through the strategy, though the two are distinct tasks
  11774. * and could, theoretically, be seperated (although it's not recommended).
  11775. *
  11776. * @note Whether or not unrecognized children are silently dropped or
  11777. * translated into text depends on the child definitions.
  11778. *
  11779. * @todo Enable nodes to be bubbled out of the structure.
  11780. */
  11781. class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
  11782. {
  11783. public function execute($tokens, $config, $context) {
  11784. //####################################################################//
  11785. // Pre-processing
  11786. // get a copy of the HTML definition
  11787. $definition = $config->getHTMLDefinition();
  11788. // insert implicit "parent" node, will be removed at end.
  11789. // DEFINITION CALL
  11790. $parent_name = $definition->info_parent;
  11791. array_unshift($tokens, new HTMLPurifier_Token_Start($parent_name));
  11792. $tokens[] = new HTMLPurifier_Token_End($parent_name);
  11793. // setup the context variable 'IsInline', for chameleon processing
  11794. // is 'false' when we are not inline, 'true' when it must always
  11795. // be inline, and an integer when it is inline for a certain
  11796. // branch of the document tree
  11797. $is_inline = $definition->info_parent_def->descendants_are_inline;
  11798. $context->register('IsInline', $is_inline);
  11799. // setup error collector
  11800. $e =& $context->get('ErrorCollector', true);
  11801. //####################################################################//
  11802. // Loop initialization
  11803. // stack that contains the indexes of all parents,
  11804. // $stack[count($stack)-1] being the current parent
  11805. $stack = array();
  11806. // stack that contains all elements that are excluded
  11807. // it is organized by parent elements, similar to $stack,
  11808. // but it is only populated when an element with exclusions is
  11809. // processed, i.e. there won't be empty exclusions.
  11810. $exclude_stack = array();
  11811. // variable that contains the start token while we are processing
  11812. // nodes. This enables error reporting to do its job
  11813. $start_token = false;
  11814. $context->register('CurrentToken', $start_token);
  11815. //####################################################################//
  11816. // Loop
  11817. // iterate through all start nodes. Determining the start node
  11818. // is complicated so it has been omitted from the loop construct
  11819. for ($i = 0, $size = count($tokens) ; $i < $size; ) {
  11820. //################################################################//
  11821. // Gather information on children
  11822. // child token accumulator
  11823. $child_tokens = array();
  11824. // scroll to the end of this node, report number, and collect
  11825. // all children
  11826. for ($j = $i, $depth = 0; ; $j++) {
  11827. if ($tokens[$j] instanceof HTMLPurifier_Token_Start) {
  11828. $depth++;
  11829. // skip token assignment on first iteration, this is the
  11830. // token we currently are on
  11831. if ($depth == 1) continue;
  11832. } elseif ($tokens[$j] instanceof HTMLPurifier_Token_End) {
  11833. $depth--;
  11834. // skip token assignment on last iteration, this is the
  11835. // end token of the token we're currently on
  11836. if ($depth == 0) break;
  11837. }
  11838. $child_tokens[] = $tokens[$j];
  11839. }
  11840. // $i is index of start token
  11841. // $j is index of end token
  11842. $start_token = $tokens[$i]; // to make token available via CurrentToken
  11843. //################################################################//
  11844. // Gather information on parent
  11845. // calculate parent information
  11846. if ($count = count($stack)) {
  11847. $parent_index = $stack[$count-1];
  11848. $parent_name = $tokens[$parent_index]->name;
  11849. if ($parent_index == 0) {
  11850. $parent_def = $definition->info_parent_def;
  11851. } else {
  11852. $parent_def = $definition->info[$parent_name];
  11853. }
  11854. } else {
  11855. // processing as if the parent were the "root" node
  11856. // unknown info, it won't be used anyway, in the future,
  11857. // we may want to enforce one element only (this is
  11858. // necessary for HTML Purifier to clean entire documents
  11859. $parent_index = $parent_name = $parent_def = null;
  11860. }
  11861. // calculate context
  11862. if ($is_inline === false) {
  11863. // check if conditions make it inline
  11864. if (!empty($parent_def) && $parent_def->descendants_are_inline) {
  11865. $is_inline = $count - 1;
  11866. }
  11867. } else {
  11868. // check if we're out of inline
  11869. if ($count === $is_inline) {
  11870. $is_inline = false;
  11871. }
  11872. }
  11873. //################################################################//
  11874. // Determine whether element is explicitly excluded SGML-style
  11875. // determine whether or not element is excluded by checking all
  11876. // parent exclusions. The array should not be very large, two
  11877. // elements at most.
  11878. $excluded = false;
  11879. if (!empty($exclude_stack)) {
  11880. foreach ($exclude_stack as $lookup) {
  11881. if (isset($lookup[$tokens[$i]->name])) {
  11882. $excluded = true;
  11883. // no need to continue processing
  11884. break;
  11885. }
  11886. }
  11887. }
  11888. //################################################################//
  11889. // Perform child validation
  11890. if ($excluded) {
  11891. // there is an exclusion, remove the entire node
  11892. $result = false;
  11893. $excludes = array(); // not used, but good to initialize anyway
  11894. } else {
  11895. // DEFINITION CALL
  11896. if ($i === 0) {
  11897. // special processing for the first node
  11898. $def = $definition->info_parent_def;
  11899. } else {
  11900. $def = $definition->info[$tokens[$i]->name];
  11901. }
  11902. if (!empty($def->child)) {
  11903. // have DTD child def validate children
  11904. $result = $def->child->validateChildren(
  11905. $child_tokens, $config, $context);
  11906. } else {
  11907. // weird, no child definition, get rid of everything
  11908. $result = false;
  11909. }
  11910. // determine whether or not this element has any exclusions
  11911. $excludes = $def->excludes;
  11912. }
  11913. // $result is now a bool or array
  11914. //################################################################//
  11915. // Process result by interpreting $result
  11916. if ($result === true || $child_tokens === $result) {
  11917. // leave the node as is
  11918. // register start token as a parental node start
  11919. $stack[] = $i;
  11920. // register exclusions if there are any
  11921. if (!empty($excludes)) $exclude_stack[] = $excludes;
  11922. // move cursor to next possible start node
  11923. $i++;
  11924. } elseif($result === false) {
  11925. // remove entire node
  11926. if ($e) {
  11927. if ($excluded) {
  11928. $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');
  11929. } else {
  11930. $e->send(E_ERROR, 'Strategy_FixNesting: Node removed');
  11931. }
  11932. }
  11933. // calculate length of inner tokens and current tokens
  11934. $length = $j - $i + 1;
  11935. // perform removal
  11936. array_splice($tokens, $i, $length);
  11937. // update size
  11938. $size -= $length;
  11939. // there is no start token to register,
  11940. // current node is now the next possible start node
  11941. // unless it turns out that we need to do a double-check
  11942. // this is a rought heuristic that covers 100% of HTML's
  11943. // cases and 99% of all other cases. A child definition
  11944. // that would be tricked by this would be something like:
  11945. // ( | a b c) where it's all or nothing. Fortunately,
  11946. // our current implementation claims that that case would
  11947. // not allow empty, even if it did
  11948. if (!$parent_def->child->allow_empty) {
  11949. // we need to do a double-check
  11950. $i = $parent_index;
  11951. array_pop($stack);
  11952. }
  11953. // PROJECTED OPTIMIZATION: Process all children elements before
  11954. // reprocessing parent node.
  11955. } else {
  11956. // replace node with $result
  11957. // calculate length of inner tokens
  11958. $length = $j - $i - 1;
  11959. if ($e) {
  11960. if (empty($result) && $length) {
  11961. $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');
  11962. } else {
  11963. $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');
  11964. }
  11965. }
  11966. // perform replacement
  11967. array_splice($tokens, $i + 1, $length, $result);
  11968. // update size
  11969. $size -= $length;
  11970. $size += count($result);
  11971. // register start token as a parental node start
  11972. $stack[] = $i;
  11973. // register exclusions if there are any
  11974. if (!empty($excludes)) $exclude_stack[] = $excludes;
  11975. // move cursor to next possible start node
  11976. $i++;
  11977. }
  11978. //################################################################//
  11979. // Scroll to next start node
  11980. // We assume, at this point, that $i is the index of the token
  11981. // that is the first possible new start point for a node.
  11982. // Test if the token indeed is a start tag, if not, move forward
  11983. // and test again.
  11984. $size = count($tokens);
  11985. while ($i < $size and !$tokens[$i] instanceof HTMLPurifier_Token_Start) {
  11986. if ($tokens[$i] instanceof HTMLPurifier_Token_End) {
  11987. // pop a token index off the stack if we ended a node
  11988. array_pop($stack);
  11989. // pop an exclusion lookup off exclusion stack if
  11990. // we ended node and that node had exclusions
  11991. if ($i == 0 || $i == $size - 1) {
  11992. // use specialized var if it's the super-parent
  11993. $s_excludes = $definition->info_parent_def->excludes;
  11994. } else {
  11995. $s_excludes = $definition->info[$tokens[$i]->name]->excludes;
  11996. }
  11997. if ($s_excludes) {
  11998. array_pop($exclude_stack);
  11999. }
  12000. }
  12001. $i++;
  12002. }
  12003. }
  12004. //####################################################################//
  12005. // Post-processing
  12006. // remove implicit parent tokens at the beginning and end
  12007. array_shift($tokens);
  12008. array_pop($tokens);
  12009. // remove context variables
  12010. $context->destroy('IsInline');
  12011. $context->destroy('CurrentToken');
  12012. //####################################################################//
  12013. // Return
  12014. return $tokens;
  12015. }
  12016. }
  12017. /**
  12018. * Takes tokens makes them well-formed (balance end tags, etc.)
  12019. */
  12020. class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
  12021. {
  12022. /**
  12023. * Array stream of tokens being processed.
  12024. */
  12025. protected $tokens;
  12026. /**
  12027. * Current index in $tokens.
  12028. */
  12029. protected $t;
  12030. /**
  12031. * Current nesting of elements.
  12032. */
  12033. protected $stack;
  12034. /**
  12035. * Injectors active in this stream processing.
  12036. */
  12037. protected $injectors;
  12038. /**
  12039. * Current instance of HTMLPurifier_Config.
  12040. */
  12041. protected $config;
  12042. /**
  12043. * Current instance of HTMLPurifier_Context.
  12044. */
  12045. protected $context;
  12046. public function execute($tokens, $config, $context) {
  12047. $definition = $config->getHTMLDefinition();
  12048. // local variables
  12049. $generator = new HTMLPurifier_Generator($config, $context);
  12050. $escape_invalid_tags = $config->get('Core.EscapeInvalidTags');
  12051. $e = $context->get('ErrorCollector', true);
  12052. $t = false; // token index
  12053. $i = false; // injector index
  12054. $token = false; // the current token
  12055. $reprocess = false; // whether or not to reprocess the same token
  12056. $stack = array();
  12057. // member variables
  12058. $this->stack =& $stack;
  12059. $this->t =& $t;
  12060. $this->tokens =& $tokens;
  12061. $this->config = $config;
  12062. $this->context = $context;
  12063. // context variables
  12064. $context->register('CurrentNesting', $stack);
  12065. $context->register('InputIndex', $t);
  12066. $context->register('InputTokens', $tokens);
  12067. $context->register('CurrentToken', $token);
  12068. // -- begin INJECTOR --
  12069. $this->injectors = array();
  12070. $injectors = $config->getBatch('AutoFormat');
  12071. $def_injectors = $definition->info_injector;
  12072. $custom_injectors = $injectors['Custom'];
  12073. unset($injectors['Custom']); // special case
  12074. foreach ($injectors as $injector => $b) {
  12075. // XXX: Fix with a legitimate lookup table of enabled filters
  12076. if (strpos($injector, '.') !== false) continue;
  12077. $injector = "HTMLPurifier_Injector_$injector";
  12078. if (!$b) continue;
  12079. $this->injectors[] = new $injector;
  12080. }
  12081. foreach ($def_injectors as $injector) {
  12082. // assumed to be objects
  12083. $this->injectors[] = $injector;
  12084. }
  12085. foreach ($custom_injectors as $injector) {
  12086. if (!$injector) continue;
  12087. if (is_string($injector)) {
  12088. $injector = "HTMLPurifier_Injector_$injector";
  12089. $injector = new $injector;
  12090. }
  12091. $this->injectors[] = $injector;
  12092. }
  12093. // give the injectors references to the definition and context
  12094. // variables for performance reasons
  12095. foreach ($this->injectors as $ix => $injector) {
  12096. $error = $injector->prepare($config, $context);
  12097. if (!$error) continue;
  12098. array_splice($this->injectors, $ix, 1); // rm the injector
  12099. trigger_error("Cannot enable {$injector->name} injector because $error is not allowed", E_USER_WARNING);
  12100. }
  12101. // -- end INJECTOR --
  12102. // a note on punting:
  12103. // In order to reduce code duplication, whenever some code needs
  12104. // to make HTML changes in order to make things "correct", the
  12105. // new HTML gets sent through the purifier, regardless of its
  12106. // status. This means that if we add a start token, because it
  12107. // was totally necessary, we don't have to update nesting; we just
  12108. // punt ($reprocess = true; continue;) and it does that for us.
  12109. // isset is in loop because $tokens size changes during loop exec
  12110. for (
  12111. $t = 0;
  12112. $t == 0 || isset($tokens[$t - 1]);
  12113. // only increment if we don't need to reprocess
  12114. $reprocess ? $reprocess = false : $t++
  12115. ) {
  12116. // check for a rewind
  12117. if (is_int($i) && $i >= 0) {
  12118. // possibility: disable rewinding if the current token has a
  12119. // rewind set on it already. This would offer protection from
  12120. // infinite loop, but might hinder some advanced rewinding.
  12121. $rewind_to = $this->injectors[$i]->getRewind();
  12122. if (is_int($rewind_to) && $rewind_to < $t) {
  12123. if ($rewind_to < 0) $rewind_to = 0;
  12124. while ($t > $rewind_to) {
  12125. $t--;
  12126. $prev = $tokens[$t];
  12127. // indicate that other injectors should not process this token,
  12128. // but we need to reprocess it
  12129. unset($prev->skip[$i]);
  12130. $prev->rewind = $i;
  12131. if ($prev instanceof HTMLPurifier_Token_Start) array_pop($this->stack);
  12132. elseif ($prev instanceof HTMLPurifier_Token_End) $this->stack[] = $prev->start;
  12133. }
  12134. }
  12135. $i = false;
  12136. }
  12137. // handle case of document end
  12138. if (!isset($tokens[$t])) {
  12139. // kill processing if stack is empty
  12140. if (empty($this->stack)) break;
  12141. // peek
  12142. $top_nesting = array_pop($this->stack);
  12143. $this->stack[] = $top_nesting;
  12144. // send error
  12145. if ($e && !isset($top_nesting->armor['MakeWellFormed_TagClosedError'])) {
  12146. $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $top_nesting);
  12147. }
  12148. // append, don't splice, since this is the end
  12149. $tokens[] = new HTMLPurifier_Token_End($top_nesting->name);
  12150. // punt!
  12151. $reprocess = true;
  12152. continue;
  12153. }
  12154. $token = $tokens[$t];
  12155. //echo '<br>'; printTokens($tokens, $t); printTokens($this->stack);
  12156. //flush();
  12157. // quick-check: if it's not a tag, no need to process
  12158. if (empty($token->is_tag)) {
  12159. if ($token instanceof HTMLPurifier_Token_Text) {
  12160. foreach ($this->injectors as $i => $injector) {
  12161. if (isset($token->skip[$i])) continue;
  12162. if ($token->rewind !== null && $token->rewind !== $i) continue;
  12163. $injector->handleText($token);
  12164. $this->processToken($token, $i);
  12165. $reprocess = true;
  12166. break;
  12167. }
  12168. }
  12169. // another possibility is a comment
  12170. continue;
  12171. }
  12172. if (isset($definition->info[$token->name])) {
  12173. $type = $definition->info[$token->name]->child->type;
  12174. } else {
  12175. $type = false; // Type is unknown, treat accordingly
  12176. }
  12177. // quick tag checks: anything that's *not* an end tag
  12178. $ok = false;
  12179. if ($type === 'empty' && $token instanceof HTMLPurifier_Token_Start) {
  12180. // claims to be a start tag but is empty
  12181. $token = new HTMLPurifier_Token_Empty($token->name, $token->attr);
  12182. $ok = true;
  12183. } elseif ($type && $type !== 'empty' && $token instanceof HTMLPurifier_Token_Empty) {
  12184. // claims to be empty but really is a start tag
  12185. $this->swap(new HTMLPurifier_Token_End($token->name));
  12186. $this->insertBefore(new HTMLPurifier_Token_Start($token->name, $token->attr));
  12187. // punt (since we had to modify the input stream in a non-trivial way)
  12188. $reprocess = true;
  12189. continue;
  12190. } elseif ($token instanceof HTMLPurifier_Token_Empty) {
  12191. // real empty token
  12192. $ok = true;
  12193. } elseif ($token instanceof HTMLPurifier_Token_Start) {
  12194. // start tag
  12195. // ...unless they also have to close their parent
  12196. if (!empty($this->stack)) {
  12197. $parent = array_pop($this->stack);
  12198. $this->stack[] = $parent;
  12199. if (isset($definition->info[$parent->name])) {
  12200. $elements = $definition->info[$parent->name]->child->getAllowedElements($config);
  12201. $autoclose = !isset($elements[$token->name]);
  12202. } else {
  12203. $autoclose = false;
  12204. }
  12205. if ($autoclose && $definition->info[$token->name]->wrap) {
  12206. // Check if an element can be wrapped by another
  12207. // element to make it valid in a context (for
  12208. // example, <ul><ul> needs a <li> in between)
  12209. $wrapname = $definition->info[$token->name]->wrap;
  12210. $wrapdef = $definition->info[$wrapname];
  12211. $elements = $wrapdef->child->getAllowedElements($config);
  12212. $parent_elements = $definition->info[$parent->name]->child->getAllowedElements($config);
  12213. if (isset($elements[$token->name]) && isset($parent_elements[$wrapname])) {
  12214. $newtoken = new HTMLPurifier_Token_Start($wrapname);
  12215. $this->insertBefore($newtoken);
  12216. $reprocess = true;
  12217. continue;
  12218. }
  12219. }
  12220. $carryover = false;
  12221. if ($autoclose && $definition->info[$parent->name]->formatting) {
  12222. $carryover = true;
  12223. }
  12224. if ($autoclose) {
  12225. // errors need to be updated
  12226. $new_token = new HTMLPurifier_Token_End($parent->name);
  12227. $new_token->start = $parent;
  12228. if ($carryover) {
  12229. $element = clone $parent;
  12230. $element->armor['MakeWellFormed_TagClosedError'] = true;
  12231. $element->carryover = true;
  12232. $this->processToken(array($new_token, $token, $element));
  12233. } else {
  12234. $this->insertBefore($new_token);
  12235. }
  12236. if ($e && !isset($parent->armor['MakeWellFormed_TagClosedError'])) {
  12237. if (!$carryover) {
  12238. $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent);
  12239. } else {
  12240. $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag carryover', $parent);
  12241. }
  12242. }
  12243. $reprocess = true;
  12244. continue;
  12245. }
  12246. }
  12247. $ok = true;
  12248. }
  12249. if ($ok) {
  12250. foreach ($this->injectors as $i => $injector) {
  12251. if (isset($token->skip[$i])) continue;
  12252. if ($token->rewind !== null && $token->rewind !== $i) continue;
  12253. $injector->handleElement($token);
  12254. $this->processToken($token, $i);
  12255. $reprocess = true;
  12256. break;
  12257. }
  12258. if (!$reprocess) {
  12259. // ah, nothing interesting happened; do normal processing
  12260. $this->swap($token);
  12261. if ($token instanceof HTMLPurifier_Token_Start) {
  12262. $this->stack[] = $token;
  12263. } elseif ($token instanceof HTMLPurifier_Token_End) {
  12264. throw new HTMLPurifier_Exception('Improper handling of end tag in start code; possible error in MakeWellFormed');
  12265. }
  12266. }
  12267. continue;
  12268. }
  12269. // sanity check: we should be dealing with a closing tag
  12270. if (!$token instanceof HTMLPurifier_Token_End) {
  12271. throw new HTMLPurifier_Exception('Unaccounted for tag token in input stream, bug in HTML Purifier');
  12272. }
  12273. // make sure that we have something open
  12274. if (empty($this->stack)) {
  12275. if ($escape_invalid_tags) {
  12276. if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text');
  12277. $this->swap(new HTMLPurifier_Token_Text(
  12278. $generator->generateFromToken($token)
  12279. ));
  12280. } else {
  12281. $this->remove();
  12282. if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed');
  12283. }
  12284. $reprocess = true;
  12285. continue;
  12286. }
  12287. // first, check for the simplest case: everything closes neatly.
  12288. // Eventually, everything passes through here; if there are problems
  12289. // we modify the input stream accordingly and then pu