PageRenderTime 128ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 2ms

/yii/vendors/htmlpurifier/HTMLPurifier.standalone.php

https://bitbucket.org/Crisu83/webgames
PHP | 14492 lines | 7496 code | 2503 blank | 4493 comment | 1442 complexity | 7d678df995783a3e3f130996946c003d MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0, GPL-3.0, LGPL-2.0, LGPL-2.1, BSD-2-Clause
  1. <?php
  2. /**
  3. * @file
  4. * This file was auto-generated by generate-includes.php and includes all of
  5. * the core files required by HTML Purifier. Use this if performance is a
  6. * primary concern and you are using an opcode cache. PLEASE DO NOT EDIT THIS
  7. * FILE, changes will be overwritten the next time the script is run.
  8. *
  9. * @version 4.4.0
  10. *
  11. * @warning
  12. * You must *not* include any other HTML Purifier files before this file,
  13. * because 'require' not 'require_once' is used.
  14. *
  15. * @warning
  16. * This file requires that the include path contains the HTML Purifier
  17. * library directory; this is not auto-set.
  18. */
  19. /*! @mainpage
  20. *
  21. * HTML Purifier is an HTML filter that will take an arbitrary snippet of
  22. * HTML and rigorously test, validate and filter it into a version that
  23. * is safe for output onto webpages. It achieves this by:
  24. *
  25. * -# Lexing (parsing into tokens) the document,
  26. * -# Executing various strategies on the tokens:
  27. * -# Removing all elements not in the whitelist,
  28. * -# Making the tokens well-formed,
  29. * -# Fixing the nesting of the nodes, and
  30. * -# Validating attributes of the nodes; and
  31. * -# Generating HTML from the purified tokens.
  32. *
  33. * However, most users will only need to interface with the HTMLPurifier
  34. * and HTMLPurifier_Config.
  35. */
  36. /*
  37. HTML Purifier 4.4.0 - Standards Compliant HTML Filtering
  38. Copyright (C) 2006-2008 Edward Z. Yang
  39. This library is free software; you can redistribute it and/or
  40. modify it under the terms of the GNU Lesser General Public
  41. License as published by the Free Software Foundation; either
  42. version 2.1 of the License, or (at your option) any later version.
  43. This library is distributed in the hope that it will be useful,
  44. but WITHOUT ANY WARRANTY; without even the implied warranty of
  45. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  46. Lesser General Public License for more details.
  47. You should have received a copy of the GNU Lesser General Public
  48. License along with this library; if not, write to the Free Software
  49. Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  50. */
  51. /**
  52. * Facade that coordinates HTML Purifier's subsystems in order to purify HTML.
  53. *
  54. * @note There are several points in which configuration can be specified
  55. * for HTML Purifier. The precedence of these (from lowest to
  56. * highest) is as follows:
  57. * -# Instance: new HTMLPurifier($config)
  58. * -# Invocation: purify($html, $config)
  59. * These configurations are entirely independent of each other and
  60. * are *not* merged (this behavior may change in the future).
  61. *
  62. * @todo We need an easier way to inject strategies using the configuration
  63. * object.
  64. */
  65. class HTMLPurifier
  66. {
  67. /** Version of HTML Purifier */
  68. public $version = '4.4.0';
  69. /** Constant with version of HTML Purifier */
  70. const VERSION = '4.4.0';
  71. /** Global configuration object */
  72. public $config;
  73. /** Array of extra HTMLPurifier_Filter objects to run on HTML, for backwards compatibility */
  74. private $filters = array();
  75. /** Single instance of HTML Purifier */
  76. private static $instance;
  77. protected $strategy, $generator;
  78. /**
  79. * Resultant HTMLPurifier_Context of last run purification. Is an array
  80. * of contexts if the last called method was purifyArray().
  81. */
  82. public $context;
  83. /**
  84. * Initializes the purifier.
  85. * @param $config Optional HTMLPurifier_Config object for all instances of
  86. * the purifier, if omitted, a default configuration is
  87. * supplied (which can be overridden on a per-use basis).
  88. * The parameter can also be any type that
  89. * HTMLPurifier_Config::create() supports.
  90. */
  91. public function __construct($config = null) {
  92. $this->config = HTMLPurifier_Config::create($config);
  93. $this->strategy = new HTMLPurifier_Strategy_Core();
  94. }
  95. /**
  96. * Adds a filter to process the output. First come first serve
  97. * @param $filter HTMLPurifier_Filter object
  98. */
  99. public function addFilter($filter) {
  100. trigger_error('HTMLPurifier->addFilter() is deprecated, use configuration directives in the Filter namespace or Filter.Custom', E_USER_WARNING);
  101. $this->filters[] = $filter;
  102. }
  103. /**
  104. * Filters an HTML snippet/document to be XSS-free and standards-compliant.
  105. *
  106. * @param $html String of HTML to purify
  107. * @param $config HTMLPurifier_Config object for this operation, if omitted,
  108. * defaults to the config object specified during this
  109. * object's construction. The parameter can also be any type
  110. * that HTMLPurifier_Config::create() supports.
  111. * @return Purified HTML
  112. */
  113. public function purify($html, $config = null) {
  114. // :TODO: make the config merge in, instead of replace
  115. $config = $config ? HTMLPurifier_Config::create($config) : $this->config;
  116. // implementation is partially environment dependant, partially
  117. // configuration dependant
  118. $lexer = HTMLPurifier_Lexer::create($config);
  119. $context = new HTMLPurifier_Context();
  120. // setup HTML generator
  121. $this->generator = new HTMLPurifier_Generator($config, $context);
  122. $context->register('Generator', $this->generator);
  123. // set up global context variables
  124. if ($config->get('Core.CollectErrors')) {
  125. // may get moved out if other facilities use it
  126. $language_factory = HTMLPurifier_LanguageFactory::instance();
  127. $language = $language_factory->create($config, $context);
  128. $context->register('Locale', $language);
  129. $error_collector = new HTMLPurifier_ErrorCollector($context);
  130. $context->register('ErrorCollector', $error_collector);
  131. }
  132. // setup id_accumulator context, necessary due to the fact that
  133. // AttrValidator can be called from many places
  134. $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
  135. $context->register('IDAccumulator', $id_accumulator);
  136. $html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context);
  137. // setup filters
  138. $filter_flags = $config->getBatch('Filter');
  139. $custom_filters = $filter_flags['Custom'];
  140. unset($filter_flags['Custom']);
  141. $filters = array();
  142. foreach ($filter_flags as $filter => $flag) {
  143. if (!$flag) continue;
  144. if (strpos($filter, '.') !== false) continue;
  145. $class = "HTMLPurifier_Filter_$filter";
  146. $filters[] = new $class;
  147. }
  148. foreach ($custom_filters as $filter) {
  149. // maybe "HTMLPurifier_Filter_$filter", but be consistent with AutoFormat
  150. $filters[] = $filter;
  151. }
  152. $filters = array_merge($filters, $this->filters);
  153. // maybe prepare(), but later
  154. for ($i = 0, $filter_size = count($filters); $i < $filter_size; $i++) {
  155. $html = $filters[$i]->preFilter($html, $config, $context);
  156. }
  157. // purified HTML
  158. $html =
  159. $this->generator->generateFromTokens(
  160. // list of tokens
  161. $this->strategy->execute(
  162. // list of un-purified tokens
  163. $lexer->tokenizeHTML(
  164. // un-purified HTML
  165. $html, $config, $context
  166. ),
  167. $config, $context
  168. )
  169. );
  170. for ($i = $filter_size - 1; $i >= 0; $i--) {
  171. $html = $filters[$i]->postFilter($html, $config, $context);
  172. }
  173. $html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context);
  174. $this->context =& $context;
  175. return $html;
  176. }
  177. /**
  178. * Filters an array of HTML snippets
  179. * @param $config Optional HTMLPurifier_Config object for this operation.
  180. * See HTMLPurifier::purify() for more details.
  181. * @return Array of purified HTML
  182. */
  183. public function purifyArray($array_of_html, $config = null) {
  184. $context_array = array();
  185. foreach ($array_of_html as $key => $html) {
  186. $array_of_html[$key] = $this->purify($html, $config);
  187. $context_array[$key] = $this->context;
  188. }
  189. $this->context = $context_array;
  190. return $array_of_html;
  191. }
  192. /**
  193. * Singleton for enforcing just one HTML Purifier in your system
  194. * @param $prototype Optional prototype HTMLPurifier instance to
  195. * overload singleton with, or HTMLPurifier_Config
  196. * instance to configure the generated version with.
  197. */
  198. public static function instance($prototype = null) {
  199. if (!self::$instance || $prototype) {
  200. if ($prototype instanceof HTMLPurifier) {
  201. self::$instance = $prototype;
  202. } elseif ($prototype) {
  203. self::$instance = new HTMLPurifier($prototype);
  204. } else {
  205. self::$instance = new HTMLPurifier();
  206. }
  207. }
  208. return self::$instance;
  209. }
  210. /**
  211. * @note Backwards compatibility, see instance()
  212. */
  213. public static function getInstance($prototype = null) {
  214. return HTMLPurifier::instance($prototype);
  215. }
  216. }
  217. /**
  218. * Defines common attribute collections that modules reference
  219. */
  220. class HTMLPurifier_AttrCollections
  221. {
  222. /**
  223. * Associative array of attribute collections, indexed by name
  224. */
  225. public $info = array();
  226. /**
  227. * Performs all expansions on internal data for use by other inclusions
  228. * It also collects all attribute collection extensions from
  229. * modules
  230. * @param $attr_types HTMLPurifier_AttrTypes instance
  231. * @param $modules Hash array of HTMLPurifier_HTMLModule members
  232. */
  233. public function __construct($attr_types, $modules) {
  234. // load extensions from the modules
  235. foreach ($modules as $module) {
  236. foreach ($module->attr_collections as $coll_i => $coll) {
  237. if (!isset($this->info[$coll_i])) {
  238. $this->info[$coll_i] = array();
  239. }
  240. foreach ($coll as $attr_i => $attr) {
  241. if ($attr_i === 0 && isset($this->info[$coll_i][$attr_i])) {
  242. // merge in includes
  243. $this->info[$coll_i][$attr_i] = array_merge(
  244. $this->info[$coll_i][$attr_i], $attr);
  245. continue;
  246. }
  247. $this->info[$coll_i][$attr_i] = $attr;
  248. }
  249. }
  250. }
  251. // perform internal expansions and inclusions
  252. foreach ($this->info as $name => $attr) {
  253. // merge attribute collections that include others
  254. $this->performInclusions($this->info[$name]);
  255. // replace string identifiers with actual attribute objects
  256. $this->expandIdentifiers($this->info[$name], $attr_types);
  257. }
  258. }
  259. /**
  260. * Takes a reference to an attribute associative array and performs
  261. * all inclusions specified by the zero index.
  262. * @param &$attr Reference to attribute array
  263. */
  264. public function performInclusions(&$attr) {
  265. if (!isset($attr[0])) return;
  266. $merge = $attr[0];
  267. $seen = array(); // recursion guard
  268. // loop through all the inclusions
  269. for ($i = 0; isset($merge[$i]); $i++) {
  270. if (isset($seen[$merge[$i]])) continue;
  271. $seen[$merge[$i]] = true;
  272. // foreach attribute of the inclusion, copy it over
  273. if (!isset($this->info[$merge[$i]])) continue;
  274. foreach ($this->info[$merge[$i]] as $key => $value) {
  275. if (isset($attr[$key])) continue; // also catches more inclusions
  276. $attr[$key] = $value;
  277. }
  278. if (isset($this->info[$merge[$i]][0])) {
  279. // recursion
  280. $merge = array_merge($merge, $this->info[$merge[$i]][0]);
  281. }
  282. }
  283. unset($attr[0]);
  284. }
  285. /**
  286. * Expands all string identifiers in an attribute array by replacing
  287. * them with the appropriate values inside HTMLPurifier_AttrTypes
  288. * @param &$attr Reference to attribute array
  289. * @param $attr_types HTMLPurifier_AttrTypes instance
  290. */
  291. public function expandIdentifiers(&$attr, $attr_types) {
  292. // because foreach will process new elements we add, make sure we
  293. // skip duplicates
  294. $processed = array();
  295. foreach ($attr as $def_i => $def) {
  296. // skip inclusions
  297. if ($def_i === 0) continue;
  298. if (isset($processed[$def_i])) continue;
  299. // determine whether or not attribute is required
  300. if ($required = (strpos($def_i, '*') !== false)) {
  301. // rename the definition
  302. unset($attr[$def_i]);
  303. $def_i = trim($def_i, '*');
  304. $attr[$def_i] = $def;
  305. }
  306. $processed[$def_i] = true;
  307. // if we've already got a literal object, move on
  308. if (is_object($def)) {
  309. // preserve previous required
  310. $attr[$def_i]->required = ($required || $attr[$def_i]->required);
  311. continue;
  312. }
  313. if ($def === false) {
  314. unset($attr[$def_i]);
  315. continue;
  316. }
  317. if ($t = $attr_types->get($def)) {
  318. $attr[$def_i] = $t;
  319. $attr[$def_i]->required = $required;
  320. } else {
  321. unset($attr[$def_i]);
  322. }
  323. }
  324. }
  325. }
  326. /**
  327. * Base class for all validating attribute definitions.
  328. *
  329. * This family of classes forms the core for not only HTML attribute validation,
  330. * but also any sort of string that needs to be validated or cleaned (which
  331. * means CSS properties and composite definitions are defined here too).
  332. * Besides defining (through code) what precisely makes the string valid,
  333. * subclasses are also responsible for cleaning the code if possible.
  334. */
  335. abstract class HTMLPurifier_AttrDef
  336. {
  337. /**
  338. * Tells us whether or not an HTML attribute is minimized. Has no
  339. * meaning in other contexts.
  340. */
  341. public $minimized = false;
  342. /**
  343. * Tells us whether or not an HTML attribute is required. Has no
  344. * meaning in other contexts
  345. */
  346. public $required = false;
  347. /**
  348. * Validates and cleans passed string according to a definition.
  349. *
  350. * @param $string String to be validated and cleaned.
  351. * @param $config Mandatory HTMLPurifier_Config object.
  352. * @param $context Mandatory HTMLPurifier_AttrContext object.
  353. */
  354. abstract public function validate($string, $config, $context);
  355. /**
  356. * Convenience method that parses a string as if it were CDATA.
  357. *
  358. * This method process a string in the manner specified at
  359. * <http://www.w3.org/TR/html4/types.html#h-6.2> by removing
  360. * leading and trailing whitespace, ignoring line feeds, and replacing
  361. * carriage returns and tabs with spaces. While most useful for HTML
  362. * attributes specified as CDATA, it can also be applied to most CSS
  363. * values.
  364. *
  365. * @note This method is not entirely standards compliant, as trim() removes
  366. * more types of whitespace than specified in the spec. In practice,
  367. * this is rarely a problem, as those extra characters usually have
  368. * already been removed by HTMLPurifier_Encoder.
  369. *
  370. * @warning This processing is inconsistent with XML's whitespace handling
  371. * as specified by section 3.3.3 and referenced XHTML 1.0 section
  372. * 4.7. However, note that we are NOT necessarily
  373. * parsing XML, thus, this behavior may still be correct. We
  374. * assume that newlines have been normalized.
  375. */
  376. public function parseCDATA($string) {
  377. $string = trim($string);
  378. $string = str_replace(array("\n", "\t", "\r"), ' ', $string);
  379. return $string;
  380. }
  381. /**
  382. * Factory method for creating this class from a string.
  383. * @param $string String construction info
  384. * @return Created AttrDef object corresponding to $string
  385. */
  386. public function make($string) {
  387. // default implementation, return a flyweight of this object.
  388. // If $string has an effect on the returned object (i.e. you
  389. // need to overload this method), it is best
  390. // to clone or instantiate new copies. (Instantiation is safer.)
  391. return $this;
  392. }
  393. /**
  394. * Removes spaces from rgb(0, 0, 0) so that shorthand CSS properties work
  395. * properly. THIS IS A HACK!
  396. */
  397. protected function mungeRgb($string) {
  398. return preg_replace('/rgb\((\d+)\s*,\s*(\d+)\s*,\s*(\d+)\)/', 'rgb(\1,\2,\3)', $string);
  399. }
  400. /**
  401. * Parses a possibly escaped CSS string and returns the "pure"
  402. * version of it.
  403. */
  404. protected function expandCSSEscape($string) {
  405. // flexibly parse it
  406. $ret = '';
  407. for ($i = 0, $c = strlen($string); $i < $c; $i++) {
  408. if ($string[$i] === '\\') {
  409. $i++;
  410. if ($i >= $c) {
  411. $ret .= '\\';
  412. break;
  413. }
  414. if (ctype_xdigit($string[$i])) {
  415. $code = $string[$i];
  416. for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
  417. if (!ctype_xdigit($string[$i])) break;
  418. $code .= $string[$i];
  419. }
  420. // We have to be extremely careful when adding
  421. // new characters, to make sure we're not breaking
  422. // the encoding.
  423. $char = HTMLPurifier_Encoder::unichr(hexdec($code));
  424. if (HTMLPurifier_Encoder::cleanUTF8($char) === '') continue;
  425. $ret .= $char;
  426. if ($i < $c && trim($string[$i]) !== '') $i--;
  427. continue;
  428. }
  429. if ($string[$i] === "\n") continue;
  430. }
  431. $ret .= $string[$i];
  432. }
  433. return $ret;
  434. }
  435. }
  436. /**
  437. * Processes an entire attribute array for corrections needing multiple values.
  438. *
  439. * Occasionally, a certain attribute will need to be removed and popped onto
  440. * another value. Instead of creating a complex return syntax for
  441. * HTMLPurifier_AttrDef, we just pass the whole attribute array to a
  442. * specialized object and have that do the special work. That is the
  443. * family of HTMLPurifier_AttrTransform.
  444. *
  445. * An attribute transformation can be assigned to run before or after
  446. * HTMLPurifier_AttrDef validation. See HTMLPurifier_HTMLDefinition for
  447. * more details.
  448. */
  449. abstract class HTMLPurifier_AttrTransform
  450. {
  451. /**
  452. * Abstract: makes changes to the attributes dependent on multiple values.
  453. *
  454. * @param $attr Assoc array of attributes, usually from
  455. * HTMLPurifier_Token_Tag::$attr
  456. * @param $config Mandatory HTMLPurifier_Config object.
  457. * @param $context Mandatory HTMLPurifier_Context object
  458. * @returns Processed attribute array.
  459. */
  460. abstract public function transform($attr, $config, $context);
  461. /**
  462. * Prepends CSS properties to the style attribute, creating the
  463. * attribute if it doesn't exist.
  464. * @param $attr Attribute array to process (passed by reference)
  465. * @param $css CSS to prepend
  466. */
  467. public function prependCSS(&$attr, $css) {
  468. $attr['style'] = isset($attr['style']) ? $attr['style'] : '';
  469. $attr['style'] = $css . $attr['style'];
  470. }
  471. /**
  472. * Retrieves and removes an attribute
  473. * @param $attr Attribute array to process (passed by reference)
  474. * @param $key Key of attribute to confiscate
  475. */
  476. public function confiscateAttr(&$attr, $key) {
  477. if (!isset($attr[$key])) return null;
  478. $value = $attr[$key];
  479. unset($attr[$key]);
  480. return $value;
  481. }
  482. }
  483. /**
  484. * Provides lookup array of attribute types to HTMLPurifier_AttrDef objects
  485. */
  486. class HTMLPurifier_AttrTypes
  487. {
  488. /**
  489. * Lookup array of attribute string identifiers to concrete implementations
  490. */
  491. protected $info = array();
  492. /**
  493. * Constructs the info array, supplying default implementations for attribute
  494. * types.
  495. */
  496. public function __construct() {
  497. // XXX This is kind of poor, since we don't actually /clone/
  498. // instances; instead, we use the supplied make() attribute. So,
  499. // the underlying class must know how to deal with arguments.
  500. // With the old implementation of Enum, that ignored its
  501. // arguments when handling a make dispatch, the IAlign
  502. // definition wouldn't work.
  503. // pseudo-types, must be instantiated via shorthand
  504. $this->info['Enum'] = new HTMLPurifier_AttrDef_Enum();
  505. $this->info['Bool'] = new HTMLPurifier_AttrDef_HTML_Bool();
  506. $this->info['CDATA'] = new HTMLPurifier_AttrDef_Text();
  507. $this->info['ID'] = new HTMLPurifier_AttrDef_HTML_ID();
  508. $this->info['Length'] = new HTMLPurifier_AttrDef_HTML_Length();
  509. $this->info['MultiLength'] = new HTMLPurifier_AttrDef_HTML_MultiLength();
  510. $this->info['NMTOKENS'] = new HTMLPurifier_AttrDef_HTML_Nmtokens();
  511. $this->info['Pixels'] = new HTMLPurifier_AttrDef_HTML_Pixels();
  512. $this->info['Text'] = new HTMLPurifier_AttrDef_Text();
  513. $this->info['URI'] = new HTMLPurifier_AttrDef_URI();
  514. $this->info['LanguageCode'] = new HTMLPurifier_AttrDef_Lang();
  515. $this->info['Color'] = new HTMLPurifier_AttrDef_HTML_Color();
  516. $this->info['IAlign'] = self::makeEnum('top,middle,bottom,left,right');
  517. $this->info['LAlign'] = self::makeEnum('top,bottom,left,right');
  518. $this->info['FrameTarget'] = new HTMLPurifier_AttrDef_HTML_FrameTarget();
  519. // unimplemented aliases
  520. $this->info['ContentType'] = new HTMLPurifier_AttrDef_Text();
  521. $this->info['ContentTypes'] = new HTMLPurifier_AttrDef_Text();
  522. $this->info['Charsets'] = new HTMLPurifier_AttrDef_Text();
  523. $this->info['Character'] = new HTMLPurifier_AttrDef_Text();
  524. // "proprietary" types
  525. $this->info['Class'] = new HTMLPurifier_AttrDef_HTML_Class();
  526. // number is really a positive integer (one or more digits)
  527. // FIXME: ^^ not always, see start and value of list items
  528. $this->info['Number'] = new HTMLPurifier_AttrDef_Integer(false, false, true);
  529. }
  530. private static function makeEnum($in) {
  531. return new HTMLPurifier_AttrDef_Clone(new HTMLPurifier_AttrDef_Enum(explode(',', $in)));
  532. }
  533. /**
  534. * Retrieves a type
  535. * @param $type String type name
  536. * @return Object AttrDef for type
  537. */
  538. public function get($type) {
  539. // determine if there is any extra info tacked on
  540. if (strpos($type, '#') !== false) list($type, $string) = explode('#', $type, 2);
  541. else $string = '';
  542. if (!isset($this->info[$type])) {
  543. trigger_error('Cannot retrieve undefined attribute type ' . $type, E_USER_ERROR);
  544. return;
  545. }
  546. return $this->info[$type]->make($string);
  547. }
  548. /**
  549. * Sets a new implementation for a type
  550. * @param $type String type name
  551. * @param $impl Object AttrDef for type
  552. */
  553. public function set($type, $impl) {
  554. $this->info[$type] = $impl;
  555. }
  556. }
  557. /**
  558. * Validates the attributes of a token. Doesn't manage required attributes
  559. * very well. The only reason we factored this out was because RemoveForeignElements
  560. * also needed it besides ValidateAttributes.
  561. */
  562. class HTMLPurifier_AttrValidator
  563. {
  564. /**
  565. * Validates the attributes of a token, returning a modified token
  566. * that has valid tokens
  567. * @param $token Reference to token to validate. We require a reference
  568. * because the operation this class performs on the token are
  569. * not atomic, so the context CurrentToken to be updated
  570. * throughout
  571. * @param $config Instance of HTMLPurifier_Config
  572. * @param $context Instance of HTMLPurifier_Context
  573. */
  574. public function validateToken(&$token, &$config, $context) {
  575. $definition = $config->getHTMLDefinition();
  576. $e =& $context->get('ErrorCollector', true);
  577. // initialize IDAccumulator if necessary
  578. $ok =& $context->get('IDAccumulator', true);
  579. if (!$ok) {
  580. $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
  581. $context->register('IDAccumulator', $id_accumulator);
  582. }
  583. // initialize CurrentToken if necessary
  584. $current_token =& $context->get('CurrentToken', true);
  585. if (!$current_token) $context->register('CurrentToken', $token);
  586. if (
  587. !$token instanceof HTMLPurifier_Token_Start &&
  588. !$token instanceof HTMLPurifier_Token_Empty
  589. ) return $token;
  590. // create alias to global definition array, see also $defs
  591. // DEFINITION CALL
  592. $d_defs = $definition->info_global_attr;
  593. // don't update token until the very end, to ensure an atomic update
  594. $attr = $token->attr;
  595. // do global transformations (pre)
  596. // nothing currently utilizes this
  597. foreach ($definition->info_attr_transform_pre as $transform) {
  598. $attr = $transform->transform($o = $attr, $config, $context);
  599. if ($e) {
  600. if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
  601. }
  602. }
  603. // do local transformations only applicable to this element (pre)
  604. // ex. <p align="right"> to <p style="text-align:right;">
  605. foreach ($definition->info[$token->name]->attr_transform_pre as $transform) {
  606. $attr = $transform->transform($o = $attr, $config, $context);
  607. if ($e) {
  608. if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
  609. }
  610. }
  611. // create alias to this element's attribute definition array, see
  612. // also $d_defs (global attribute definition array)
  613. // DEFINITION CALL
  614. $defs = $definition->info[$token->name]->attr;
  615. $attr_key = false;
  616. $context->register('CurrentAttr', $attr_key);
  617. // iterate through all the attribute keypairs
  618. // Watch out for name collisions: $key has previously been used
  619. foreach ($attr as $attr_key => $value) {
  620. // call the definition
  621. if ( isset($defs[$attr_key]) ) {
  622. // there is a local definition defined
  623. if ($defs[$attr_key] === false) {
  624. // We've explicitly been told not to allow this element.
  625. // This is usually when there's a global definition
  626. // that must be overridden.
  627. // Theoretically speaking, we could have a
  628. // AttrDef_DenyAll, but this is faster!
  629. $result = false;
  630. } else {
  631. // validate according to the element's definition
  632. $result = $defs[$attr_key]->validate(
  633. $value, $config, $context
  634. );
  635. }
  636. } elseif ( isset($d_defs[$attr_key]) ) {
  637. // there is a global definition defined, validate according
  638. // to the global definition
  639. $result = $d_defs[$attr_key]->validate(
  640. $value, $config, $context
  641. );
  642. } else {
  643. // system never heard of the attribute? DELETE!
  644. $result = false;
  645. }
  646. // put the results into effect
  647. if ($result === false || $result === null) {
  648. // this is a generic error message that should replaced
  649. // with more specific ones when possible
  650. if ($e) $e->send(E_ERROR, 'AttrValidator: Attribute removed');
  651. // remove the attribute
  652. unset($attr[$attr_key]);
  653. } elseif (is_string($result)) {
  654. // generally, if a substitution is happening, there
  655. // was some sort of implicit correction going on. We'll
  656. // delegate it to the attribute classes to say exactly what.
  657. // simple substitution
  658. $attr[$attr_key] = $result;
  659. } else {
  660. // nothing happens
  661. }
  662. // we'd also want slightly more complicated substitution
  663. // involving an array as the return value,
  664. // although we're not sure how colliding attributes would
  665. // resolve (certain ones would be completely overriden,
  666. // others would prepend themselves).
  667. }
  668. $context->destroy('CurrentAttr');
  669. // post transforms
  670. // global (error reporting untested)
  671. foreach ($definition->info_attr_transform_post as $transform) {
  672. $attr = $transform->transform($o = $attr, $config, $context);
  673. if ($e) {
  674. if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
  675. }
  676. }
  677. // local (error reporting untested)
  678. foreach ($definition->info[$token->name]->attr_transform_post as $transform) {
  679. $attr = $transform->transform($o = $attr, $config, $context);
  680. if ($e) {
  681. if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
  682. }
  683. }
  684. $token->attr = $attr;
  685. // destroy CurrentToken if we made it ourselves
  686. if (!$current_token) $context->destroy('CurrentToken');
  687. }
  688. }
  689. // constants are slow, so we use as few as possible
  690. if (!defined('HTMLPURIFIER_PREFIX')) {
  691. define('HTMLPURIFIER_PREFIX', dirname(__FILE__) . '/standalone');
  692. set_include_path(HTMLPURIFIER_PREFIX . PATH_SEPARATOR . get_include_path());
  693. }
  694. // accomodations for versions earlier than 5.0.2
  695. // borrowed from PHP_Compat, LGPL licensed, by Aidan Lister <aidan@php.net>
  696. if (!defined('PHP_EOL')) {
  697. switch (strtoupper(substr(PHP_OS, 0, 3))) {
  698. case 'WIN':
  699. define('PHP_EOL', "\r\n");
  700. break;
  701. case 'DAR':
  702. define('PHP_EOL', "\r");
  703. break;
  704. default:
  705. define('PHP_EOL', "\n");
  706. }
  707. }
  708. /**
  709. * Bootstrap class that contains meta-functionality for HTML Purifier such as
  710. * the autoload function.
  711. *
  712. * @note
  713. * This class may be used without any other files from HTML Purifier.
  714. */
  715. class HTMLPurifier_Bootstrap
  716. {
  717. /**
  718. * Autoload function for HTML Purifier
  719. * @param $class Class to load
  720. */
  721. public static function autoload($class) {
  722. $file = HTMLPurifier_Bootstrap::getPath($class);
  723. if (!$file) return false;
  724. // Technically speaking, it should be ok and more efficient to
  725. // just do 'require', but Antonio Parraga reports that with
  726. // Zend extensions such as Zend debugger and APC, this invariant
  727. // may be broken. Since we have efficient alternatives, pay
  728. // the cost here and avoid the bug.
  729. require_once HTMLPURIFIER_PREFIX . '/' . $file;
  730. return true;
  731. }
  732. /**
  733. * Returns the path for a specific class.
  734. */
  735. public static function getPath($class) {
  736. if (strncmp('HTMLPurifier', $class, 12) !== 0) return false;
  737. // Custom implementations
  738. if (strncmp('HTMLPurifier_Language_', $class, 22) === 0) {
  739. $code = str_replace('_', '-', substr($class, 22));
  740. $file = 'HTMLPurifier/Language/classes/' . $code . '.php';
  741. } else {
  742. $file = str_replace('_', '/', $class) . '.php';
  743. }
  744. if (!file_exists(HTMLPURIFIER_PREFIX . '/' . $file)) return false;
  745. return $file;
  746. }
  747. /**
  748. * "Pre-registers" our autoloader on the SPL stack.
  749. */
  750. public static function registerAutoload() {
  751. $autoload = array('HTMLPurifier_Bootstrap', 'autoload');
  752. if ( ($funcs = spl_autoload_functions()) === false ) {
  753. spl_autoload_register($autoload);
  754. } elseif (function_exists('spl_autoload_unregister')) {
  755. $buggy = version_compare(PHP_VERSION, '5.2.11', '<');
  756. $compat = version_compare(PHP_VERSION, '5.1.2', '<=') &&
  757. version_compare(PHP_VERSION, '5.1.0', '>=');
  758. foreach ($funcs as $func) {
  759. if ($buggy && is_array($func)) {
  760. // :TRICKY: There are some compatibility issues and some
  761. // places where we need to error out
  762. $reflector = new ReflectionMethod($func[0], $func[1]);
  763. if (!$reflector->isStatic()) {
  764. throw new Exception('
  765. HTML Purifier autoloader registrar is not compatible
  766. with non-static object methods due to PHP Bug #44144;
  767. Please do not use HTMLPurifier.autoload.php (or any
  768. file that includes this file); instead, place the code:
  769. spl_autoload_register(array(\'HTMLPurifier_Bootstrap\', \'autoload\'))
  770. after your own autoloaders.
  771. ');
  772. }
  773. // Suprisingly, spl_autoload_register supports the
  774. // Class::staticMethod callback format, although call_user_func doesn't
  775. if ($compat) $func = implode('::', $func);
  776. }
  777. spl_autoload_unregister($func);
  778. }
  779. spl_autoload_register($autoload);
  780. foreach ($funcs as $func) spl_autoload_register($func);
  781. }
  782. }
  783. }
  784. /**
  785. * Super-class for definition datatype objects, implements serialization
  786. * functions for the class.
  787. */
  788. abstract class HTMLPurifier_Definition
  789. {
  790. /**
  791. * Has setup() been called yet?
  792. */
  793. public $setup = false;
  794. /**
  795. * If true, write out the final definition object to the cache after
  796. * setup. This will be true only if all invocations to get a raw
  797. * definition object are also optimized. This does not cause file
  798. * system thrashing because on subsequent calls the cached object
  799. * is used and any writes to the raw definition object are short
  800. * circuited. See enduser-customize.html for the high-level
  801. * picture.
  802. */
  803. public $optimized = null;
  804. /**
  805. * What type of definition is it?
  806. */
  807. public $type;
  808. /**
  809. * Sets up the definition object into the final form, something
  810. * not done by the constructor
  811. * @param $config HTMLPurifier_Config instance
  812. */
  813. abstract protected function doSetup($config);
  814. /**
  815. * Setup function that aborts if already setup
  816. * @param $config HTMLPurifier_Config instance
  817. */
  818. public function setup($config) {
  819. if ($this->setup) return;
  820. $this->setup = true;
  821. $this->doSetup($config);
  822. }
  823. }
  824. /**
  825. * Defines allowed CSS attributes and what their values are.
  826. * @see HTMLPurifier_HTMLDefinition
  827. */
  828. class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
  829. {
  830. public $type = 'CSS';
  831. /**
  832. * Assoc array of attribute name to definition object.
  833. */
  834. public $info = array();
  835. /**
  836. * Constructs the info array. The meat of this class.
  837. */
  838. protected function doSetup($config) {
  839. $this->info['text-align'] = new HTMLPurifier_AttrDef_Enum(
  840. array('left', 'right', 'center', 'justify'), false);
  841. $border_style =
  842. $this->info['border-bottom-style'] =
  843. $this->info['border-right-style'] =
  844. $this->info['border-left-style'] =
  845. $this->info['border-top-style'] = new HTMLPurifier_AttrDef_Enum(
  846. array('none', 'hidden', 'dotted', 'dashed', 'solid', 'double',
  847. 'groove', 'ridge', 'inset', 'outset'), false);
  848. $this->info['border-style'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_style);
  849. $this->info['clear'] = new HTMLPurifier_AttrDef_Enum(
  850. array('none', 'left', 'right', 'both'), false);
  851. $this->info['float'] = new HTMLPurifier_AttrDef_Enum(
  852. array('none', 'left', 'right'), false);
  853. $this->info['font-style'] = new HTMLPurifier_AttrDef_Enum(
  854. array('normal', 'italic', 'oblique'), false);
  855. $this->info['font-variant'] = new HTMLPurifier_AttrDef_Enum(
  856. array('normal', 'small-caps'), false);
  857. $uri_or_none = new HTMLPurifier_AttrDef_CSS_Composite(
  858. array(
  859. new HTMLPurifier_AttrDef_Enum(array('none')),
  860. new HTMLPurifier_AttrDef_CSS_URI()
  861. )
  862. );
  863. $this->info['list-style-position'] = new HTMLPurifier_AttrDef_Enum(
  864. array('inside', 'outside'), false);
  865. $this->info['list-style-type'] = new HTMLPurifier_AttrDef_Enum(
  866. array('disc', 'circle', 'square', 'decimal', 'lower-roman',
  867. 'upper-roman', 'lower-alpha', 'upper-alpha', 'none'), false);
  868. $this->info['list-style-image'] = $uri_or_none;
  869. $this->info['list-style'] = new HTMLPurifier_AttrDef_CSS_ListStyle($config);
  870. $this->info['text-transform'] = new HTMLPurifier_AttrDef_Enum(
  871. array('capitalize', 'uppercase', 'lowercase', 'none'), false);
  872. $this->info['color'] = new HTMLPurifier_AttrDef_CSS_Color();
  873. $this->info['background-image'] = $uri_or_none;
  874. $this->info['background-repeat'] = new HTMLPurifier_AttrDef_Enum(
  875. array('repeat', 'repeat-x', 'repeat-y', 'no-repeat')
  876. );
  877. $this->info['background-attachment'] = new HTMLPurifier_AttrDef_Enum(
  878. array('scroll', 'fixed')
  879. );
  880. $this->info['background-position'] = new HTMLPurifier_AttrDef_CSS_BackgroundPosition();
  881. $border_color =
  882. $this->info['border-top-color'] =
  883. $this->info['border-bottom-color'] =
  884. $this->info['border-left-color'] =
  885. $this->info['border-right-color'] =
  886. $this->info['background-color'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  887. new HTMLPurifier_AttrDef_Enum(array('transparent')),
  888. new HTMLPurifier_AttrDef_CSS_Color()
  889. ));
  890. $this->info['background'] = new HTMLPurifier_AttrDef_CSS_Background($config);
  891. $this->info['border-color'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_color);
  892. $border_width =
  893. $this->info['border-top-width'] =
  894. $this->info['border-bottom-width'] =
  895. $this->info['border-left-width'] =
  896. $this->info['border-right-width'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  897. new HTMLPurifier_AttrDef_Enum(array('thin', 'medium', 'thick')),
  898. new HTMLPurifier_AttrDef_CSS_Length('0') //disallow negative
  899. ));
  900. $this->info['border-width'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_width);
  901. $this->info['letter-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  902. new HTMLPurifier_AttrDef_Enum(array('normal')),
  903. new HTMLPurifier_AttrDef_CSS_Length()
  904. ));
  905. $this->info['word-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  906. new HTMLPurifier_AttrDef_Enum(array('normal')),
  907. new HTMLPurifier_AttrDef_CSS_Length()
  908. ));
  909. $this->info['font-size'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  910. new HTMLPurifier_AttrDef_Enum(array('xx-small', 'x-small',
  911. 'small', 'medium', 'large', 'x-large', 'xx-large',
  912. 'larger', 'smaller')),
  913. new HTMLPurifier_AttrDef_CSS_Percentage(),
  914. new HTMLPurifier_AttrDef_CSS_Length()
  915. ));
  916. $this->info['line-height'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  917. new HTMLPurifier_AttrDef_Enum(array('normal')),
  918. new HTMLPurifier_AttrDef_CSS_Number(true), // no negatives
  919. new HTMLPurifier_AttrDef_CSS_Length('0'),
  920. new HTMLPurifier_AttrDef_CSS_Percentage(true)
  921. ));
  922. $margin =
  923. $this->info['margin-top'] =
  924. $this->info['margin-bottom'] =
  925. $this->info['margin-left'] =
  926. $this->info['margin-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  927. new HTMLPurifier_AttrDef_CSS_Length(),
  928. new HTMLPurifier_AttrDef_CSS_Percentage(),
  929. new HTMLPurifier_AttrDef_Enum(array('auto'))
  930. ));
  931. $this->info['margin'] = new HTMLPurifier_AttrDef_CSS_Multiple($margin);
  932. // non-negative
  933. $padding =
  934. $this->info['padding-top'] =
  935. $this->info['padding-bottom'] =
  936. $this->info['padding-left'] =
  937. $this->info['padding-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  938. new HTMLPurifier_AttrDef_CSS_Length('0'),
  939. new HTMLPurifier_AttrDef_CSS_Percentage(true)
  940. ));
  941. $this->info['padding'] = new HTMLPurifier_AttrDef_CSS_Multiple($padding);
  942. $this->info['text-indent'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  943. new HTMLPurifier_AttrDef_CSS_Length(),
  944. new HTMLPurifier_AttrDef_CSS_Percentage()
  945. ));
  946. $trusted_wh = new HTMLPurifier_AttrDef_CSS_Composite(array(
  947. new HTMLPurifier_AttrDef_CSS_Length('0'),
  948. new HTMLPurifier_AttrDef_CSS_Percentage(true),
  949. new HTMLPurifier_AttrDef_Enum(array('auto'))
  950. ));
  951. $max = $config->get('CSS.MaxImgLength');
  952. $this->info['width'] =
  953. $this->info['height'] =
  954. $max === null ?
  955. $trusted_wh :
  956. new HTMLPurifier_AttrDef_Switch('img',
  957. // For img tags:
  958. new HTMLPurifier_AttrDef_CSS_Composite(array(
  959. new HTMLPurifier_AttrDef_CSS_Length('0', $max),
  960. new HTMLPurifier_AttrDef_Enum(array('auto'))
  961. )),
  962. // For everyone else:
  963. $trusted_wh
  964. );
  965. $this->info['text-decoration'] = new HTMLPurifier_AttrDef_CSS_TextDecoration();
  966. $this->info['font-family'] = new HTMLPurifier_AttrDef_CSS_FontFamily();
  967. // this could use specialized code
  968. $this->info['font-weight'] = new HTMLPurifier_AttrDef_Enum(
  969. array('normal', 'bold', 'bolder', 'lighter', '100', '200', '300',
  970. '400', '500', '600', '700', '800', '900'), false);
  971. // MUST be called after other font properties, as it references
  972. // a CSSDefinition object
  973. $this->info['font'] = new HTMLPurifier_AttrDef_CSS_Font($config);
  974. // same here
  975. $this->info['border'] =
  976. $this->info['border-bottom'] =
  977. $this->info['border-top'] =
  978. $this->info['border-left'] =
  979. $this->info['border-right'] = new HTMLPurifier_AttrDef_CSS_Border($config);
  980. $this->info['border-collapse'] = new HTMLPurifier_AttrDef_Enum(array(
  981. 'collapse', 'separate'));
  982. $this->info['caption-side'] = new HTMLPurifier_AttrDef_Enum(array(
  983. 'top', 'bottom'));
  984. $this->info['table-layout'] = new HTMLPurifier_AttrDef_Enum(array(
  985. 'auto', 'fixed'));
  986. $this->info['vertical-align'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  987. new HTMLPurifier_AttrDef_Enum(array('baseline', 'sub', 'super',
  988. 'top', 'text-top', 'middle', 'bottom', 'text-bottom')),
  989. new HTMLPurifier_AttrDef_CSS_Length(),
  990. new HTMLPurifier_AttrDef_CSS_Percentage()
  991. ));
  992. $this->info['border-spacing'] = new HTMLPurifier_AttrDef_CSS_Multiple(new HTMLPurifier_AttrDef_CSS_Length(), 2);
  993. // partial support
  994. $this->info['white-space'] = new HTMLPurifier_AttrDef_Enum(array('nowrap'));
  995. if ($config->get('CSS.Proprietary')) {
  996. $this->doSetupProprietary($config);
  997. }
  998. if ($config->get('CSS.AllowTricky')) {
  999. $this->doSetupTricky($config);
  1000. }
  1001. if ($config->get('CSS.Trusted')) {
  1002. $this->doSetupTrusted($config);
  1003. }
  1004. $allow_important = $config->get('CSS.AllowImportant');
  1005. // wrap all attr-defs with decorator that handles !important
  1006. foreach ($this->info as $k => $v) {
  1007. $this->info[$k] = new HTMLPurifier_AttrDef_CSS_ImportantDecorator($v, $allow_important);
  1008. }
  1009. $this->setupConfigStuff($config);
  1010. }
  1011. protected function doSetupProprietary($config) {
  1012. // Internet Explorer only scrollbar colors
  1013. $this->info['scrollbar-arrow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  1014. $this->info['scrollbar-base-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  1015. $this->info['scrollbar-darkshadow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  1016. $this->info['scrollbar-face-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  1017. $this->info['scrollbar-highlight-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  1018. $this->info['scrollbar-shadow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  1019. // technically not proprietary, but CSS3, and no one supports it
  1020. $this->info['opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
  1021. $this->info['-moz-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
  1022. $this->info['-khtml-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
  1023. // only opacity, for now
  1024. $this->info['filter'] = new HTMLPurifier_AttrDef_CSS_Filter();
  1025. }
  1026. protected function doSetupTricky($config) {
  1027. $this->info['display'] = new HTMLPurifier_AttrDef_Enum(array(
  1028. 'inline', 'block', 'list-item', 'run-in', 'compact',
  1029. 'marker', 'table', 'inline-table', 'table-row-group',
  1030. 'table-header-group', 'table-footer-group', 'table-row',
  1031. 'table-column-group', 'table-column', 'table-cell', 'table-caption', 'none'
  1032. ));
  1033. $this->info['visibility'] = new HTMLPurifier_AttrDef_Enum(array(
  1034. 'visible', 'hidden', 'collapse'
  1035. ));
  1036. $this->info['overflow'] = new HTMLPurifier_AttrDef_Enum(array('visible', 'hidden', 'auto', 'scroll'));
  1037. }
  1038. protected function doSetupTrusted($config) {
  1039. $this->info['position'] = new HTMLPurifier_AttrDef_Enum(array(
  1040. 'static', 'relative', 'absolute', 'fixed'
  1041. ));
  1042. $this->info['top'] =
  1043. $this->info['left'] =
  1044. $this->info['right'] =
  1045. $this->info['bottom'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  1046. new HTMLPurifier_AttrDef_CSS_Length(),
  1047. new HTMLPurifier_AttrDef_CSS_Percentage(),
  1048. new HTMLPurifier_AttrDef_Enum(array('auto')),
  1049. ));
  1050. $this->info['z-index'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
  1051. new HTMLPurifier_AttrDef_Integer(),
  1052. new HTMLPurifier_AttrDef_Enum(array('auto')),
  1053. ));
  1054. }
  1055. /**
  1056. * Performs extra config-based processing. Based off of
  1057. * HTMLPurifier_HTMLDefinition.
  1058. * @todo Refactor duplicate elements into common class (probably using
  1059. * composition, not inheritance).
  1060. */
  1061. protected function setupConfigStuff($config) {
  1062. // setup allowed elements
  1063. $support = "(for information on implementing this, see the ".
  1064. "support forums) ";
  1065. $allowed_properties = $config->get('CSS.AllowedProperties');
  1066. if ($allowed_properties !== null) {
  1067. foreach ($this->info as $name => $d) {
  1068. if(!isset($allowed_properties[$name])) unset($this->info[$name]);
  1069. unset($allowed_properties[$name]);
  1070. }
  1071. // emit errors
  1072. foreach ($allowed_properties as $name => $d) {
  1073. // :TODO: Is this htmlspecialchars() call really necessary?
  1074. $name = htmlspecialchars($name);
  1075. trigger_error("Style attribute '$name' is not supported $support", E_USER_WARNING);
  1076. }
  1077. }
  1078. $forbidden_properties = $config->get('CSS.ForbiddenProperties');
  1079. if ($forbidden_properties !== null) {
  1080. foreach ($this->info as $name => $d) {
  1081. if (isset($forbidden_properties[$name])) {
  1082. unset($this->info[$name]);
  1083. }
  1084. }
  1085. }
  1086. }
  1087. }
  1088. /**
  1089. * Defines allowed child nodes and validates tokens against it.
  1090. */
  1091. abstract class HTMLPurifier_ChildDef
  1092. {
  1093. /**
  1094. * Type of child definition, usually right-most part of class name lowercase.
  1095. * Used occasionally in terms of context.
  1096. */
  1097. public $type;
  1098. /**
  1099. * Bool that indicates whether or not an empty array of children is okay
  1100. *
  1101. * This is necessary for redundant checking when changes affecting
  1102. * a child node may cause a parent node to now be disallowed.
  1103. */
  1104. public $allow_empty;
  1105. /**
  1106. * Lookup array of all elements that this definition could possibly allow
  1107. */
  1108. public $elements = array();
  1109. /**
  1110. * Get lookup of tag names that should not close this element automatically.
  1111. * All other elements will do so.
  1112. */
  1113. public function getAllowedElements($config) {
  1114. return $this->elements;
  1115. }
  1116. /**
  1117. * Validates nodes according to definition and returns modification.
  1118. *
  1119. * @param $tokens_of_children Array of HTMLPurifier_Token
  1120. * @param $config HTMLPurifier_Config object
  1121. * @param $context HTMLPurifier_Context object
  1122. * @return bool true to leave nodes as is
  1123. * @return bool false to remove parent node
  1124. * @return array of replacement child tokens
  1125. */
  1126. abstract public function validateChildren($tokens_of_children, $config, $context);
  1127. }
  1128. /**
  1129. * Configuration object that triggers customizable behavior.
  1130. *
  1131. * @warning This class is strongly defined: that means that the class
  1132. * will fail if an undefined directive is retrieved or set.
  1133. *
  1134. * @note Many classes that could (although many times don't) use the
  1135. * configuration object make it a mandatory parameter. This is
  1136. * because a configuration object should always be forwarded,
  1137. * otherwise, you run the risk of missing a parameter and then
  1138. * being stumped when a configuration directive doesn't work.
  1139. *
  1140. * @todo Reconsider some of the public member variables
  1141. */
  1142. class HTMLPurifier_Config
  1143. {
  1144. /**
  1145. * HTML Purifier's version
  1146. */
  1147. public $version = '4.4.0';
  1148. /**
  1149. * Bool indicator whether or not to automatically finalize
  1150. * the object if a read operation is done
  1151. */
  1152. public $autoFinalize = true;
  1153. // protected member variables
  1154. /**
  1155. * Namespace indexed array of serials for specific namespaces (see
  1156. * getSerial() for more info).
  1157. */
  1158. protected $serials = array();
  1159. /**
  1160. * Serial for entire configuration object
  1161. */
  1162. protected $serial;
  1163. /**
  1164. * Parser for variables
  1165. */
  1166. protected $parser = null;
  1167. /**
  1168. * Reference HTMLPurifier_ConfigSchema for value checking
  1169. * @note This is public for introspective purposes. Please don't
  1170. * abuse!
  1171. */
  1172. public $def;
  1173. /**
  1174. * Indexed array of definitions
  1175. */
  1176. protected $definitions;
  1177. /**
  1178. * Bool indicator whether or not config is finalized
  1179. */
  1180. protected $finalized = false;
  1181. /**
  1182. * Property list containing configuration directives.
  1183. */
  1184. protected $plist;
  1185. /**
  1186. * Whether or not a set is taking place due to an
  1187. * alias lookup.
  1188. */
  1189. private $aliasMode;
  1190. /**
  1191. * Set to false if you do not want line and file numbers in errors
  1192. * (useful when unit testing). This will also compress some errors
  1193. * and exceptions.
  1194. */
  1195. public $chatty = true;
  1196. /**
  1197. * Current lock; only gets to this namespace are allowed.
  1198. */
  1199. private $lock;
  1200. /**
  1201. * @param $definition HTMLPurifier_ConfigSchema that defines what directives
  1202. * are allowed.
  1203. */
  1204. public function __construct($definition, $parent = null) {
  1205. $parent = $parent ? $parent : $definition->defaultPlist;
  1206. $this->plist = new HTMLPurifier_PropertyList($parent);
  1207. $this->def = $definition; // keep a copy around for checking
  1208. $this->parser = new HTMLPurifier_VarParser_Flexible();
  1209. }
  1210. /**
  1211. * Convenience constructor that creates a config object based on a mixed var
  1212. * @param mixed $config Variable that defines the state of the config
  1213. * object. Can be: a HTMLPurifier_Config() object,
  1214. * an array of directives based on loadArray(),
  1215. * or a string filename of an ini file.
  1216. * @param HTMLPurifier_ConfigSchema Schema object
  1217. * @return Configured HTMLPurifier_Config object
  1218. */
  1219. public static function create($config, $schema = null) {
  1220. if ($config instanceof HTMLPurifier_Config) {
  1221. // pass-through
  1222. return $config;
  1223. }
  1224. if (!$schema) {
  1225. $ret = HTMLPurifier_Config::createDefault();
  1226. } else {
  1227. $ret = new HTMLPurifier_Config($schema);
  1228. }
  1229. if (is_string($config)) $ret->loadIni($config);
  1230. elseif (is_array($config)) $ret->loadArray($config);
  1231. return $ret;
  1232. }
  1233. /**
  1234. * Creates a new config object that inherits from a previous one.
  1235. * @param HTMLPurifier_Config $config Configuration object to inherit
  1236. * from.
  1237. * @return HTMLPurifier_Config object with $config as its parent.
  1238. */
  1239. public static function inherit(HTMLPurifier_Config $config) {
  1240. return new HTMLPurifier_Config($config->def, $config->plist);
  1241. }
  1242. /**
  1243. * Convenience constructor that creates a default configuration object.
  1244. * @return Default HTMLPurifier_Config object.
  1245. */
  1246. public static function createDefault() {
  1247. $definition = HTMLPurifier_ConfigSchema::instance();
  1248. $config = new HTMLPurifier_Config($definition);
  1249. return $config;
  1250. }
  1251. /**
  1252. * Retreives a value from the configuration.
  1253. * @param $key String key
  1254. */
  1255. public function get($key, $a = null) {
  1256. if ($a !== null) {
  1257. $this->triggerError("Using deprecated API: use \$config->get('$key.$a') instead", E_USER_WARNING);
  1258. $key = "$key.$a";
  1259. }
  1260. if (!$this->finalized) $this->autoFinalize();
  1261. if (!isset($this->def->info[$key])) {
  1262. // can't add % due to SimpleTest bug
  1263. $this->triggerError('Cannot retrieve value of undefined directive ' . htmlspecialchars($key),
  1264. E_USER_WARNING);
  1265. return;
  1266. }
  1267. if (isset($this->def->info[$key]->isAlias)) {
  1268. $d = $this->def->info[$key];
  1269. $this->triggerError('Cannot get value from aliased directive, use real name ' . $d->key,
  1270. E_USER_ERROR);
  1271. return;
  1272. }
  1273. if ($this->lock) {
  1274. list($ns) = explode('.', $key);
  1275. if ($ns !== $this->lock) {
  1276. $this->triggerError('Cannot get value of namespace ' . $ns . ' when lock for ' . $this->lock . ' is active, this probably indicates a Definition setup method is accessing directives that are not within its namespace', E_USER_ERROR);
  1277. return;
  1278. }
  1279. }
  1280. return $this->plist->get($key);
  1281. }
  1282. /**
  1283. * Retreives an array of directives to values from a given namespace
  1284. * @param $namespace String namespace
  1285. */
  1286. public function getBatch($namespace) {
  1287. if (!$this->finalized) $this->autoFinalize();
  1288. $full = $this->getAll();
  1289. if (!isset($full[$namespace])) {
  1290. $this->triggerError('Cannot retrieve undefined namespace ' . htmlspecialchars($namespace),
  1291. E_USER_WARNING);
  1292. return;
  1293. }
  1294. return $full[$namespace];
  1295. }
  1296. /**
  1297. * Returns a md5 signature of a segment of the configuration object
  1298. * that uniquely identifies that particular configuration
  1299. * @note Revision is handled specially and is removed from the batch
  1300. * before processing!
  1301. * @param $namespace Namespace to get serial for
  1302. */
  1303. public function getBatchSerial($namespace) {
  1304. if (empty($this->serials[$namespace])) {
  1305. $batch = $this->getBatch($namespace);
  1306. unset($batch['DefinitionRev']);
  1307. $this->serials[$namespace] = md5(serialize($batch));
  1308. }
  1309. return $this->serials[$namespace];
  1310. }
  1311. /**
  1312. * Returns a md5 signature for the entire configuration object
  1313. * that uniquely identifies that particular configuration
  1314. */
  1315. public function getSerial() {
  1316. if (empty($this->serial)) {
  1317. $this->serial = md5(serialize($this->getAll()));
  1318. }
  1319. return $this->serial;
  1320. }
  1321. /**
  1322. * Retrieves all directives, organized by namespace
  1323. * @warning This is a pretty inefficient function, avoid if you can
  1324. */
  1325. public function getAll() {
  1326. if (!$this->finalized) $this->autoFinalize();
  1327. $ret = array();
  1328. foreach ($this->plist->squash() as $name => $value) {
  1329. list($ns, $key) = explode('.', $name, 2);
  1330. $ret[$ns][$key] = $value;
  1331. }
  1332. return $ret;
  1333. }
  1334. /**
  1335. * Sets a value to configuration.
  1336. * @param $key String key
  1337. * @param $value Mixed value
  1338. */
  1339. public function set($key, $value, $a = null) {
  1340. if (strpos($key, '.') === false) {
  1341. $namespace = $key;
  1342. $directive = $value;
  1343. $value = $a;
  1344. $key = "$key.$directive";
  1345. $this->triggerError("Using deprecated API: use \$config->set('$key', ...) instead", E_USER_NOTICE);
  1346. } else {
  1347. list($namespace) = explode('.', $key);
  1348. }
  1349. if ($this->isFinalized('Cannot set directive after finalization')) return;
  1350. if (!isset($this->def->info[$key])) {
  1351. $this->triggerError('Cannot set undefined directive ' . htmlspecialchars($key) . ' to value',
  1352. E_USER_WARNING);
  1353. return;
  1354. }
  1355. $def = $this->def->info[$key];
  1356. if (isset($def->isAlias)) {
  1357. if ($this->aliasMode) {
  1358. $this->triggerError('Double-aliases not allowed, please fix '.
  1359. 'ConfigSchema bug with' . $key, E_USER_ERROR);
  1360. return;
  1361. }
  1362. $this->aliasMode = true;
  1363. $this->set($def->key, $value);
  1364. $this->aliasMode = false;
  1365. $this->triggerError("$key is an alias, preferred directive name is {$def->key}", E_USER_NOTICE);
  1366. return;
  1367. }
  1368. // Raw type might be negative when using the fully optimized form
  1369. // of stdclass, which indicates allow_null == true
  1370. $rtype = is_int($def) ? $def : $def->type;
  1371. if ($rtype < 0) {
  1372. $type = -$rtype;
  1373. $allow_null = true;
  1374. } else {
  1375. $type = $rtype;
  1376. $allow_null = isset($def->allow_null);
  1377. }
  1378. try {
  1379. $value = $this->parser->parse($value, $type, $allow_null);
  1380. } catch (HTMLPurifier_VarParserException $e) {
  1381. $this->triggerError('Value for ' . $key . ' is of invalid type, should be ' . HTMLPurifier_VarParser::getTypeName($type), E_USER_WARNING);
  1382. return;
  1383. }
  1384. if (is_string($value) && is_object($def)) {
  1385. // resolve value alias if defined
  1386. if (isset($def->aliases[$value])) {
  1387. $value = $def->aliases[$value];
  1388. }
  1389. // check to see if the value is allowed
  1390. if (isset($def->allowed) && !isset($def->allowed[$value])) {
  1391. $this->triggerError('Value not supported, valid values are: ' .
  1392. $this->_listify($def->allowed), E_USER_WARNING);
  1393. return;
  1394. }
  1395. }
  1396. $this->plist->set($key, $value);
  1397. // reset definitions if the directives they depend on changed
  1398. // this is a very costly process, so it's discouraged
  1399. // with finalization
  1400. if ($namespace == 'HTML' || $namespace == 'CSS' || $namespace == 'URI') {
  1401. $this->definitions[$namespace] = null;
  1402. }
  1403. $this->serials[$namespace] = false;
  1404. }
  1405. /**
  1406. * Convenience function for error reporting
  1407. */
  1408. private function _listify($lookup) {
  1409. $list = array();
  1410. foreach ($lookup as $name => $b) $list[] = $name;
  1411. return implode(', ', $list);
  1412. }
  1413. /**
  1414. * Retrieves object reference to the HTML definition.
  1415. * @param $raw Return a copy that has not been setup yet. Must be
  1416. * called before it's been setup, otherwise won't work.
  1417. * @param $optimized If true, this method may return null, to
  1418. * indicate that a cached version of the modified
  1419. * definition object is available and no further edits
  1420. * are necessary. Consider using
  1421. * maybeGetRawHTMLDefinition, which is more explicitly
  1422. * named, instead.
  1423. */
  1424. public function getHTMLDefinition($raw = false, $optimized = false) {
  1425. return $this->getDefinition('HTML', $raw, $optimized);
  1426. }
  1427. /**
  1428. * Retrieves object reference to the CSS definition
  1429. * @param $raw Return a copy that has not been setup yet. Must be
  1430. * called before it's been setup, otherwise won't work.
  1431. * @param $optimized If true, this method may return null, to
  1432. * indicate that a cached version of the modified
  1433. * definition object is available and no further edits
  1434. * are necessary. Consider using
  1435. * maybeGetRawCSSDefinition, which is more explicitly
  1436. * named, instead.
  1437. */
  1438. public function getCSSDefinition($raw = false, $optimized = false) {
  1439. return $this->getDefinition('CSS', $raw, $optimized);
  1440. }
  1441. /**
  1442. * Retrieves object reference to the URI definition
  1443. * @param $raw Return a copy that has not been setup yet. Must be
  1444. * called before it's been setup, otherwise won't work.
  1445. * @param $optimized If true, this method may return null, to
  1446. * indicate that a cached version of the modified
  1447. * definition object is available and no further edits
  1448. * are necessary. Consider using
  1449. * maybeGetRawURIDefinition, which is more explicitly
  1450. * named, instead.
  1451. */
  1452. public function getURIDefinition($raw = false, $optimized = false) {
  1453. return $this->getDefinition('URI', $raw, $optimized);
  1454. }
  1455. /**
  1456. * Retrieves a definition
  1457. * @param $type Type of definition: HTML, CSS, etc
  1458. * @param $raw Whether or not definition should be returned raw
  1459. * @param $optimized Only has an effect when $raw is true. Whether
  1460. * or not to return null if the result is already present in
  1461. * the cache. This is off by default for backwards
  1462. * compatibility reasons, but you need to do things this
  1463. * way in order to ensure that caching is done properly.
  1464. * Check out enduser-customize.html for more details.
  1465. * We probably won't ever change this default, as much as the
  1466. * maybe semantics is the "right thing to do."
  1467. */
  1468. public function getDefinition($type, $raw = false, $optimized = false) {
  1469. if ($optimized && !$raw) {
  1470. throw new HTMLPurifier_Exception("Cannot set optimized = true when raw = false");
  1471. }
  1472. if (!$this->finalized) $this->autoFinalize();
  1473. // temporarily suspend locks, so we can handle recursive definition calls
  1474. $lock = $this->lock;
  1475. $this->lock = null;
  1476. $factory = HTMLPurifier_DefinitionCacheFactory::instance();
  1477. $cache = $factory->create($type, $this);
  1478. $this->lock = $lock;
  1479. if (!$raw) {
  1480. // full definition
  1481. // ---------------
  1482. // check if definition is in memory
  1483. if (!empty($this->definitions[$type])) {
  1484. $def = $this->definitions[$type];
  1485. // check if the definition is setup
  1486. if ($def->setup) {
  1487. return $def;
  1488. } else {
  1489. $def->setup($this);
  1490. if ($def->optimized) $cache->add($def, $this);
  1491. return $def;
  1492. }
  1493. }
  1494. // check if definition is in cache
  1495. $def = $cache->get($this);
  1496. if ($def) {
  1497. // definition in cache, save to memory and return it
  1498. $this->definitions[$type] = $def;
  1499. return $def;
  1500. }
  1501. // initialize it
  1502. $def = $this->initDefinition($type);
  1503. // set it up
  1504. $this->lock = $type;
  1505. $def->setup($this);
  1506. $this->lock = null;
  1507. // save in cache
  1508. $cache->add($def, $this);
  1509. // return it
  1510. return $def;
  1511. } else {
  1512. // raw definition
  1513. // --------------
  1514. // check preconditions
  1515. $def = null;
  1516. if ($optimized) {
  1517. if (is_null($this->get($type . '.DefinitionID'))) {
  1518. // fatally error out if definition ID not set
  1519. throw new HTMLPurifier_Exception("Cannot retrieve raw version without specifying %$type.DefinitionID");
  1520. }
  1521. }
  1522. if (!empty($this->definitions[$type])) {
  1523. $def = $this->definitions[$type];
  1524. if ($def->setup && !$optimized) {
  1525. $extra = $this->chatty ? " (try moving this code block earlier in your initialization)" : "";
  1526. throw new HTMLPurifier_Exception("Cannot retrieve raw definition after it has already been setup" . $extra);
  1527. }
  1528. if ($def->optimized === null) {
  1529. $extra = $this->chatty ? " (try flushing your cache)" : "";
  1530. throw new HTMLPurifier_Exception("Optimization status of definition is unknown" . $extra);
  1531. }
  1532. if ($def->optimized !== $optimized) {
  1533. $msg = $optimized ? "optimized" : "unoptimized";
  1534. $extra = $this->chatty ? " (this backtrace is for the first inconsistent call, which was for a $msg raw definition)" : "";
  1535. throw new HTMLPurifier_Exception("Inconsistent use of optimized and unoptimized raw definition retrievals" . $extra);
  1536. }
  1537. }
  1538. // check if definition was in memory
  1539. if ($def) {
  1540. if ($def->setup) {
  1541. // invariant: $optimized === true (checked above)
  1542. return null;
  1543. } else {
  1544. return $def;
  1545. }
  1546. }
  1547. // if optimized, check if definition was in cache
  1548. // (because we do the memory check first, this formulation
  1549. // is prone to cache slamming, but I think
  1550. // guaranteeing that either /all/ of the raw
  1551. // setup code or /none/ of it is run is more important.)
  1552. if ($optimized) {
  1553. // This code path only gets run once; once we put
  1554. // something in $definitions (which is guaranteed by the
  1555. // trailing code), we always short-circuit above.
  1556. $def = $cache->get($this);
  1557. if ($def) {
  1558. // save the full definition for later, but don't
  1559. // return it yet
  1560. $this->definitions[$type] = $def;
  1561. return null;
  1562. }
  1563. }
  1564. // check invariants for creation
  1565. if (!$optimized) {
  1566. if (!is_null($this->get($type . '.DefinitionID'))) {
  1567. if ($this->chatty) {
  1568. $this->triggerError("Due to a documentation error in previous version of HTML Purifier, your definitions are not being cached. If this is OK, you can remove the %$type.DefinitionRev and %$type.DefinitionID declaration. Otherwise, modify your code to use maybeGetRawDefinition, and test if the returned value is null before making any edits (if it is null, that means that a cached version is available, and no raw operations are necessary). See <a href='http://htmlpurifier.org/docs/enduser-customize.html#optimized'>Customize</a> for more details", E_USER_WARNING);
  1569. } else {
  1570. $this->triggerError("Useless DefinitionID declaration", E_USER_WARNING);
  1571. }
  1572. }
  1573. }
  1574. // initialize it
  1575. $def = $this->initDefinition($type);
  1576. $def->optimized = $optimized;
  1577. return $def;
  1578. }
  1579. throw new HTMLPurifier_Exception("The impossible happened!");
  1580. }
  1581. private function initDefinition($type) {
  1582. // quick checks failed, let's create the object
  1583. if ($type == 'HTML') {
  1584. $def = new HTMLPurifier_HTMLDefinition();
  1585. } elseif ($type == 'CSS') {
  1586. $def = new HTMLPurifier_CSSDefinition();
  1587. } elseif ($type == 'URI') {
  1588. $def = new HTMLPurifier_URIDefinition();
  1589. } else {
  1590. throw new HTMLPurifier_Exception("Definition of $type type not supported");
  1591. }
  1592. $this->definitions[$type] = $def;
  1593. return $def;
  1594. }
  1595. public function maybeGetRawDefinition($name) {
  1596. return $this->getDefinition($name, true, true);
  1597. }
  1598. public function maybeGetRawHTMLDefinition() {
  1599. return $this->getDefinition('HTML', true, true);
  1600. }
  1601. public function maybeGetRawCSSDefinition() {
  1602. return $this->getDefinition('CSS', true, true);
  1603. }
  1604. public function maybeGetRawURIDefinition() {
  1605. return $this->getDefinition('URI', true, true);
  1606. }
  1607. /**
  1608. * Loads configuration values from an array with the following structure:
  1609. * Namespace.Directive => Value
  1610. * @param $config_array Configuration associative array
  1611. */
  1612. public function loadArray($config_array) {
  1613. if ($this->isFinalized('Cannot load directives after finalization')) return;
  1614. foreach ($config_array as $key => $value) {
  1615. $key = str_replace('_', '.', $key);
  1616. if (strpos($key, '.') !== false) {
  1617. $this->set($key, $value);
  1618. } else {
  1619. $namespace = $key;
  1620. $namespace_values = $value;
  1621. foreach ($namespace_values as $directive => $value) {
  1622. $this->set($namespace .'.'. $directive, $value);
  1623. }
  1624. }
  1625. }
  1626. }
  1627. /**
  1628. * Returns a list of array(namespace, directive) for all directives
  1629. * that are allowed in a web-form context as per an allowed
  1630. * namespaces/directives list.
  1631. * @param $allowed List of allowed namespaces/directives
  1632. */
  1633. public static function getAllowedDirectivesForForm($allowed, $schema = null) {
  1634. if (!$schema) {
  1635. $schema = HTMLPurifier_ConfigSchema::instance();
  1636. }
  1637. if ($allowed !== true) {
  1638. if (is_string($allowed)) $allowed = array($allowed);
  1639. $allowed_ns = array();
  1640. $allowed_directives = array();
  1641. $blacklisted_directives = array();
  1642. foreach ($allowed as $ns_or_directive) {
  1643. if (strpos($ns_or_directive, '.') !== false) {
  1644. // directive
  1645. if ($ns_or_directive[0] == '-') {
  1646. $blacklisted_directives[substr($ns_or_directive, 1)] = true;
  1647. } else {
  1648. $allowed_directives[$ns_or_directive] = true;
  1649. }
  1650. } else {
  1651. // namespace
  1652. $allowed_ns[$ns_or_directive] = true;
  1653. }
  1654. }
  1655. }
  1656. $ret = array();
  1657. foreach ($schema->info as $key => $def) {
  1658. list($ns, $directive) = explode('.', $key, 2);
  1659. if ($allowed !== true) {
  1660. if (isset($blacklisted_directives["$ns.$directive"])) continue;
  1661. if (!isset($allowed_directives["$ns.$directive"]) && !isset($allowed_ns[$ns])) continue;
  1662. }
  1663. if (isset($def->isAlias)) continue;
  1664. if ($directive == 'DefinitionID' || $directive == 'DefinitionRev') continue;
  1665. $ret[] = array($ns, $directive);
  1666. }
  1667. return $ret;
  1668. }
  1669. /**
  1670. * Loads configuration values from $_GET/$_POST that were posted
  1671. * via ConfigForm
  1672. * @param $array $_GET or $_POST array to import
  1673. * @param $index Index/name that the config variables are in
  1674. * @param $allowed List of allowed namespaces/directives
  1675. * @param $mq_fix Boolean whether or not to enable magic quotes fix
  1676. * @param $schema Instance of HTMLPurifier_ConfigSchema to use, if not global copy
  1677. */
  1678. public static function loadArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) {
  1679. $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $schema);
  1680. $config = HTMLPurifier_Config::create($ret, $schema);
  1681. return $config;
  1682. }
  1683. /**
  1684. * Merges in configuration values from $_GET/$_POST to object. NOT STATIC.
  1685. * @note Same parameters as loadArrayFromForm
  1686. */
  1687. public function mergeArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true) {
  1688. $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $this->def);
  1689. $this->loadArray($ret);
  1690. }
  1691. /**
  1692. * Prepares an array from a form into something usable for the more
  1693. * strict parts of HTMLPurifier_Config
  1694. */
  1695. public static function prepareArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) {
  1696. if ($index !== false) $array = (isset($array[$index]) && is_array($array[$index])) ? $array[$index] : array();
  1697. $mq = $mq_fix && function_exists('get_magic_quotes_gpc') && get_magic_quotes_gpc();
  1698. $allowed = HTMLPurifier_Config::getAllowedDirectivesForForm($allowed, $schema);
  1699. $ret = array();
  1700. foreach ($allowed as $key) {
  1701. list($ns, $directive) = $key;
  1702. $skey = "$ns.$directive";
  1703. if (!empty($array["Null_$skey"])) {
  1704. $ret[$ns][$directive] = null;
  1705. continue;
  1706. }
  1707. if (!isset($array[$skey])) continue;
  1708. $value = $mq ? stripslashes($array[$skey]) : $array[$skey];
  1709. $ret[$ns][$directive] = $value;
  1710. }
  1711. return $ret;
  1712. }
  1713. /**
  1714. * Loads configuration values from an ini file
  1715. * @param $filename Name of ini file
  1716. */
  1717. public function loadIni($filename) {
  1718. if ($this->isFinalized('Cannot load directives after finalization')) return;
  1719. $array = parse_ini_file($filename, true);
  1720. $this->loadArray($array);
  1721. }
  1722. /**
  1723. * Checks whether or not the configuration object is finalized.
  1724. * @param $error String error message, or false for no error
  1725. */
  1726. public function isFinalized($error = false) {
  1727. if ($this->finalized && $error) {
  1728. $this->triggerError($error, E_USER_ERROR);
  1729. }
  1730. return $this->finalized;
  1731. }
  1732. /**
  1733. * Finalizes configuration only if auto finalize is on and not
  1734. * already finalized
  1735. */
  1736. public function autoFinalize() {
  1737. if ($this->autoFinalize) {
  1738. $this->finalize();
  1739. } else {
  1740. $this->plist->squash(true);
  1741. }
  1742. }
  1743. /**
  1744. * Finalizes a configuration object, prohibiting further change
  1745. */
  1746. public function finalize() {
  1747. $this->finalized = true;
  1748. $this->parser = null;
  1749. }
  1750. /**
  1751. * Produces a nicely formatted error message by supplying the
  1752. * stack frame information OUTSIDE of HTMLPurifier_Config.
  1753. */
  1754. protected function triggerError($msg, $no) {
  1755. // determine previous stack frame
  1756. $extra = '';
  1757. if ($this->chatty) {
  1758. $trace = debug_backtrace();
  1759. // zip(tail(trace), trace) -- but PHP is not Haskell har har
  1760. for ($i = 0, $c = count($trace); $i < $c - 1; $i++) {
  1761. if ($trace[$i + 1]['class'] === 'HTMLPurifier_Config') {
  1762. continue;
  1763. }
  1764. $frame = $trace[$i];
  1765. $extra = " invoked on line {$frame['line']} in file {$frame['file']}";
  1766. break;
  1767. }
  1768. }
  1769. trigger_error($msg . $extra, $no);
  1770. }
  1771. /**
  1772. * Returns a serialized form of the configuration object that can
  1773. * be reconstituted.
  1774. */
  1775. public function serialize() {
  1776. $this->getDefinition('HTML');
  1777. $this->getDefinition('CSS');
  1778. $this->getDefinition('URI');
  1779. return serialize($this);
  1780. }
  1781. }
  1782. /**
  1783. * Configuration definition, defines directives and their defaults.
  1784. */
  1785. class HTMLPurifier_ConfigSchema {
  1786. /**
  1787. * Defaults of the directives and namespaces.
  1788. * @note This shares the exact same structure as HTMLPurifier_Config::$conf
  1789. */
  1790. public $defaults = array();
  1791. /**
  1792. * The default property list. Do not edit this property list.
  1793. */
  1794. public $defaultPlist;
  1795. /**
  1796. * Definition of the directives. The structure of this is:
  1797. *
  1798. * array(
  1799. * 'Namespace' => array(
  1800. * 'Directive' => new stdclass(),
  1801. * )
  1802. * )
  1803. *
  1804. * The stdclass may have the following properties:
  1805. *
  1806. * - If isAlias isn't set:
  1807. * - type: Integer type of directive, see HTMLPurifier_VarParser for definitions
  1808. * - allow_null: If set, this directive allows null values
  1809. * - aliases: If set, an associative array of value aliases to real values
  1810. * - allowed: If set, a lookup array of allowed (string) values
  1811. * - If isAlias is set:
  1812. * - namespace: Namespace this directive aliases to
  1813. * - name: Directive name this directive aliases to
  1814. *
  1815. * In certain degenerate cases, stdclass will actually be an integer. In
  1816. * that case, the value is equivalent to an stdclass with the type
  1817. * property set to the integer. If the integer is negative, type is
  1818. * equal to the absolute value of integer, and allow_null is true.
  1819. *
  1820. * This class is friendly with HTMLPurifier_Config. If you need introspection
  1821. * about the schema, you're better of using the ConfigSchema_Interchange,
  1822. * which uses more memory but has much richer information.
  1823. */
  1824. public $info = array();
  1825. /**
  1826. * Application-wide singleton
  1827. */
  1828. static protected $singleton;
  1829. public function __construct() {
  1830. $this->defaultPlist = new HTMLPurifier_PropertyList();
  1831. }
  1832. /**
  1833. * Unserializes the default ConfigSchema.
  1834. */
  1835. public static function makeFromSerial() {
  1836. $contents = file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/ConfigSchema/schema.ser');
  1837. $r = unserialize($contents);
  1838. if (!$r) {
  1839. $hash = sha1($contents);
  1840. trigger_error("Unserialization of configuration schema failed, sha1 of file was $hash", E_USER_ERROR);
  1841. }
  1842. return $r;
  1843. }
  1844. /**
  1845. * Retrieves an instance of the application-wide configuration definition.
  1846. */
  1847. public static function instance($prototype = null) {
  1848. if ($prototype !== null) {
  1849. HTMLPurifier_ConfigSchema::$singleton = $prototype;
  1850. } elseif (HTMLPurifier_ConfigSchema::$singleton === null || $prototype === true) {
  1851. HTMLPurifier_ConfigSchema::$singleton = HTMLPurifier_ConfigSchema::makeFromSerial();
  1852. }
  1853. return HTMLPurifier_ConfigSchema::$singleton;
  1854. }
  1855. /**
  1856. * Defines a directive for configuration
  1857. * @warning Will fail of directive's namespace is defined.
  1858. * @warning This method's signature is slightly different from the legacy
  1859. * define() static method! Beware!
  1860. * @param $namespace Namespace the directive is in
  1861. * @param $name Key of directive
  1862. * @param $default Default value of directive
  1863. * @param $type Allowed type of the directive. See
  1864. * HTMLPurifier_DirectiveDef::$type for allowed values
  1865. * @param $allow_null Whether or not to allow null values
  1866. */
  1867. public function add($key, $default, $type, $allow_null) {
  1868. $obj = new stdclass();
  1869. $obj->type = is_int($type) ? $type : HTMLPurifier_VarParser::$types[$type];
  1870. if ($allow_null) $obj->allow_null = true;
  1871. $this->info[$key] = $obj;
  1872. $this->defaults[$key] = $default;
  1873. $this->defaultPlist->set($key, $default);
  1874. }
  1875. /**
  1876. * Defines a directive value alias.
  1877. *
  1878. * Directive value aliases are convenient for developers because it lets
  1879. * them set a directive to several values and get the same result.
  1880. * @param $namespace Directive's namespace
  1881. * @param $name Name of Directive
  1882. * @param $aliases Hash of aliased values to the real alias
  1883. */
  1884. public function addValueAliases($key, $aliases) {
  1885. if (!isset($this->info[$key]->aliases)) {
  1886. $this->info[$key]->aliases = array();
  1887. }
  1888. foreach ($aliases as $alias => $real) {
  1889. $this->info[$key]->aliases[$alias] = $real;
  1890. }
  1891. }
  1892. /**
  1893. * Defines a set of allowed values for a directive.
  1894. * @warning This is slightly different from the corresponding static
  1895. * method definition.
  1896. * @param $namespace Namespace of directive
  1897. * @param $name Name of directive
  1898. * @param $allowed Lookup array of allowed values
  1899. */
  1900. public function addAllowedValues($key, $allowed) {
  1901. $this->info[$key]->allowed = $allowed;
  1902. }
  1903. /**
  1904. * Defines a directive alias for backwards compatibility
  1905. * @param $namespace
  1906. * @param $name Directive that will be aliased
  1907. * @param $new_namespace
  1908. * @param $new_name Directive that the alias will be to
  1909. */
  1910. public function addAlias($key, $new_key) {
  1911. $obj = new stdclass;
  1912. $obj->key = $new_key;
  1913. $obj->isAlias = true;
  1914. $this->info[$key] = $obj;
  1915. }
  1916. /**
  1917. * Replaces any stdclass that only has the type property with type integer.
  1918. */
  1919. public function postProcess() {
  1920. foreach ($this->info as $key => $v) {
  1921. if (count((array) $v) == 1) {
  1922. $this->info[$key] = $v->type;
  1923. } elseif (count((array) $v) == 2 && isset($v->allow_null)) {
  1924. $this->info[$key] = -$v->type;
  1925. }
  1926. }
  1927. }
  1928. }
  1929. /**
  1930. * @todo Unit test
  1931. */
  1932. class HTMLPurifier_ContentSets
  1933. {
  1934. /**
  1935. * List of content set strings (pipe seperators) indexed by name.
  1936. */
  1937. public $info = array();
  1938. /**
  1939. * List of content set lookups (element => true) indexed by name.
  1940. * @note This is in HTMLPurifier_HTMLDefinition->info_content_sets
  1941. */
  1942. public $lookup = array();
  1943. /**
  1944. * Synchronized list of defined content sets (keys of info)
  1945. */
  1946. protected $keys = array();
  1947. /**
  1948. * Synchronized list of defined content values (values of info)
  1949. */
  1950. protected $values = array();
  1951. /**
  1952. * Merges in module's content sets, expands identifiers in the content
  1953. * sets and populates the keys, values and lookup member variables.
  1954. * @param $modules List of HTMLPurifier_HTMLModule
  1955. */
  1956. public function __construct($modules) {
  1957. if (!is_array($modules)) $modules = array($modules);
  1958. // populate content_sets based on module hints
  1959. // sorry, no way of overloading
  1960. foreach ($modules as $module_i => $module) {
  1961. foreach ($module->content_sets as $key => $value) {
  1962. $temp = $this->convertToLookup($value);
  1963. if (isset($this->lookup[$key])) {
  1964. // add it into the existing content set
  1965. $this->lookup[$key] = array_merge($this->lookup[$key], $temp);
  1966. } else {
  1967. $this->lookup[$key] = $temp;
  1968. }
  1969. }
  1970. }
  1971. $old_lookup = false;
  1972. while ($old_lookup !== $this->lookup) {
  1973. $old_lookup = $this->lookup;
  1974. foreach ($this->lookup as $i => $set) {
  1975. $add = array();
  1976. foreach ($set as $element => $x) {
  1977. if (isset($this->lookup[$element])) {
  1978. $add += $this->lookup[$element];
  1979. unset($this->lookup[$i][$element]);
  1980. }
  1981. }
  1982. $this->lookup[$i] += $add;
  1983. }
  1984. }
  1985. foreach ($this->lookup as $key => $lookup) {
  1986. $this->info[$key] = implode(' | ', array_keys($lookup));
  1987. }
  1988. $this->keys = array_keys($this->info);
  1989. $this->values = array_values($this->info);
  1990. }
  1991. /**
  1992. * Accepts a definition; generates and assigns a ChildDef for it
  1993. * @param $def HTMLPurifier_ElementDef reference
  1994. * @param $module Module that defined the ElementDef
  1995. */
  1996. public function generateChildDef(&$def, $module) {
  1997. if (!empty($def->child)) return; // already done!
  1998. $content_model = $def->content_model;
  1999. if (is_string($content_model)) {
  2000. // Assume that $this->keys is alphanumeric
  2001. $def->content_model = preg_replace_callback(
  2002. '/\b(' . implode('|', $this->keys) . ')\b/',
  2003. array($this, 'generateChildDefCallback'),
  2004. $content_model
  2005. );
  2006. //$def->content_model = str_replace(
  2007. // $this->keys, $this->values, $content_model);
  2008. }
  2009. $def->child = $this->getChildDef($def, $module);
  2010. }
  2011. public function generateChildDefCallback($matches) {
  2012. return $this->info[$matches[0]];
  2013. }
  2014. /**
  2015. * Instantiates a ChildDef based on content_model and content_model_type
  2016. * member variables in HTMLPurifier_ElementDef
  2017. * @note This will also defer to modules for custom HTMLPurifier_ChildDef
  2018. * subclasses that need content set expansion
  2019. * @param $def HTMLPurifier_ElementDef to have ChildDef extracted
  2020. * @return HTMLPurifier_ChildDef corresponding to ElementDef
  2021. */
  2022. public function getChildDef($def, $module) {
  2023. $value = $def->content_model;
  2024. if (is_object($value)) {
  2025. trigger_error(
  2026. 'Literal object child definitions should be stored in '.
  2027. 'ElementDef->child not ElementDef->content_model',
  2028. E_USER_NOTICE
  2029. );
  2030. return $value;
  2031. }
  2032. switch ($def->content_model_type) {
  2033. case 'required':
  2034. return new HTMLPurifier_ChildDef_Required($value);
  2035. case 'optional':
  2036. return new HTMLPurifier_ChildDef_Optional($value);
  2037. case 'empty':
  2038. return new HTMLPurifier_ChildDef_Empty();
  2039. case 'custom':
  2040. return new HTMLPurifier_ChildDef_Custom($value);
  2041. }
  2042. // defer to its module
  2043. $return = false;
  2044. if ($module->defines_child_def) { // save a func call
  2045. $return = $module->getChildDef($def);
  2046. }
  2047. if ($return !== false) return $return;
  2048. // error-out
  2049. trigger_error(
  2050. 'Could not determine which ChildDef class to instantiate',
  2051. E_USER_ERROR
  2052. );
  2053. return false;
  2054. }
  2055. /**
  2056. * Converts a string list of elements separated by pipes into
  2057. * a lookup array.
  2058. * @param $string List of elements
  2059. * @return Lookup array of elements
  2060. */
  2061. protected function convertToLookup($string) {
  2062. $array = explode('|', str_replace(' ', '', $string));
  2063. $ret = array();
  2064. foreach ($array as $i => $k) {
  2065. $ret[$k] = true;
  2066. }
  2067. return $ret;
  2068. }
  2069. }
  2070. /**
  2071. * Registry object that contains information about the current context.
  2072. * @warning Is a bit buggy when variables are set to null: it thinks
  2073. * they don't exist! So use false instead, please.
  2074. * @note Since the variables Context deals with may not be objects,
  2075. * references are very important here! Do not remove!
  2076. */
  2077. class HTMLPurifier_Context
  2078. {
  2079. /**
  2080. * Private array that stores the references.
  2081. */
  2082. private $_storage = array();
  2083. /**
  2084. * Registers a variable into the context.
  2085. * @param $name String name
  2086. * @param $ref Reference to variable to be registered
  2087. */
  2088. public function register($name, &$ref) {
  2089. if (isset($this->_storage[$name])) {
  2090. trigger_error("Name $name produces collision, cannot re-register",
  2091. E_USER_ERROR);
  2092. return;
  2093. }
  2094. $this->_storage[$name] =& $ref;
  2095. }
  2096. /**
  2097. * Retrieves a variable reference from the context.
  2098. * @param $name String name
  2099. * @param $ignore_error Boolean whether or not to ignore error
  2100. */
  2101. public function &get($name, $ignore_error = false) {
  2102. if (!isset($this->_storage[$name])) {
  2103. if (!$ignore_error) {
  2104. trigger_error("Attempted to retrieve non-existent variable $name",
  2105. E_USER_ERROR);
  2106. }
  2107. $var = null; // so we can return by reference
  2108. return $var;
  2109. }
  2110. return $this->_storage[$name];
  2111. }
  2112. /**
  2113. * Destorys a variable in the context.
  2114. * @param $name String name
  2115. */
  2116. public function destroy($name) {
  2117. if (!isset($this->_storage[$name])) {
  2118. trigger_error("Attempted to destroy non-existent variable $name",
  2119. E_USER_ERROR);
  2120. return;
  2121. }
  2122. unset($this->_storage[$name]);
  2123. }
  2124. /**
  2125. * Checks whether or not the variable exists.
  2126. * @param $name String name
  2127. */
  2128. public function exists($name) {
  2129. return isset($this->_storage[$name]);
  2130. }
  2131. /**
  2132. * Loads a series of variables from an associative array
  2133. * @param $context_array Assoc array of variables to load
  2134. */
  2135. public function loadArray($context_array) {
  2136. foreach ($context_array as $key => $discard) {
  2137. $this->register($key, $context_array[$key]);
  2138. }
  2139. }
  2140. }
  2141. /**
  2142. * Abstract class representing Definition cache managers that implements
  2143. * useful common methods and is a factory.
  2144. * @todo Create a separate maintenance file advanced users can use to
  2145. * cache their custom HTMLDefinition, which can be loaded
  2146. * via a configuration directive
  2147. * @todo Implement memcached
  2148. */
  2149. abstract class HTMLPurifier_DefinitionCache
  2150. {
  2151. public $type;
  2152. /**
  2153. * @param $name Type of definition objects this instance of the
  2154. * cache will handle.
  2155. */
  2156. public function __construct($type) {
  2157. $this->type = $type;
  2158. }
  2159. /**
  2160. * Generates a unique identifier for a particular configuration
  2161. * @param Instance of HTMLPurifier_Config
  2162. */
  2163. public function generateKey($config) {
  2164. return $config->version . ',' . // possibly replace with function calls
  2165. $config->getBatchSerial($this->type) . ',' .
  2166. $config->get($this->type . '.DefinitionRev');
  2167. }
  2168. /**
  2169. * Tests whether or not a key is old with respect to the configuration's
  2170. * version and revision number.
  2171. * @param $key Key to test
  2172. * @param $config Instance of HTMLPurifier_Config to test against
  2173. */
  2174. public function isOld($key, $config) {
  2175. if (substr_count($key, ',') < 2) return true;
  2176. list($version, $hash, $revision) = explode(',', $key, 3);
  2177. $compare = version_compare($version, $config->version);
  2178. // version mismatch, is always old
  2179. if ($compare != 0) return true;
  2180. // versions match, ids match, check revision number
  2181. if (
  2182. $hash == $config->getBatchSerial($this->type) &&
  2183. $revision < $config->get($this->type . '.DefinitionRev')
  2184. ) return true;
  2185. return false;
  2186. }
  2187. /**
  2188. * Checks if a definition's type jives with the cache's type
  2189. * @note Throws an error on failure
  2190. * @param $def Definition object to check
  2191. * @return Boolean true if good, false if not
  2192. */
  2193. public function checkDefType($def) {
  2194. if ($def->type !== $this->type) {
  2195. trigger_error("Cannot use definition of type {$def->type} in cache for {$this->type}");
  2196. return false;
  2197. }
  2198. return true;
  2199. }
  2200. /**
  2201. * Adds a definition object to the cache
  2202. */
  2203. abstract public function add($def, $config);
  2204. /**
  2205. * Unconditionally saves a definition object to the cache
  2206. */
  2207. abstract public function set($def, $config);
  2208. /**
  2209. * Replace an object in the cache
  2210. */
  2211. abstract public function replace($def, $config);
  2212. /**
  2213. * Retrieves a definition object from the cache
  2214. */
  2215. abstract public function get($config);
  2216. /**
  2217. * Removes a definition object to the cache
  2218. */
  2219. abstract public function remove($config);
  2220. /**
  2221. * Clears all objects from cache
  2222. */
  2223. abstract public function flush($config);
  2224. /**
  2225. * Clears all expired (older version or revision) objects from cache
  2226. * @note Be carefuly implementing this method as flush. Flush must
  2227. * not interfere with other Definition types, and cleanup()
  2228. * should not be repeatedly called by userland code.
  2229. */
  2230. abstract public function cleanup($config);
  2231. }
  2232. /**
  2233. * Responsible for creating definition caches.
  2234. */
  2235. class HTMLPurifier_DefinitionCacheFactory
  2236. {
  2237. protected $caches = array('Serializer' => array());
  2238. protected $implementations = array();
  2239. protected $decorators = array();
  2240. /**
  2241. * Initialize default decorators
  2242. */
  2243. public function setup() {
  2244. $this->addDecorator('Cleanup');
  2245. }
  2246. /**
  2247. * Retrieves an instance of global definition cache factory.
  2248. */
  2249. public static function instance($prototype = null) {
  2250. static $instance;
  2251. if ($prototype !== null) {
  2252. $instance = $prototype;
  2253. } elseif ($instance === null || $prototype === true) {
  2254. $instance = new HTMLPurifier_DefinitionCacheFactory();
  2255. $instance->setup();
  2256. }
  2257. return $instance;
  2258. }
  2259. /**
  2260. * Registers a new definition cache object
  2261. * @param $short Short name of cache object, for reference
  2262. * @param $long Full class name of cache object, for construction
  2263. */
  2264. public function register($short, $long) {
  2265. $this->implementations[$short] = $long;
  2266. }
  2267. /**
  2268. * Factory method that creates a cache object based on configuration
  2269. * @param $name Name of definitions handled by cache
  2270. * @param $config Instance of HTMLPurifier_Config
  2271. */
  2272. public function create($type, $config) {
  2273. $method = $config->get('Cache.DefinitionImpl');
  2274. if ($method === null) {
  2275. return new HTMLPurifier_DefinitionCache_Null($type);
  2276. }
  2277. if (!empty($this->caches[$method][$type])) {
  2278. return $this->caches[$method][$type];
  2279. }
  2280. if (
  2281. isset($this->implementations[$method]) &&
  2282. class_exists($class = $this->implementations[$method], false)
  2283. ) {
  2284. $cache = new $class($type);
  2285. } else {
  2286. if ($method != 'Serializer') {
  2287. trigger_error("Unrecognized DefinitionCache $method, using Serializer instead", E_USER_WARNING);
  2288. }
  2289. $cache = new HTMLPurifier_DefinitionCache_Serializer($type);
  2290. }
  2291. foreach ($this->decorators as $decorator) {
  2292. $new_cache = $decorator->decorate($cache);
  2293. // prevent infinite recursion in PHP 4
  2294. unset($cache);
  2295. $cache = $new_cache;
  2296. }
  2297. $this->caches[$method][$type] = $cache;
  2298. return $this->caches[$method][$type];
  2299. }
  2300. /**
  2301. * Registers a decorator to add to all new cache objects
  2302. * @param
  2303. */
  2304. public function addDecorator($decorator) {
  2305. if (is_string($decorator)) {
  2306. $class = "HTMLPurifier_DefinitionCache_Decorator_$decorator";
  2307. $decorator = new $class;
  2308. }
  2309. $this->decorators[$decorator->name] = $decorator;
  2310. }
  2311. }
  2312. /**
  2313. * Represents a document type, contains information on which modules
  2314. * need to be loaded.
  2315. * @note This class is inspected by Printer_HTMLDefinition->renderDoctype.
  2316. * If structure changes, please update that function.
  2317. */
  2318. class HTMLPurifier_Doctype
  2319. {
  2320. /**
  2321. * Full name of doctype
  2322. */
  2323. public $name;
  2324. /**
  2325. * List of standard modules (string identifiers or literal objects)
  2326. * that this doctype uses
  2327. */
  2328. public $modules = array();
  2329. /**
  2330. * List of modules to use for tidying up code
  2331. */
  2332. public $tidyModules = array();
  2333. /**
  2334. * Is the language derived from XML (i.e. XHTML)?
  2335. */
  2336. public $xml = true;
  2337. /**
  2338. * List of aliases for this doctype
  2339. */
  2340. public $aliases = array();
  2341. /**
  2342. * Public DTD identifier
  2343. */
  2344. public $dtdPublic;
  2345. /**
  2346. * System DTD identifier
  2347. */
  2348. public $dtdSystem;
  2349. public function __construct($name = null, $xml = true, $modules = array(),
  2350. $tidyModules = array(), $aliases = array(), $dtd_public = null, $dtd_system = null
  2351. ) {
  2352. $this->name = $name;
  2353. $this->xml = $xml;
  2354. $this->modules = $modules;
  2355. $this->tidyModules = $tidyModules;
  2356. $this->aliases = $aliases;
  2357. $this->dtdPublic = $dtd_public;
  2358. $this->dtdSystem = $dtd_system;
  2359. }
  2360. }
  2361. class HTMLPurifier_DoctypeRegistry
  2362. {
  2363. /**
  2364. * Hash of doctype names to doctype objects
  2365. */
  2366. protected $doctypes;
  2367. /**
  2368. * Lookup table of aliases to real doctype names
  2369. */
  2370. protected $aliases;
  2371. /**
  2372. * Registers a doctype to the registry
  2373. * @note Accepts a fully-formed doctype object, or the
  2374. * parameters for constructing a doctype object
  2375. * @param $doctype Name of doctype or literal doctype object
  2376. * @param $modules Modules doctype will load
  2377. * @param $modules_for_modes Modules doctype will load for certain modes
  2378. * @param $aliases Alias names for doctype
  2379. * @return Editable registered doctype
  2380. */
  2381. public function register($doctype, $xml = true, $modules = array(),
  2382. $tidy_modules = array(), $aliases = array(), $dtd_public = null, $dtd_system = null
  2383. ) {
  2384. if (!is_array($modules)) $modules = array($modules);
  2385. if (!is_array($tidy_modules)) $tidy_modules = array($tidy_modules);
  2386. if (!is_array($aliases)) $aliases = array($aliases);
  2387. if (!is_object($doctype)) {
  2388. $doctype = new HTMLPurifier_Doctype(
  2389. $doctype, $xml, $modules, $tidy_modules, $aliases, $dtd_public, $dtd_system
  2390. );
  2391. }
  2392. $this->doctypes[$doctype->name] = $doctype;
  2393. $name = $doctype->name;
  2394. // hookup aliases
  2395. foreach ($doctype->aliases as $alias) {
  2396. if (isset($this->doctypes[$alias])) continue;
  2397. $this->aliases[$alias] = $name;
  2398. }
  2399. // remove old aliases
  2400. if (isset($this->aliases[$name])) unset($this->aliases[$name]);
  2401. return $doctype;
  2402. }
  2403. /**
  2404. * Retrieves reference to a doctype of a certain name
  2405. * @note This function resolves aliases
  2406. * @note When possible, use the more fully-featured make()
  2407. * @param $doctype Name of doctype
  2408. * @return Editable doctype object
  2409. */
  2410. public function get($doctype) {
  2411. if (isset($this->aliases[$doctype])) $doctype = $this->aliases[$doctype];
  2412. if (!isset($this->doctypes[$doctype])) {
  2413. trigger_error('Doctype ' . htmlspecialchars($doctype) . ' does not exist', E_USER_ERROR);
  2414. $anon = new HTMLPurifier_Doctype($doctype);
  2415. return $anon;
  2416. }
  2417. return $this->doctypes[$doctype];
  2418. }
  2419. /**
  2420. * Creates a doctype based on a configuration object,
  2421. * will perform initialization on the doctype
  2422. * @note Use this function to get a copy of doctype that config
  2423. * can hold on to (this is necessary in order to tell
  2424. * Generator whether or not the current document is XML
  2425. * based or not).
  2426. */
  2427. public function make($config) {
  2428. return clone $this->get($this->getDoctypeFromConfig($config));
  2429. }
  2430. /**
  2431. * Retrieves the doctype from the configuration object
  2432. */
  2433. public function getDoctypeFromConfig($config) {
  2434. // recommended test
  2435. $doctype = $config->get('HTML.Doctype');
  2436. if (!empty($doctype)) return $doctype;
  2437. $doctype = $config->get('HTML.CustomDoctype');
  2438. if (!empty($doctype)) return $doctype;
  2439. // backwards-compatibility
  2440. if ($config->get('HTML.XHTML')) {
  2441. $doctype = 'XHTML 1.0';
  2442. } else {
  2443. $doctype = 'HTML 4.01';
  2444. }
  2445. if ($config->get('HTML.Strict')) {
  2446. $doctype .= ' Strict';
  2447. } else {
  2448. $doctype .= ' Transitional';
  2449. }
  2450. return $doctype;
  2451. }
  2452. }
  2453. /**
  2454. * Structure that stores an HTML element definition. Used by
  2455. * HTMLPurifier_HTMLDefinition and HTMLPurifier_HTMLModule.
  2456. * @note This class is inspected by HTMLPurifier_Printer_HTMLDefinition.
  2457. * Please update that class too.
  2458. * @warning If you add new properties to this class, you MUST update
  2459. * the mergeIn() method.
  2460. */
  2461. class HTMLPurifier_ElementDef
  2462. {
  2463. /**
  2464. * Does the definition work by itself, or is it created solely
  2465. * for the purpose of merging into another definition?
  2466. */
  2467. public $standalone = true;
  2468. /**
  2469. * Associative array of attribute name to HTMLPurifier_AttrDef
  2470. * @note Before being processed by HTMLPurifier_AttrCollections
  2471. * when modules are finalized during
  2472. * HTMLPurifier_HTMLDefinition->setup(), this array may also
  2473. * contain an array at index 0 that indicates which attribute
  2474. * collections to load into the full array. It may also
  2475. * contain string indentifiers in lieu of HTMLPurifier_AttrDef,
  2476. * see HTMLPurifier_AttrTypes on how they are expanded during
  2477. * HTMLPurifier_HTMLDefinition->setup() processing.
  2478. */
  2479. public $attr = array();
  2480. /**
  2481. * Indexed list of tag's HTMLPurifier_AttrTransform to be done before validation
  2482. */
  2483. public $attr_transform_pre = array();
  2484. /**
  2485. * Indexed list of tag's HTMLPurifier_AttrTransform to be done after validation
  2486. */
  2487. public $attr_transform_post = array();
  2488. /**
  2489. * HTMLPurifier_ChildDef of this tag.
  2490. */
  2491. public $child;
  2492. /**
  2493. * Abstract string representation of internal ChildDef rules. See
  2494. * HTMLPurifier_ContentSets for how this is parsed and then transformed
  2495. * into an HTMLPurifier_ChildDef.
  2496. * @warning This is a temporary variable that is not available after
  2497. * being processed by HTMLDefinition
  2498. */
  2499. public $content_model;
  2500. /**
  2501. * Value of $child->type, used to determine which ChildDef to use,
  2502. * used in combination with $content_model.
  2503. * @warning This must be lowercase
  2504. * @warning This is a temporary variable that is not available after
  2505. * being processed by HTMLDefinition
  2506. */
  2507. public $content_model_type;
  2508. /**
  2509. * Does the element have a content model (#PCDATA | Inline)*? This
  2510. * is important for chameleon ins and del processing in
  2511. * HTMLPurifier_ChildDef_Chameleon. Dynamically set: modules don't
  2512. * have to worry about this one.
  2513. */
  2514. public $descendants_are_inline = false;
  2515. /**
  2516. * List of the names of required attributes this element has. Dynamically
  2517. * populated by HTMLPurifier_HTMLDefinition::getElement
  2518. */
  2519. public $required_attr = array();
  2520. /**
  2521. * Lookup table of tags excluded from all descendants of this tag.
  2522. * @note SGML permits exclusions for all descendants, but this is
  2523. * not possible with DTDs or XML Schemas. W3C has elected to
  2524. * use complicated compositions of content_models to simulate
  2525. * exclusion for children, but we go the simpler, SGML-style
  2526. * route of flat-out exclusions, which correctly apply to
  2527. * all descendants and not just children. Note that the XHTML
  2528. * Modularization Abstract Modules are blithely unaware of such
  2529. * distinctions.
  2530. */
  2531. public $excludes = array();
  2532. /**
  2533. * This tag is explicitly auto-closed by the following tags.
  2534. */
  2535. public $autoclose = array();
  2536. /**
  2537. * If a foreign element is found in this element, test if it is
  2538. * allowed by this sub-element; if it is, instead of closing the
  2539. * current element, place it inside this element.
  2540. */
  2541. public $wrap;
  2542. /**
  2543. * Whether or not this is a formatting element affected by the
  2544. * "Active Formatting Elements" algorithm.
  2545. */
  2546. public $formatting;
  2547. /**
  2548. * Low-level factory constructor for creating new standalone element defs
  2549. */
  2550. public static function create($content_model, $content_model_type, $attr) {
  2551. $def = new HTMLPurifier_ElementDef();
  2552. $def->content_model = $content_model;
  2553. $def->content_model_type = $content_model_type;
  2554. $def->attr = $attr;
  2555. return $def;
  2556. }
  2557. /**
  2558. * Merges the values of another element definition into this one.
  2559. * Values from the new element def take precedence if a value is
  2560. * not mergeable.
  2561. */
  2562. public function mergeIn($def) {
  2563. // later keys takes precedence
  2564. foreach($def->attr as $k => $v) {
  2565. if ($k === 0) {
  2566. // merge in the includes
  2567. // sorry, no way to override an include
  2568. foreach ($v as $v2) {
  2569. $this->attr[0][] = $v2;
  2570. }
  2571. continue;
  2572. }
  2573. if ($v === false) {
  2574. if (isset($this->attr[$k])) unset($this->attr[$k]);
  2575. continue;
  2576. }
  2577. $this->attr[$k] = $v;
  2578. }
  2579. $this->_mergeAssocArray($this->attr_transform_pre, $def->attr_transform_pre);
  2580. $this->_mergeAssocArray($this->attr_transform_post, $def->attr_transform_post);
  2581. $this->_mergeAssocArray($this->excludes, $def->excludes);
  2582. if(!empty($def->content_model)) {
  2583. $this->content_model =
  2584. str_replace("#SUPER", $this->content_model, $def->content_model);
  2585. $this->child = false;
  2586. }
  2587. if(!empty($def->content_model_type)) {
  2588. $this->content_model_type = $def->content_model_type;
  2589. $this->child = false;
  2590. }
  2591. if(!is_null($def->child)) $this->child = $def->child;
  2592. if(!is_null($def->formatting)) $this->formatting = $def->formatting;
  2593. if($def->descendants_are_inline) $this->descendants_are_inline = $def->descendants_are_inline;
  2594. }
  2595. /**
  2596. * Merges one array into another, removes values which equal false
  2597. * @param $a1 Array by reference that is merged into
  2598. * @param $a2 Array that merges into $a1
  2599. */
  2600. private function _mergeAssocArray(&$a1, $a2) {
  2601. foreach ($a2 as $k => $v) {
  2602. if ($v === false) {
  2603. if (isset($a1[$k])) unset($a1[$k]);
  2604. continue;
  2605. }
  2606. $a1[$k] = $v;
  2607. }
  2608. }
  2609. }
  2610. /**
  2611. * A UTF-8 specific character encoder that handles cleaning and transforming.
  2612. * @note All functions in this class should be static.
  2613. */
  2614. class HTMLPurifier_Encoder
  2615. {
  2616. /**
  2617. * Constructor throws fatal error if you attempt to instantiate class
  2618. */
  2619. private function __construct() {
  2620. trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
  2621. }
  2622. /**
  2623. * Error-handler that mutes errors, alternative to shut-up operator.
  2624. */
  2625. public static function muteErrorHandler() {}
  2626. /**
  2627. * iconv wrapper which mutes errors, but doesn't work around bugs.
  2628. */
  2629. public static function unsafeIconv($in, $out, $text) {
  2630. set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
  2631. $r = iconv($in, $out, $text);
  2632. restore_error_handler();
  2633. return $r;
  2634. }
  2635. /**
  2636. * iconv wrapper which mutes errors and works around bugs.
  2637. */
  2638. public static function iconv($in, $out, $text, $max_chunk_size = 8000) {
  2639. $code = self::testIconvTruncateBug();
  2640. if ($code == self::ICONV_OK) {
  2641. return self::unsafeIconv($in, $out, $text);
  2642. } elseif ($code == self::ICONV_TRUNCATES) {
  2643. // we can only work around this if the input character set
  2644. // is utf-8
  2645. if ($in == 'utf-8') {
  2646. if ($max_chunk_size < 4) {
  2647. trigger_error('max_chunk_size is too small', E_USER_WARNING);
  2648. return false;
  2649. }
  2650. // split into 8000 byte chunks, but be careful to handle
  2651. // multibyte boundaries properly
  2652. if (($c = strlen($text)) <= $max_chunk_size) {
  2653. return self::unsafeIconv($in, $out, $text);
  2654. }
  2655. $r = '';
  2656. $i = 0;
  2657. while (true) {
  2658. if ($i + $max_chunk_size >= $c) {
  2659. $r .= self::unsafeIconv($in, $out, substr($text, $i));
  2660. break;
  2661. }
  2662. // wibble the boundary
  2663. if (0x80 != (0xC0 & ord($text[$i + $max_chunk_size]))) {
  2664. $chunk_size = $max_chunk_size;
  2665. } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 1]))) {
  2666. $chunk_size = $max_chunk_size - 1;
  2667. } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 2]))) {
  2668. $chunk_size = $max_chunk_size - 2;
  2669. } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 3]))) {
  2670. $chunk_size = $max_chunk_size - 3;
  2671. } else {
  2672. return false; // rather confusing UTF-8...
  2673. }
  2674. $chunk = substr($text, $i, $chunk_size); // substr doesn't mind overlong lengths
  2675. $r .= self::unsafeIconv($in, $out, $chunk);
  2676. $i += $chunk_size;
  2677. }
  2678. return $r;
  2679. } else {
  2680. return false;
  2681. }
  2682. } else {
  2683. return false;
  2684. }
  2685. }
  2686. /**
  2687. * Cleans a UTF-8 string for well-formedness and SGML validity
  2688. *
  2689. * It will parse according to UTF-8 and return a valid UTF8 string, with
  2690. * non-SGML codepoints excluded.
  2691. *
  2692. * @note Just for reference, the non-SGML code points are 0 to 31 and
  2693. * 127 to 159, inclusive. However, we allow code points 9, 10
  2694. * and 13, which are the tab, line feed and carriage return
  2695. * respectively. 128 and above the code points map to multibyte
  2696. * UTF-8 representations.
  2697. *
  2698. * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
  2699. * hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
  2700. * LGPL license. Notes on what changed are inside, but in general,
  2701. * the original code transformed UTF-8 text into an array of integer
  2702. * Unicode codepoints. Understandably, transforming that back to
  2703. * a string would be somewhat expensive, so the function was modded to
  2704. * directly operate on the string. However, this discourages code
  2705. * reuse, and the logic enumerated here would be useful for any
  2706. * function that needs to be able to understand UTF-8 characters.
  2707. * As of right now, only smart lossless character encoding converters
  2708. * would need that, and I'm probably not going to implement them.
  2709. * Once again, PHP 6 should solve all our problems.
  2710. */
  2711. public static function cleanUTF8($str, $force_php = false) {
  2712. // UTF-8 validity is checked since PHP 4.3.5
  2713. // This is an optimization: if the string is already valid UTF-8, no
  2714. // need to do PHP stuff. 99% of the time, this will be the case.
  2715. // The regexp matches the XML char production, as well as well as excluding
  2716. // non-SGML codepoints U+007F to U+009F
  2717. if (preg_match('/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) {
  2718. return $str;
  2719. }
  2720. $mState = 0; // cached expected number of octets after the current octet
  2721. // until the beginning of the next UTF8 character sequence
  2722. $mUcs4 = 0; // cached Unicode character
  2723. $mBytes = 1; // cached expected number of octets in the current sequence
  2724. // original code involved an $out that was an array of Unicode
  2725. // codepoints. Instead of having to convert back into UTF-8, we've
  2726. // decided to directly append valid UTF-8 characters onto a string
  2727. // $out once they're done. $char accumulates raw bytes, while $mUcs4
  2728. // turns into the Unicode code point, so there's some redundancy.
  2729. $out = '';
  2730. $char = '';
  2731. $len = strlen($str);
  2732. for($i = 0; $i < $len; $i++) {
  2733. $in = ord($str{$i});
  2734. $char .= $str[$i]; // append byte to char
  2735. if (0 == $mState) {
  2736. // When mState is zero we expect either a US-ASCII character
  2737. // or a multi-octet sequence.
  2738. if (0 == (0x80 & ($in))) {
  2739. // US-ASCII, pass straight through.
  2740. if (($in <= 31 || $in == 127) &&
  2741. !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
  2742. ) {
  2743. // control characters, remove
  2744. } else {
  2745. $out .= $char;
  2746. }
  2747. // reset
  2748. $char = '';
  2749. $mBytes = 1;
  2750. } elseif (0xC0 == (0xE0 & ($in))) {
  2751. // First octet of 2 octet sequence
  2752. $mUcs4 = ($in);
  2753. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  2754. $mState = 1;
  2755. $mBytes = 2;
  2756. } elseif (0xE0 == (0xF0 & ($in))) {
  2757. // First octet of 3 octet sequence
  2758. $mUcs4 = ($in);
  2759. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  2760. $mState = 2;
  2761. $mBytes = 3;
  2762. } elseif (0xF0 == (0xF8 & ($in))) {
  2763. // First octet of 4 octet sequence
  2764. $mUcs4 = ($in);
  2765. $mUcs4 = ($mUcs4 & 0x07) << 18;
  2766. $mState = 3;
  2767. $mBytes = 4;
  2768. } elseif (0xF8 == (0xFC & ($in))) {
  2769. // First octet of 5 octet sequence.
  2770. //
  2771. // This is illegal because the encoded codepoint must be
  2772. // either:
  2773. // (a) not the shortest form or
  2774. // (b) outside the Unicode range of 0-0x10FFFF.
  2775. // Rather than trying to resynchronize, we will carry on
  2776. // until the end of the sequence and let the later error
  2777. // handling code catch it.
  2778. $mUcs4 = ($in);
  2779. $mUcs4 = ($mUcs4 & 0x03) << 24;
  2780. $mState = 4;
  2781. $mBytes = 5;
  2782. } elseif (0xFC == (0xFE & ($in))) {
  2783. // First octet of 6 octet sequence, see comments for 5
  2784. // octet sequence.
  2785. $mUcs4 = ($in);
  2786. $mUcs4 = ($mUcs4 & 1) << 30;
  2787. $mState = 5;
  2788. $mBytes = 6;
  2789. } else {
  2790. // Current octet is neither in the US-ASCII range nor a
  2791. // legal first octet of a multi-octet sequence.
  2792. $mState = 0;
  2793. $mUcs4 = 0;
  2794. $mBytes = 1;
  2795. $char = '';
  2796. }
  2797. } else {
  2798. // When mState is non-zero, we expect a continuation of the
  2799. // multi-octet sequence
  2800. if (0x80 == (0xC0 & ($in))) {
  2801. // Legal continuation.
  2802. $shift = ($mState - 1) * 6;
  2803. $tmp = $in;
  2804. $tmp = ($tmp & 0x0000003F) << $shift;
  2805. $mUcs4 |= $tmp;
  2806. if (0 == --$mState) {
  2807. // End of the multi-octet sequence. mUcs4 now contains
  2808. // the final Unicode codepoint to be output
  2809. // Check for illegal sequences and codepoints.
  2810. // From Unicode 3.1, non-shortest form is illegal
  2811. if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
  2812. ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
  2813. ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
  2814. (4 < $mBytes) ||
  2815. // From Unicode 3.2, surrogate characters = illegal
  2816. (($mUcs4 & 0xFFFFF800) == 0xD800) ||
  2817. // Codepoints outside the Unicode range are illegal
  2818. ($mUcs4 > 0x10FFFF)
  2819. ) {
  2820. } elseif (0xFEFF != $mUcs4 && // omit BOM
  2821. // check for valid Char unicode codepoints
  2822. (
  2823. 0x9 == $mUcs4 ||
  2824. 0xA == $mUcs4 ||
  2825. 0xD == $mUcs4 ||
  2826. (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
  2827. // 7F-9F is not strictly prohibited by XML,
  2828. // but it is non-SGML, and thus we don't allow it
  2829. (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
  2830. (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
  2831. )
  2832. ) {
  2833. $out .= $char;
  2834. }
  2835. // initialize UTF8 cache (reset)
  2836. $mState = 0;
  2837. $mUcs4 = 0;
  2838. $mBytes = 1;
  2839. $char = '';
  2840. }
  2841. } else {
  2842. // ((0xC0 & (*in) != 0x80) && (mState != 0))
  2843. // Incomplete multi-octet sequence.
  2844. // used to result in complete fail, but we'll reset
  2845. $mState = 0;
  2846. $mUcs4 = 0;
  2847. $mBytes = 1;
  2848. $char ='';
  2849. }
  2850. }
  2851. }
  2852. return $out;
  2853. }
  2854. /**
  2855. * Translates a Unicode codepoint into its corresponding UTF-8 character.
  2856. * @note Based on Feyd's function at
  2857. * <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
  2858. * which is in public domain.
  2859. * @note While we're going to do code point parsing anyway, a good
  2860. * optimization would be to refuse to translate code points that
  2861. * are non-SGML characters. However, this could lead to duplication.
  2862. * @note This is very similar to the unichr function in
  2863. * maintenance/generate-entity-file.php (although this is superior,
  2864. * due to its sanity checks).
  2865. */
  2866. // +----------+----------+----------+----------+
  2867. // | 33222222 | 22221111 | 111111 | |
  2868. // | 10987654 | 32109876 | 54321098 | 76543210 | bit
  2869. // +----------+----------+----------+----------+
  2870. // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
  2871. // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
  2872. // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
  2873. // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
  2874. // +----------+----------+----------+----------+
  2875. // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
  2876. // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
  2877. // +----------+----------+----------+----------+
  2878. public static function unichr($code) {
  2879. if($code > 1114111 or $code < 0 or
  2880. ($code >= 55296 and $code <= 57343) ) {
  2881. // bits are set outside the "valid" range as defined
  2882. // by UNICODE 4.1.0
  2883. return '';
  2884. }
  2885. $x = $y = $z = $w = 0;
  2886. if ($code < 128) {
  2887. // regular ASCII character
  2888. $x = $code;
  2889. } else {
  2890. // set up bits for UTF-8
  2891. $x = ($code & 63) | 128;
  2892. if ($code < 2048) {
  2893. $y = (($code & 2047) >> 6) | 192;
  2894. } else {
  2895. $y = (($code & 4032) >> 6) | 128;
  2896. if($code < 65536) {
  2897. $z = (($code >> 12) & 15) | 224;
  2898. } else {
  2899. $z = (($code >> 12) & 63) | 128;
  2900. $w = (($code >> 18) & 7) | 240;
  2901. }
  2902. }
  2903. }
  2904. // set up the actual character
  2905. $ret = '';
  2906. if($w) $ret .= chr($w);
  2907. if($z) $ret .= chr($z);
  2908. if($y) $ret .= chr($y);
  2909. $ret .= chr($x);
  2910. return $ret;
  2911. }
  2912. public static function iconvAvailable() {
  2913. static $iconv = null;
  2914. if ($iconv === null) {
  2915. $iconv = function_exists('iconv') && self::testIconvTruncateBug() != self::ICONV_UNUSABLE;
  2916. }
  2917. return $iconv;
  2918. }
  2919. /**
  2920. * Converts a string to UTF-8 based on configuration.
  2921. */
  2922. public static function convertToUTF8($str, $config, $context) {
  2923. $encoding = $config->get('Core.Encoding');
  2924. if ($encoding === 'utf-8') return $str;
  2925. static $iconv = null;
  2926. if ($iconv === null) $iconv = self::iconvAvailable();
  2927. if ($iconv && !$config->get('Test.ForceNoIconv')) {
  2928. // unaffected by bugs, since UTF-8 support all characters
  2929. $str = self::unsafeIconv($encoding, 'utf-8//IGNORE', $str);
  2930. if ($str === false) {
  2931. // $encoding is not a valid encoding
  2932. trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR);
  2933. return '';
  2934. }
  2935. // If the string is bjorked by Shift_JIS or a similar encoding
  2936. // that doesn't support all of ASCII, convert the naughty
  2937. // characters to their true byte-wise ASCII/UTF-8 equivalents.
  2938. $str = strtr($str, self::testEncodingSupportsASCII($encoding));
  2939. return $str;
  2940. } elseif ($encoding === 'iso-8859-1') {
  2941. $str = utf8_encode($str);
  2942. return $str;
  2943. }
  2944. trigger_error('Encoding not supported, please install iconv', E_USER_ERROR);
  2945. }
  2946. /**
  2947. * Converts a string from UTF-8 based on configuration.
  2948. * @note Currently, this is a lossy conversion, with unexpressable
  2949. * characters being omitted.
  2950. */
  2951. public static function convertFromUTF8($str, $config, $context) {
  2952. $encoding = $config->get('Core.Encoding');
  2953. if ($escape = $config->get('Core.EscapeNonASCIICharacters')) {
  2954. $str = self::convertToASCIIDumbLossless($str);
  2955. }
  2956. if ($encoding === 'utf-8') return $str;
  2957. static $iconv = null;
  2958. if ($iconv === null) $iconv = self::iconvAvailable();
  2959. if ($iconv && !$config->get('Test.ForceNoIconv')) {
  2960. // Undo our previous fix in convertToUTF8, otherwise iconv will barf
  2961. $ascii_fix = self::testEncodingSupportsASCII($encoding);
  2962. if (!$escape && !empty($ascii_fix)) {
  2963. $clear_fix = array();
  2964. foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = '';
  2965. $str = strtr($str, $clear_fix);
  2966. }
  2967. $str = strtr($str, array_flip($ascii_fix));
  2968. // Normal stuff
  2969. $str = self::iconv('utf-8', $encoding . '//IGNORE', $str);
  2970. return $str;
  2971. } elseif ($encoding === 'iso-8859-1') {
  2972. $str = utf8_decode($str);
  2973. return $str;
  2974. }
  2975. trigger_error('Encoding not supported', E_USER_ERROR);
  2976. // You might be tempted to assume that the ASCII representation
  2977. // might be OK, however, this is *not* universally true over all
  2978. // encodings. So we take the conservative route here, rather
  2979. // than forcibly turn on %Core.EscapeNonASCIICharacters
  2980. }
  2981. /**
  2982. * Lossless (character-wise) conversion of HTML to ASCII
  2983. * @param $str UTF-8 string to be converted to ASCII
  2984. * @returns ASCII encoded string with non-ASCII character entity-ized
  2985. * @warning Adapted from MediaWiki, claiming fair use: this is a common
  2986. * algorithm. If you disagree with this license fudgery,
  2987. * implement it yourself.
  2988. * @note Uses decimal numeric entities since they are best supported.
  2989. * @note This is a DUMB function: it has no concept of keeping
  2990. * character entities that the projected character encoding
  2991. * can allow. We could possibly implement a smart version
  2992. * but that would require it to also know which Unicode
  2993. * codepoints the charset supported (not an easy task).
  2994. * @note Sort of with cleanUTF8() but it assumes that $str is
  2995. * well-formed UTF-8
  2996. */
  2997. public static function convertToASCIIDumbLossless($str) {
  2998. $bytesleft = 0;
  2999. $result = '';
  3000. $working = 0;
  3001. $len = strlen($str);
  3002. for( $i = 0; $i < $len; $i++ ) {
  3003. $bytevalue = ord( $str[$i] );
  3004. if( $bytevalue <= 0x7F ) { //0xxx xxxx
  3005. $result .= chr( $bytevalue );
  3006. $bytesleft = 0;
  3007. } elseif( $bytevalue <= 0xBF ) { //10xx xxxx
  3008. $working = $working << 6;
  3009. $working += ($bytevalue & 0x3F);
  3010. $bytesleft--;
  3011. if( $bytesleft <= 0 ) {
  3012. $result .= "&#" . $working . ";";
  3013. }
  3014. } elseif( $bytevalue <= 0xDF ) { //110x xxxx
  3015. $working = $bytevalue & 0x1F;
  3016. $bytesleft = 1;
  3017. } elseif( $bytevalue <= 0xEF ) { //1110 xxxx
  3018. $working = $bytevalue & 0x0F;
  3019. $bytesleft = 2;
  3020. } else { //1111 0xxx
  3021. $working = $bytevalue & 0x07;
  3022. $bytesleft = 3;
  3023. }
  3024. }
  3025. return $result;
  3026. }
  3027. /** No bugs detected in iconv. */
  3028. const ICONV_OK = 0;
  3029. /** Iconv truncates output if converting from UTF-8 to another
  3030. * character set with //IGNORE, and a non-encodable character is found */
  3031. const ICONV_TRUNCATES = 1;
  3032. /** Iconv does not support //IGNORE, making it unusable for
  3033. * transcoding purposes */
  3034. const ICONV_UNUSABLE = 2;
  3035. /**
  3036. * glibc iconv has a known bug where it doesn't handle the magic
  3037. * //IGNORE stanza correctly. In particular, rather than ignore
  3038. * characters, it will return an EILSEQ after consuming some number
  3039. * of characters, and expect you to restart iconv as if it were
  3040. * an E2BIG. Old versions of PHP did not respect the errno, and
  3041. * returned the fragment, so as a result you would see iconv
  3042. * mysteriously truncating output. We can work around this by
  3043. * manually chopping our input into segments of about 8000
  3044. * characters, as long as PHP ignores the error code. If PHP starts
  3045. * paying attention to the error code, iconv becomes unusable.
  3046. *
  3047. * @returns Error code indicating severity of bug.
  3048. */
  3049. public static function testIconvTruncateBug() {
  3050. static $code = null;
  3051. if ($code === null) {
  3052. // better not use iconv, otherwise infinite loop!
  3053. $r = self::unsafeIconv('utf-8', 'ascii//IGNORE', "\xCE\xB1" . str_repeat('a', 9000));
  3054. if ($r === false) {
  3055. $code = self::ICONV_UNUSABLE;
  3056. } elseif (($c = strlen($r)) < 9000) {
  3057. $code = self::ICONV_TRUNCATES;
  3058. } elseif ($c > 9000) {
  3059. trigger_error('Your copy of iconv is extremely buggy. Please notify HTML Purifier maintainers: include your iconv version as per phpversion()', E_USER_ERROR);
  3060. } else {
  3061. $code = self::ICONV_OK;
  3062. }
  3063. }
  3064. return $code;
  3065. }
  3066. /**
  3067. * This expensive function tests whether or not a given character
  3068. * encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
  3069. * fail this test, and require special processing. Variable width
  3070. * encodings shouldn't ever fail.
  3071. *
  3072. * @param string $encoding Encoding name to test, as per iconv format
  3073. * @param bool $bypass Whether or not to bypass the precompiled arrays.
  3074. * @return Array of UTF-8 characters to their corresponding ASCII,
  3075. * which can be used to "undo" any overzealous iconv action.
  3076. */
  3077. public static function testEncodingSupportsASCII($encoding, $bypass = false) {
  3078. // All calls to iconv here are unsafe, proof by case analysis:
  3079. // If ICONV_OK, no difference.
  3080. // If ICONV_TRUNCATE, all calls involve one character inputs,
  3081. // so bug is not triggered.
  3082. // If ICONV_UNUSABLE, this call is irrelevant
  3083. static $encodings = array();
  3084. if (!$bypass) {
  3085. if (isset($encodings[$encoding])) return $encodings[$encoding];
  3086. $lenc = strtolower($encoding);
  3087. switch ($lenc) {
  3088. case 'shift_jis':
  3089. return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
  3090. case 'johab':
  3091. return array("\xE2\x82\xA9" => '\\');
  3092. }
  3093. if (strpos($lenc, 'iso-8859-') === 0) return array();
  3094. }
  3095. $ret = array();
  3096. if (self::unsafeIconv('UTF-8', $encoding, 'a') === false) return false;
  3097. for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
  3098. $c = chr($i); // UTF-8 char
  3099. $r = self::unsafeIconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion
  3100. if (
  3101. $r === '' ||
  3102. // This line is needed for iconv implementations that do not
  3103. // omit characters that do not exist in the target character set
  3104. ($r === $c && self::unsafeIconv($encoding, 'UTF-8//IGNORE', $r) !== $c)
  3105. ) {
  3106. // Reverse engineer: what's the UTF-8 equiv of this byte
  3107. // sequence? This assumes that there's no variable width
  3108. // encoding that doesn't support ASCII.
  3109. $ret[self::unsafeIconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
  3110. }
  3111. }
  3112. $encodings[$encoding] = $ret;
  3113. return $ret;
  3114. }
  3115. }
  3116. /**
  3117. * Object that provides entity lookup table from entity name to character
  3118. */
  3119. class HTMLPurifier_EntityLookup {
  3120. /**
  3121. * Assoc array of entity name to character represented.
  3122. */
  3123. public $table;
  3124. /**
  3125. * Sets up the entity lookup table from the serialized file contents.
  3126. * @note The serialized contents are versioned, but were generated
  3127. * using the maintenance script generate_entity_file.php
  3128. * @warning This is not in constructor to help enforce the Singleton
  3129. */
  3130. public function setup($file = false) {
  3131. if (!$file) {
  3132. $file = HTMLPURIFIER_PREFIX . '/HTMLPurifier/EntityLookup/entities.ser';
  3133. }
  3134. $this->table = unserialize(file_get_contents($file));
  3135. }
  3136. /**
  3137. * Retrieves sole instance of the object.
  3138. * @param Optional prototype of custom lookup table to overload with.
  3139. */
  3140. public static function instance($prototype = false) {
  3141. // no references, since PHP doesn't copy unless modified
  3142. static $instance = null;
  3143. if ($prototype) {
  3144. $instance = $prototype;
  3145. } elseif (!$instance) {
  3146. $instance = new HTMLPurifier_EntityLookup();
  3147. $instance->setup();
  3148. }
  3149. return $instance;
  3150. }
  3151. }
  3152. // if want to implement error collecting here, we'll need to use some sort
  3153. // of global data (probably trigger_error) because it's impossible to pass
  3154. // $config or $context to the callback functions.
  3155. /**
  3156. * Handles referencing and derefencing character entities
  3157. */
  3158. class HTMLPurifier_EntityParser
  3159. {
  3160. /**
  3161. * Reference to entity lookup table.
  3162. */
  3163. protected $_entity_lookup;
  3164. /**
  3165. * Callback regex string for parsing entities.
  3166. */
  3167. protected $_substituteEntitiesRegex =
  3168. '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
  3169. // 1. hex 2. dec 3. string (XML style)
  3170. /**
  3171. * Decimal to parsed string conversion table for special entities.
  3172. */
  3173. protected $_special_dec2str =
  3174. array(
  3175. 34 => '"',
  3176. 38 => '&',
  3177. 39 => "'",
  3178. 60 => '<',
  3179. 62 => '>'
  3180. );
  3181. /**
  3182. * Stripped entity names to decimal conversion table for special entities.
  3183. */
  3184. protected $_special_ent2dec =
  3185. array(
  3186. 'quot' => 34,
  3187. 'amp' => 38,
  3188. 'lt' => 60,
  3189. 'gt' => 62
  3190. );
  3191. /**
  3192. * Substitutes non-special entities with their parsed equivalents. Since
  3193. * running this whenever you have parsed character is t3h 5uck, we run
  3194. * it before everything else.
  3195. *
  3196. * @param $string String to have non-special entities parsed.
  3197. * @returns Parsed string.
  3198. */
  3199. public function substituteNonSpecialEntities($string) {
  3200. // it will try to detect missing semicolons, but don't rely on it
  3201. return preg_replace_callback(
  3202. $this->_substituteEntitiesRegex,
  3203. array($this, 'nonSpecialEntityCallback'),
  3204. $string
  3205. );
  3206. }
  3207. /**
  3208. * Callback function for substituteNonSpecialEntities() that does the work.
  3209. *
  3210. * @param $matches PCRE matches array, with 0 the entire match, and
  3211. * either index 1, 2 or 3 set with a hex value, dec value,
  3212. * or string (respectively).
  3213. * @returns Replacement string.
  3214. */
  3215. protected function nonSpecialEntityCallback($matches) {
  3216. // replaces all but big five
  3217. $entity = $matches[0];
  3218. $is_num = (@$matches[0][1] === '#');
  3219. if ($is_num) {
  3220. $is_hex = (@$entity[2] === 'x');
  3221. $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
  3222. // abort for special characters
  3223. if (isset($this->_special_dec2str[$code])) return $entity;
  3224. return HTMLPurifier_Encoder::unichr($code);
  3225. } else {
  3226. if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
  3227. if (!$this->_entity_lookup) {
  3228. $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
  3229. }
  3230. if (isset($this->_entity_lookup->table[$matches[3]])) {
  3231. return $this->_entity_lookup->table[$matches[3]];
  3232. } else {
  3233. return $entity;
  3234. }
  3235. }
  3236. }
  3237. /**
  3238. * Substitutes only special entities with their parsed equivalents.
  3239. *
  3240. * @notice We try to avoid calling this function because otherwise, it
  3241. * would have to be called a lot (for every parsed section).
  3242. *
  3243. * @param $string String to have non-special entities parsed.
  3244. * @returns Parsed string.
  3245. */
  3246. public function substituteSpecialEntities($string) {
  3247. return preg_replace_callback(
  3248. $this->_substituteEntitiesRegex,
  3249. array($this, 'specialEntityCallback'),
  3250. $string);
  3251. }
  3252. /**
  3253. * Callback function for substituteSpecialEntities() that does the work.
  3254. *
  3255. * This callback has same syntax as nonSpecialEntityCallback().
  3256. *
  3257. * @param $matches PCRE-style matches array, with 0 the entire match, and
  3258. * either index 1, 2 or 3 set with a hex value, dec value,
  3259. * or string (respectively).
  3260. * @returns Replacement string.
  3261. */
  3262. protected function specialEntityCallback($matches) {
  3263. $entity = $matches[0];
  3264. $is_num = (@$matches[0][1] === '#');
  3265. if ($is_num) {
  3266. $is_hex = (@$entity[2] === 'x');
  3267. $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
  3268. return isset($this->_special_dec2str[$int]) ?
  3269. $this->_special_dec2str[$int] :
  3270. $entity;
  3271. } else {
  3272. return isset($this->_special_ent2dec[$matches[3]]) ?
  3273. $this->_special_ent2dec[$matches[3]] :
  3274. $entity;
  3275. }
  3276. }
  3277. }
  3278. /**
  3279. * Error collection class that enables HTML Purifier to report HTML
  3280. * problems back to the user
  3281. */
  3282. class HTMLPurifier_ErrorCollector
  3283. {
  3284. /**
  3285. * Identifiers for the returned error array. These are purposely numeric
  3286. * so list() can be used.
  3287. */
  3288. const LINENO = 0;
  3289. const SEVERITY = 1;
  3290. const MESSAGE = 2;
  3291. const CHILDREN = 3;
  3292. protected $errors;
  3293. protected $_current;
  3294. protected $_stacks = array(array());
  3295. protected $locale;
  3296. protected $generator;
  3297. protected $context;
  3298. protected $lines = array();
  3299. public function __construct($context) {
  3300. $this->locale =& $context->get('Locale');
  3301. $this->context = $context;
  3302. $this->_current =& $this->_stacks[0];
  3303. $this->errors =& $this->_stacks[0];
  3304. }
  3305. /**
  3306. * Sends an error message to the collector for later use
  3307. * @param $severity int Error severity, PHP error style (don't use E_USER_)
  3308. * @param $msg string Error message text
  3309. * @param $subst1 string First substitution for $msg
  3310. * @param $subst2 string ...
  3311. */
  3312. public function send($severity, $msg) {
  3313. $args = array();
  3314. if (func_num_args() > 2) {
  3315. $args = func_get_args();
  3316. array_shift($args);
  3317. unset($args[0]);
  3318. }
  3319. $token = $this->context->get('CurrentToken', true);
  3320. $line = $token ? $token->line : $this->context->get('CurrentLine', true);
  3321. $col = $token ? $token->col : $this->context->get('CurrentCol', true);
  3322. $attr = $this->context->get('CurrentAttr', true);
  3323. // perform special substitutions, also add custom parameters
  3324. $subst = array();
  3325. if (!is_null($token)) {
  3326. $args['CurrentToken'] = $token;
  3327. }
  3328. if (!is_null($attr)) {
  3329. $subst['$CurrentAttr.Name'] = $attr;
  3330. if (isset($token->attr[$attr])) $subst['$CurrentAttr.Value'] = $token->attr[$attr];
  3331. }
  3332. if (empty($args)) {
  3333. $msg = $this->locale->getMessage($msg);
  3334. } else {
  3335. $msg = $this->locale->formatMessage($msg, $args);
  3336. }
  3337. if (!empty($subst)) $msg = strtr($msg, $subst);
  3338. // (numerically indexed)
  3339. $error = array(
  3340. self::LINENO => $line,
  3341. self::SEVERITY => $severity,
  3342. self::MESSAGE => $msg,
  3343. self::CHILDREN => array()
  3344. );
  3345. $this->_current[] = $error;
  3346. // NEW CODE BELOW ...
  3347. $struct = null;
  3348. // Top-level errors are either:
  3349. // TOKEN type, if $value is set appropriately, or
  3350. // "syntax" type, if $value is null
  3351. $new_struct = new HTMLPurifier_ErrorStruct();
  3352. $new_struct->type = HTMLPurifier_ErrorStruct::TOKEN;
  3353. if ($token) $new_struct->value = clone $token;
  3354. if (is_int($line) && is_int($col)) {
  3355. if (isset($this->lines[$line][$col])) {
  3356. $struct = $this->lines[$line][$col];
  3357. } else {
  3358. $struct = $this->lines[$line][$col] = $new_struct;
  3359. }
  3360. // These ksorts may present a performance problem
  3361. ksort($this->lines[$line], SORT_NUMERIC);
  3362. } else {
  3363. if (isset($this->lines[-1])) {
  3364. $struct = $this->lines[-1];
  3365. } else {
  3366. $struct = $this->lines[-1] = $new_struct;
  3367. }
  3368. }
  3369. ksort($this->lines, SORT_NUMERIC);
  3370. // Now, check if we need to operate on a lower structure
  3371. if (!empty($attr)) {
  3372. $struct = $struct->getChild(HTMLPurifier_ErrorStruct::ATTR, $attr);
  3373. if (!$struct->value) {
  3374. $struct->value = array($attr, 'PUT VALUE HERE');
  3375. }
  3376. }
  3377. if (!empty($cssprop)) {
  3378. $struct = $struct->getChild(HTMLPurifier_ErrorStruct::CSSPROP, $cssprop);
  3379. if (!$struct->value) {
  3380. // if we tokenize CSS this might be a little more difficult to do
  3381. $struct->value = array($cssprop, 'PUT VALUE HERE');
  3382. }
  3383. }
  3384. // Ok, structs are all setup, now time to register the error
  3385. $struct->addError($severity, $msg);
  3386. }
  3387. /**
  3388. * Retrieves raw error data for custom formatter to use
  3389. * @param List of arrays in format of array(line of error,
  3390. * error severity, error message,
  3391. * recursive sub-errors array)
  3392. */
  3393. public function getRaw() {
  3394. return $this->errors;
  3395. }
  3396. /**
  3397. * Default HTML formatting implementation for error messages
  3398. * @param $config Configuration array, vital for HTML output nature
  3399. * @param $errors Errors array to display; used for recursion.
  3400. */
  3401. public function getHTMLFormatted($config, $errors = null) {
  3402. $ret = array();
  3403. $this->generator = new HTMLPurifier_Generator($config, $this->context);
  3404. if ($errors === null) $errors = $this->errors;
  3405. // 'At line' message needs to be removed
  3406. // generation code for new structure goes here. It needs to be recursive.
  3407. foreach ($this->lines as $line => $col_array) {
  3408. if ($line == -1) continue;
  3409. foreach ($col_array as $col => $struct) {
  3410. $this->_renderStruct($ret, $struct, $line, $col);
  3411. }
  3412. }
  3413. if (isset($this->lines[-1])) {
  3414. $this->_renderStruct($ret, $this->lines[-1]);
  3415. }
  3416. if (empty($errors)) {
  3417. return '<p>' . $this->locale->getMessage('ErrorCollector: No errors') . '</p>';
  3418. } else {
  3419. return '<ul><li>' . implode('</li><li>', $ret) . '</li></ul>';
  3420. }
  3421. }
  3422. private function _renderStruct(&$ret, $struct, $line = null, $col = null) {
  3423. $stack = array($struct);
  3424. $context_stack = array(array());
  3425. while ($current = array_pop($stack)) {
  3426. $context = array_pop($context_stack);
  3427. foreach ($current->errors as $error) {
  3428. list($severity, $msg) = $error;
  3429. $string = '';
  3430. $string .= '<div>';
  3431. // W3C uses an icon to indicate the severity of the error.
  3432. $error = $this->locale->getErrorName($severity);
  3433. $string .= "<span class=\"error e$severity\"><strong>$error</strong></span> ";
  3434. if (!is_null($line) && !is_null($col)) {
  3435. $string .= "<em class=\"location\">Line $line, Column $col: </em> ";
  3436. } else {
  3437. $string .= '<em class="location">End of Document: </em> ';
  3438. }
  3439. $string .= '<strong class="description">' . $this->generator->escape($msg) . '</strong> ';
  3440. $string .= '</div>';
  3441. // Here, have a marker for the character on the column appropriate.
  3442. // Be sure to clip extremely long lines.
  3443. //$string .= '<pre>';
  3444. //$string .= '';
  3445. //$string .= '</pre>';
  3446. $ret[] = $string;
  3447. }
  3448. foreach ($current->children as $type => $array) {
  3449. $context[] = $current;
  3450. $stack = array_merge($stack, array_reverse($array, true));
  3451. for ($i = count($array); $i > 0; $i--) {
  3452. $context_stack[] = $context;
  3453. }
  3454. }
  3455. }
  3456. }
  3457. }
  3458. /**
  3459. * Records errors for particular segments of an HTML document such as tokens,
  3460. * attributes or CSS properties. They can contain error structs (which apply
  3461. * to components of what they represent), but their main purpose is to hold
  3462. * errors applying to whatever struct is being used.
  3463. */
  3464. class HTMLPurifier_ErrorStruct
  3465. {
  3466. /**
  3467. * Possible values for $children first-key. Note that top-level structures
  3468. * are automatically token-level.
  3469. */
  3470. const TOKEN = 0;
  3471. const ATTR = 1;
  3472. const CSSPROP = 2;
  3473. /**
  3474. * Type of this struct.
  3475. */
  3476. public $type;
  3477. /**
  3478. * Value of the struct we are recording errors for. There are various
  3479. * values for this:
  3480. * - TOKEN: Instance of HTMLPurifier_Token
  3481. * - ATTR: array('attr-name', 'value')
  3482. * - CSSPROP: array('prop-name', 'value')
  3483. */
  3484. public $value;
  3485. /**
  3486. * Errors registered for this structure.
  3487. */
  3488. public $errors = array();
  3489. /**
  3490. * Child ErrorStructs that are from this structure. For example, a TOKEN
  3491. * ErrorStruct would contain ATTR ErrorStructs. This is a multi-dimensional
  3492. * array in structure: [TYPE]['identifier']
  3493. */
  3494. public $children = array();
  3495. public function getChild($type, $id) {
  3496. if (!isset($this->children[$type][$id])) {
  3497. $this->children[$type][$id] = new HTMLPurifier_ErrorStruct();
  3498. $this->children[$type][$id]->type = $type;
  3499. }
  3500. return $this->children[$type][$id];
  3501. }
  3502. public function addError($severity, $message) {
  3503. $this->errors[] = array($severity, $message);
  3504. }
  3505. }
  3506. /**
  3507. * Global exception class for HTML Purifier; any exceptions we throw
  3508. * are from here.
  3509. */
  3510. class HTMLPurifier_Exception extends Exception
  3511. {
  3512. }
  3513. /**
  3514. * Represents a pre or post processing filter on HTML Purifier's output
  3515. *
  3516. * Sometimes, a little ad-hoc fixing of HTML has to be done before
  3517. * it gets sent through HTML Purifier: you can use filters to acheive
  3518. * this effect. For instance, YouTube videos can be preserved using
  3519. * this manner. You could have used a decorator for this task, but
  3520. * PHP's support for them is not terribly robust, so we're going
  3521. * to just loop through the filters.
  3522. *
  3523. * Filters should be exited first in, last out. If there are three filters,
  3524. * named 1, 2 and 3, the order of execution should go 1->preFilter,
  3525. * 2->preFilter, 3->preFilter, purify, 3->postFilter, 2->postFilter,
  3526. * 1->postFilter.
  3527. *
  3528. * @note Methods are not declared abstract as it is perfectly legitimate
  3529. * for an implementation not to want anything to happen on a step
  3530. */
  3531. class HTMLPurifier_Filter
  3532. {
  3533. /**
  3534. * Name of the filter for identification purposes
  3535. */
  3536. public $name;
  3537. /**
  3538. * Pre-processor function, handles HTML before HTML Purifier
  3539. */
  3540. public function preFilter($html, $config, $context) {
  3541. return $html;
  3542. }
  3543. /**
  3544. * Post-processor function, handles HTML after HTML Purifier
  3545. */
  3546. public function postFilter($html, $config, $context) {
  3547. return $html;
  3548. }
  3549. }
  3550. /**
  3551. * Generates HTML from tokens.
  3552. * @todo Refactor interface so that configuration/context is determined
  3553. * upon instantiation, no need for messy generateFromTokens() calls
  3554. * @todo Make some of the more internal functions protected, and have
  3555. * unit tests work around that
  3556. */
  3557. class HTMLPurifier_Generator
  3558. {
  3559. /**
  3560. * Whether or not generator should produce XML output
  3561. */
  3562. private $_xhtml = true;
  3563. /**
  3564. * :HACK: Whether or not generator should comment the insides of <script> tags
  3565. */
  3566. private $_scriptFix = false;
  3567. /**
  3568. * Cache of HTMLDefinition during HTML output to determine whether or
  3569. * not attributes should be minimized.
  3570. */
  3571. private $_def;
  3572. /**
  3573. * Cache of %Output.SortAttr
  3574. */
  3575. private $_sortAttr;
  3576. /**
  3577. * Cache of %Output.FlashCompat
  3578. */
  3579. private $_flashCompat;
  3580. /**
  3581. * Cache of %Output.FixInnerHTML
  3582. */
  3583. private $_innerHTMLFix;
  3584. /**
  3585. * Stack for keeping track of object information when outputting IE
  3586. * compatibility code.
  3587. */
  3588. private $_flashStack = array();
  3589. /**
  3590. * Configuration for the generator
  3591. */
  3592. protected $config;
  3593. /**
  3594. * @param $config Instance of HTMLPurifier_Config
  3595. * @param $context Instance of HTMLPurifier_Context
  3596. */
  3597. public function __construct($config, $context) {
  3598. $this->config = $config;
  3599. $this->_scriptFix = $config->get('Output.CommentScriptContents');
  3600. $this->_innerHTMLFix = $config->get('Output.FixInnerHTML');
  3601. $this->_sortAttr = $config->get('Output.SortAttr');
  3602. $this->_flashCompat = $config->get('Output.FlashCompat');
  3603. $this->_def = $config->getHTMLDefinition();
  3604. $this->_xhtml = $this->_def->doctype->xml;
  3605. }
  3606. /**
  3607. * Generates HTML from an array of tokens.
  3608. * @param $tokens Array of HTMLPurifier_Token
  3609. * @param $config HTMLPurifier_Config object
  3610. * @return Generated HTML
  3611. */
  3612. public function generateFromTokens($tokens) {
  3613. if (!$tokens) return '';
  3614. // Basic algorithm
  3615. $html = '';
  3616. for ($i = 0, $size = count($tokens); $i < $size; $i++) {
  3617. if ($this->_scriptFix && $tokens[$i]->name === 'script'
  3618. && $i + 2 < $size && $tokens[$i+2] instanceof HTMLPurifier_Token_End) {
  3619. // script special case
  3620. // the contents of the script block must be ONE token
  3621. // for this to work.
  3622. $html .= $this->generateFromToken($tokens[$i++]);
  3623. $html .= $this->generateScriptFromToken($tokens[$i++]);
  3624. }
  3625. $html .= $this->generateFromToken($tokens[$i]);
  3626. }
  3627. // Tidy cleanup
  3628. if (extension_loaded('tidy') && $this->config->get('Output.TidyFormat')) {
  3629. $tidy = new Tidy;
  3630. $tidy->parseString($html, array(
  3631. 'indent'=> true,
  3632. 'output-xhtml' => $this->_xhtml,
  3633. 'show-body-only' => true,
  3634. 'indent-spaces' => 2,
  3635. 'wrap' => 68,
  3636. ), 'utf8');
  3637. $tidy->cleanRepair();
  3638. $html = (string) $tidy; // explicit cast necessary
  3639. }
  3640. // Normalize newlines to system defined value
  3641. if ($this->config->get('Core.NormalizeNewlines')) {
  3642. $nl = $this->config->get('Output.Newline');
  3643. if ($nl === null) $nl = PHP_EOL;
  3644. if ($nl !== "\n") $html = str_replace("\n", $nl, $html);
  3645. }
  3646. return $html;
  3647. }
  3648. /**
  3649. * Generates HTML from a single token.
  3650. * @param $token HTMLPurifier_Token object.
  3651. * @return Generated HTML
  3652. */
  3653. public function generateFromToken($token) {
  3654. if (!$token instanceof HTMLPurifier_Token) {
  3655. trigger_error('Cannot generate HTML from non-HTMLPurifier_Token object', E_USER_WARNING);
  3656. return '';
  3657. } elseif ($token instanceof HTMLPurifier_Token_Start) {
  3658. $attr = $this->generateAttributes($token->attr, $token->name);
  3659. if ($this->_flashCompat) {
  3660. if ($token->name == "object") {
  3661. $flash = new stdclass();
  3662. $flash->attr = $token->attr;
  3663. $flash->param = array();
  3664. $this->_flashStack[] = $flash;
  3665. }
  3666. }
  3667. return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
  3668. } elseif ($token instanceof HTMLPurifier_Token_End) {
  3669. $_extra = '';
  3670. if ($this->_flashCompat) {
  3671. if ($token->name == "object" && !empty($this->_flashStack)) {
  3672. // doesn't do anything for now
  3673. }
  3674. }
  3675. return $_extra . '</' . $token->name . '>';
  3676. } elseif ($token instanceof HTMLPurifier_Token_Empty) {
  3677. if ($this->_flashCompat && $token->name == "param" && !empty($this->_flashStack)) {
  3678. $this->_flashStack[count($this->_flashStack)-1]->param[$token->attr['name']] = $token->attr['value'];
  3679. }
  3680. $attr = $this->generateAttributes($token->attr, $token->name);
  3681. return '<' . $token->name . ($attr ? ' ' : '') . $attr .
  3682. ( $this->_xhtml ? ' /': '' ) // <br /> v. <br>
  3683. . '>';
  3684. } elseif ($token instanceof HTMLPurifier_Token_Text) {
  3685. return $this->escape($token->data, ENT_NOQUOTES);
  3686. } elseif ($token instanceof HTMLPurifier_Token_Comment) {
  3687. return '<!--' . $token->data . '-->';
  3688. } else {
  3689. return '';
  3690. }
  3691. }
  3692. /**
  3693. * Special case processor for the contents of script tags
  3694. * @warning This runs into problems if there's already a literal
  3695. * --> somewhere inside the script contents.
  3696. */
  3697. public function generateScriptFromToken($token) {
  3698. if (!$token instanceof HTMLPurifier_Token_Text) return $this->generateFromToken($token);
  3699. // Thanks <http://lachy.id.au/log/2005/05/script-comments>
  3700. $data = preg_replace('#//\s*$#', '', $token->data);
  3701. return '<!--//--><![CDATA[//><!--' . "\n" . trim($data) . "\n" . '//--><!]]>';
  3702. }
  3703. /**
  3704. * Generates attribute declarations from attribute array.
  3705. * @note This does not include the leading or trailing space.
  3706. * @param $assoc_array_of_attributes Attribute array
  3707. * @param $element Name of element attributes are for, used to check
  3708. * attribute minimization.
  3709. * @return Generate HTML fragment for insertion.
  3710. */
  3711. public function generateAttributes($assoc_array_of_attributes, $element = false) {
  3712. $html = '';
  3713. if ($this->_sortAttr) ksort($assoc_array_of_attributes);
  3714. foreach ($assoc_array_of_attributes as $key => $value) {
  3715. if (!$this->_xhtml) {
  3716. // Remove namespaced attributes
  3717. if (strpos($key, ':') !== false) continue;
  3718. // Check if we should minimize the attribute: val="val" -> val
  3719. if ($element && !empty($this->_def->info[$element]->attr[$key]->minimized)) {
  3720. $html .= $key . ' ';
  3721. continue;
  3722. }
  3723. }
  3724. // Workaround for Internet Explorer innerHTML bug.
  3725. // Essentially, Internet Explorer, when calculating
  3726. // innerHTML, omits quotes if there are no instances of
  3727. // angled brackets, quotes or spaces. However, when parsing
  3728. // HTML (for example, when you assign to innerHTML), it
  3729. // treats backticks as quotes. Thus,
  3730. // <img alt="``" />
  3731. // becomes
  3732. // <img alt=`` />
  3733. // becomes
  3734. // <img alt='' />
  3735. // Fortunately, all we need to do is trigger an appropriate
  3736. // quoting style, which we do by adding an extra space.
  3737. // This also is consistent with the W3C spec, which states
  3738. // that user agents may ignore leading or trailing
  3739. // whitespace (in fact, most don't, at least for attributes
  3740. // like alt, but an extra space at the end is barely
  3741. // noticeable). Still, we have a configuration knob for
  3742. // this, since this transformation is not necesary if you
  3743. // don't process user input with innerHTML or you don't plan
  3744. // on supporting Internet Explorer.
  3745. if ($this->_innerHTMLFix) {
  3746. if (strpos($value, '`') !== false) {
  3747. // check if correct quoting style would not already be
  3748. // triggered
  3749. if (strcspn($value, '"\' <>') === strlen($value)) {
  3750. // protect!
  3751. $value .= ' ';
  3752. }
  3753. }
  3754. }
  3755. $html .= $key.'="'.$this->escape($value).'" ';
  3756. }
  3757. return rtrim($html);
  3758. }
  3759. /**
  3760. * Escapes raw text data.
  3761. * @todo This really ought to be protected, but until we have a facility
  3762. * for properly generating HTML here w/o using tokens, it stays
  3763. * public.
  3764. * @param $string String data to escape for HTML.
  3765. * @param $quote Quoting style, like htmlspecialchars. ENT_NOQUOTES is
  3766. * permissible for non-attribute output.
  3767. * @return String escaped data.
  3768. */
  3769. public function escape($string, $quote = null) {
  3770. // Workaround for APC bug on Mac Leopard reported by sidepodcast
  3771. // http://htmlpurifier.org/phorum/read.php?3,4823,4846
  3772. if ($quote === null) $quote = ENT_COMPAT;
  3773. return htmlspecialchars($string, $quote, 'UTF-8');
  3774. }
  3775. }
  3776. /**
  3777. * Definition of the purified HTML that describes allowed children,
  3778. * attributes, and many other things.
  3779. *
  3780. * Conventions:
  3781. *
  3782. * All member variables that are prefixed with info
  3783. * (including the main $info array) are used by HTML Purifier internals
  3784. * and should not be directly edited when customizing the HTMLDefinition.
  3785. * They can usually be set via configuration directives or custom
  3786. * modules.
  3787. *
  3788. * On the other hand, member variables without the info prefix are used
  3789. * internally by the HTMLDefinition and MUST NOT be used by other HTML
  3790. * Purifier internals. Many of them, however, are public, and may be
  3791. * edited by userspace code to tweak the behavior of HTMLDefinition.
  3792. *
  3793. * @note This class is inspected by Printer_HTMLDefinition; please
  3794. * update that class if things here change.
  3795. *
  3796. * @warning Directives that change this object's structure must be in
  3797. * the HTML or Attr namespace!
  3798. */
  3799. class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition
  3800. {
  3801. // FULLY-PUBLIC VARIABLES ---------------------------------------------
  3802. /**
  3803. * Associative array of element names to HTMLPurifier_ElementDef
  3804. */
  3805. public $info = array();
  3806. /**
  3807. * Associative array of global attribute name to attribute definition.
  3808. */
  3809. public $info_global_attr = array();
  3810. /**
  3811. * String name of parent element HTML will be going into.
  3812. */
  3813. public $info_parent = 'div';
  3814. /**
  3815. * Definition for parent element, allows parent element to be a
  3816. * tag that's not allowed inside the HTML fragment.
  3817. */
  3818. public $info_parent_def;
  3819. /**
  3820. * String name of element used to wrap inline elements in block context
  3821. * @note This is rarely used except for BLOCKQUOTEs in strict mode
  3822. */
  3823. public $info_block_wrapper = 'p';
  3824. /**
  3825. * Associative array of deprecated tag name to HTMLPurifier_TagTransform
  3826. */
  3827. public $info_tag_transform = array();
  3828. /**
  3829. * Indexed list of HTMLPurifier_AttrTransform to be performed before validation.
  3830. */
  3831. public $info_attr_transform_pre = array();
  3832. /**
  3833. * Indexed list of HTMLPurifier_AttrTransform to be performed after validation.
  3834. */
  3835. public $info_attr_transform_post = array();
  3836. /**
  3837. * Nested lookup array of content set name (Block, Inline) to
  3838. * element name to whether or not it belongs in that content set.
  3839. */
  3840. public $info_content_sets = array();
  3841. /**
  3842. * Indexed list of HTMLPurifier_Injector to be used.
  3843. */
  3844. public $info_injector = array();
  3845. /**
  3846. * Doctype object
  3847. */
  3848. public $doctype;
  3849. // RAW CUSTOMIZATION STUFF --------------------------------------------
  3850. /**
  3851. * Adds a custom attribute to a pre-existing element
  3852. * @note This is strictly convenience, and does not have a corresponding
  3853. * method in HTMLPurifier_HTMLModule
  3854. * @param $element_name String element name to add attribute to
  3855. * @param $attr_name String name of attribute
  3856. * @param $def Attribute definition, can be string or object, see
  3857. * HTMLPurifier_AttrTypes for details
  3858. */
  3859. public function addAttribute($element_name, $attr_name, $def) {
  3860. $module = $this->getAnonymousModule();
  3861. if (!isset($module->info[$element_name])) {
  3862. $element = $module->addBlankElement($element_name);
  3863. } else {
  3864. $element = $module->info[$element_name];
  3865. }
  3866. $element->attr[$attr_name] = $def;
  3867. }
  3868. /**
  3869. * Adds a custom element to your HTML definition
  3870. * @note See HTMLPurifier_HTMLModule::addElement for detailed
  3871. * parameter and return value descriptions.
  3872. */
  3873. public function addElement($element_name, $type, $contents, $attr_collections, $attributes = array()) {
  3874. $module = $this->getAnonymousModule();
  3875. // assume that if the user is calling this, the element
  3876. // is safe. This may not be a good idea
  3877. $element = $module->addElement($element_name, $type, $contents, $attr_collections, $attributes);
  3878. return $element;
  3879. }
  3880. /**
  3881. * Adds a blank element to your HTML definition, for overriding
  3882. * existing behavior
  3883. * @note See HTMLPurifier_HTMLModule::addBlankElement for detailed
  3884. * parameter and return value descriptions.
  3885. */
  3886. public function addBlankElement($element_name) {
  3887. $module = $this->getAnonymousModule();
  3888. $element = $module->addBlankElement($element_name);
  3889. return $element;
  3890. }
  3891. /**
  3892. * Retrieves a reference to the anonymous module, so you can
  3893. * bust out advanced features without having to make your own
  3894. * module.
  3895. */
  3896. public function getAnonymousModule() {
  3897. if (!$this->_anonModule) {
  3898. $this->_anonModule = new HTMLPurifier_HTMLModule();
  3899. $this->_anonModule->name = 'Anonymous';
  3900. }
  3901. return $this->_anonModule;
  3902. }
  3903. private $_anonModule = null;
  3904. // PUBLIC BUT INTERNAL VARIABLES --------------------------------------
  3905. public $type = 'HTML';
  3906. public $manager; /**< Instance of HTMLPurifier_HTMLModuleManager */
  3907. /**
  3908. * Performs low-cost, preliminary initialization.
  3909. */
  3910. public function __construct() {
  3911. $this->manager = new HTMLPurifier_HTMLModuleManager();
  3912. }
  3913. protected function doSetup($config) {
  3914. $this->processModules($config);
  3915. $this->setupConfigStuff($config);
  3916. unset($this->manager);
  3917. // cleanup some of the element definitions
  3918. foreach ($this->info as $k => $v) {
  3919. unset($this->info[$k]->content_model);
  3920. unset($this->info[$k]->content_model_type);
  3921. }
  3922. }
  3923. /**
  3924. * Extract out the information from the manager
  3925. */
  3926. protected function processModules($config) {
  3927. if ($this->_anonModule) {
  3928. // for user specific changes
  3929. // this is late-loaded so we don't have to deal with PHP4
  3930. // reference wonky-ness
  3931. $this->manager->addModule($this->_anonModule);
  3932. unset($this->_anonModule);
  3933. }
  3934. $this->manager->setup($config);
  3935. $this->doctype = $this->manager->doctype;
  3936. foreach ($this->manager->modules as $module) {
  3937. foreach($module->info_tag_transform as $k => $v) {
  3938. if ($v === false) unset($this->info_tag_transform[$k]);
  3939. else $this->info_tag_transform[$k] = $v;
  3940. }
  3941. foreach($module->info_attr_transform_pre as $k => $v) {
  3942. if ($v === false) unset($this->info_attr_transform_pre[$k]);
  3943. else $this->info_attr_transform_pre[$k] = $v;
  3944. }
  3945. foreach($module->info_attr_transform_post as $k => $v) {
  3946. if ($v === false) unset($this->info_attr_transform_post[$k]);
  3947. else $this->info_attr_transform_post[$k] = $v;
  3948. }
  3949. foreach ($module->info_injector as $k => $v) {
  3950. if ($v === false) unset($this->info_injector[$k]);
  3951. else $this->info_injector[$k] = $v;
  3952. }
  3953. }
  3954. $this->info = $this->manager->getElements();
  3955. $this->info_content_sets = $this->manager->contentSets->lookup;
  3956. }
  3957. /**
  3958. * Sets up stuff based on config. We need a better way of doing this.
  3959. */
  3960. protected function setupConfigStuff($config) {
  3961. $block_wrapper = $config->get('HTML.BlockWrapper');
  3962. if (isset($this->info_content_sets['Block'][$block_wrapper])) {
  3963. $this->info_block_wrapper = $block_wrapper;
  3964. } else {
  3965. trigger_error('Cannot use non-block element as block wrapper',
  3966. E_USER_ERROR);
  3967. }
  3968. $parent = $config->get('HTML.Parent');
  3969. $def = $this->manager->getElement($parent, true);
  3970. if ($def) {
  3971. $this->info_parent = $parent;
  3972. $this->info_parent_def = $def;
  3973. } else {
  3974. trigger_error('Cannot use unrecognized element as parent',
  3975. E_USER_ERROR);
  3976. $this->info_parent_def = $this->manager->getElement($this->info_parent, true);
  3977. }
  3978. // support template text
  3979. $support = "(for information on implementing this, see the ".
  3980. "support forums) ";
  3981. // setup allowed elements -----------------------------------------
  3982. $allowed_elements = $config->get('HTML.AllowedElements');
  3983. $allowed_attributes = $config->get('HTML.AllowedAttributes'); // retrieve early
  3984. if (!is_array($allowed_elements) && !is_array($allowed_attributes)) {
  3985. $allowed = $config->get('HTML.Allowed');
  3986. if (is_string($allowed)) {
  3987. list($allowed_elements, $allowed_attributes) = $this->parseTinyMCEAllowedList($allowed);
  3988. }
  3989. }
  3990. if (is_array($allowed_elements)) {
  3991. foreach ($this->info as $name => $d) {
  3992. if(!isset($allowed_elements[$name])) unset($this->info[$name]);
  3993. unset($allowed_elements[$name]);
  3994. }
  3995. // emit errors
  3996. foreach ($allowed_elements as $element => $d) {
  3997. $element = htmlspecialchars($element); // PHP doesn't escape errors, be careful!
  3998. trigger_error("Element '$element' is not supported $support", E_USER_WARNING);
  3999. }
  4000. }
  4001. // setup allowed attributes ---------------------------------------
  4002. $allowed_attributes_mutable = $allowed_attributes; // by copy!
  4003. if (is_array($allowed_attributes)) {
  4004. // This actually doesn't do anything, since we went away from
  4005. // global attributes. It's possible that userland code uses
  4006. // it, but HTMLModuleManager doesn't!
  4007. foreach ($this->info_global_attr as $attr => $x) {
  4008. $keys = array($attr, "*@$attr", "*.$attr");
  4009. $delete = true;
  4010. foreach ($keys as $key) {
  4011. if ($delete && isset($allowed_attributes[$key])) {
  4012. $delete = false;
  4013. }
  4014. if (isset($allowed_attributes_mutable[$key])) {
  4015. unset($allowed_attributes_mutable[$key]);
  4016. }
  4017. }
  4018. if ($delete) unset($this->info_global_attr[$attr]);
  4019. }
  4020. foreach ($this->info as $tag => $info) {
  4021. foreach ($info->attr as $attr => $x) {
  4022. $keys = array("$tag@$attr", $attr, "*@$attr", "$tag.$attr", "*.$attr");
  4023. $delete = true;
  4024. foreach ($keys as $key) {
  4025. if ($delete && isset($allowed_attributes[$key])) {
  4026. $delete = false;
  4027. }
  4028. if (isset($allowed_attributes_mutable[$key])) {
  4029. unset($allowed_attributes_mutable[$key]);
  4030. }
  4031. }
  4032. if ($delete) {
  4033. if ($this->info[$tag]->attr[$attr]->required) {
  4034. trigger_error("Required attribute '$attr' in element '$tag' was not allowed, which means '$tag' will not be allowed either", E_USER_WARNING);
  4035. }
  4036. unset($this->info[$tag]->attr[$attr]);
  4037. }
  4038. }
  4039. }
  4040. // emit errors
  4041. foreach ($allowed_attributes_mutable as $elattr => $d) {
  4042. $bits = preg_split('/[.@]/', $elattr, 2);
  4043. $c = count($bits);
  4044. switch ($c) {
  4045. case 2:
  4046. if ($bits[0] !== '*') {
  4047. $element = htmlspecialchars($bits[0]);
  4048. $attribute = htmlspecialchars($bits[1]);
  4049. if (!isset($this->info[$element])) {
  4050. trigger_error("Cannot allow attribute '$attribute' if element '$element' is not allowed/supported $support");
  4051. } else {
  4052. trigger_error("Attribute '$attribute' in element '$element' not supported $support",
  4053. E_USER_WARNING);
  4054. }
  4055. break;
  4056. }
  4057. // otherwise fall through
  4058. case 1:
  4059. $attribute = htmlspecialchars($bits[0]);
  4060. trigger_error("Global attribute '$attribute' is not ".
  4061. "supported in any elements $support",
  4062. E_USER_WARNING);
  4063. break;
  4064. }
  4065. }
  4066. }
  4067. // setup forbidden elements ---------------------------------------
  4068. $forbidden_elements = $config->get('HTML.ForbiddenElements');
  4069. $forbidden_attributes = $config->get('HTML.ForbiddenAttributes');
  4070. foreach ($this->info as $tag => $info) {
  4071. if (isset($forbidden_elements[$tag])) {
  4072. unset($this->info[$tag]);
  4073. continue;
  4074. }
  4075. foreach ($info->attr as $attr => $x) {
  4076. if (
  4077. isset($forbidden_attributes["$tag@$attr"]) ||
  4078. isset($forbidden_attributes["*@$attr"]) ||
  4079. isset($forbidden_attributes[$attr])
  4080. ) {
  4081. unset($this->info[$tag]->attr[$attr]);
  4082. continue;
  4083. } // this segment might get removed eventually
  4084. elseif (isset($forbidden_attributes["$tag.$attr"])) {
  4085. // $tag.$attr are not user supplied, so no worries!
  4086. trigger_error("Error with $tag.$attr: tag.attr syntax not supported for HTML.ForbiddenAttributes; use tag@attr instead", E_USER_WARNING);
  4087. }
  4088. }
  4089. }
  4090. foreach ($forbidden_attributes as $key => $v) {
  4091. if (strlen($key) < 2) continue;
  4092. if ($key[0] != '*') continue;
  4093. if ($key[1] == '.') {
  4094. trigger_error("Error with $key: *.attr syntax not supported for HTML.ForbiddenAttributes; use attr instead", E_USER_WARNING);
  4095. }
  4096. }
  4097. // setup injectors -----------------------------------------------------
  4098. foreach ($this->info_injector as $i => $injector) {
  4099. if ($injector->checkNeeded($config) !== false) {
  4100. // remove injector that does not have it's required
  4101. // elements/attributes present, and is thus not needed.
  4102. unset($this->info_injector[$i]);
  4103. }
  4104. }
  4105. }
  4106. /**
  4107. * Parses a TinyMCE-flavored Allowed Elements and Attributes list into
  4108. * separate lists for processing. Format is element[attr1|attr2],element2...
  4109. * @warning Although it's largely drawn from TinyMCE's implementation,
  4110. * it is different, and you'll probably have to modify your lists
  4111. * @param $list String list to parse
  4112. * @param array($allowed_elements, $allowed_attributes)
  4113. * @todo Give this its own class, probably static interface
  4114. */
  4115. public function parseTinyMCEAllowedList($list) {
  4116. $list = str_replace(array(' ', "\t"), '', $list);
  4117. $elements = array();
  4118. $attributes = array();
  4119. $chunks = preg_split('/(,|[\n\r]+)/', $list);
  4120. foreach ($chunks as $chunk) {
  4121. if (empty($chunk)) continue;
  4122. // remove TinyMCE element control characters
  4123. if (!strpos($chunk, '[')) {
  4124. $element = $chunk;
  4125. $attr = false;
  4126. } else {
  4127. list($element, $attr) = explode('[', $chunk);
  4128. }
  4129. if ($element !== '*') $elements[$element] = true;
  4130. if (!$attr) continue;
  4131. $attr = substr($attr, 0, strlen($attr) - 1); // remove trailing ]
  4132. $attr = explode('|', $attr);
  4133. foreach ($attr as $key) {
  4134. $attributes["$element.$key"] = true;
  4135. }
  4136. }
  4137. return array($elements, $attributes);
  4138. }
  4139. }
  4140. /**
  4141. * Represents an XHTML 1.1 module, with information on elements, tags
  4142. * and attributes.
  4143. * @note Even though this is technically XHTML 1.1, it is also used for
  4144. * regular HTML parsing. We are using modulization as a convenient
  4145. * way to represent the internals of HTMLDefinition, and our
  4146. * implementation is by no means conforming and does not directly
  4147. * use the normative DTDs or XML schemas.
  4148. * @note The public variables in a module should almost directly
  4149. * correspond to the variables in HTMLPurifier_HTMLDefinition.
  4150. * However, the prefix info carries no special meaning in these
  4151. * objects (include it anyway if that's the correspondence though).
  4152. * @todo Consider making some member functions protected
  4153. */
  4154. class HTMLPurifier_HTMLModule
  4155. {
  4156. // -- Overloadable ----------------------------------------------------
  4157. /**
  4158. * Short unique string identifier of the module
  4159. */
  4160. public $name;
  4161. /**
  4162. * Informally, a list of elements this module changes. Not used in
  4163. * any significant way.
  4164. */
  4165. public $elements = array();
  4166. /**
  4167. * Associative array of element names to element definitions.
  4168. * Some definitions may be incomplete, to be merged in later
  4169. * with the full definition.
  4170. */
  4171. public $info = array();
  4172. /**
  4173. * Associative array of content set names to content set additions.
  4174. * This is commonly used to, say, add an A element to the Inline
  4175. * content set. This corresponds to an internal variable $content_sets
  4176. * and NOT info_content_sets member variable of HTMLDefinition.
  4177. */
  4178. public $content_sets = array();
  4179. /**
  4180. * Associative array of attribute collection names to attribute
  4181. * collection additions. More rarely used for adding attributes to
  4182. * the global collections. Example is the StyleAttribute module adding
  4183. * the style attribute to the Core. Corresponds to HTMLDefinition's
  4184. * attr_collections->info, since the object's data is only info,
  4185. * with extra behavior associated with it.
  4186. */
  4187. public $attr_collections = array();
  4188. /**
  4189. * Associative array of deprecated tag name to HTMLPurifier_TagTransform
  4190. */
  4191. public $info_tag_transform = array();
  4192. /**
  4193. * List of HTMLPurifier_AttrTransform to be performed before validation.
  4194. */
  4195. public $info_attr_transform_pre = array();
  4196. /**
  4197. * List of HTMLPurifier_AttrTransform to be performed after validation.
  4198. */
  4199. public $info_attr_transform_post = array();
  4200. /**
  4201. * List of HTMLPurifier_Injector to be performed during well-formedness fixing.
  4202. * An injector will only be invoked if all of it's pre-requisites are met;
  4203. * if an injector fails setup, there will be no error; it will simply be
  4204. * silently disabled.
  4205. */
  4206. public $info_injector = array();
  4207. /**
  4208. * Boolean flag that indicates whether or not getChildDef is implemented.
  4209. * For optimization reasons: may save a call to a function. Be sure
  4210. * to set it if you do implement getChildDef(), otherwise it will have
  4211. * no effect!
  4212. */
  4213. public $defines_child_def = false;
  4214. /**
  4215. * Boolean flag whether or not this module is safe. If it is not safe, all
  4216. * of its members are unsafe. Modules are safe by default (this might be
  4217. * slightly dangerous, but it doesn't make much sense to force HTML Purifier,
  4218. * which is based off of safe HTML, to explicitly say, "This is safe," even
  4219. * though there are modules which are "unsafe")
  4220. *
  4221. * @note Previously, safety could be applied at an element level granularity.
  4222. * We've removed this ability, so in order to add "unsafe" elements
  4223. * or attributes, a dedicated module with this property set to false
  4224. * must be used.
  4225. */
  4226. public $safe = true;
  4227. /**
  4228. * Retrieves a proper HTMLPurifier_ChildDef subclass based on
  4229. * content_model and content_model_type member variables of
  4230. * the HTMLPurifier_ElementDef class. There is a similar function
  4231. * in HTMLPurifier_HTMLDefinition.
  4232. * @param $def HTMLPurifier_ElementDef instance
  4233. * @return HTMLPurifier_ChildDef subclass
  4234. */
  4235. public function getChildDef($def) {return false;}
  4236. // -- Convenience -----------------------------------------------------
  4237. /**
  4238. * Convenience function that sets up a new element
  4239. * @param $element Name of element to add
  4240. * @param $type What content set should element be registered to?
  4241. * Set as false to skip this step.
  4242. * @param $contents Allowed children in form of:
  4243. * "$content_model_type: $content_model"
  4244. * @param $attr_includes What attribute collections to register to
  4245. * element?
  4246. * @param $attr What unique attributes does the element define?
  4247. * @note See ElementDef for in-depth descriptions of these parameters.
  4248. * @return Created element definition object, so you
  4249. * can set advanced parameters
  4250. */
  4251. public function addElement($element, $type, $contents, $attr_includes = array(), $attr = array()) {
  4252. $this->elements[] = $element;
  4253. // parse content_model
  4254. list($content_model_type, $content_model) = $this->parseContents($contents);
  4255. // merge in attribute inclusions
  4256. $this->mergeInAttrIncludes($attr, $attr_includes);
  4257. // add element to content sets
  4258. if ($type) $this->addElementToContentSet($element, $type);
  4259. // create element
  4260. $this->info[$element] = HTMLPurifier_ElementDef::create(
  4261. $content_model, $content_model_type, $attr
  4262. );
  4263. // literal object $contents means direct child manipulation
  4264. if (!is_string($contents)) $this->info[$element]->child = $contents;
  4265. return $this->info[$element];
  4266. }
  4267. /**
  4268. * Convenience function that creates a totally blank, non-standalone
  4269. * element.
  4270. * @param $element Name of element to create
  4271. * @return Created element
  4272. */
  4273. public function addBlankElement($element) {
  4274. if (!isset($this->info[$element])) {
  4275. $this->elements[] = $element;
  4276. $this->info[$element] = new HTMLPurifier_ElementDef();
  4277. $this->info[$element]->standalone = false;
  4278. } else {
  4279. trigger_error("Definition for $element already exists in module, cannot redefine");
  4280. }
  4281. return $this->info[$element];
  4282. }
  4283. /**
  4284. * Convenience function that registers an element to a content set
  4285. * @param Element to register
  4286. * @param Name content set (warning: case sensitive, usually upper-case
  4287. * first letter)
  4288. */
  4289. public function addElementToContentSet($element, $type) {
  4290. if (!isset($this->content_sets[$type])) $this->content_sets[$type] = '';
  4291. else $this->content_sets[$type] .= ' | ';
  4292. $this->content_sets[$type] .= $element;
  4293. }
  4294. /**
  4295. * Convenience function that transforms single-string contents
  4296. * into separate content model and content model type
  4297. * @param $contents Allowed children in form of:
  4298. * "$content_model_type: $content_model"
  4299. * @note If contents is an object, an array of two nulls will be
  4300. * returned, and the callee needs to take the original $contents
  4301. * and use it directly.
  4302. */
  4303. public function parseContents($contents) {
  4304. if (!is_string($contents)) return array(null, null); // defer
  4305. switch ($contents) {
  4306. // check for shorthand content model forms
  4307. case 'Empty':
  4308. return array('empty', '');
  4309. case 'Inline':
  4310. return array('optional', 'Inline | #PCDATA');
  4311. case 'Flow':
  4312. return array('optional', 'Flow | #PCDATA');
  4313. }
  4314. list($content_model_type, $content_model) = explode(':', $contents);
  4315. $content_model_type = strtolower(trim($content_model_type));
  4316. $content_model = trim($content_model);
  4317. return array($content_model_type, $content_model);
  4318. }
  4319. /**
  4320. * Convenience function that merges a list of attribute includes into
  4321. * an attribute array.
  4322. * @param $attr Reference to attr array to modify
  4323. * @param $attr_includes Array of includes / string include to merge in
  4324. */
  4325. public function mergeInAttrIncludes(&$attr, $attr_includes) {
  4326. if (!is_array($attr_includes)) {
  4327. if (empty($attr_includes)) $attr_includes = array();
  4328. else $attr_includes = array($attr_includes);
  4329. }
  4330. $attr[0] = $attr_includes;
  4331. }
  4332. /**
  4333. * Convenience function that generates a lookup table with boolean
  4334. * true as value.
  4335. * @param $list List of values to turn into a lookup
  4336. * @note You can also pass an arbitrary number of arguments in
  4337. * place of the regular argument
  4338. * @return Lookup array equivalent of list
  4339. */
  4340. public function makeLookup($list) {
  4341. if (is_string($list)) $list = func_get_args();
  4342. $ret = array();
  4343. foreach ($list as $value) {
  4344. if (is_null($value)) continue;
  4345. $ret[$value] = true;
  4346. }
  4347. return $ret;
  4348. }
  4349. /**
  4350. * Lazy load construction of the module after determining whether
  4351. * or not it's needed, and also when a finalized configuration object
  4352. * is available.
  4353. * @param $config Instance of HTMLPurifier_Config
  4354. */
  4355. public function setup($config) {}
  4356. }
  4357. class HTMLPurifier_HTMLModuleManager
  4358. {
  4359. /**
  4360. * Instance of HTMLPurifier_DoctypeRegistry
  4361. */
  4362. public $doctypes;
  4363. /**
  4364. * Instance of current doctype
  4365. */
  4366. public $doctype;
  4367. /**
  4368. * Instance of HTMLPurifier_AttrTypes
  4369. */
  4370. public $attrTypes;
  4371. /**
  4372. * Active instances of modules for the specified doctype are
  4373. * indexed, by name, in this array.
  4374. */
  4375. public $modules = array();
  4376. /**
  4377. * Array of recognized HTMLPurifier_Module instances, indexed by
  4378. * module's class name. This array is usually lazy loaded, but a
  4379. * user can overload a module by pre-emptively registering it.
  4380. */
  4381. public $registeredModules = array();
  4382. /**
  4383. * List of extra modules that were added by the user using addModule().
  4384. * These get unconditionally merged into the current doctype, whatever
  4385. * it may be.
  4386. */
  4387. public $userModules = array();
  4388. /**
  4389. * Associative array of element name to list of modules that have
  4390. * definitions for the element; this array is dynamically filled.
  4391. */
  4392. public $elementLookup = array();
  4393. /** List of prefixes we should use for registering small names */
  4394. public $prefixes = array('HTMLPurifier_HTMLModule_');
  4395. public $contentSets; /**< Instance of HTMLPurifier_ContentSets */
  4396. public $attrCollections; /**< Instance of HTMLPurifier_AttrCollections */
  4397. /** If set to true, unsafe elements and attributes will be allowed */
  4398. public $trusted = false;
  4399. public function __construct() {
  4400. // editable internal objects
  4401. $this->attrTypes = new HTMLPurifier_AttrTypes();
  4402. $this->doctypes = new HTMLPurifier_DoctypeRegistry();
  4403. // setup basic modules
  4404. $common = array(
  4405. 'CommonAttributes', 'Text', 'Hypertext', 'List',
  4406. 'Presentation', 'Edit', 'Bdo', 'Tables', 'Image',
  4407. 'StyleAttribute',
  4408. // Unsafe:
  4409. 'Scripting', 'Object', 'Forms',
  4410. // Sorta legacy, but present in strict:
  4411. 'Name',
  4412. );
  4413. $transitional = array('Legacy', 'Target', 'Iframe');
  4414. $xml = array('XMLCommonAttributes');
  4415. $non_xml = array('NonXMLCommonAttributes');
  4416. // setup basic doctypes
  4417. $this->doctypes->register(
  4418. 'HTML 4.01 Transitional', false,
  4419. array_merge($common, $transitional, $non_xml),
  4420. array('Tidy_Transitional', 'Tidy_Proprietary'),
  4421. array(),
  4422. '-//W3C//DTD HTML 4.01 Transitional//EN',
  4423. 'http://www.w3.org/TR/html4/loose.dtd'
  4424. );
  4425. $this->doctypes->register(
  4426. 'HTML 4.01 Strict', false,
  4427. array_merge($common, $non_xml),
  4428. array('Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'),
  4429. array(),
  4430. '-//W3C//DTD HTML 4.01//EN',
  4431. 'http://www.w3.org/TR/html4/strict.dtd'
  4432. );
  4433. $this->doctypes->register(
  4434. 'XHTML 1.0 Transitional', true,
  4435. array_merge($common, $transitional, $xml, $non_xml),
  4436. array('Tidy_Transitional', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Name'),
  4437. array(),
  4438. '-//W3C//DTD XHTML 1.0 Transitional//EN',
  4439. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
  4440. );
  4441. $this->doctypes->register(
  4442. 'XHTML 1.0 Strict', true,
  4443. array_merge($common, $xml, $non_xml),
  4444. array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'),
  4445. array(),
  4446. '-//W3C//DTD XHTML 1.0 Strict//EN',
  4447. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'
  4448. );
  4449. $this->doctypes->register(
  4450. 'XHTML 1.1', true,
  4451. // Iframe is a real XHTML 1.1 module, despite being
  4452. // "transitional"!
  4453. array_merge($common, $xml, array('Ruby', 'Iframe')),
  4454. array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Strict', 'Tidy_Name'), // Tidy_XHTML1_1
  4455. array(),
  4456. '-//W3C//DTD XHTML 1.1//EN',
  4457. 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'
  4458. );
  4459. }
  4460. /**
  4461. * Registers a module to the recognized module list, useful for
  4462. * overloading pre-existing modules.
  4463. * @param $module Mixed: string module name, with or without
  4464. * HTMLPurifier_HTMLModule prefix, or instance of
  4465. * subclass of HTMLPurifier_HTMLModule.
  4466. * @param $overload Boolean whether or not to overload previous modules.
  4467. * If this is not set, and you do overload a module,
  4468. * HTML Purifier will complain with a warning.
  4469. * @note This function will not call autoload, you must instantiate
  4470. * (and thus invoke) autoload outside the method.
  4471. * @note If a string is passed as a module name, different variants
  4472. * will be tested in this order:
  4473. * - Check for HTMLPurifier_HTMLModule_$name
  4474. * - Check all prefixes with $name in order they were added
  4475. * - Check for literal object name
  4476. * - Throw fatal error
  4477. * If your object name collides with an internal class, specify
  4478. * your module manually. All modules must have been included
  4479. * externally: registerModule will not perform inclusions for you!
  4480. */
  4481. public function registerModule($module, $overload = false) {
  4482. if (is_string($module)) {
  4483. // attempt to load the module
  4484. $original_module = $module;
  4485. $ok = false;
  4486. foreach ($this->prefixes as $prefix) {
  4487. $module = $prefix . $original_module;
  4488. if (class_exists($module)) {
  4489. $ok = true;
  4490. break;
  4491. }
  4492. }
  4493. if (!$ok) {
  4494. $module = $original_module;
  4495. if (!class_exists($module)) {
  4496. trigger_error($original_module . ' module does not exist',
  4497. E_USER_ERROR);
  4498. return;
  4499. }
  4500. }
  4501. $module = new $module();
  4502. }
  4503. if (empty($module->name)) {
  4504. trigger_error('Module instance of ' . get_class($module) . ' must have name');
  4505. return;
  4506. }
  4507. if (!$overload && isset($this->registeredModules[$module->name])) {
  4508. trigger_error('Overloading ' . $module->name . ' without explicit overload parameter', E_USER_WARNING);
  4509. }
  4510. $this->registeredModules[$module->name] = $module;
  4511. }
  4512. /**
  4513. * Adds a module to the current doctype by first registering it,
  4514. * and then tacking it on to the active doctype
  4515. */
  4516. public function addModule($module) {
  4517. $this->registerModule($module);
  4518. if (is_object($module)) $module = $module->name;
  4519. $this->userModules[] = $module;
  4520. }
  4521. /**
  4522. * Adds a class prefix that registerModule() will use to resolve a
  4523. * string name to a concrete class
  4524. */
  4525. public function addPrefix($prefix) {
  4526. $this->prefixes[] = $prefix;
  4527. }
  4528. /**
  4529. * Performs processing on modules, after being called you may
  4530. * use getElement() and getElements()
  4531. * @param $config Instance of HTMLPurifier_Config
  4532. */
  4533. public function setup($config) {
  4534. $this->trusted = $config->get('HTML.Trusted');
  4535. // generate
  4536. $this->doctype = $this->doctypes->make($config);
  4537. $modules = $this->doctype->modules;
  4538. // take out the default modules that aren't allowed
  4539. $lookup = $config->get('HTML.AllowedModules');
  4540. $special_cases = $config->get('HTML.CoreModules');
  4541. if (is_array($lookup)) {
  4542. foreach ($modules as $k => $m) {
  4543. if (isset($special_cases[$m])) continue;
  4544. if (!isset($lookup[$m])) unset($modules[$k]);
  4545. }
  4546. }
  4547. // custom modules
  4548. if ($config->get('HTML.Proprietary')) {
  4549. $modules[] = 'Proprietary';
  4550. }
  4551. if ($config->get('HTML.SafeObject')) {
  4552. $modules[] = 'SafeObject';
  4553. }
  4554. if ($config->get('HTML.SafeEmbed')) {
  4555. $modules[] = 'SafeEmbed';
  4556. }
  4557. if ($config->get('HTML.Nofollow')) {
  4558. $modules[] = 'Nofollow';
  4559. }
  4560. if ($config->get('HTML.TargetBlank')) {
  4561. $modules[] = 'TargetBlank';
  4562. }
  4563. // merge in custom modules
  4564. $modules = array_merge($modules, $this->userModules);
  4565. foreach ($modules as $module) {
  4566. $this->processModule($module);
  4567. $this->modules[$module]->setup($config);
  4568. }
  4569. foreach ($this->doctype->tidyModules as $module) {
  4570. $this->processModule($module);
  4571. $this->modules[$module]->setup($config);
  4572. }
  4573. // prepare any injectors
  4574. foreach ($this->modules as $module) {
  4575. $n = array();
  4576. foreach ($module->info_injector as $i => $injector) {
  4577. if (!is_object($injector)) {
  4578. $class = "HTMLPurifier_Injector_$injector";
  4579. $injector = new $class;
  4580. }
  4581. $n[$injector->name] = $injector;
  4582. }
  4583. $module->info_injector = $n;
  4584. }
  4585. // setup lookup table based on all valid modules
  4586. foreach ($this->modules as $module) {
  4587. foreach ($module->info as $name => $def) {
  4588. if (!isset($this->elementLookup[$name])) {
  4589. $this->elementLookup[$name] = array();
  4590. }
  4591. $this->elementLookup[$name][] = $module->name;
  4592. }
  4593. }
  4594. // note the different choice
  4595. $this->contentSets = new HTMLPurifier_ContentSets(
  4596. // content set assembly deals with all possible modules,
  4597. // not just ones deemed to be "safe"
  4598. $this->modules
  4599. );
  4600. $this->attrCollections = new HTMLPurifier_AttrCollections(
  4601. $this->attrTypes,
  4602. // there is no way to directly disable a global attribute,
  4603. // but using AllowedAttributes or simply not including
  4604. // the module in your custom doctype should be sufficient
  4605. $this->modules
  4606. );
  4607. }
  4608. /**
  4609. * Takes a module and adds it to the active module collection,
  4610. * registering it if necessary.
  4611. */
  4612. public function processModule($module) {
  4613. if (!isset($this->registeredModules[$module]) || is_object($module)) {
  4614. $this->registerModule($module);
  4615. }
  4616. $this->modules[$module] = $this->registeredModules[$module];
  4617. }
  4618. /**
  4619. * Retrieves merged element definitions.
  4620. * @return Array of HTMLPurifier_ElementDef
  4621. */
  4622. public function getElements() {
  4623. $elements = array();
  4624. foreach ($this->modules as $module) {
  4625. if (!$this->trusted && !$module->safe) continue;
  4626. foreach ($module->info as $name => $v) {
  4627. if (isset($elements[$name])) continue;
  4628. $elements[$name] = $this->getElement($name);
  4629. }
  4630. }
  4631. // remove dud elements, this happens when an element that
  4632. // appeared to be safe actually wasn't
  4633. foreach ($elements as $n => $v) {
  4634. if ($v === false) unset($elements[$n]);
  4635. }
  4636. return $elements;
  4637. }
  4638. /**
  4639. * Retrieves a single merged element definition
  4640. * @param $name Name of element
  4641. * @param $trusted Boolean trusted overriding parameter: set to true
  4642. * if you want the full version of an element
  4643. * @return Merged HTMLPurifier_ElementDef
  4644. * @note You may notice that modules are getting iterated over twice (once
  4645. * in getElements() and once here). This
  4646. * is because
  4647. */
  4648. public function getElement($name, $trusted = null) {
  4649. if (!isset($this->elementLookup[$name])) {
  4650. return false;
  4651. }
  4652. // setup global state variables
  4653. $def = false;
  4654. if ($trusted === null) $trusted = $this->trusted;
  4655. // iterate through each module that has registered itself to this
  4656. // element
  4657. foreach($this->elementLookup[$name] as $module_name) {
  4658. $module = $this->modules[$module_name];
  4659. // refuse to create/merge from a module that is deemed unsafe--
  4660. // pretend the module doesn't exist--when trusted mode is not on.
  4661. if (!$trusted && !$module->safe) {
  4662. continue;
  4663. }
  4664. // clone is used because, ideally speaking, the original
  4665. // definition should not be modified. Usually, this will
  4666. // make no difference, but for consistency's sake
  4667. $new_def = clone $module->info[$name];
  4668. if (!$def && $new_def->standalone) {
  4669. $def = $new_def;
  4670. } elseif ($def) {
  4671. // This will occur even if $new_def is standalone. In practice,
  4672. // this will usually result in a full replacement.
  4673. $def->mergeIn($new_def);
  4674. } else {
  4675. // :TODO:
  4676. // non-standalone definitions that don't have a standalone
  4677. // to merge into could be deferred to the end
  4678. // HOWEVER, it is perfectly valid for a non-standalone
  4679. // definition to lack a standalone definition, even
  4680. // after all processing: this allows us to safely
  4681. // specify extra attributes for elements that may not be
  4682. // enabled all in one place. In particular, this might
  4683. // be the case for trusted elements. WARNING: care must
  4684. // be taken that the /extra/ definitions are all safe.
  4685. continue;
  4686. }
  4687. // attribute value expansions
  4688. $this->attrCollections->performInclusions($def->attr);
  4689. $this->attrCollections->expandIdentifiers($def->attr, $this->attrTypes);
  4690. // descendants_are_inline, for ChildDef_Chameleon
  4691. if (is_string($def->content_model) &&
  4692. strpos($def->content_model, 'Inline') !== false) {
  4693. if ($name != 'del' && $name != 'ins') {
  4694. // this is for you, ins/del
  4695. $def->descendants_are_inline = true;
  4696. }
  4697. }
  4698. $this->contentSets->generateChildDef($def, $module);
  4699. }
  4700. // This can occur if there is a blank definition, but no base to
  4701. // mix it in with
  4702. if (!$def) return false;
  4703. // add information on required attributes
  4704. foreach ($def->attr as $attr_name => $attr_def) {
  4705. if ($attr_def->required) {
  4706. $def->required_attr[] = $attr_name;
  4707. }
  4708. }
  4709. return $def;
  4710. }
  4711. }
  4712. /**
  4713. * Component of HTMLPurifier_AttrContext that accumulates IDs to prevent dupes
  4714. * @note In Slashdot-speak, dupe means duplicate.
  4715. * @note The default constructor does not accept $config or $context objects:
  4716. * use must use the static build() factory method to perform initialization.
  4717. */
  4718. class HTMLPurifier_IDAccumulator
  4719. {
  4720. /**
  4721. * Lookup table of IDs we've accumulated.
  4722. * @public
  4723. */
  4724. public $ids = array();
  4725. /**
  4726. * Builds an IDAccumulator, also initializing the default blacklist
  4727. * @param $config Instance of HTMLPurifier_Config
  4728. * @param $context Instance of HTMLPurifier_Context
  4729. * @return Fully initialized HTMLPurifier_IDAccumulator
  4730. */
  4731. public static function build($config, $context) {
  4732. $id_accumulator = new HTMLPurifier_IDAccumulator();
  4733. $id_accumulator->load($config->get('Attr.IDBlacklist'));
  4734. return $id_accumulator;
  4735. }
  4736. /**
  4737. * Add an ID to the lookup table.
  4738. * @param $id ID to be added.
  4739. * @return Bool status, true if success, false if there's a dupe
  4740. */
  4741. public function add($id) {
  4742. if (isset($this->ids[$id])) return false;
  4743. return $this->ids[$id] = true;
  4744. }
  4745. /**
  4746. * Load a list of IDs into the lookup table
  4747. * @param $array_of_ids Array of IDs to load
  4748. * @note This function doesn't care about duplicates
  4749. */
  4750. public function load($array_of_ids) {
  4751. foreach ($array_of_ids as $id) {
  4752. $this->ids[$id] = true;
  4753. }
  4754. }
  4755. }
  4756. /**
  4757. * Injects tokens into the document while parsing for well-formedness.
  4758. * This enables "formatter-like" functionality such as auto-paragraphing,
  4759. * smiley-ification and linkification to take place.
  4760. *
  4761. * A note on how handlers create changes; this is done by assigning a new
  4762. * value to the $token reference. These values can take a variety of forms and
  4763. * are best described HTMLPurifier_Strategy_MakeWellFormed->processToken()
  4764. * documentation.
  4765. *
  4766. * @todo Allow injectors to request a re-run on their output. This
  4767. * would help if an operation is recursive.
  4768. */
  4769. abstract class HTMLPurifier_Injector
  4770. {
  4771. /**
  4772. * Advisory name of injector, this is for friendly error messages
  4773. */
  4774. public $name;
  4775. /**
  4776. * Instance of HTMLPurifier_HTMLDefinition
  4777. */
  4778. protected $htmlDefinition;
  4779. /**
  4780. * Reference to CurrentNesting variable in Context. This is an array
  4781. * list of tokens that we are currently "inside"
  4782. */
  4783. protected $currentNesting;
  4784. /**
  4785. * Reference to InputTokens variable in Context. This is an array
  4786. * list of the input tokens that are being processed.
  4787. */
  4788. protected $inputTokens;
  4789. /**
  4790. * Reference to InputIndex variable in Context. This is an integer
  4791. * array index for $this->inputTokens that indicates what token
  4792. * is currently being processed.
  4793. */
  4794. protected $inputIndex;
  4795. /**
  4796. * Array of elements and attributes this injector creates and therefore
  4797. * need to be allowed by the definition. Takes form of
  4798. * array('element' => array('attr', 'attr2'), 'element2')
  4799. */
  4800. public $needed = array();
  4801. /**
  4802. * Index of inputTokens to rewind to.
  4803. */
  4804. protected $rewind = false;
  4805. /**
  4806. * Rewind to a spot to re-perform processing. This is useful if you
  4807. * deleted a node, and now need to see if this change affected any
  4808. * earlier nodes. Rewinding does not affect other injectors, and can
  4809. * result in infinite loops if not used carefully.
  4810. * @warning HTML Purifier will prevent you from fast-forwarding with this
  4811. * function.
  4812. */
  4813. public function rewind($index) {
  4814. $this->rewind = $index;
  4815. }
  4816. /**
  4817. * Retrieves rewind, and then unsets it.
  4818. */
  4819. public function getRewind() {
  4820. $r = $this->rewind;
  4821. $this->rewind = false;
  4822. return $r;
  4823. }
  4824. /**
  4825. * Prepares the injector by giving it the config and context objects:
  4826. * this allows references to important variables to be made within
  4827. * the injector. This function also checks if the HTML environment
  4828. * will work with the Injector (see checkNeeded()).
  4829. * @param $config Instance of HTMLPurifier_Config
  4830. * @param $context Instance of HTMLPurifier_Context
  4831. * @return Boolean false if success, string of missing needed element/attribute if failure
  4832. */
  4833. public function prepare($config, $context) {
  4834. $this->htmlDefinition = $config->getHTMLDefinition();
  4835. // Even though this might fail, some unit tests ignore this and
  4836. // still test checkNeeded, so be careful. Maybe get rid of that
  4837. // dependency.
  4838. $result = $this->checkNeeded($config);
  4839. if ($result !== false) return $result;
  4840. $this->currentNesting =& $context->get('CurrentNesting');
  4841. $this->inputTokens =& $context->get('InputTokens');
  4842. $this->inputIndex =& $context->get('InputIndex');
  4843. return false;
  4844. }
  4845. /**
  4846. * This function checks if the HTML environment
  4847. * will work with the Injector: if p tags are not allowed, the
  4848. * Auto-Paragraphing injector should not be enabled.
  4849. * @param $config Instance of HTMLPurifier_Config
  4850. * @param $context Instance of HTMLPurifier_Context
  4851. * @return Boolean false if success, string of missing needed element/attribute if failure
  4852. */
  4853. public function checkNeeded($config) {
  4854. $def = $config->getHTMLDefinition();
  4855. foreach ($this->needed as $element => $attributes) {
  4856. if (is_int($element)) $element = $attributes;
  4857. if (!isset($def->info[$element])) return $element;
  4858. if (!is_array($attributes)) continue;
  4859. foreach ($attributes as $name) {
  4860. if (!isset($def->info[$element]->attr[$name])) return "$element.$name";
  4861. }
  4862. }
  4863. return false;
  4864. }
  4865. /**
  4866. * Tests if the context node allows a certain element
  4867. * @param $name Name of element to test for
  4868. * @return True if element is allowed, false if it is not
  4869. */
  4870. public function allowsElement($name) {
  4871. if (!empty($this->currentNesting)) {
  4872. $parent_token = array_pop($this->currentNesting);
  4873. $this->currentNesting[] = $parent_token;
  4874. $parent = $this->htmlDefinition->info[$parent_token->name];
  4875. } else {
  4876. $parent = $this->htmlDefinition->info_parent_def;
  4877. }
  4878. if (!isset($parent->child->elements[$name]) || isset($parent->excludes[$name])) {
  4879. return false;
  4880. }
  4881. // check for exclusion
  4882. for ($i = count($this->currentNesting) - 2; $i >= 0; $i--) {
  4883. $node = $this->currentNesting[$i];
  4884. $def = $this->htmlDefinition->info[$node->name];
  4885. if (isset($def->excludes[$name])) return false;
  4886. }
  4887. return true;
  4888. }
  4889. /**
  4890. * Iterator function, which starts with the next token and continues until
  4891. * you reach the end of the input tokens.
  4892. * @warning Please prevent previous references from interfering with this
  4893. * functions by setting $i = null beforehand!
  4894. * @param &$i Current integer index variable for inputTokens
  4895. * @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
  4896. */
  4897. protected function forward(&$i, &$current) {
  4898. if ($i === null) $i = $this->inputIndex + 1;
  4899. else $i++;
  4900. if (!isset($this->inputTokens[$i])) return false;
  4901. $current = $this->inputTokens[$i];
  4902. return true;
  4903. }
  4904. /**
  4905. * Similar to _forward, but accepts a third parameter $nesting (which
  4906. * should be initialized at 0) and stops when we hit the end tag
  4907. * for the node $this->inputIndex starts in.
  4908. */
  4909. protected function forwardUntilEndToken(&$i, &$current, &$nesting) {
  4910. $result = $this->forward($i, $current);
  4911. if (!$result) return false;
  4912. if ($nesting === null) $nesting = 0;
  4913. if ($current instanceof HTMLPurifier_Token_Start) $nesting++;
  4914. elseif ($current instanceof HTMLPurifier_Token_End) {
  4915. if ($nesting <= 0) return false;
  4916. $nesting--;
  4917. }
  4918. return true;
  4919. }
  4920. /**
  4921. * Iterator function, starts with the previous token and continues until
  4922. * you reach the beginning of input tokens.
  4923. * @warning Please prevent previous references from interfering with this
  4924. * functions by setting $i = null beforehand!
  4925. * @param &$i Current integer index variable for inputTokens
  4926. * @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
  4927. */
  4928. protected function backward(&$i, &$current) {
  4929. if ($i === null) $i = $this->inputIndex - 1;
  4930. else $i--;
  4931. if ($i < 0) return false;
  4932. $current = $this->inputTokens[$i];
  4933. return true;
  4934. }
  4935. /**
  4936. * Initializes the iterator at the current position. Use in a do {} while;
  4937. * loop to force the _forward and _backward functions to start at the
  4938. * current location.
  4939. * @warning Please prevent previous references from interfering with this
  4940. * functions by setting $i = null beforehand!
  4941. * @param &$i Current integer index variable for inputTokens
  4942. * @param &$current Current token variable. Do NOT use $token, as that variable is also a reference
  4943. */
  4944. protected function current(&$i, &$current) {
  4945. if ($i === null) $i = $this->inputIndex;
  4946. $current = $this->inputTokens[$i];
  4947. }
  4948. /**
  4949. * Handler that is called when a text token is processed
  4950. */
  4951. public function handleText(&$token) {}
  4952. /**
  4953. * Handler that is called when a start or empty token is processed
  4954. */
  4955. public function handleElement(&$token) {}
  4956. /**
  4957. * Handler that is called when an end token is processed
  4958. */
  4959. public function handleEnd(&$token) {
  4960. $this->notifyEnd($token);
  4961. }
  4962. /**
  4963. * Notifier that is called when an end token is processed
  4964. * @note This differs from handlers in that the token is read-only
  4965. * @deprecated
  4966. */
  4967. public function notifyEnd($token) {}
  4968. }
  4969. /**
  4970. * Represents a language and defines localizable string formatting and
  4971. * other functions, as well as the localized messages for HTML Purifier.
  4972. */
  4973. class HTMLPurifier_Language
  4974. {
  4975. /**
  4976. * ISO 639 language code of language. Prefers shortest possible version
  4977. */
  4978. public $code = 'en';
  4979. /**
  4980. * Fallback language code
  4981. */
  4982. public $fallback = false;
  4983. /**
  4984. * Array of localizable messages
  4985. */
  4986. public $messages = array();
  4987. /**
  4988. * Array of localizable error codes
  4989. */
  4990. public $errorNames = array();
  4991. /**
  4992. * True if no message file was found for this language, so English
  4993. * is being used instead. Check this if you'd like to notify the
  4994. * user that they've used a non-supported language.
  4995. */
  4996. public $error = false;
  4997. /**
  4998. * Has the language object been loaded yet?
  4999. * @todo Make it private, fix usage in HTMLPurifier_LanguageTest
  5000. */
  5001. public $_loaded = false;
  5002. /**
  5003. * Instances of HTMLPurifier_Config and HTMLPurifier_Context
  5004. */
  5005. protected $config, $context;
  5006. public function __construct($config, $context) {
  5007. $this->config = $config;
  5008. $this->context = $context;
  5009. }
  5010. /**
  5011. * Loads language object with necessary info from factory cache
  5012. * @note This is a lazy loader
  5013. */
  5014. public function load() {
  5015. if ($this->_loaded) return;
  5016. $factory = HTMLPurifier_LanguageFactory::instance();
  5017. $factory->loadLanguage($this->code);
  5018. foreach ($factory->keys as $key) {
  5019. $this->$key = $factory->cache[$this->code][$key];
  5020. }
  5021. $this->_loaded = true;
  5022. }
  5023. /**
  5024. * Retrieves a localised message.
  5025. * @param $key string identifier of message
  5026. * @return string localised message
  5027. */
  5028. public function getMessage($key) {
  5029. if (!$this->_loaded) $this->load();
  5030. if (!isset($this->messages[$key])) return "[$key]";
  5031. return $this->messages[$key];
  5032. }
  5033. /**
  5034. * Retrieves a localised error name.
  5035. * @param $int integer error number, corresponding to PHP's error
  5036. * reporting
  5037. * @return string localised message
  5038. */
  5039. public function getErrorName($int) {
  5040. if (!$this->_loaded) $this->load();
  5041. if (!isset($this->errorNames[$int])) return "[Error: $int]";
  5042. return $this->errorNames[$int];
  5043. }
  5044. /**
  5045. * Converts an array list into a string readable representation
  5046. */
  5047. public function listify($array) {
  5048. $sep = $this->getMessage('Item separator');
  5049. $sep_last = $this->getMessage('Item separator last');
  5050. $ret = '';
  5051. for ($i = 0, $c = count($array); $i < $c; $i++) {
  5052. if ($i == 0) {
  5053. } elseif ($i + 1 < $c) {
  5054. $ret .= $sep;
  5055. } else {
  5056. $ret .= $sep_last;
  5057. }
  5058. $ret .= $array[$i];
  5059. }
  5060. return $ret;
  5061. }
  5062. /**
  5063. * Formats a localised message with passed parameters
  5064. * @param $key string identifier of message
  5065. * @param $args Parameters to substitute in
  5066. * @return string localised message
  5067. * @todo Implement conditionals? Right now, some messages make
  5068. * reference to line numbers, but those aren't always available
  5069. */
  5070. public function formatMessage($key, $args = array()) {
  5071. if (!$this->_loaded) $this->load();
  5072. if (!isset($this->messages[$key])) return "[$key]";
  5073. $raw = $this->messages[$key];
  5074. $subst = array();
  5075. $generator = false;
  5076. foreach ($args as $i => $value) {
  5077. if (is_object($value)) {
  5078. if ($value instanceof HTMLPurifier_Token) {
  5079. // factor this out some time
  5080. if (!$generator) $generator = $this->context->get('Generator');
  5081. if (isset($value->name)) $subst['$'.$i.'.Name'] = $value->name;
  5082. if (isset($value->data)) $subst['$'.$i.'.Data'] = $value->data;
  5083. $subst['$'.$i.'.Compact'] =
  5084. $subst['$'.$i.'.Serialized'] = $generator->generateFromToken($value);
  5085. // a more complex algorithm for compact representation
  5086. // could be introduced for all types of tokens. This
  5087. // may need to be factored out into a dedicated class
  5088. if (!empty($value->attr)) {
  5089. $stripped_token = clone $value;
  5090. $stripped_token->attr = array();
  5091. $subst['$'.$i.'.Compact'] = $generator->generateFromToken($stripped_token);
  5092. }
  5093. $subst['$'.$i.'.Line'] = $value->line ? $value->line : 'unknown';
  5094. }
  5095. continue;
  5096. } elseif (is_array($value)) {
  5097. $keys = array_keys($value);
  5098. if (array_keys($keys) === $keys) {
  5099. // list
  5100. $subst['$'.$i] = $this->listify($value);
  5101. } else {
  5102. // associative array
  5103. // no $i implementation yet, sorry
  5104. $subst['$'.$i.'.Keys'] = $this->listify($keys);
  5105. $subst['$'.$i.'.Values'] = $this->listify(array_values($value));
  5106. }
  5107. continue;
  5108. }
  5109. $subst['$' . $i] = $value;
  5110. }
  5111. return strtr($raw, $subst);
  5112. }
  5113. }
  5114. /**
  5115. * Class responsible for generating HTMLPurifier_Language objects, managing
  5116. * caching and fallbacks.
  5117. * @note Thanks to MediaWiki for the general logic, although this version
  5118. * has been entirely rewritten
  5119. * @todo Serialized cache for languages
  5120. */
  5121. class HTMLPurifier_LanguageFactory
  5122. {
  5123. /**
  5124. * Cache of language code information used to load HTMLPurifier_Language objects
  5125. * Structure is: $factory->cache[$language_code][$key] = $value
  5126. * @value array map
  5127. */
  5128. public $cache;
  5129. /**
  5130. * Valid keys in the HTMLPurifier_Language object. Designates which
  5131. * variables to slurp out of a message file.
  5132. * @value array list
  5133. */
  5134. public $keys = array('fallback', 'messages', 'errorNames');
  5135. /**
  5136. * Instance of HTMLPurifier_AttrDef_Lang to validate language codes
  5137. * @value object HTMLPurifier_AttrDef_Lang
  5138. */
  5139. protected $validator;
  5140. /**
  5141. * Cached copy of dirname(__FILE__), directory of current file without
  5142. * trailing slash
  5143. * @value string filename
  5144. */
  5145. protected $dir;
  5146. /**
  5147. * Keys whose contents are a hash map and can be merged
  5148. * @value array lookup
  5149. */
  5150. protected $mergeable_keys_map = array('messages' => true, 'errorNames' => true);
  5151. /**
  5152. * Keys whose contents are a list and can be merged
  5153. * @value array lookup
  5154. */
  5155. protected $mergeable_keys_list = array();
  5156. /**
  5157. * Retrieve sole instance of the factory.
  5158. * @param $prototype Optional prototype to overload sole instance with,
  5159. * or bool true to reset to default factory.
  5160. */
  5161. public static function instance($prototype = null) {
  5162. static $instance = null;
  5163. if ($prototype !== null) {
  5164. $instance = $prototype;
  5165. } elseif ($instance === null || $prototype == true) {
  5166. $instance = new HTMLPurifier_LanguageFactory();
  5167. $instance->setup();
  5168. }
  5169. return $instance;
  5170. }
  5171. /**
  5172. * Sets up the singleton, much like a constructor
  5173. * @note Prevents people from getting this outside of the singleton
  5174. */
  5175. public function setup() {
  5176. $this->validator = new HTMLPurifier_AttrDef_Lang();
  5177. $this->dir = HTMLPURIFIER_PREFIX . '/HTMLPurifier';
  5178. }
  5179. /**
  5180. * Creates a language object, handles class fallbacks
  5181. * @param $config Instance of HTMLPurifier_Config
  5182. * @param $context Instance of HTMLPurifier_Context
  5183. * @param $code Code to override configuration with. Private parameter.
  5184. */
  5185. public function create($config, $context, $code = false) {
  5186. // validate language code
  5187. if ($code === false) {
  5188. $code = $this->validator->validate(
  5189. $config->get('Core.Language'), $config, $context
  5190. );
  5191. } else {
  5192. $code = $this->validator->validate($code, $config, $context);
  5193. }
  5194. if ($code === false) $code = 'en'; // malformed code becomes English
  5195. $pcode = str_replace('-', '_', $code); // make valid PHP classname
  5196. static $depth = 0; // recursion protection
  5197. if ($code == 'en') {
  5198. $lang = new HTMLPurifier_Language($config, $context);
  5199. } else {
  5200. $class = 'HTMLPurifier_Language_' . $pcode;
  5201. $file = $this->dir . '/Language/classes/' . $code . '.php';
  5202. if (file_exists($file) || class_exists($class, false)) {
  5203. $lang = new $class($config, $context);
  5204. } else {
  5205. // Go fallback
  5206. $raw_fallback = $this->getFallbackFor($code);
  5207. $fallback = $raw_fallback ? $raw_fallback : 'en';
  5208. $depth++;
  5209. $lang = $this->create($config, $context, $fallback);
  5210. if (!$raw_fallback) {
  5211. $lang->error = true;
  5212. }
  5213. $depth--;
  5214. }
  5215. }
  5216. $lang->code = $code;
  5217. return $lang;
  5218. }
  5219. /**
  5220. * Returns the fallback language for language
  5221. * @note Loads the original language into cache
  5222. * @param $code string language code
  5223. */
  5224. public function getFallbackFor($code) {
  5225. $this->loadLanguage($code);
  5226. return $this->cache[$code]['fallback'];
  5227. }
  5228. /**
  5229. * Loads language into the cache, handles message file and fallbacks
  5230. * @param $code string language code
  5231. */
  5232. public function loadLanguage($code) {
  5233. static $languages_seen = array(); // recursion guard
  5234. // abort if we've already loaded it
  5235. if (isset($this->cache[$code])) return;
  5236. // generate filename
  5237. $filename = $this->dir . '/Language/messages/' . $code . '.php';
  5238. // default fallback : may be overwritten by the ensuing include
  5239. $fallback = ($code != 'en') ? 'en' : false;
  5240. // load primary localisation
  5241. if (!file_exists($filename)) {
  5242. // skip the include: will rely solely on fallback
  5243. $filename = $this->dir . '/Language/messages/en.php';
  5244. $cache = array();
  5245. } else {
  5246. include $filename;
  5247. $cache = compact($this->keys);
  5248. }
  5249. // load fallback localisation
  5250. if (!empty($fallback)) {
  5251. // infinite recursion guard
  5252. if (isset($languages_seen[$code])) {
  5253. trigger_error('Circular fallback reference in language ' .
  5254. $code, E_USER_ERROR);
  5255. $fallback = 'en';
  5256. }
  5257. $language_seen[$code] = true;
  5258. // load the fallback recursively
  5259. $this->loadLanguage($fallback);
  5260. $fallback_cache = $this->cache[$fallback];
  5261. // merge fallback with current language
  5262. foreach ( $this->keys as $key ) {
  5263. if (isset($cache[$key]) && isset($fallback_cache[$key])) {
  5264. if (isset($this->mergeable_keys_map[$key])) {
  5265. $cache[$key] = $cache[$key] + $fallback_cache[$key];
  5266. } elseif (isset($this->mergeable_keys_list[$key])) {
  5267. $cache[$key] = array_merge( $fallback_cache[$key], $cache[$key] );
  5268. }
  5269. } else {
  5270. $cache[$key] = $fallback_cache[$key];
  5271. }
  5272. }
  5273. }
  5274. // save to cache for later retrieval
  5275. $this->cache[$code] = $cache;
  5276. return;
  5277. }
  5278. }
  5279. /**
  5280. * Represents a measurable length, with a string numeric magnitude
  5281. * and a unit. This object is immutable.
  5282. */
  5283. class HTMLPurifier_Length
  5284. {
  5285. /**
  5286. * String numeric magnitude.
  5287. */
  5288. protected $n;
  5289. /**
  5290. * String unit. False is permitted if $n = 0.
  5291. */
  5292. protected $unit;
  5293. /**
  5294. * Whether or not this length is valid. Null if not calculated yet.
  5295. */
  5296. protected $isValid;
  5297. /**
  5298. * Lookup array of units recognized by CSS 2.1
  5299. */
  5300. protected static $allowedUnits = array(
  5301. 'em' => true, 'ex' => true, 'px' => true, 'in' => true,
  5302. 'cm' => true, 'mm' => true, 'pt' => true, 'pc' => true
  5303. );
  5304. /**
  5305. * @param number $n Magnitude
  5306. * @param string $u Unit
  5307. */
  5308. public function __construct($n = '0', $u = false) {
  5309. $this->n = (string) $n;
  5310. $this->unit = $u !== false ? (string) $u : false;
  5311. }
  5312. /**
  5313. * @param string $s Unit string, like '2em' or '3.4in'
  5314. * @warning Does not perform validation.
  5315. */
  5316. static public function make($s) {
  5317. if ($s instanceof HTMLPurifier_Length) return $s;
  5318. $n_length = strspn($s, '1234567890.+-');
  5319. $n = substr($s, 0, $n_length);
  5320. $unit = substr($s, $n_length);
  5321. if ($unit === '') $unit = false;
  5322. return new HTMLPurifier_Length($n, $unit);
  5323. }
  5324. /**
  5325. * Validates the number and unit.
  5326. */
  5327. protected function validate() {
  5328. // Special case:
  5329. if ($this->n === '+0' || $this->n === '-0') $this->n = '0';
  5330. if ($this->n === '0' && $this->unit === false) return true;
  5331. if (!ctype_lower($this->unit)) $this->unit = strtolower($this->unit);
  5332. if (!isset(HTMLPurifier_Length::$allowedUnits[$this->unit])) return false;
  5333. // Hack:
  5334. $def = new HTMLPurifier_AttrDef_CSS_Number();
  5335. $result = $def->validate($this->n, false, false);
  5336. if ($result === false) return false;
  5337. $this->n = $result;
  5338. return true;
  5339. }
  5340. /**
  5341. * Returns string representation of number.
  5342. */
  5343. public function toString() {
  5344. if (!$this->isValid()) return false;
  5345. return $this->n . $this->unit;
  5346. }
  5347. /**
  5348. * Retrieves string numeric magnitude.
  5349. */
  5350. public function getN() {return $this->n;}
  5351. /**
  5352. * Retrieves string unit.
  5353. */
  5354. public function getUnit() {return $this->unit;}
  5355. /**
  5356. * Returns true if this length unit is valid.
  5357. */
  5358. public function isValid() {
  5359. if ($this->isValid === null) $this->isValid = $this->validate();
  5360. return $this->isValid;
  5361. }
  5362. /**
  5363. * Compares two lengths, and returns 1 if greater, -1 if less and 0 if equal.
  5364. * @warning If both values are too large or small, this calculation will
  5365. * not work properly
  5366. */
  5367. public function compareTo($l) {
  5368. if ($l === false) return false;
  5369. if ($l->unit !== $this->unit) {
  5370. $converter = new HTMLPurifier_UnitConverter();
  5371. $l = $converter->convert($l, $this->unit);
  5372. if ($l === false) return false;
  5373. }
  5374. return $this->n - $l->n;
  5375. }
  5376. }
  5377. /**
  5378. * Forgivingly lexes HTML (SGML-style) markup into tokens.
  5379. *
  5380. * A lexer parses a string of SGML-style markup and converts them into
  5381. * corresponding tokens. It doesn't check for well-formedness, although its
  5382. * internal mechanism may make this automatic (such as the case of
  5383. * HTMLPurifier_Lexer_DOMLex). There are several implementations to choose
  5384. * from.
  5385. *
  5386. * A lexer is HTML-oriented: it might work with XML, but it's not
  5387. * recommended, as we adhere to a subset of the specification for optimization
  5388. * reasons. This might change in the future. Also, most tokenizers are not
  5389. * expected to handle DTDs or PIs.
  5390. *
  5391. * This class should not be directly instantiated, but you may use create() to
  5392. * retrieve a default copy of the lexer. Being a supertype, this class
  5393. * does not actually define any implementation, but offers commonly used
  5394. * convenience functions for subclasses.
  5395. *
  5396. * @note The unit tests will instantiate this class for testing purposes, as
  5397. * many of the utility functions require a class to be instantiated.
  5398. * This means that, even though this class is not runnable, it will
  5399. * not be declared abstract.
  5400. *
  5401. * @par
  5402. *
  5403. * @note
  5404. * We use tokens rather than create a DOM representation because DOM would:
  5405. *
  5406. * @par
  5407. * -# Require more processing and memory to create,
  5408. * -# Is not streamable, and
  5409. * -# Has the entire document structure (html and body not needed).
  5410. *
  5411. * @par
  5412. * However, DOM is helpful in that it makes it easy to move around nodes
  5413. * without a lot of lookaheads to see when a tag is closed. This is a
  5414. * limitation of the token system and some workarounds would be nice.
  5415. */
  5416. class HTMLPurifier_Lexer
  5417. {
  5418. /**
  5419. * Whether or not this lexer implements line-number/column-number tracking.
  5420. * If it does, set to true.
  5421. */
  5422. public $tracksLineNumbers = false;
  5423. // -- STATIC ----------------------------------------------------------
  5424. /**
  5425. * Retrieves or sets the default Lexer as a Prototype Factory.
  5426. *
  5427. * By default HTMLPurifier_Lexer_DOMLex will be returned. There are
  5428. * a few exceptions involving special features that only DirectLex
  5429. * implements.
  5430. *
  5431. * @note The behavior of this class has changed, rather than accepting
  5432. * a prototype object, it now accepts a configuration object.
  5433. * To specify your own prototype, set %Core.LexerImpl to it.
  5434. * This change in behavior de-singletonizes the lexer object.
  5435. *
  5436. * @param $config Instance of HTMLPurifier_Config
  5437. * @return Concrete lexer.
  5438. */
  5439. public static function create($config) {
  5440. if (!($config instanceof HTMLPurifier_Config)) {
  5441. $lexer = $config;
  5442. trigger_error("Passing a prototype to
  5443. HTMLPurifier_Lexer::create() is deprecated, please instead
  5444. use %Core.LexerImpl", E_USER_WARNING);
  5445. } else {
  5446. $lexer = $config->get('Core.LexerImpl');
  5447. }
  5448. $needs_tracking =
  5449. $config->get('Core.MaintainLineNumbers') ||
  5450. $config->get('Core.CollectErrors');
  5451. $inst = null;
  5452. if (is_object($lexer)) {
  5453. $inst = $lexer;
  5454. } else {
  5455. if (is_null($lexer)) { do {
  5456. // auto-detection algorithm
  5457. if ($needs_tracking) {
  5458. $lexer = 'DirectLex';
  5459. break;
  5460. }
  5461. if (
  5462. class_exists('DOMDocument') &&
  5463. method_exists('DOMDocument', 'loadHTML') &&
  5464. !extension_loaded('domxml')
  5465. ) {
  5466. // check for DOM support, because while it's part of the
  5467. // core, it can be disabled compile time. Also, the PECL
  5468. // domxml extension overrides the default DOM, and is evil
  5469. // and nasty and we shan't bother to support it
  5470. $lexer = 'DOMLex';
  5471. } else {
  5472. $lexer = 'DirectLex';
  5473. }
  5474. } while(0); } // do..while so we can break
  5475. // instantiate recognized string names
  5476. switch ($lexer) {
  5477. case 'DOMLex':
  5478. $inst = new HTMLPurifier_Lexer_DOMLex();
  5479. break;
  5480. case 'DirectLex':
  5481. $inst = new HTMLPurifier_Lexer_DirectLex();
  5482. break;
  5483. case 'PH5P':
  5484. $inst = new HTMLPurifier_Lexer_PH5P();
  5485. break;
  5486. default:
  5487. throw new HTMLPurifier_Exception("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer));
  5488. }
  5489. }
  5490. if (!$inst) throw new HTMLPurifier_Exception('No lexer was instantiated');
  5491. // once PHP DOM implements native line numbers, or we
  5492. // hack out something using XSLT, remove this stipulation
  5493. if ($needs_tracking && !$inst->tracksLineNumbers) {
  5494. throw new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)');
  5495. }
  5496. return $inst;
  5497. }
  5498. // -- CONVENIENCE MEMBERS ---------------------------------------------
  5499. public function __construct() {
  5500. $this->_entity_parser = new HTMLPurifier_EntityParser();
  5501. }
  5502. /**
  5503. * Most common entity to raw value conversion table for special entities.
  5504. */
  5505. protected $_special_entity2str =
  5506. array(
  5507. '&quot;' => '"',
  5508. '&amp;' => '&',
  5509. '&lt;' => '<',
  5510. '&gt;' => '>',
  5511. '&#39;' => "'",
  5512. '&#039;' => "'",
  5513. '&#x27;' => "'"
  5514. );
  5515. /**
  5516. * Parses special entities into the proper characters.
  5517. *
  5518. * This string will translate escaped versions of the special characters
  5519. * into the correct ones.
  5520. *
  5521. * @warning
  5522. * You should be able to treat the output of this function as
  5523. * completely parsed, but that's only because all other entities should
  5524. * have been handled previously in substituteNonSpecialEntities()
  5525. *
  5526. * @param $string String character data to be parsed.
  5527. * @returns Parsed character data.
  5528. */
  5529. public function parseData($string) {
  5530. // following functions require at least one character
  5531. if ($string === '') return '';
  5532. // subtracts amps that cannot possibly be escaped
  5533. $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
  5534. ($string[strlen($string)-1] === '&' ? 1 : 0);
  5535. if (!$num_amp) return $string; // abort if no entities
  5536. $num_esc_amp = substr_count($string, '&amp;');
  5537. $string = strtr($string, $this->_special_entity2str);
  5538. // code duplication for sake of optimization, see above
  5539. $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
  5540. ($string[strlen($string)-1] === '&' ? 1 : 0);
  5541. if ($num_amp_2 <= $num_esc_amp) return $string;
  5542. // hmm... now we have some uncommon entities. Use the callback.
  5543. $string = $this->_entity_parser->substituteSpecialEntities($string);
  5544. return $string;
  5545. }
  5546. /**
  5547. * Lexes an HTML string into tokens.
  5548. *
  5549. * @param $string String HTML.
  5550. * @return HTMLPurifier_Token array representation of HTML.
  5551. */
  5552. public function tokenizeHTML($string, $config, $context) {
  5553. trigger_error('Call to abstract class', E_USER_ERROR);
  5554. }
  5555. /**
  5556. * Translates CDATA sections into regular sections (through escaping).
  5557. *
  5558. * @param $string HTML string to process.
  5559. * @returns HTML with CDATA sections escaped.
  5560. */
  5561. protected static function escapeCDATA($string) {
  5562. return preg_replace_callback(
  5563. '/<!\[CDATA\[(.+?)\]\]>/s',
  5564. array('HTMLPurifier_Lexer', 'CDATACallback'),
  5565. $string
  5566. );
  5567. }
  5568. /**
  5569. * Special CDATA case that is especially convoluted for <script>
  5570. */
  5571. protected static function escapeCommentedCDATA($string) {
  5572. return preg_replace_callback(
  5573. '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
  5574. array('HTMLPurifier_Lexer', 'CDATACallback'),
  5575. $string
  5576. );
  5577. }
  5578. /**
  5579. * Special Internet Explorer conditional comments should be removed.
  5580. */
  5581. protected static function removeIEConditional($string) {
  5582. return preg_replace(
  5583. '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings
  5584. '',
  5585. $string
  5586. );
  5587. }
  5588. /**
  5589. * Callback function for escapeCDATA() that does the work.
  5590. *
  5591. * @warning Though this is public in order to let the callback happen,
  5592. * calling it directly is not recommended.
  5593. * @params $matches PCRE matches array, with index 0 the entire match
  5594. * and 1 the inside of the CDATA section.
  5595. * @returns Escaped internals of the CDATA section.
  5596. */
  5597. protected static function CDATACallback($matches) {
  5598. // not exactly sure why the character set is needed, but whatever
  5599. return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
  5600. }
  5601. /**
  5602. * Takes a piece of HTML and normalizes it by converting entities, fixing
  5603. * encoding, extracting bits, and other good stuff.
  5604. * @todo Consider making protected
  5605. */
  5606. public function normalize($html, $config, $context) {
  5607. // normalize newlines to \n
  5608. if ($config->get('Core.NormalizeNewlines')) {
  5609. $html = str_replace("\r\n", "\n", $html);
  5610. $html = str_replace("\r", "\n", $html);
  5611. }
  5612. if ($config->get('HTML.Trusted')) {
  5613. // escape convoluted CDATA
  5614. $html = $this->escapeCommentedCDATA($html);
  5615. }
  5616. // escape CDATA
  5617. $html = $this->escapeCDATA($html);
  5618. $html = $this->removeIEConditional($html);
  5619. // extract body from document if applicable
  5620. if ($config->get('Core.ConvertDocumentToFragment')) {
  5621. $e = false;
  5622. if ($config->get('Core.CollectErrors')) {
  5623. $e =& $context->get('ErrorCollector');
  5624. }
  5625. $new_html = $this->extractBody($html);
  5626. if ($e && $new_html != $html) {
  5627. $e->send(E_WARNING, 'Lexer: Extracted body');
  5628. }
  5629. $html = $new_html;
  5630. }
  5631. // expand entities that aren't the big five
  5632. $html = $this->_entity_parser->substituteNonSpecialEntities($html);
  5633. // clean into wellformed UTF-8 string for an SGML context: this has
  5634. // to be done after entity expansion because the entities sometimes
  5635. // represent non-SGML characters (horror, horror!)
  5636. $html = HTMLPurifier_Encoder::cleanUTF8($html);
  5637. // if processing instructions are to removed, remove them now
  5638. if ($config->get('Core.RemoveProcessingInstructions')) {
  5639. $html = preg_replace('#<\?.+?\?>#s', '', $html);
  5640. }
  5641. return $html;
  5642. }
  5643. /**
  5644. * Takes a string of HTML (fragment or document) and returns the content
  5645. * @todo Consider making protected
  5646. */
  5647. public function extractBody($html) {
  5648. $matches = array();
  5649. $result = preg_match('!<body[^>]*>(.*)</body>!is', $html, $matches);
  5650. if ($result) {
  5651. return $matches[1];
  5652. } else {
  5653. return $html;
  5654. }
  5655. }
  5656. }
  5657. /**
  5658. * Class that handles operations involving percent-encoding in URIs.
  5659. *
  5660. * @warning
  5661. * Be careful when reusing instances of PercentEncoder. The object
  5662. * you use for normalize() SHOULD NOT be used for encode(), or
  5663. * vice-versa.
  5664. */
  5665. class HTMLPurifier_PercentEncoder
  5666. {
  5667. /**
  5668. * Reserved characters to preserve when using encode().
  5669. */
  5670. protected $preserve = array();
  5671. /**
  5672. * String of characters that should be preserved while using encode().
  5673. */
  5674. public function __construct($preserve = false) {
  5675. // unreserved letters, ought to const-ify
  5676. for ($i = 48; $i <= 57; $i++) $this->preserve[$i] = true; // digits
  5677. for ($i = 65; $i <= 90; $i++) $this->preserve[$i] = true; // upper-case
  5678. for ($i = 97; $i <= 122; $i++) $this->preserve[$i] = true; // lower-case
  5679. $this->preserve[45] = true; // Dash -
  5680. $this->preserve[46] = true; // Period .
  5681. $this->preserve[95] = true; // Underscore _
  5682. $this->preserve[126]= true; // Tilde ~
  5683. // extra letters not to escape
  5684. if ($preserve !== false) {
  5685. for ($i = 0, $c = strlen($preserve); $i < $c; $i++) {
  5686. $this->preserve[ord($preserve[$i])] = true;
  5687. }
  5688. }
  5689. }
  5690. /**
  5691. * Our replacement for urlencode, it encodes all non-reserved characters,
  5692. * as well as any extra characters that were instructed to be preserved.
  5693. * @note
  5694. * Assumes that the string has already been normalized, making any
  5695. * and all percent escape sequences valid. Percents will not be
  5696. * re-escaped, regardless of their status in $preserve
  5697. * @param $string String to be encoded
  5698. * @return Encoded string.
  5699. */
  5700. public function encode($string) {
  5701. $ret = '';
  5702. for ($i = 0, $c = strlen($string); $i < $c; $i++) {
  5703. if ($string[$i] !== '%' && !isset($this->preserve[$int = ord($string[$i])]) ) {
  5704. $ret .= '%' . sprintf('%02X', $int);
  5705. } else {
  5706. $ret .= $string[$i];
  5707. }
  5708. }
  5709. return $ret;
  5710. }
  5711. /**
  5712. * Fix up percent-encoding by decoding unreserved characters and normalizing.
  5713. * @warning This function is affected by $preserve, even though the
  5714. * usual desired behavior is for this not to preserve those
  5715. * characters. Be careful when reusing instances of PercentEncoder!
  5716. * @param $string String to normalize
  5717. */
  5718. public function normalize($string) {
  5719. if ($string == '') return '';
  5720. $parts = explode('%', $string);
  5721. $ret = array_shift($parts);
  5722. foreach ($parts as $part) {
  5723. $length = strlen($part);
  5724. if ($length < 2) {
  5725. $ret .= '%25' . $part;
  5726. continue;
  5727. }
  5728. $encoding = substr($part, 0, 2);
  5729. $text = substr($part, 2);
  5730. if (!ctype_xdigit($encoding)) {
  5731. $ret .= '%25' . $part;
  5732. continue;
  5733. }
  5734. $int = hexdec($encoding);
  5735. if (isset($this->preserve[$int])) {
  5736. $ret .= chr($int) . $text;
  5737. continue;
  5738. }
  5739. $encoding = strtoupper($encoding);
  5740. $ret .= '%' . $encoding . $text;
  5741. }
  5742. return $ret;
  5743. }
  5744. }
  5745. /**
  5746. * Generic property list implementation
  5747. */
  5748. class HTMLPurifier_PropertyList
  5749. {
  5750. /**
  5751. * Internal data-structure for properties
  5752. */
  5753. protected $data = array();
  5754. /**
  5755. * Parent plist
  5756. */
  5757. protected $parent;
  5758. protected $cache;
  5759. public function __construct($parent = null) {
  5760. $this->parent = $parent;
  5761. }
  5762. /**
  5763. * Recursively retrieves the value for a key
  5764. */
  5765. public function get($name) {
  5766. if ($this->has($name)) return $this->data[$name];
  5767. // possible performance bottleneck, convert to iterative if necessary
  5768. if ($this->parent) return $this->parent->get($name);
  5769. throw new HTMLPurifier_Exception("Key '$name' not found");
  5770. }
  5771. /**
  5772. * Sets the value of a key, for this plist
  5773. */
  5774. public function set($name, $value) {
  5775. $this->data[$name] = $value;
  5776. }
  5777. /**
  5778. * Returns true if a given key exists
  5779. */
  5780. public function has($name) {
  5781. return array_key_exists($name, $this->data);
  5782. }
  5783. /**
  5784. * Resets a value to the value of it's parent, usually the default. If
  5785. * no value is specified, the entire plist is reset.
  5786. */
  5787. public function reset($name = null) {
  5788. if ($name == null) $this->data = array();
  5789. else unset($this->data[$name]);
  5790. }
  5791. /**
  5792. * Squashes this property list and all of its property lists into a single
  5793. * array, and returns the array. This value is cached by default.
  5794. * @param $force If true, ignores the cache and regenerates the array.
  5795. */
  5796. public function squash($force = false) {
  5797. if ($this->cache !== null && !$force) return $this->cache;
  5798. if ($this->parent) {
  5799. return $this->cache = array_merge($this->parent->squash($force), $this->data);
  5800. } else {
  5801. return $this->cache = $this->data;
  5802. }
  5803. }
  5804. /**
  5805. * Returns the parent plist.
  5806. */
  5807. public function getParent() {
  5808. return $this->parent;
  5809. }
  5810. /**
  5811. * Sets the parent plist.
  5812. */
  5813. public function setParent($plist) {
  5814. $this->parent = $plist;
  5815. }
  5816. }
  5817. /**
  5818. * Property list iterator. Do not instantiate this class directly.
  5819. */
  5820. class HTMLPurifier_PropertyListIterator extends FilterIterator
  5821. {
  5822. protected $l;
  5823. protected $filter;
  5824. /**
  5825. * @param $data Array of data to iterate over
  5826. * @param $filter Optional prefix to only allow values of
  5827. */
  5828. public function __construct(Iterator $iterator, $filter = null) {
  5829. parent::__construct($iterator);
  5830. $this->l = strlen($filter);
  5831. $this->filter = $filter;
  5832. }
  5833. public function accept() {
  5834. $key = $this->getInnerIterator()->key();
  5835. if( strncmp($key, $this->filter, $this->l) !== 0 ) {
  5836. return false;
  5837. }
  5838. return true;
  5839. }
  5840. }
  5841. /**
  5842. * Supertype for classes that define a strategy for modifying/purifying tokens.
  5843. *
  5844. * While HTMLPurifier's core purpose is fixing HTML into something proper,
  5845. * strategies provide plug points for extra configuration or even extra
  5846. * features, such as custom tags, custom parsing of text, etc.
  5847. */
  5848. abstract class HTMLPurifier_Strategy
  5849. {
  5850. /**
  5851. * Executes the strategy on the tokens.
  5852. *
  5853. * @param $tokens Array of HTMLPurifier_Token objects to be operated on.
  5854. * @param $config Configuration options
  5855. * @returns Processed array of token objects.
  5856. */
  5857. abstract public function execute($tokens, $config, $context);
  5858. }
  5859. /**
  5860. * This is in almost every respect equivalent to an array except
  5861. * that it keeps track of which keys were accessed.
  5862. *
  5863. * @warning For the sake of backwards compatibility with early versions
  5864. * of PHP 5, you must not use the $hash[$key] syntax; if you do
  5865. * our version of offsetGet is never called.
  5866. */
  5867. class HTMLPurifier_StringHash extends ArrayObject
  5868. {
  5869. protected $accessed = array();
  5870. /**
  5871. * Retrieves a value, and logs the access.
  5872. */
  5873. public function offsetGet($index) {
  5874. $this->accessed[$index] = true;
  5875. return parent::offsetGet($index);
  5876. }
  5877. /**
  5878. * Returns a lookup array of all array indexes that have been accessed.
  5879. * @return Array in form array($index => true).
  5880. */
  5881. public function getAccessed() {
  5882. return $this->accessed;
  5883. }
  5884. /**
  5885. * Resets the access array.
  5886. */
  5887. public function resetAccessed() {
  5888. $this->accessed = array();
  5889. }
  5890. }
  5891. /**
  5892. * Parses string hash files. File format is as such:
  5893. *
  5894. * DefaultKeyValue
  5895. * KEY: Value
  5896. * KEY2: Value2
  5897. * --MULTILINE-KEY--
  5898. * Multiline
  5899. * value.
  5900. *
  5901. * Which would output something similar to:
  5902. *
  5903. * array(
  5904. * 'ID' => 'DefaultKeyValue',
  5905. * 'KEY' => 'Value',
  5906. * 'KEY2' => 'Value2',
  5907. * 'MULTILINE-KEY' => "Multiline\nvalue.\n",
  5908. * )
  5909. *
  5910. * We use this as an easy to use file-format for configuration schema
  5911. * files, but the class itself is usage agnostic.
  5912. *
  5913. * You can use ---- to forcibly terminate parsing of a single string-hash;
  5914. * this marker is used in multi string-hashes to delimit boundaries.
  5915. */
  5916. class HTMLPurifier_StringHashParser
  5917. {
  5918. public $default = 'ID';
  5919. /**
  5920. * Parses a file that contains a single string-hash.
  5921. */
  5922. public function parseFile($file) {
  5923. if (!file_exists($file)) return false;
  5924. $fh = fopen($file, 'r');
  5925. if (!$fh) return false;
  5926. $ret = $this->parseHandle($fh);
  5927. fclose($fh);
  5928. return $ret;
  5929. }
  5930. /**
  5931. * Parses a file that contains multiple string-hashes delimited by '----'
  5932. */
  5933. public function parseMultiFile($file) {
  5934. if (!file_exists($file)) return false;
  5935. $ret = array();
  5936. $fh = fopen($file, 'r');
  5937. if (!$fh) return false;
  5938. while (!feof($fh)) {
  5939. $ret[] = $this->parseHandle($fh);
  5940. }
  5941. fclose($fh);
  5942. return $ret;
  5943. }
  5944. /**
  5945. * Internal parser that acepts a file handle.
  5946. * @note While it's possible to simulate in-memory parsing by using
  5947. * custom stream wrappers, if such a use-case arises we should
  5948. * factor out the file handle into its own class.
  5949. * @param $fh File handle with pointer at start of valid string-hash
  5950. * block.
  5951. */
  5952. protected function parseHandle($fh) {
  5953. $state = false;
  5954. $single = false;
  5955. $ret = array();
  5956. do {
  5957. $line = fgets($fh);
  5958. if ($line === false) break;
  5959. $line = rtrim($line, "\n\r");
  5960. if (!$state && $line === '') continue;
  5961. if ($line === '----') break;
  5962. if (strncmp('--#', $line, 3) === 0) {
  5963. // Comment
  5964. continue;
  5965. } elseif (strncmp('--', $line, 2) === 0) {
  5966. // Multiline declaration
  5967. $state = trim($line, '- ');
  5968. if (!isset($ret[$state])) $ret[$state] = '';
  5969. continue;
  5970. } elseif (!$state) {
  5971. $single = true;
  5972. if (strpos($line, ':') !== false) {
  5973. // Single-line declaration
  5974. list($state, $line) = explode(':', $line, 2);
  5975. $line = trim($line);
  5976. } else {
  5977. // Use default declaration
  5978. $state = $this->default;
  5979. }
  5980. }
  5981. if ($single) {
  5982. $ret[$state] = $line;
  5983. $single = false;
  5984. $state = false;
  5985. } else {
  5986. $ret[$state] .= "$line\n";
  5987. }
  5988. } while (!feof($fh));
  5989. return $ret;
  5990. }
  5991. }
  5992. /**
  5993. * Defines a mutation of an obsolete tag into a valid tag.
  5994. */
  5995. abstract class HTMLPurifier_TagTransform
  5996. {
  5997. /**
  5998. * Tag name to transform the tag to.
  5999. */
  6000. public $transform_to;
  6001. /**
  6002. * Transforms the obsolete tag into the valid tag.
  6003. * @param $tag Tag to be transformed.
  6004. * @param $config Mandatory HTMLPurifier_Config object
  6005. * @param $context Mandatory HTMLPurifier_Context object
  6006. */
  6007. abstract public function transform($tag, $config, $context);
  6008. /**
  6009. * Prepends CSS properties to the style attribute, creating the
  6010. * attribute if it doesn't exist.
  6011. * @warning Copied over from AttrTransform, be sure to keep in sync
  6012. * @param $attr Attribute array to process (passed by reference)
  6013. * @param $css CSS to prepend
  6014. */
  6015. protected function prependCSS(&$attr, $css) {
  6016. $attr['style'] = isset($attr['style']) ? $attr['style'] : '';
  6017. $attr['style'] = $css . $attr['style'];
  6018. }
  6019. }
  6020. /**
  6021. * Abstract base token class that all others inherit from.
  6022. */
  6023. class HTMLPurifier_Token {
  6024. public $line; /**< Line number node was on in source document. Null if unknown. */
  6025. public $col; /**< Column of line node was on in source document. Null if unknown. */
  6026. /**
  6027. * Lookup array of processing that this token is exempt from.
  6028. * Currently, valid values are "ValidateAttributes" and
  6029. * "MakeWellFormed_TagClosedError"
  6030. */
  6031. public $armor = array();
  6032. /**
  6033. * Used during MakeWellFormed.
  6034. */
  6035. public $skip;
  6036. public $rewind;
  6037. public $carryover;
  6038. public function __get($n) {
  6039. if ($n === 'type') {
  6040. trigger_error('Deprecated type property called; use instanceof', E_USER_NOTICE);
  6041. switch (get_class($this)) {
  6042. case 'HTMLPurifier_Token_Start': return 'start';
  6043. case 'HTMLPurifier_Token_Empty': return 'empty';
  6044. case 'HTMLPurifier_Token_End': return 'end';
  6045. case 'HTMLPurifier_Token_Text': return 'text';
  6046. case 'HTMLPurifier_Token_Comment': return 'comment';
  6047. default: return null;
  6048. }
  6049. }
  6050. }
  6051. /**
  6052. * Sets the position of the token in the source document.
  6053. */
  6054. public function position($l = null, $c = null) {
  6055. $this->line = $l;
  6056. $this->col = $c;
  6057. }
  6058. /**
  6059. * Convenience function for DirectLex settings line/col position.
  6060. */
  6061. public function rawPosition($l, $c) {
  6062. if ($c === -1) $l++;
  6063. $this->line = $l;
  6064. $this->col = $c;
  6065. }
  6066. }
  6067. /**
  6068. * Factory for token generation.
  6069. *
  6070. * @note Doing some benchmarking indicates that the new operator is much
  6071. * slower than the clone operator (even discounting the cost of the
  6072. * constructor). This class is for that optimization.
  6073. * Other then that, there's not much point as we don't
  6074. * maintain parallel HTMLPurifier_Token hierarchies (the main reason why
  6075. * you'd want to use an abstract factory).
  6076. * @todo Port DirectLex to use this
  6077. */
  6078. class HTMLPurifier_TokenFactory
  6079. {
  6080. /**
  6081. * Prototypes that will be cloned.
  6082. * @private
  6083. */
  6084. // p stands for prototype
  6085. private $p_start, $p_end, $p_empty, $p_text, $p_comment;
  6086. /**
  6087. * Generates blank prototypes for cloning.
  6088. */
  6089. public function __construct() {
  6090. $this->p_start = new HTMLPurifier_Token_Start('', array());
  6091. $this->p_end = new HTMLPurifier_Token_End('');
  6092. $this->p_empty = new HTMLPurifier_Token_Empty('', array());
  6093. $this->p_text = new HTMLPurifier_Token_Text('');
  6094. $this->p_comment= new HTMLPurifier_Token_Comment('');
  6095. }
  6096. /**
  6097. * Creates a HTMLPurifier_Token_Start.
  6098. * @param $name Tag name
  6099. * @param $attr Associative array of attributes
  6100. * @return Generated HTMLPurifier_Token_Start
  6101. */
  6102. public function createStart($name, $attr = array()) {
  6103. $p = clone $this->p_start;
  6104. $p->__construct($name, $attr);
  6105. return $p;
  6106. }
  6107. /**
  6108. * Creates a HTMLPurifier_Token_End.
  6109. * @param $name Tag name
  6110. * @return Generated HTMLPurifier_Token_End
  6111. */
  6112. public function createEnd($name) {
  6113. $p = clone $this->p_end;
  6114. $p->__construct($name);
  6115. return $p;
  6116. }
  6117. /**
  6118. * Creates a HTMLPurifier_Token_Empty.
  6119. * @param $name Tag name
  6120. * @param $attr Associative array of attributes
  6121. * @return Generated HTMLPurifier_Token_Empty
  6122. */
  6123. public function createEmpty($name, $attr = array()) {
  6124. $p = clone $this->p_empty;
  6125. $p->__construct($name, $attr);
  6126. return $p;
  6127. }
  6128. /**
  6129. * Creates a HTMLPurifier_Token_Text.
  6130. * @param $data Data of text token
  6131. * @return Generated HTMLPurifier_Token_Text
  6132. */
  6133. public function createText($data) {
  6134. $p = clone $this->p_text;
  6135. $p->__construct($data);
  6136. return $p;
  6137. }
  6138. /**
  6139. * Creates a HTMLPurifier_Token_Comment.
  6140. * @param $data Data of comment token
  6141. * @return Generated HTMLPurifier_Token_Comment
  6142. */
  6143. public function createComment($data) {
  6144. $p = clone $this->p_comment;
  6145. $p->__construct($data);
  6146. return $p;
  6147. }
  6148. }
  6149. /**
  6150. * HTML Purifier's internal representation of a URI.
  6151. * @note
  6152. * Internal data-structures are completely escaped. If the data needs
  6153. * to be used in a non-URI context (which is very unlikely), be sure
  6154. * to decode it first. The URI may not necessarily be well-formed until
  6155. * validate() is called.
  6156. */
  6157. class HTMLPurifier_URI
  6158. {
  6159. public $scheme, $userinfo, $host, $port, $path, $query, $fragment;
  6160. /**
  6161. * @note Automatically normalizes scheme and port
  6162. */
  6163. public function __construct($scheme, $userinfo, $host, $port, $path, $query, $fragment) {
  6164. $this->scheme = is_null($scheme) || ctype_lower($scheme) ? $scheme : strtolower($scheme);
  6165. $this->userinfo = $userinfo;
  6166. $this->host = $host;
  6167. $this->port = is_null($port) ? $port : (int) $port;
  6168. $this->path = $path;
  6169. $this->query = $query;
  6170. $this->fragment = $fragment;
  6171. }
  6172. /**
  6173. * Retrieves a scheme object corresponding to the URI's scheme/default
  6174. * @param $config Instance of HTMLPurifier_Config
  6175. * @param $context Instance of HTMLPurifier_Context
  6176. * @return Scheme object appropriate for validating this URI
  6177. */
  6178. public function getSchemeObj($config, $context) {
  6179. $registry = HTMLPurifier_URISchemeRegistry::instance();
  6180. if ($this->scheme !== null) {
  6181. $scheme_obj = $registry->getScheme($this->scheme, $config, $context);
  6182. if (!$scheme_obj) return false; // invalid scheme, clean it out
  6183. } else {
  6184. // no scheme: retrieve the default one
  6185. $def = $config->getDefinition('URI');
  6186. $scheme_obj = $def->getDefaultScheme($config, $context);
  6187. if (!$scheme_obj) {
  6188. // something funky happened to the default scheme object
  6189. trigger_error(
  6190. 'Default scheme object "' . $def->defaultScheme . '" was not readable',
  6191. E_USER_WARNING
  6192. );
  6193. return false;
  6194. }
  6195. }
  6196. return $scheme_obj;
  6197. }
  6198. /**
  6199. * Generic validation method applicable for all schemes. May modify
  6200. * this URI in order to get it into a compliant form.
  6201. * @param $config Instance of HTMLPurifier_Config
  6202. * @param $context Instance of HTMLPurifier_Context
  6203. * @return True if validation/filtering succeeds, false if failure
  6204. */
  6205. public function validate($config, $context) {
  6206. // ABNF definitions from RFC 3986
  6207. $chars_sub_delims = '!$&\'()*+,;=';
  6208. $chars_gen_delims = ':/?#[]@';
  6209. $chars_pchar = $chars_sub_delims . ':@';
  6210. // validate host
  6211. if (!is_null($this->host)) {
  6212. $host_def = new HTMLPurifier_AttrDef_URI_Host();
  6213. $this->host = $host_def->validate($this->host, $config, $context);
  6214. if ($this->host === false) $this->host = null;
  6215. }
  6216. // validate scheme
  6217. // NOTE: It's not appropriate to check whether or not this
  6218. // scheme is in our registry, since a URIFilter may convert a
  6219. // URI that we don't allow into one we do. So instead, we just
  6220. // check if the scheme can be dropped because there is no host
  6221. // and it is our default scheme.
  6222. if (!is_null($this->scheme) && is_null($this->host) || $this->host === '') {
  6223. // support for relative paths is pretty abysmal when the
  6224. // scheme is present, so axe it when possible
  6225. $def = $config->getDefinition('URI');
  6226. if ($def->defaultScheme === $this->scheme) {
  6227. $this->scheme = null;
  6228. }
  6229. }
  6230. // validate username
  6231. if (!is_null($this->userinfo)) {
  6232. $encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
  6233. $this->userinfo = $encoder->encode($this->userinfo);
  6234. }
  6235. // validate port
  6236. if (!is_null($this->port)) {
  6237. if ($this->port < 1 || $this->port > 65535) $this->port = null;
  6238. }
  6239. // validate path
  6240. $path_parts = array();
  6241. $segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
  6242. if (!is_null($this->host)) { // this catches $this->host === ''
  6243. // path-abempty (hier and relative)
  6244. // http://www.example.com/my/path
  6245. // //www.example.com/my/path (looks odd, but works, and
  6246. // recognized by most browsers)
  6247. // (this set is valid or invalid on a scheme by scheme
  6248. // basis, so we'll deal with it later)
  6249. // file:///my/path
  6250. // ///my/path
  6251. $this->path = $segments_encoder->encode($this->path);
  6252. } elseif ($this->path !== '') {
  6253. if ($this->path[0] === '/') {
  6254. // path-absolute (hier and relative)
  6255. // http:/my/path
  6256. // /my/path
  6257. if (strlen($this->path) >= 2 && $this->path[1] === '/') {
  6258. // This could happen if both the host gets stripped
  6259. // out
  6260. // http://my/path
  6261. // //my/path
  6262. $this->path = '';
  6263. } else {
  6264. $this->path = $segments_encoder->encode($this->path);
  6265. }
  6266. } elseif (!is_null($this->scheme)) {
  6267. // path-rootless (hier)
  6268. // http:my/path
  6269. // Short circuit evaluation means we don't need to check nz
  6270. $this->path = $segments_encoder->encode($this->path);
  6271. } else {
  6272. // path-noscheme (relative)
  6273. // my/path
  6274. // (once again, not checking nz)
  6275. $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
  6276. $c = strpos($this->path, '/');
  6277. if ($c !== false) {
  6278. $this->path =
  6279. $segment_nc_encoder->encode(substr($this->path, 0, $c)) .
  6280. $segments_encoder->encode(substr($this->path, $c));
  6281. } else {
  6282. $this->path = $segment_nc_encoder->encode($this->path);
  6283. }
  6284. }
  6285. } else {
  6286. // path-empty (hier and relative)
  6287. $this->path = ''; // just to be safe
  6288. }
  6289. // qf = query and fragment
  6290. $qf_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/?');
  6291. if (!is_null($this->query)) {
  6292. $this->query = $qf_encoder->encode($this->query);
  6293. }
  6294. if (!is_null($this->fragment)) {
  6295. $this->fragment = $qf_encoder->encode($this->fragment);
  6296. }
  6297. return true;
  6298. }
  6299. /**
  6300. * Convert URI back to string
  6301. * @return String URI appropriate for output
  6302. */
  6303. public function toString() {
  6304. // reconstruct authority
  6305. $authority = null;
  6306. // there is a rendering difference between a null authority
  6307. // (http:foo-bar) and an empty string authority
  6308. // (http:///foo-bar).
  6309. if (!is_null($this->host)) {
  6310. $authority = '';
  6311. if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@';
  6312. $authority .= $this->host;
  6313. if(!is_null($this->port)) $authority .= ':' . $this->port;
  6314. }
  6315. // Reconstruct the result
  6316. // One might wonder about parsing quirks from browsers after
  6317. // this reconstruction. Unfortunately, parsing behavior depends
  6318. // on what *scheme* was employed (file:///foo is handled *very*
  6319. // differently than http:///foo), so unfortunately we have to
  6320. // defer to the schemes to do the right thing.
  6321. $result = '';
  6322. if (!is_null($this->scheme)) $result .= $this->scheme . ':';
  6323. if (!is_null($authority)) $result .= '//' . $authority;
  6324. $result .= $this->path;
  6325. if (!is_null($this->query)) $result .= '?' . $this->query;
  6326. if (!is_null($this->fragment)) $result .= '#' . $this->fragment;
  6327. return $result;
  6328. }
  6329. /**
  6330. * Returns true if this URL might be considered a 'local' URL given
  6331. * the current context. This is true when the host is null, or
  6332. * when it matches the host supplied to the configuration.
  6333. *
  6334. * Note that this does not do any scheme checking, so it is mostly
  6335. * only appropriate for metadata that doesn't care about protocol
  6336. * security. isBenign is probably what you actually want.
  6337. */
  6338. public function isLocal($config, $context) {
  6339. if ($this->host === null) return true;
  6340. $uri_def = $config->getDefinition('URI');
  6341. if ($uri_def->host === $this->host) return true;
  6342. return false;
  6343. }
  6344. /**
  6345. * Returns true if this URL should be considered a 'benign' URL,
  6346. * that is:
  6347. *
  6348. * - It is a local URL (isLocal), and
  6349. * - It has a equal or better level of security
  6350. */
  6351. public function isBenign($config, $context) {
  6352. if (!$this->isLocal($config, $context)) return false;
  6353. $scheme_obj = $this->getSchemeObj($config, $context);
  6354. if (!$scheme_obj) return false; // conservative approach
  6355. $current_scheme_obj = $config->getDefinition('URI')->getDefaultScheme($config, $context);
  6356. if ($current_scheme_obj->secure) {
  6357. if (!$scheme_obj->secure) {
  6358. return false;
  6359. }
  6360. }
  6361. return true;
  6362. }
  6363. }
  6364. class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition
  6365. {
  6366. public $type = 'URI';
  6367. protected $filters = array();
  6368. protected $postFilters = array();
  6369. protected $registeredFilters = array();
  6370. /**
  6371. * HTMLPurifier_URI object of the base specified at %URI.Base
  6372. */
  6373. public $base;
  6374. /**
  6375. * String host to consider "home" base, derived off of $base
  6376. */
  6377. public $host;
  6378. /**
  6379. * Name of default scheme based on %URI.DefaultScheme and %URI.Base
  6380. */
  6381. public $defaultScheme;
  6382. public function __construct() {
  6383. $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternal());
  6384. $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternalResources());
  6385. $this->registerFilter(new HTMLPurifier_URIFilter_HostBlacklist());
  6386. $this->registerFilter(new HTMLPurifier_URIFilter_SafeIframe());
  6387. $this->registerFilter(new HTMLPurifier_URIFilter_MakeAbsolute());
  6388. $this->registerFilter(new HTMLPurifier_URIFilter_Munge());
  6389. }
  6390. public function registerFilter($filter) {
  6391. $this->registeredFilters[$filter->name] = $filter;
  6392. }
  6393. public function addFilter($filter, $config) {
  6394. $r = $filter->prepare($config);
  6395. if ($r === false) return; // null is ok, for backwards compat
  6396. if ($filter->post) {
  6397. $this->postFilters[$filter->name] = $filter;
  6398. } else {
  6399. $this->filters[$filter->name] = $filter;
  6400. }
  6401. }
  6402. protected function doSetup($config) {
  6403. $this->setupMemberVariables($config);
  6404. $this->setupFilters($config);
  6405. }
  6406. protected function setupFilters($config) {
  6407. foreach ($this->registeredFilters as $name => $filter) {
  6408. if ($filter->always_load) {
  6409. $this->addFilter($filter, $config);
  6410. } else {
  6411. $conf = $config->get('URI.' . $name);
  6412. if ($conf !== false && $conf !== null) {
  6413. $this->addFilter($filter, $config);
  6414. }
  6415. }
  6416. }
  6417. unset($this->registeredFilters);
  6418. }
  6419. protected function setupMemberVariables($config) {
  6420. $this->host = $config->get('URI.Host');
  6421. $base_uri = $config->get('URI.Base');
  6422. if (!is_null($base_uri)) {
  6423. $parser = new HTMLPurifier_URIParser();
  6424. $this->base = $parser->parse($base_uri);
  6425. $this->defaultScheme = $this->base->scheme;
  6426. if (is_null($this->host)) $this->host = $this->base->host;
  6427. }
  6428. if (is_null($this->defaultScheme)) $this->defaultScheme = $config->get('URI.DefaultScheme');
  6429. }
  6430. public function getDefaultScheme($config, $context) {
  6431. return HTMLPurifier_URISchemeRegistry::instance()->getScheme($this->defaultScheme, $config, $context);
  6432. }
  6433. public function filter(&$uri, $config, $context) {
  6434. foreach ($this->filters as $name => $f) {
  6435. $result = $f->filter($uri, $config, $context);
  6436. if (!$result) return false;
  6437. }
  6438. return true;
  6439. }
  6440. public function postFilter(&$uri, $config, $context) {
  6441. foreach ($this->postFilters as $name => $f) {
  6442. $result = $f->filter($uri, $config, $context);
  6443. if (!$result) return false;
  6444. }
  6445. return true;
  6446. }
  6447. }
  6448. /**
  6449. * Chainable filters for custom URI processing.
  6450. *
  6451. * These filters can perform custom actions on a URI filter object,
  6452. * including transformation or blacklisting. A filter named Foo
  6453. * must have a corresponding configuration directive %URI.Foo,
  6454. * unless always_load is specified to be true.
  6455. *
  6456. * The following contexts may be available while URIFilters are being
  6457. * processed:
  6458. *
  6459. * - EmbeddedURI: true if URI is an embedded resource that will
  6460. * be loaded automatically on page load
  6461. * - CurrentToken: a reference to the token that is currently
  6462. * being processed
  6463. * - CurrentAttr: the name of the attribute that is currently being
  6464. * processed
  6465. * - CurrentCSSProperty: the name of the CSS property that is
  6466. * currently being processed (if applicable)
  6467. *
  6468. * @warning This filter is called before scheme object validation occurs.
  6469. * Make sure, if you require a specific scheme object, you
  6470. * you check that it exists. This allows filters to convert
  6471. * proprietary URI schemes into regular ones.
  6472. */
  6473. abstract class HTMLPurifier_URIFilter
  6474. {
  6475. /**
  6476. * Unique identifier of filter
  6477. */
  6478. public $name;
  6479. /**
  6480. * True if this filter should be run after scheme validation.
  6481. */
  6482. public $post = false;
  6483. /**
  6484. * True if this filter should always be loaded (this permits
  6485. * a filter to be named Foo without the corresponding %URI.Foo
  6486. * directive existing.)
  6487. */
  6488. public $always_load = false;
  6489. /**
  6490. * Performs initialization for the filter. If the filter returns
  6491. * false, this means that it shouldn't be considered active.
  6492. */
  6493. public function prepare($config) {return true;}
  6494. /**
  6495. * Filter a URI object
  6496. * @param $uri Reference to URI object variable
  6497. * @param $config Instance of HTMLPurifier_Config
  6498. * @param $context Instance of HTMLPurifier_Context
  6499. * @return bool Whether or not to continue processing: false indicates
  6500. * URL is no good, true indicates continue processing. Note that
  6501. * all changes are committed directly on the URI object
  6502. */
  6503. abstract public function filter(&$uri, $config, $context);
  6504. }
  6505. /**
  6506. * Parses a URI into the components and fragment identifier as specified
  6507. * by RFC 3986.
  6508. */
  6509. class HTMLPurifier_URIParser
  6510. {
  6511. /**
  6512. * Instance of HTMLPurifier_PercentEncoder to do normalization with.
  6513. */
  6514. protected $percentEncoder;
  6515. public function __construct() {
  6516. $this->percentEncoder = new HTMLPurifier_PercentEncoder();
  6517. }
  6518. /**
  6519. * Parses a URI.
  6520. * @param $uri string URI to parse
  6521. * @return HTMLPurifier_URI representation of URI. This representation has
  6522. * not been validated yet and may not conform to RFC.
  6523. */
  6524. public function parse($uri) {
  6525. $uri = $this->percentEncoder->normalize($uri);
  6526. // Regexp is as per Appendix B.
  6527. // Note that ["<>] are an addition to the RFC's recommended
  6528. // characters, because they represent external delimeters.
  6529. $r_URI = '!'.
  6530. '(([^:/?#"<>]+):)?'. // 2. Scheme
  6531. '(//([^/?#"<>]*))?'. // 4. Authority
  6532. '([^?#"<>]*)'. // 5. Path
  6533. '(\?([^#"<>]*))?'. // 7. Query
  6534. '(#([^"<>]*))?'. // 8. Fragment
  6535. '!';
  6536. $matches = array();
  6537. $result = preg_match($r_URI, $uri, $matches);
  6538. if (!$result) return false; // *really* invalid URI
  6539. // seperate out parts
  6540. $scheme = !empty($matches[1]) ? $matches[2] : null;
  6541. $authority = !empty($matches[3]) ? $matches[4] : null;
  6542. $path = $matches[5]; // always present, can be empty
  6543. $query = !empty($matches[6]) ? $matches[7] : null;
  6544. $fragment = !empty($matches[8]) ? $matches[9] : null;
  6545. // further parse authority
  6546. if ($authority !== null) {
  6547. $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
  6548. $matches = array();
  6549. preg_match($r_authority, $authority, $matches);
  6550. $userinfo = !empty($matches[1]) ? $matches[2] : null;
  6551. $host = !empty($matches[3]) ? $matches[3] : '';
  6552. $port = !empty($matches[4]) ? (int) $matches[5] : null;
  6553. } else {
  6554. $port = $host = $userinfo = null;
  6555. }
  6556. return new HTMLPurifier_URI(
  6557. $scheme, $userinfo, $host, $port, $path, $query, $fragment);
  6558. }
  6559. }
  6560. /**
  6561. * Validator for the components of a URI for a specific scheme
  6562. */
  6563. abstract class HTMLPurifier_URIScheme
  6564. {
  6565. /**
  6566. * Scheme's default port (integer). If an explicit port number is
  6567. * specified that coincides with the default port, it will be
  6568. * elided.
  6569. */
  6570. public $default_port = null;
  6571. /**
  6572. * Whether or not URIs of this schem are locatable by a browser
  6573. * http and ftp are accessible, while mailto and news are not.
  6574. */
  6575. public $browsable = false;
  6576. /**
  6577. * Whether or not data transmitted over this scheme is encrypted.
  6578. * https is secure, http is not.
  6579. */
  6580. public $secure = false;
  6581. /**
  6582. * Whether or not the URI always uses <hier_part>, resolves edge cases
  6583. * with making relative URIs absolute
  6584. */
  6585. public $hierarchical = false;
  6586. /**
  6587. * Whether or not the URI may omit a hostname when the scheme is
  6588. * explicitly specified, ala file:///path/to/file. As of writing,
  6589. * 'file' is the only scheme that browsers support his properly.
  6590. */
  6591. public $may_omit_host = false;
  6592. /**
  6593. * Validates the components of a URI for a specific scheme.
  6594. * @param $uri Reference to a HTMLPurifier_URI object
  6595. * @param $config HTMLPurifier_Config object
  6596. * @param $context HTMLPurifier_Context object
  6597. * @return Bool success or failure
  6598. */
  6599. public abstract function doValidate(&$uri, $config, $context);
  6600. /**
  6601. * Public interface for validating components of a URI. Performs a
  6602. * bunch of default actions. Don't overload this method.
  6603. * @param $uri Reference to a HTMLPurifier_URI object
  6604. * @param $config HTMLPurifier_Config object
  6605. * @param $context HTMLPurifier_Context object
  6606. * @return Bool success or failure
  6607. */
  6608. public function validate(&$uri, $config, $context) {
  6609. if ($this->default_port == $uri->port) $uri->port = null;
  6610. // kludge: browsers do funny things when the scheme but not the
  6611. // authority is set
  6612. if (!$this->may_omit_host &&
  6613. // if the scheme is present, a missing host is always in error
  6614. (!is_null($uri->scheme) && ($uri->host === '' || is_null($uri->host))) ||
  6615. // if the scheme is not present, a *blank* host is in error,
  6616. // since this translates into '///path' which most browsers
  6617. // interpret as being 'http://path'.
  6618. (is_null($uri->scheme) && $uri->host === '')
  6619. ) {
  6620. do {
  6621. if (is_null($uri->scheme)) {
  6622. if (substr($uri->path, 0, 2) != '//') {
  6623. $uri->host = null;
  6624. break;
  6625. }
  6626. // URI is '////path', so we cannot nullify the
  6627. // host to preserve semantics. Try expanding the
  6628. // hostname instead (fall through)
  6629. }
  6630. // first see if we can manually insert a hostname
  6631. $host = $config->get('URI.Host');
  6632. if (!is_null($host)) {
  6633. $uri->host = $host;
  6634. } else {
  6635. // we can't do anything sensible, reject the URL.
  6636. return false;
  6637. }
  6638. } while (false);
  6639. }
  6640. return $this->doValidate($uri, $config, $context);
  6641. }
  6642. }
  6643. /**
  6644. * Registry for retrieving specific URI scheme validator objects.
  6645. */
  6646. class HTMLPurifier_URISchemeRegistry
  6647. {
  6648. /**
  6649. * Retrieve sole instance of the registry.
  6650. * @param $prototype Optional prototype to overload sole instance with,
  6651. * or bool true to reset to default registry.
  6652. * @note Pass a registry object $prototype with a compatible interface and
  6653. * the function will copy it and return it all further times.
  6654. */
  6655. public static function instance($prototype = null) {
  6656. static $instance = null;
  6657. if ($prototype !== null) {
  6658. $instance = $prototype;
  6659. } elseif ($instance === null || $prototype == true) {
  6660. $instance = new HTMLPurifier_URISchemeRegistry();
  6661. }
  6662. return $instance;
  6663. }
  6664. /**
  6665. * Cache of retrieved schemes.
  6666. */
  6667. protected $schemes = array();
  6668. /**
  6669. * Retrieves a scheme validator object
  6670. * @param $scheme String scheme name like http or mailto
  6671. * @param $config HTMLPurifier_Config object
  6672. * @param $config HTMLPurifier_Context object
  6673. */
  6674. public function getScheme($scheme, $config, $context) {
  6675. if (!$config) $config = HTMLPurifier_Config::createDefault();
  6676. // important, otherwise attacker could include arbitrary file
  6677. $allowed_schemes = $config->get('URI.AllowedSchemes');
  6678. if (!$config->get('URI.OverrideAllowedSchemes') &&
  6679. !isset($allowed_schemes[$scheme])
  6680. ) {
  6681. return;
  6682. }
  6683. if (isset($this->schemes[$scheme])) return $this->schemes[$scheme];
  6684. if (!isset($allowed_schemes[$scheme])) return;
  6685. $class = 'HTMLPurifier_URIScheme_' . $scheme;
  6686. if (!class_exists($class)) return;
  6687. $this->schemes[$scheme] = new $class();
  6688. return $this->schemes[$scheme];
  6689. }
  6690. /**
  6691. * Registers a custom scheme to the cache, bypassing reflection.
  6692. * @param $scheme Scheme name
  6693. * @param $scheme_obj HTMLPurifier_URIScheme object
  6694. */
  6695. public function register($scheme, $scheme_obj) {
  6696. $this->schemes[$scheme] = $scheme_obj;
  6697. }
  6698. }
  6699. /**
  6700. * Class for converting between different unit-lengths as specified by
  6701. * CSS.
  6702. */
  6703. class HTMLPurifier_UnitConverter
  6704. {
  6705. const ENGLISH = 1;
  6706. const METRIC = 2;
  6707. const DIGITAL = 3;
  6708. /**
  6709. * Units information array. Units are grouped into measuring systems
  6710. * (English, Metric), and are assigned an integer representing
  6711. * the conversion factor between that unit and the smallest unit in
  6712. * the system. Numeric indexes are actually magical constants that
  6713. * encode conversion data from one system to the next, with a O(n^2)
  6714. * constraint on memory (this is generally not a problem, since
  6715. * the number of measuring systems is small.)
  6716. */
  6717. protected static $units = array(
  6718. self::ENGLISH => array(
  6719. 'px' => 3, // This is as per CSS 2.1 and Firefox. Your mileage may vary
  6720. 'pt' => 4,
  6721. 'pc' => 48,
  6722. 'in' => 288,
  6723. self::METRIC => array('pt', '0.352777778', 'mm'),
  6724. ),
  6725. self::METRIC => array(
  6726. 'mm' => 1,
  6727. 'cm' => 10,
  6728. self::ENGLISH => array('mm', '2.83464567', 'pt'),
  6729. ),
  6730. );
  6731. /**
  6732. * Minimum bcmath precision for output.
  6733. */
  6734. protected $outputPrecision;
  6735. /**
  6736. * Bcmath precision for internal calculations.
  6737. */
  6738. protected $internalPrecision;
  6739. /**
  6740. * Whether or not BCMath is available
  6741. */
  6742. private $bcmath;
  6743. public function __construct($output_precision = 4, $internal_precision = 10, $force_no_bcmath = false) {
  6744. $this->outputPrecision = $output_precision;
  6745. $this->internalPrecision = $internal_precision;
  6746. $this->bcmath = !$force_no_bcmath && function_exists('bcmul');
  6747. }
  6748. /**
  6749. * Converts a length object of one unit into another unit.
  6750. * @param HTMLPurifier_Length $length
  6751. * Instance of HTMLPurifier_Length to convert. You must validate()
  6752. * it before passing it here!
  6753. * @param string $to_unit
  6754. * Unit to convert to.
  6755. * @note
  6756. * About precision: This conversion function pays very special
  6757. * attention to the incoming precision of values and attempts
  6758. * to maintain a number of significant figure. Results are
  6759. * fairly accurate up to nine digits. Some caveats:
  6760. * - If a number is zero-padded as a result of this significant
  6761. * figure tracking, the zeroes will be eliminated.
  6762. * - If a number contains less than four sigfigs ($outputPrecision)
  6763. * and this causes some decimals to be excluded, those
  6764. * decimals will be added on.
  6765. */
  6766. public function convert($length, $to_unit) {
  6767. if (!$length->isValid()) return false;
  6768. $n = $length->getN();
  6769. $unit = $length->getUnit();
  6770. if ($n === '0' || $unit === false) {
  6771. return new HTMLPurifier_Length('0', false);
  6772. }
  6773. $state = $dest_state = false;
  6774. foreach (self::$units as $k => $x) {
  6775. if (isset($x[$unit])) $state = $k;
  6776. if (isset($x[$to_unit])) $dest_state = $k;
  6777. }
  6778. if (!$state || !$dest_state) return false;
  6779. // Some calculations about the initial precision of the number;
  6780. // this will be useful when we need to do final rounding.
  6781. $sigfigs = $this->getSigFigs($n);
  6782. if ($sigfigs < $this->outputPrecision) $sigfigs = $this->outputPrecision;
  6783. // BCMath's internal precision deals only with decimals. Use
  6784. // our default if the initial number has no decimals, or increase
  6785. // it by how ever many decimals, thus, the number of guard digits
  6786. // will always be greater than or equal to internalPrecision.
  6787. $log = (int) floor(log(abs($n), 10));
  6788. $cp = ($log < 0) ? $this->internalPrecision - $log : $this->internalPrecision; // internal precision
  6789. for ($i = 0; $i < 2; $i++) {
  6790. // Determine what unit IN THIS SYSTEM we need to convert to
  6791. if ($dest_state === $state) {
  6792. // Simple conversion
  6793. $dest_unit = $to_unit;
  6794. } else {
  6795. // Convert to the smallest unit, pending a system shift
  6796. $dest_unit = self::$units[$state][$dest_state][0];
  6797. }
  6798. // Do the conversion if necessary
  6799. if ($dest_unit !== $unit) {
  6800. $factor = $this->div(self::$units[$state][$unit], self::$units[$state][$dest_unit], $cp);
  6801. $n = $this->mul($n, $factor, $cp);
  6802. $unit = $dest_unit;
  6803. }
  6804. // Output was zero, so bail out early. Shouldn't ever happen.
  6805. if ($n === '') {
  6806. $n = '0';
  6807. $unit = $to_unit;
  6808. break;
  6809. }
  6810. // It was a simple conversion, so bail out
  6811. if ($dest_state === $state) {
  6812. break;
  6813. }
  6814. if ($i !== 0) {
  6815. // Conversion failed! Apparently, the system we forwarded
  6816. // to didn't have this unit. This should never happen!
  6817. return false;
  6818. }
  6819. // Pre-condition: $i == 0
  6820. // Perform conversion to next system of units
  6821. $n = $this->mul($n, self::$units[$state][$dest_state][1], $cp);
  6822. $unit = self::$units[$state][$dest_state][2];
  6823. $state = $dest_state;
  6824. // One more loop around to convert the unit in the new system.
  6825. }
  6826. // Post-condition: $unit == $to_unit
  6827. if ($unit !== $to_unit) return false;
  6828. // Useful for debugging:
  6829. //echo "<pre>n";
  6830. //echo "$n\nsigfigs = $sigfigs\nnew_log = $new_log\nlog = $log\nrp = $rp\n</pre>\n";
  6831. $n = $this->round($n, $sigfigs);
  6832. if (strpos($n, '.') !== false) $n = rtrim($n, '0');
  6833. $n = rtrim($n, '.');
  6834. return new HTMLPurifier_Length($n, $unit);
  6835. }
  6836. /**
  6837. * Returns the number of significant figures in a string number.
  6838. * @param string $n Decimal number
  6839. * @return int number of sigfigs
  6840. */
  6841. public function getSigFigs($n) {
  6842. $n = ltrim($n, '0+-');
  6843. $dp = strpos($n, '.'); // decimal position
  6844. if ($dp === false) {
  6845. $sigfigs = strlen(rtrim($n, '0'));
  6846. } else {
  6847. $sigfigs = strlen(ltrim($n, '0.')); // eliminate extra decimal character
  6848. if ($dp !== 0) $sigfigs--;
  6849. }
  6850. return $sigfigs;
  6851. }
  6852. /**
  6853. * Adds two numbers, using arbitrary precision when available.
  6854. */
  6855. private function add($s1, $s2, $scale) {
  6856. if ($this->bcmath) return bcadd($s1, $s2, $scale);
  6857. else return $this->scale($s1 + $s2, $scale);
  6858. }
  6859. /**
  6860. * Multiples two numbers, using arbitrary precision when available.
  6861. */
  6862. private function mul($s1, $s2, $scale) {
  6863. if ($this->bcmath) return bcmul($s1, $s2, $scale);
  6864. else return $this->scale($s1 * $s2, $scale);
  6865. }
  6866. /**
  6867. * Divides two numbers, using arbitrary precision when available.
  6868. */
  6869. private function div($s1, $s2, $scale) {
  6870. if ($this->bcmath) return bcdiv($s1, $s2, $scale);
  6871. else return $this->scale($s1 / $s2, $scale);
  6872. }
  6873. /**
  6874. * Rounds a number according to the number of sigfigs it should have,
  6875. * using arbitrary precision when available.
  6876. */
  6877. private function round($n, $sigfigs) {
  6878. $new_log = (int) floor(log(abs($n), 10)); // Number of digits left of decimal - 1
  6879. $rp = $sigfigs - $new_log - 1; // Number of decimal places needed
  6880. $neg = $n < 0 ? '-' : ''; // Negative sign
  6881. if ($this->bcmath) {
  6882. if ($rp >= 0) {
  6883. $n = bcadd($n, $neg . '0.' . str_repeat('0', $rp) . '5', $rp + 1);
  6884. $n = bcdiv($n, '1', $rp);
  6885. } else {
  6886. // This algorithm partially depends on the standardized
  6887. // form of numbers that comes out of bcmath.
  6888. $n = bcadd($n, $neg . '5' . str_repeat('0', $new_log - $sigfigs), 0);
  6889. $n = substr($n, 0, $sigfigs + strlen($neg)) . str_repeat('0', $new_log - $sigfigs + 1);
  6890. }
  6891. return $n;
  6892. } else {
  6893. return $this->scale(round($n, $sigfigs - $new_log - 1), $rp + 1);
  6894. }
  6895. }
  6896. /**
  6897. * Scales a float to $scale digits right of decimal point, like BCMath.
  6898. */
  6899. private function scale($r, $scale) {
  6900. if ($scale < 0) {
  6901. // The f sprintf type doesn't support negative numbers, so we
  6902. // need to cludge things manually. First get the string.
  6903. $r = sprintf('%.0f', (float) $r);
  6904. // Due to floating point precision loss, $r will more than likely
  6905. // look something like 4652999999999.9234. We grab one more digit
  6906. // than we need to precise from $r and then use that to round
  6907. // appropriately.
  6908. $precise = (string) round(substr($r, 0, strlen($r) + $scale), -1);
  6909. // Now we return it, truncating the zero that was rounded off.
  6910. return substr($precise, 0, -1) . str_repeat('0', -$scale + 1);
  6911. }
  6912. return sprintf('%.' . $scale . 'f', (float) $r);
  6913. }
  6914. }
  6915. /**
  6916. * Parses string representations into their corresponding native PHP
  6917. * variable type. The base implementation does a simple type-check.
  6918. */
  6919. class HTMLPurifier_VarParser
  6920. {
  6921. const STRING = 1;
  6922. const ISTRING = 2;
  6923. const TEXT = 3;
  6924. const ITEXT = 4;
  6925. const INT = 5;
  6926. const FLOAT = 6;
  6927. const BOOL = 7;
  6928. const LOOKUP = 8;
  6929. const ALIST = 9;
  6930. const HASH = 10;
  6931. const MIXED = 11;
  6932. /**
  6933. * Lookup table of allowed types. Mainly for backwards compatibility, but
  6934. * also convenient for transforming string type names to the integer constants.
  6935. */
  6936. static public $types = array(
  6937. 'string' => self::STRING,
  6938. 'istring' => self::ISTRING,
  6939. 'text' => self::TEXT,
  6940. 'itext' => self::ITEXT,
  6941. 'int' => self::INT,
  6942. 'float' => self::FLOAT,
  6943. 'bool' => self::BOOL,
  6944. 'lookup' => self::LOOKUP,
  6945. 'list' => self::ALIST,
  6946. 'hash' => self::HASH,
  6947. 'mixed' => self::MIXED
  6948. );
  6949. /**
  6950. * Lookup table of types that are string, and can have aliases or
  6951. * allowed value lists.
  6952. */
  6953. static public $stringTypes = array(
  6954. self::STRING => true,
  6955. self::ISTRING => true,
  6956. self::TEXT => true,
  6957. self::ITEXT => true,
  6958. );
  6959. /**
  6960. * Validate a variable according to type. Throws
  6961. * HTMLPurifier_VarParserException if invalid.
  6962. * It may return NULL as a valid type if $allow_null is true.
  6963. *
  6964. * @param $var Variable to validate
  6965. * @param $type Type of variable, see HTMLPurifier_VarParser->types
  6966. * @param $allow_null Whether or not to permit null as a value
  6967. * @return Validated and type-coerced variable
  6968. */
  6969. final public function parse($var, $type, $allow_null = false) {
  6970. if (is_string($type)) {
  6971. if (!isset(HTMLPurifier_VarParser::$types[$type])) {
  6972. throw new HTMLPurifier_VarParserException("Invalid type '$type'");
  6973. } else {
  6974. $type = HTMLPurifier_VarParser::$types[$type];
  6975. }
  6976. }
  6977. $var = $this->parseImplementation($var, $type, $allow_null);
  6978. if ($allow_null && $var === null) return null;
  6979. // These are basic checks, to make sure nothing horribly wrong
  6980. // happened in our implementations.
  6981. switch ($type) {
  6982. case (self::STRING):
  6983. case (self::ISTRING):
  6984. case (self::TEXT):
  6985. case (self::ITEXT):
  6986. if (!is_string($var)) break;
  6987. if ($type == self::ISTRING || $type == self::ITEXT) $var = strtolower($var);
  6988. return $var;
  6989. case (self::INT):
  6990. if (!is_int($var)) break;
  6991. return $var;
  6992. case (self::FLOAT):
  6993. if (!is_float($var)) break;
  6994. return $var;
  6995. case (self::BOOL):
  6996. if (!is_bool($var)) break;
  6997. return $var;
  6998. case (self::LOOKUP):
  6999. case (self::ALIST):
  7000. case (self::HASH):
  7001. if (!is_array($var)) break;
  7002. if ($type === self::LOOKUP) {
  7003. foreach ($var as $k) if ($k !== true) $this->error('Lookup table contains value other than true');
  7004. } elseif ($type === self::ALIST) {
  7005. $keys = array_keys($var);
  7006. if (array_keys($keys) !== $keys) $this->error('Indices for list are not uniform');
  7007. }
  7008. return $var;
  7009. case (self::MIXED):
  7010. return $var;
  7011. default:
  7012. $this->errorInconsistent(get_class($this), $type);
  7013. }
  7014. $this->errorGeneric($var, $type);
  7015. }
  7016. /**
  7017. * Actually implements the parsing. Base implementation is to not
  7018. * do anything to $var. Subclasses should overload this!
  7019. */
  7020. protected function parseImplementation($var, $type, $allow_null) {
  7021. return $var;
  7022. }
  7023. /**
  7024. * Throws an exception.
  7025. */
  7026. protected function error($msg) {
  7027. throw new HTMLPurifier_VarParserException($msg);
  7028. }
  7029. /**
  7030. * Throws an inconsistency exception.
  7031. * @note This should not ever be called. It would be called if we
  7032. * extend the allowed values of HTMLPurifier_VarParser without
  7033. * updating subclasses.
  7034. */
  7035. protected function errorInconsistent($class, $type) {
  7036. throw new HTMLPurifier_Exception("Inconsistency in $class: ".HTMLPurifier_VarParser::getTypeName($type)." not implemented");
  7037. }
  7038. /**
  7039. * Generic error for if a type didn't work.
  7040. */
  7041. protected function errorGeneric($var, $type) {
  7042. $vtype = gettype($var);
  7043. $this->error("Expected type ".HTMLPurifier_VarParser::getTypeName($type).", got $vtype");
  7044. }
  7045. static public function getTypeName($type) {
  7046. static $lookup;
  7047. if (!$lookup) {
  7048. // Lazy load the alternative lookup table
  7049. $lookup = array_flip(HTMLPurifier_VarParser::$types);
  7050. }
  7051. if (!isset($lookup[$type])) return 'unknown';
  7052. return $lookup[$type];
  7053. }
  7054. }
  7055. /**
  7056. * Exception type for HTMLPurifier_VarParser
  7057. */
  7058. class HTMLPurifier_VarParserException extends HTMLPurifier_Exception
  7059. {
  7060. }
  7061. /**
  7062. * Validates the HTML attribute style, otherwise known as CSS.
  7063. * @note We don't implement the whole CSS specification, so it might be
  7064. * difficult to reuse this component in the context of validating
  7065. * actual stylesheet declarations.
  7066. * @note If we were really serious about validating the CSS, we would
  7067. * tokenize the styles and then parse the tokens. Obviously, we
  7068. * are not doing that. Doing that could seriously harm performance,
  7069. * but would make these components a lot more viable for a CSS
  7070. * filtering solution.
  7071. */
  7072. class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
  7073. {
  7074. public function validate($css, $config, $context) {
  7075. $css = $this->parseCDATA($css);
  7076. $definition = $config->getCSSDefinition();
  7077. // we're going to break the spec and explode by semicolons.
  7078. // This is because semicolon rarely appears in escaped form
  7079. // Doing this is generally flaky but fast
  7080. // IT MIGHT APPEAR IN URIs, see HTMLPurifier_AttrDef_CSSURI
  7081. // for details
  7082. $declarations = explode(';', $css);
  7083. $propvalues = array();
  7084. /**
  7085. * Name of the current CSS property being validated.
  7086. */
  7087. $property = false;
  7088. $context->register('CurrentCSSProperty', $property);
  7089. foreach ($declarations as $declaration) {
  7090. if (!$declaration) continue;
  7091. if (!strpos($declaration, ':')) continue;
  7092. list($property, $value) = explode(':', $declaration, 2);
  7093. $property = trim($property);
  7094. $value = trim($value);
  7095. $ok = false;
  7096. do {
  7097. if (isset($definition->info[$property])) {
  7098. $ok = true;
  7099. break;
  7100. }
  7101. if (ctype_lower($property)) break;
  7102. $property = strtolower($property);
  7103. if (isset($definition->info[$property])) {
  7104. $ok = true;
  7105. break;
  7106. }
  7107. } while(0);
  7108. if (!$ok) continue;
  7109. // inefficient call, since the validator will do this again
  7110. if (strtolower(trim($value)) !== 'inherit') {
  7111. // inherit works for everything (but only on the base property)
  7112. $result = $definition->info[$property]->validate(
  7113. $value, $config, $context );
  7114. } else {
  7115. $result = 'inherit';
  7116. }
  7117. if ($result === false) continue;
  7118. $propvalues[$property] = $result;
  7119. }
  7120. $context->destroy('CurrentCSSProperty');
  7121. // procedure does not write the new CSS simultaneously, so it's
  7122. // slightly inefficient, but it's the only way of getting rid of
  7123. // duplicates. Perhaps config to optimize it, but not now.
  7124. $new_declarations = '';
  7125. foreach ($propvalues as $prop => $value) {
  7126. $new_declarations .= "$prop:$value;";
  7127. }
  7128. return $new_declarations ? $new_declarations : false;
  7129. }
  7130. }
  7131. /**
  7132. * Dummy AttrDef that mimics another AttrDef, BUT it generates clones
  7133. * with make.
  7134. */
  7135. class HTMLPurifier_AttrDef_Clone extends HTMLPurifier_AttrDef
  7136. {
  7137. /**
  7138. * What we're cloning
  7139. */
  7140. protected $clone;
  7141. public function __construct($clone) {
  7142. $this->clone = $clone;
  7143. }
  7144. public function validate($v, $config, $context) {
  7145. return $this->clone->validate($v, $config, $context);
  7146. }
  7147. public function make($string) {
  7148. return clone $this->clone;
  7149. }
  7150. }
  7151. // Enum = Enumerated
  7152. /**
  7153. * Validates a keyword against a list of valid values.
  7154. * @warning The case-insensitive compare of this function uses PHP's
  7155. * built-in strtolower and ctype_lower functions, which may
  7156. * cause problems with international comparisons
  7157. */
  7158. class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef
  7159. {
  7160. /**
  7161. * Lookup table of valid values.
  7162. * @todo Make protected
  7163. */
  7164. public $valid_values = array();
  7165. /**
  7166. * Bool indicating whether or not enumeration is case sensitive.
  7167. * @note In general this is always case insensitive.
  7168. */
  7169. protected $case_sensitive = false; // values according to W3C spec
  7170. /**
  7171. * @param $valid_values List of valid values
  7172. * @param $case_sensitive Bool indicating whether or not case sensitive
  7173. */
  7174. public function __construct(
  7175. $valid_values = array(), $case_sensitive = false
  7176. ) {
  7177. $this->valid_values = array_flip($valid_values);
  7178. $this->case_sensitive = $case_sensitive;
  7179. }
  7180. public function validate($string, $config, $context) {
  7181. $string = trim($string);
  7182. if (!$this->case_sensitive) {
  7183. // we may want to do full case-insensitive libraries
  7184. $string = ctype_lower($string) ? $string : strtolower($string);
  7185. }
  7186. $result = isset($this->valid_values[$string]);
  7187. return $result ? $string : false;
  7188. }
  7189. /**
  7190. * @param $string In form of comma-delimited list of case-insensitive
  7191. * valid values. Example: "foo,bar,baz". Prepend "s:" to make
  7192. * case sensitive
  7193. */
  7194. public function make($string) {
  7195. if (strlen($string) > 2 && $string[0] == 's' && $string[1] == ':') {
  7196. $string = substr($string, 2);
  7197. $sensitive = true;
  7198. } else {
  7199. $sensitive = false;
  7200. }
  7201. $values = explode(',', $string);
  7202. return new HTMLPurifier_AttrDef_Enum($values, $sensitive);
  7203. }
  7204. }
  7205. /**
  7206. * Validates an integer.
  7207. * @note While this class was modeled off the CSS definition, no currently
  7208. * allowed CSS uses this type. The properties that do are: widows,
  7209. * orphans, z-index, counter-increment, counter-reset. Some of the
  7210. * HTML attributes, however, find use for a non-negative version of this.
  7211. */
  7212. class HTMLPurifier_AttrDef_Integer extends HTMLPurifier_AttrDef
  7213. {
  7214. /**
  7215. * Bool indicating whether or not negative values are allowed
  7216. */
  7217. protected $negative = true;
  7218. /**
  7219. * Bool indicating whether or not zero is allowed
  7220. */
  7221. protected $zero = true;
  7222. /**
  7223. * Bool indicating whether or not positive values are allowed
  7224. */
  7225. protected $positive = true;
  7226. /**
  7227. * @param $negative Bool indicating whether or not negative values are allowed
  7228. * @param $zero Bool indicating whether or not zero is allowed
  7229. * @param $positive Bool indicating whether or not positive values are allowed
  7230. */
  7231. public function __construct(
  7232. $negative = true, $zero = true, $positive = true
  7233. ) {
  7234. $this->negative = $negative;
  7235. $this->zero = $zero;
  7236. $this->positive = $positive;
  7237. }
  7238. public function validate($integer, $config, $context) {
  7239. $integer = $this->parseCDATA($integer);
  7240. if ($integer === '') return false;
  7241. // we could possibly simply typecast it to integer, but there are
  7242. // certain fringe cases that must not return an integer.
  7243. // clip leading sign
  7244. if ( $this->negative && $integer[0] === '-' ) {
  7245. $digits = substr($integer, 1);
  7246. if ($digits === '0') $integer = '0'; // rm minus sign for zero
  7247. } elseif( $this->positive && $integer[0] === '+' ) {
  7248. $digits = $integer = substr($integer, 1); // rm unnecessary plus
  7249. } else {
  7250. $digits = $integer;
  7251. }
  7252. // test if it's numeric
  7253. if (!ctype_digit($digits)) return false;
  7254. // perform scope tests
  7255. if (!$this->zero && $integer == 0) return false;
  7256. if (!$this->positive && $integer > 0) return false;
  7257. if (!$this->negative && $integer < 0) return false;
  7258. return $integer;
  7259. }
  7260. }
  7261. /**
  7262. * Validates the HTML attribute lang, effectively a language code.
  7263. * @note Built according to RFC 3066, which obsoleted RFC 1766
  7264. */
  7265. class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef
  7266. {
  7267. public function validate($string, $config, $context) {
  7268. $string = trim($string);
  7269. if (!$string) return false;
  7270. $subtags = explode('-', $string);
  7271. $num_subtags = count($subtags);
  7272. if ($num_subtags == 0) return false; // sanity check
  7273. // process primary subtag : $subtags[0]
  7274. $length = strlen($subtags[0]);
  7275. switch ($length) {
  7276. case 0:
  7277. return false;
  7278. case 1:
  7279. if (! ($subtags[0] == 'x' || $subtags[0] == 'i') ) {
  7280. return false;
  7281. }
  7282. break;
  7283. case 2:
  7284. case 3:
  7285. if (! ctype_alpha($subtags[0]) ) {
  7286. return false;
  7287. } elseif (! ctype_lower($subtags[0]) ) {
  7288. $subtags[0] = strtolower($subtags[0]);
  7289. }
  7290. break;
  7291. default:
  7292. return false;
  7293. }
  7294. $new_string = $subtags[0];
  7295. if ($num_subtags == 1) return $new_string;
  7296. // process second subtag : $subtags[1]
  7297. $length = strlen($subtags[1]);
  7298. if ($length == 0 || ($length == 1 && $subtags[1] != 'x') || $length > 8 || !ctype_alnum($subtags[1])) {
  7299. return $new_string;
  7300. }
  7301. if (!ctype_lower($subtags[1])) $subtags[1] = strtolower($subtags[1]);
  7302. $new_string .= '-' . $subtags[1];
  7303. if ($num_subtags == 2) return $new_string;
  7304. // process all other subtags, index 2 and up
  7305. for ($i = 2; $i < $num_subtags; $i++) {
  7306. $length = strlen($subtags[$i]);
  7307. if ($length == 0 || $length > 8 || !ctype_alnum($subtags[$i])) {
  7308. return $new_string;
  7309. }
  7310. if (!ctype_lower($subtags[$i])) {
  7311. $subtags[$i] = strtolower($subtags[$i]);
  7312. }
  7313. $new_string .= '-' . $subtags[$i];
  7314. }
  7315. return $new_string;
  7316. }
  7317. }
  7318. /**
  7319. * Decorator that, depending on a token, switches between two definitions.
  7320. */
  7321. class HTMLPurifier_AttrDef_Switch
  7322. {
  7323. protected $tag;
  7324. protected $withTag, $withoutTag;
  7325. /**
  7326. * @param string $tag Tag name to switch upon
  7327. * @param HTMLPurifier_AttrDef $with_tag Call if token matches tag
  7328. * @param HTMLPurifier_AttrDef $without_tag Call if token doesn't match, or there is no token
  7329. */
  7330. public function __construct($tag, $with_tag, $without_tag) {
  7331. $this->tag = $tag;
  7332. $this->withTag = $with_tag;
  7333. $this->withoutTag = $without_tag;
  7334. }
  7335. public function validate($string, $config, $context) {
  7336. $token = $context->get('CurrentToken', true);
  7337. if (!$token || $token->name !== $this->tag) {
  7338. return $this->withoutTag->validate($string, $config, $context);
  7339. } else {
  7340. return $this->withTag->validate($string, $config, $context);
  7341. }
  7342. }
  7343. }
  7344. /**
  7345. * Validates arbitrary text according to the HTML spec.
  7346. */
  7347. class HTMLPurifier_AttrDef_Text extends HTMLPurifier_AttrDef
  7348. {
  7349. public function validate($string, $config, $context) {
  7350. return $this->parseCDATA($string);
  7351. }
  7352. }
  7353. /**
  7354. * Validates a URI as defined by RFC 3986.
  7355. * @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme
  7356. */
  7357. class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
  7358. {
  7359. protected $parser;
  7360. protected $embedsResource;
  7361. /**
  7362. * @param $embeds_resource_resource Does the URI here result in an extra HTTP request?
  7363. */
  7364. public function __construct($embeds_resource = false) {
  7365. $this->parser = new HTMLPurifier_URIParser();
  7366. $this->embedsResource = (bool) $embeds_resource;
  7367. }
  7368. public function make($string) {
  7369. $embeds = ($string === 'embedded');
  7370. return new HTMLPurifier_AttrDef_URI($embeds);
  7371. }
  7372. public function validate($uri, $config, $context) {
  7373. if ($config->get('URI.Disable')) return false;
  7374. $uri = $this->parseCDATA($uri);
  7375. // parse the URI
  7376. $uri = $this->parser->parse($uri);
  7377. if ($uri === false) return false;
  7378. // add embedded flag to context for validators
  7379. $context->register('EmbeddedURI', $this->embedsResource);
  7380. $ok = false;
  7381. do {
  7382. // generic validation
  7383. $result = $uri->validate($config, $context);
  7384. if (!$result) break;
  7385. // chained filtering
  7386. $uri_def = $config->getDefinition('URI');
  7387. $result = $uri_def->filter($uri, $config, $context);
  7388. if (!$result) break;
  7389. // scheme-specific validation
  7390. $scheme_obj = $uri->getSchemeObj($config, $context);
  7391. if (!$scheme_obj) break;
  7392. if ($this->embedsResource && !$scheme_obj->browsable) break;
  7393. $result = $scheme_obj->validate($uri, $config, $context);
  7394. if (!$result) break;
  7395. // Post chained filtering
  7396. $result = $uri_def->postFilter($uri, $config, $context);
  7397. if (!$result) break;
  7398. // survived gauntlet
  7399. $ok = true;
  7400. } while (false);
  7401. $context->destroy('EmbeddedURI');
  7402. if (!$ok) return false;
  7403. // back to string
  7404. return $uri->toString();
  7405. }
  7406. }
  7407. /**
  7408. * Validates a number as defined by the CSS spec.
  7409. */
  7410. class HTMLPurifier_AttrDef_CSS_Number extends HTMLPurifier_AttrDef
  7411. {
  7412. /**
  7413. * Bool indicating whether or not only positive values allowed.
  7414. */
  7415. protected $non_negative = false;
  7416. /**
  7417. * @param $non_negative Bool indicating whether negatives are forbidden
  7418. */
  7419. public function __construct($non_negative = false) {
  7420. $this->non_negative = $non_negative;
  7421. }
  7422. /**
  7423. * @warning Some contexts do not pass $config, $context. These
  7424. * variables should not be used without checking HTMLPurifier_Length
  7425. */
  7426. public function validate($number, $config, $context) {
  7427. $number = $this->parseCDATA($number);
  7428. if ($number === '') return false;
  7429. if ($number === '0') return '0';
  7430. $sign = '';
  7431. switch ($number[0]) {
  7432. case '-':
  7433. if ($this->non_negative) return false;
  7434. $sign = '-';
  7435. case '+':
  7436. $number = substr($number, 1);
  7437. }
  7438. if (ctype_digit($number)) {
  7439. $number = ltrim($number, '0');
  7440. return $number ? $sign . $number : '0';
  7441. }
  7442. // Period is the only non-numeric character allowed
  7443. if (strpos($number, '.') === false) return false;
  7444. list($left, $right) = explode('.', $number, 2);
  7445. if ($left === '' && $right === '') return false;
  7446. if ($left !== '' && !ctype_digit($left)) return false;
  7447. $left = ltrim($left, '0');
  7448. $right = rtrim($right, '0');
  7449. if ($right === '') {
  7450. return $left ? $sign . $left : '0';
  7451. } elseif (!ctype_digit($right)) {
  7452. return false;
  7453. }
  7454. return $sign . $left . '.' . $right;
  7455. }
  7456. }
  7457. class HTMLPurifier_AttrDef_CSS_AlphaValue extends HTMLPurifier_AttrDef_CSS_Number
  7458. {
  7459. public function __construct() {
  7460. parent::__construct(false); // opacity is non-negative, but we will clamp it
  7461. }
  7462. public function validate($number, $config, $context) {
  7463. $result = parent::validate($number, $config, $context);
  7464. if ($result === false) return $result;
  7465. $float = (float) $result;
  7466. if ($float < 0.0) $result = '0';
  7467. if ($float > 1.0) $result = '1';
  7468. return $result;
  7469. }
  7470. }
  7471. /**
  7472. * Validates shorthand CSS property background.
  7473. * @warning Does not support url tokens that have internal spaces.
  7474. */
  7475. class HTMLPurifier_AttrDef_CSS_Background extends HTMLPurifier_AttrDef
  7476. {
  7477. /**
  7478. * Local copy of component validators.
  7479. * @note See HTMLPurifier_AttrDef_Font::$info for a similar impl.
  7480. */
  7481. protected $info;
  7482. public function __construct($config) {
  7483. $def = $config->getCSSDefinition();
  7484. $this->info['background-color'] = $def->info['background-color'];
  7485. $this->info['background-image'] = $def->info['background-image'];
  7486. $this->info['background-repeat'] = $def->info['background-repeat'];
  7487. $this->info['background-attachment'] = $def->info['background-attachment'];
  7488. $this->info['background-position'] = $def->info['background-position'];
  7489. }
  7490. public function validate($string, $config, $context) {
  7491. // regular pre-processing
  7492. $string = $this->parseCDATA($string);
  7493. if ($string === '') return false;
  7494. // munge rgb() decl if necessary
  7495. $string = $this->mungeRgb($string);
  7496. // assumes URI doesn't have spaces in it
  7497. $bits = explode(' ', strtolower($string)); // bits to process
  7498. $caught = array();
  7499. $caught['color'] = false;
  7500. $caught['image'] = false;
  7501. $caught['repeat'] = false;
  7502. $caught['attachment'] = false;
  7503. $caught['position'] = false;
  7504. $i = 0; // number of catches
  7505. $none = false;
  7506. foreach ($bits as $bit) {
  7507. if ($bit === '') continue;
  7508. foreach ($caught as $key => $status) {
  7509. if ($key != 'position') {
  7510. if ($status !== false) continue;
  7511. $r = $this->info['background-' . $key]->validate($bit, $config, $context);
  7512. } else {
  7513. $r = $bit;
  7514. }
  7515. if ($r === false) continue;
  7516. if ($key == 'position') {
  7517. if ($caught[$key] === false) $caught[$key] = '';
  7518. $caught[$key] .= $r . ' ';
  7519. } else {
  7520. $caught[$key] = $r;
  7521. }
  7522. $i++;
  7523. break;
  7524. }
  7525. }
  7526. if (!$i) return false;
  7527. if ($caught['position'] !== false) {
  7528. $caught['position'] = $this->info['background-position']->
  7529. validate($caught['position'], $config, $context);
  7530. }
  7531. $ret = array();
  7532. foreach ($caught as $value) {
  7533. if ($value === false) continue;
  7534. $ret[] = $value;
  7535. }
  7536. if (empty($ret)) return false;
  7537. return implode(' ', $ret);
  7538. }
  7539. }
  7540. /* W3C says:
  7541. [ // adjective and number must be in correct order, even if
  7542. // you could switch them without introducing ambiguity.
  7543. // some browsers support that syntax
  7544. [
  7545. <percentage> | <length> | left | center | right
  7546. ]
  7547. [
  7548. <percentage> | <length> | top | center | bottom
  7549. ]?
  7550. ] |
  7551. [ // this signifies that the vertical and horizontal adjectives
  7552. // can be arbitrarily ordered, however, there can only be two,
  7553. // one of each, or none at all
  7554. [
  7555. left | center | right
  7556. ] ||
  7557. [
  7558. top | center | bottom
  7559. ]
  7560. ]
  7561. top, left = 0%
  7562. center, (none) = 50%
  7563. bottom, right = 100%
  7564. */
  7565. /* QuirksMode says:
  7566. keyword + length/percentage must be ordered correctly, as per W3C
  7567. Internet Explorer and Opera, however, support arbitrary ordering. We
  7568. should fix it up.
  7569. Minor issue though, not strictly necessary.
  7570. */
  7571. // control freaks may appreciate the ability to convert these to
  7572. // percentages or something, but it's not necessary
  7573. /**
  7574. * Validates the value of background-position.
  7575. */
  7576. class HTMLPurifier_AttrDef_CSS_BackgroundPosition extends HTMLPurifier_AttrDef
  7577. {
  7578. protected $length;
  7579. protected $percentage;
  7580. public function __construct() {
  7581. $this->length = new HTMLPurifier_AttrDef_CSS_Length();
  7582. $this->percentage = new HTMLPurifier_AttrDef_CSS_Percentage();
  7583. }
  7584. public function validate($string, $config, $context) {
  7585. $string = $this->parseCDATA($string);
  7586. $bits = explode(' ', $string);
  7587. $keywords = array();
  7588. $keywords['h'] = false; // left, right
  7589. $keywords['v'] = false; // top, bottom
  7590. $keywords['ch'] = false; // center (first word)
  7591. $keywords['cv'] = false; // center (second word)
  7592. $measures = array();
  7593. $i = 0;
  7594. $lookup = array(
  7595. 'top' => 'v',
  7596. 'bottom' => 'v',
  7597. 'left' => 'h',
  7598. 'right' => 'h',
  7599. 'center' => 'c'
  7600. );
  7601. foreach ($bits as $bit) {
  7602. if ($bit === '') continue;
  7603. // test for keyword
  7604. $lbit = ctype_lower($bit) ? $bit : strtolower($bit);
  7605. if (isset($lookup[$lbit])) {
  7606. $status = $lookup[$lbit];
  7607. if ($status == 'c') {
  7608. if ($i == 0) {
  7609. $status = 'ch';
  7610. } else {
  7611. $status = 'cv';
  7612. }
  7613. }
  7614. $keywords[$status] = $lbit;
  7615. $i++;
  7616. }
  7617. // test for length
  7618. $r = $this->length->validate($bit, $config, $context);
  7619. if ($r !== false) {
  7620. $measures[] = $r;
  7621. $i++;
  7622. }
  7623. // test for percentage
  7624. $r = $this->percentage->validate($bit, $config, $context);
  7625. if ($r !== false) {
  7626. $measures[] = $r;
  7627. $i++;
  7628. }
  7629. }
  7630. if (!$i) return false; // no valid values were caught
  7631. $ret = array();
  7632. // first keyword
  7633. if ($keywords['h']) $ret[] = $keywords['h'];
  7634. elseif ($keywords['ch']) {
  7635. $ret[] = $keywords['ch'];
  7636. $keywords['cv'] = false; // prevent re-use: center = center center
  7637. }
  7638. elseif (count($measures)) $ret[] = array_shift($measures);
  7639. if ($keywords['v']) $ret[] = $keywords['v'];
  7640. elseif ($keywords['cv']) $ret[] = $keywords['cv'];
  7641. elseif (count($measures)) $ret[] = array_shift($measures);
  7642. if (empty($ret)) return false;
  7643. return implode(' ', $ret);
  7644. }
  7645. }
  7646. /**
  7647. * Validates the border property as defined by CSS.
  7648. */
  7649. class HTMLPurifier_AttrDef_CSS_Border extends HTMLPurifier_AttrDef
  7650. {
  7651. /**
  7652. * Local copy of properties this property is shorthand for.
  7653. */
  7654. protected $info = array();
  7655. public function __construct($config) {
  7656. $def = $config->getCSSDefinition();
  7657. $this->info['border-width'] = $def->info['border-width'];
  7658. $this->info['border-style'] = $def->info['border-style'];
  7659. $this->info['border-top-color'] = $def->info['border-top-color'];
  7660. }
  7661. public function validate($string, $config, $context) {
  7662. $string = $this->parseCDATA($string);
  7663. $string = $this->mungeRgb($string);
  7664. $bits = explode(' ', $string);
  7665. $done = array(); // segments we've finished
  7666. $ret = ''; // return value
  7667. foreach ($bits as $bit) {
  7668. foreach ($this->info as $propname => $validator) {
  7669. if (isset($done[$propname])) continue;
  7670. $r = $validator->validate($bit, $config, $context);
  7671. if ($r !== false) {
  7672. $ret .= $r . ' ';
  7673. $done[$propname] = true;
  7674. break;
  7675. }
  7676. }
  7677. }
  7678. return rtrim($ret);
  7679. }
  7680. }
  7681. /**
  7682. * Validates Color as defined by CSS.
  7683. */
  7684. class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
  7685. {
  7686. public function validate($color, $config, $context) {
  7687. static $colors = null;
  7688. if ($colors === null) $colors = $config->get('Core.ColorKeywords');
  7689. $color = trim($color);
  7690. if ($color === '') return false;
  7691. $lower = strtolower($color);
  7692. if (isset($colors[$lower])) return $colors[$lower];
  7693. if (strpos($color, 'rgb(') !== false) {
  7694. // rgb literal handling
  7695. $length = strlen($color);
  7696. if (strpos($color, ')') !== $length - 1) return false;
  7697. $triad = substr($color, 4, $length - 4 - 1);
  7698. $parts = explode(',', $triad);
  7699. if (count($parts) !== 3) return false;
  7700. $type = false; // to ensure that they're all the same type
  7701. $new_parts = array();
  7702. foreach ($parts as $part) {
  7703. $part = trim($part);
  7704. if ($part === '') return false;
  7705. $length = strlen($part);
  7706. if ($part[$length - 1] === '%') {
  7707. // handle percents
  7708. if (!$type) {
  7709. $type = 'percentage';
  7710. } elseif ($type !== 'percentage') {
  7711. return false;
  7712. }
  7713. $num = (float) substr($part, 0, $length - 1);
  7714. if ($num < 0) $num = 0;
  7715. if ($num > 100) $num = 100;
  7716. $new_parts[] = "$num%";
  7717. } else {
  7718. // handle integers
  7719. if (!$type) {
  7720. $type = 'integer';
  7721. } elseif ($type !== 'integer') {
  7722. return false;
  7723. }
  7724. $num = (int) $part;
  7725. if ($num < 0) $num = 0;
  7726. if ($num > 255) $num = 255;
  7727. $new_parts[] = (string) $num;
  7728. }
  7729. }
  7730. $new_triad = implode(',', $new_parts);
  7731. $color = "rgb($new_triad)";
  7732. } else {
  7733. // hexadecimal handling
  7734. if ($color[0] === '#') {
  7735. $hex = substr($color, 1);
  7736. } else {
  7737. $hex = $color;
  7738. $color = '#' . $color;
  7739. }
  7740. $length = strlen($hex);
  7741. if ($length !== 3 && $length !== 6) return false;
  7742. if (!ctype_xdigit($hex)) return false;
  7743. }
  7744. return $color;
  7745. }
  7746. }
  7747. /**
  7748. * Allows multiple validators to attempt to validate attribute.
  7749. *
  7750. * Composite is just what it sounds like: a composite of many validators.
  7751. * This means that multiple HTMLPurifier_AttrDef objects will have a whack
  7752. * at the string. If one of them passes, that's what is returned. This is
  7753. * especially useful for CSS values, which often are a choice between
  7754. * an enumerated set of predefined values or a flexible data type.
  7755. */
  7756. class HTMLPurifier_AttrDef_CSS_Composite extends HTMLPurifier_AttrDef
  7757. {
  7758. /**
  7759. * List of HTMLPurifier_AttrDef objects that may process strings
  7760. * @todo Make protected
  7761. */
  7762. public $defs;
  7763. /**
  7764. * @param $defs List of HTMLPurifier_AttrDef objects
  7765. */
  7766. public function __construct($defs) {
  7767. $this->defs = $defs;
  7768. }
  7769. public function validate($string, $config, $context) {
  7770. foreach ($this->defs as $i => $def) {
  7771. $result = $this->defs[$i]->validate($string, $config, $context);
  7772. if ($result !== false) return $result;
  7773. }
  7774. return false;
  7775. }
  7776. }
  7777. /**
  7778. * Decorator which enables CSS properties to be disabled for specific elements.
  7779. */
  7780. class HTMLPurifier_AttrDef_CSS_DenyElementDecorator extends HTMLPurifier_AttrDef
  7781. {
  7782. public $def, $element;
  7783. /**
  7784. * @param $def Definition to wrap
  7785. * @param $element Element to deny
  7786. */
  7787. public function __construct($def, $element) {
  7788. $this->def = $def;
  7789. $this->element = $element;
  7790. }
  7791. /**
  7792. * Checks if CurrentToken is set and equal to $this->element
  7793. */
  7794. public function validate($string, $config, $context) {
  7795. $token = $context->get('CurrentToken', true);
  7796. if ($token && $token->name == $this->element) return false;
  7797. return $this->def->validate($string, $config, $context);
  7798. }
  7799. }
  7800. /**
  7801. * Microsoft's proprietary filter: CSS property
  7802. * @note Currently supports the alpha filter. In the future, this will
  7803. * probably need an extensible framework
  7804. */
  7805. class HTMLPurifier_AttrDef_CSS_Filter extends HTMLPurifier_AttrDef
  7806. {
  7807. protected $intValidator;
  7808. public function __construct() {
  7809. $this->intValidator = new HTMLPurifier_AttrDef_Integer();
  7810. }
  7811. public function validate($value, $config, $context) {
  7812. $value = $this->parseCDATA($value);
  7813. if ($value === 'none') return $value;
  7814. // if we looped this we could support multiple filters
  7815. $function_length = strcspn($value, '(');
  7816. $function = trim(substr($value, 0, $function_length));
  7817. if ($function !== 'alpha' &&
  7818. $function !== 'Alpha' &&
  7819. $function !== 'progid:DXImageTransform.Microsoft.Alpha'
  7820. ) return false;
  7821. $cursor = $function_length + 1;
  7822. $parameters_length = strcspn($value, ')', $cursor);
  7823. $parameters = substr($value, $cursor, $parameters_length);
  7824. $params = explode(',', $parameters);
  7825. $ret_params = array();
  7826. $lookup = array();
  7827. foreach ($params as $param) {
  7828. list($key, $value) = explode('=', $param);
  7829. $key = trim($key);
  7830. $value = trim($value);
  7831. if (isset($lookup[$key])) continue;
  7832. if ($key !== 'opacity') continue;
  7833. $value = $this->intValidator->validate($value, $config, $context);
  7834. if ($value === false) continue;
  7835. $int = (int) $value;
  7836. if ($int > 100) $value = '100';
  7837. if ($int < 0) $value = '0';
  7838. $ret_params[] = "$key=$value";
  7839. $lookup[$key] = true;
  7840. }
  7841. $ret_parameters = implode(',', $ret_params);
  7842. $ret_function = "$function($ret_parameters)";
  7843. return $ret_function;
  7844. }
  7845. }
  7846. /**
  7847. * Validates shorthand CSS property font.
  7848. */
  7849. class HTMLPurifier_AttrDef_CSS_Font extends HTMLPurifier_AttrDef
  7850. {
  7851. /**
  7852. * Local copy of component validators.
  7853. *
  7854. * @note If we moved specific CSS property definitions to their own
  7855. * classes instead of having them be assembled at run time by
  7856. * CSSDefinition, this wouldn't be necessary. We'd instantiate
  7857. * our own copies.
  7858. */
  7859. protected $info = array();
  7860. public function __construct($config) {
  7861. $def = $config->getCSSDefinition();
  7862. $this->info['font-style'] = $def->info['font-style'];
  7863. $this->info['font-variant'] = $def->info['font-variant'];
  7864. $this->info['font-weight'] = $def->info['font-weight'];
  7865. $this->info['font-size'] = $def->info['font-size'];
  7866. $this->info['line-height'] = $def->info['line-height'];
  7867. $this->info['font-family'] = $def->info['font-family'];
  7868. }
  7869. public function validate($string, $config, $context) {
  7870. static $system_fonts = array(
  7871. 'caption' => true,
  7872. 'icon' => true,
  7873. 'menu' => true,
  7874. 'message-box' => true,
  7875. 'small-caption' => true,
  7876. 'status-bar' => true
  7877. );
  7878. // regular pre-processing
  7879. $string = $this->parseCDATA($string);
  7880. if ($string === '') return false;
  7881. // check if it's one of the keywords
  7882. $lowercase_string = strtolower($string);
  7883. if (isset($system_fonts[$lowercase_string])) {
  7884. return $lowercase_string;
  7885. }
  7886. $bits = explode(' ', $string); // bits to process
  7887. $stage = 0; // this indicates what we're looking for
  7888. $caught = array(); // which stage 0 properties have we caught?
  7889. $stage_1 = array('font-style', 'font-variant', 'font-weight');
  7890. $final = ''; // output
  7891. for ($i = 0, $size = count($bits); $i < $size; $i++) {
  7892. if ($bits[$i] === '') continue;
  7893. switch ($stage) {
  7894. // attempting to catch font-style, font-variant or font-weight
  7895. case 0:
  7896. foreach ($stage_1 as $validator_name) {
  7897. if (isset($caught[$validator_name])) continue;
  7898. $r = $this->info[$validator_name]->validate(
  7899. $bits[$i], $config, $context);
  7900. if ($r !== false) {
  7901. $final .= $r . ' ';
  7902. $caught[$validator_name] = true;
  7903. break;
  7904. }
  7905. }
  7906. // all three caught, continue on
  7907. if (count($caught) >= 3) $stage = 1;
  7908. if ($r !== false) break;
  7909. // attempting to catch font-size and perhaps line-height
  7910. case 1:
  7911. $found_slash = false;
  7912. if (strpos($bits[$i], '/') !== false) {
  7913. list($font_size, $line_height) =
  7914. explode('/', $bits[$i]);
  7915. if ($line_height === '') {
  7916. // ooh, there's a space after the slash!
  7917. $line_height = false;
  7918. $found_slash = true;
  7919. }
  7920. } else {
  7921. $font_size = $bits[$i];
  7922. $line_height = false;
  7923. }
  7924. $r = $this->info['font-size']->validate(
  7925. $font_size, $config, $context);
  7926. if ($r !== false) {
  7927. $final .= $r;
  7928. // attempt to catch line-height
  7929. if ($line_height === false) {
  7930. // we need to scroll forward
  7931. for ($j = $i + 1; $j < $size; $j++) {
  7932. if ($bits[$j] === '') continue;
  7933. if ($bits[$j] === '/') {
  7934. if ($found_slash) {
  7935. return false;
  7936. } else {
  7937. $found_slash = true;
  7938. continue;
  7939. }
  7940. }
  7941. $line_height = $bits[$j];
  7942. break;
  7943. }
  7944. } else {
  7945. // slash already found
  7946. $found_slash = true;
  7947. $j = $i;
  7948. }
  7949. if ($found_slash) {
  7950. $i = $j;
  7951. $r = $this->info['line-height']->validate(
  7952. $line_height, $config, $context);
  7953. if ($r !== false) {
  7954. $final .= '/' . $r;
  7955. }
  7956. }
  7957. $final .= ' ';
  7958. $stage = 2;
  7959. break;
  7960. }
  7961. return false;
  7962. // attempting to catch font-family
  7963. case 2:
  7964. $font_family =
  7965. implode(' ', array_slice($bits, $i, $size - $i));
  7966. $r = $this->info['font-family']->validate(
  7967. $font_family, $config, $context);
  7968. if ($r !== false) {
  7969. $final .= $r . ' ';
  7970. // processing completed successfully
  7971. return rtrim($final);
  7972. }
  7973. return false;
  7974. }
  7975. }
  7976. return false;
  7977. }
  7978. }
  7979. /**
  7980. * Validates a font family list according to CSS spec
  7981. */
  7982. class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
  7983. {
  7984. protected $mask = null;
  7985. public function __construct() {
  7986. $this->mask = '- ';
  7987. for ($c = 'a'; $c <= 'z'; $c++) $this->mask .= $c;
  7988. for ($c = 'A'; $c <= 'Z'; $c++) $this->mask .= $c;
  7989. for ($c = '0'; $c <= '9'; $c++) $this->mask .= $c; // cast-y, but should be fine
  7990. // special bytes used by UTF-8
  7991. for ($i = 0x80; $i <= 0xFF; $i++) {
  7992. // We don't bother excluding invalid bytes in this range,
  7993. // because the our restriction of well-formed UTF-8 will
  7994. // prevent these from ever occurring.
  7995. $this->mask .= chr($i);
  7996. }
  7997. /*
  7998. PHP's internal strcspn implementation is
  7999. O(length of string * length of mask), making it inefficient
  8000. for large masks. However, it's still faster than
  8001. preg_match 8)
  8002. for (p = s1;;) {
  8003. spanp = s2;
  8004. do {
  8005. if (*spanp == c || p == s1_end) {
  8006. return p - s1;
  8007. }
  8008. } while (spanp++ < (s2_end - 1));
  8009. c = *++p;
  8010. }
  8011. */
  8012. // possible optimization: invert the mask.
  8013. }
  8014. public function validate($string, $config, $context) {
  8015. static $generic_names = array(
  8016. 'serif' => true,
  8017. 'sans-serif' => true,
  8018. 'monospace' => true,
  8019. 'fantasy' => true,
  8020. 'cursive' => true
  8021. );
  8022. $allowed_fonts = $config->get('CSS.AllowedFonts');
  8023. // assume that no font names contain commas in them
  8024. $fonts = explode(',', $string);
  8025. $final = '';
  8026. foreach($fonts as $font) {
  8027. $font = trim($font);
  8028. if ($font === '') continue;
  8029. // match a generic name
  8030. if (isset($generic_names[$font])) {
  8031. if ($allowed_fonts === null || isset($allowed_fonts[$font])) {
  8032. $final .= $font . ', ';
  8033. }
  8034. continue;
  8035. }
  8036. // match a quoted name
  8037. if ($font[0] === '"' || $font[0] === "'") {
  8038. $length = strlen($font);
  8039. if ($length <= 2) continue;
  8040. $quote = $font[0];
  8041. if ($font[$length - 1] !== $quote) continue;
  8042. $font = substr($font, 1, $length - 2);
  8043. }
  8044. $font = $this->expandCSSEscape($font);
  8045. // $font is a pure representation of the font name
  8046. if ($allowed_fonts !== null && !isset($allowed_fonts[$font])) {
  8047. continue;
  8048. }
  8049. if (ctype_alnum($font) && $font !== '') {
  8050. // very simple font, allow it in unharmed
  8051. $final .= $font . ', ';
  8052. continue;
  8053. }
  8054. // bugger out on whitespace. form feed (0C) really
  8055. // shouldn't show up regardless
  8056. $font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font);
  8057. // Here, there are various classes of characters which need
  8058. // to be treated differently:
  8059. // - Alphanumeric characters are essentially safe. We
  8060. // handled these above.
  8061. // - Spaces require quoting, though most parsers will do
  8062. // the right thing if there aren't any characters that
  8063. // can be misinterpreted
  8064. // - Dashes rarely occur, but they fairly unproblematic
  8065. // for parsing/rendering purposes.
  8066. // The above characters cover the majority of Western font
  8067. // names.
  8068. // - Arbitrary Unicode characters not in ASCII. Because
  8069. // most parsers give little thought to Unicode, treatment
  8070. // of these codepoints is basically uniform, even for
  8071. // punctuation-like codepoints. These characters can
  8072. // show up in non-Western pages and are supported by most
  8073. // major browsers, for example: "MS 明朝" is a
  8074. // legitimate font-name
  8075. // <http://ja.wikipedia.org/wiki/MS_明朝>. See
  8076. // the CSS3 spec for more examples:
  8077. // <http://www.w3.org/TR/2011/WD-css3-fonts-20110324/localizedfamilynames.png>
  8078. // You can see live samples of these on the Internet:
  8079. // <http://www.google.co.jp/search?q=font-family+MS+明朝|ゴシック>
  8080. // However, most of these fonts have ASCII equivalents:
  8081. // for example, 'MS Mincho', and it's considered
  8082. // professional to use ASCII font names instead of
  8083. // Unicode font names. Thanks Takeshi Terada for
  8084. // providing this information.
  8085. // The following characters, to my knowledge, have not been
  8086. // used to name font names.
  8087. // - Single quote. While theoretically you might find a
  8088. // font name that has a single quote in its name (serving
  8089. // as an apostrophe, e.g. Dave's Scribble), I haven't
  8090. // been able to find any actual examples of this.
  8091. // Internet Explorer's cssText translation (which I
  8092. // believe is invoked by innerHTML) normalizes any
  8093. // quoting to single quotes, and fails to escape single
  8094. // quotes. (Note that this is not IE's behavior for all
  8095. // CSS properties, just some sort of special casing for
  8096. // font-family). So a single quote *cannot* be used
  8097. // safely in the font-family context if there will be an
  8098. // innerHTML/cssText translation. Note that Firefox 3.x
  8099. // does this too.
  8100. // - Double quote. In IE, these get normalized to
  8101. // single-quotes, no matter what the encoding. (Fun
  8102. // fact, in IE8, the 'content' CSS property gained
  8103. // support, where they special cased to preserve encoded
  8104. // double quotes, but still translate unadorned double
  8105. // quotes into single quotes.) So, because their
  8106. // fixpoint behavior is identical to single quotes, they
  8107. // cannot be allowed either. Firefox 3.x displays
  8108. // single-quote style behavior.
  8109. // - Backslashes are reduced by one (so \\ -> \) every
  8110. // iteration, so they cannot be used safely. This shows
  8111. // up in IE7, IE8 and FF3
  8112. // - Semicolons, commas and backticks are handled properly.
  8113. // - The rest of the ASCII punctuation is handled properly.
  8114. // We haven't checked what browsers do to unadorned
  8115. // versions, but this is not important as long as the
  8116. // browser doesn't /remove/ surrounding quotes (as IE does
  8117. // for HTML).
  8118. //
  8119. // With these results in hand, we conclude that there are
  8120. // various levels of safety:
  8121. // - Paranoid: alphanumeric, spaces and dashes(?)
  8122. // - International: Paranoid + non-ASCII Unicode
  8123. // - Edgy: Everything except quotes, backslashes
  8124. // - NoJS: Standards compliance, e.g. sod IE. Note that
  8125. // with some judicious character escaping (since certain
  8126. // types of escaping doesn't work) this is theoretically
  8127. // OK as long as innerHTML/cssText is not called.
  8128. // We believe that international is a reasonable default
  8129. // (that we will implement now), and once we do more
  8130. // extensive research, we may feel comfortable with dropping
  8131. // it down to edgy.
  8132. // Edgy: alphanumeric, spaces, dashes and Unicode. Use of
  8133. // str(c)spn assumes that the string was already well formed
  8134. // Unicode (which of course it is).
  8135. if (strspn($font, $this->mask) !== strlen($font)) {
  8136. continue;
  8137. }
  8138. // Historical:
  8139. // In the absence of innerHTML/cssText, these ugly
  8140. // transforms don't pose a security risk (as \\ and \"
  8141. // might--these escapes are not supported by most browsers).
  8142. // We could try to be clever and use single-quote wrapping
  8143. // when there is a double quote present, but I have choosen
  8144. // not to implement that. (NOTE: you can reduce the amount
  8145. // of escapes by one depending on what quoting style you use)
  8146. // $font = str_replace('\\', '\\5C ', $font);
  8147. // $font = str_replace('"', '\\22 ', $font);
  8148. // $font = str_replace("'", '\\27 ', $font);
  8149. // font possibly with spaces, requires quoting
  8150. $final .= "'$font', ";
  8151. }
  8152. $final = rtrim($final, ', ');
  8153. if ($final === '') return false;
  8154. return $final;
  8155. }
  8156. }
  8157. /**
  8158. * Validates based on {ident} CSS grammar production
  8159. */
  8160. class HTMLPurifier_AttrDef_CSS_Ident extends HTMLPurifier_AttrDef
  8161. {
  8162. public function validate($string, $config, $context) {
  8163. $string = trim($string);
  8164. // early abort: '' and '0' (strings that convert to false) are invalid
  8165. if (!$string) return false;
  8166. $pattern = '/^(-?[A-Za-z_][A-Za-z_\-0-9]*)$/';
  8167. if (!preg_match($pattern, $string)) return false;
  8168. return $string;
  8169. }
  8170. }
  8171. /**
  8172. * Decorator which enables !important to be used in CSS values.
  8173. */
  8174. class HTMLPurifier_AttrDef_CSS_ImportantDecorator extends HTMLPurifier_AttrDef
  8175. {
  8176. public $def, $allow;
  8177. /**
  8178. * @param $def Definition to wrap
  8179. * @param $allow Whether or not to allow !important
  8180. */
  8181. public function __construct($def, $allow = false) {
  8182. $this->def = $def;
  8183. $this->allow = $allow;
  8184. }
  8185. /**
  8186. * Intercepts and removes !important if necessary
  8187. */
  8188. public function validate($string, $config, $context) {
  8189. // test for ! and important tokens
  8190. $string = trim($string);
  8191. $is_important = false;
  8192. // :TODO: optimization: test directly for !important and ! important
  8193. if (strlen($string) >= 9 && substr($string, -9) === 'important') {
  8194. $temp = rtrim(substr($string, 0, -9));
  8195. // use a temp, because we might want to restore important
  8196. if (strlen($temp) >= 1 && substr($temp, -1) === '!') {
  8197. $string = rtrim(substr($temp, 0, -1));
  8198. $is_important = true;
  8199. }
  8200. }
  8201. $string = $this->def->validate($string, $config, $context);
  8202. if ($this->allow && $is_important) $string .= ' !important';
  8203. return $string;
  8204. }
  8205. }
  8206. /**
  8207. * Represents a Length as defined by CSS.
  8208. */
  8209. class HTMLPurifier_AttrDef_CSS_Length extends HTMLPurifier_AttrDef
  8210. {
  8211. protected $min, $max;
  8212. /**
  8213. * @param HTMLPurifier_Length $max Minimum length, or null for no bound. String is also acceptable.
  8214. * @param HTMLPurifier_Length $max Maximum length, or null for no bound. String is also acceptable.
  8215. */
  8216. public function __construct($min = null, $max = null) {
  8217. $this->min = $min !== null ? HTMLPurifier_Length::make($min) : null;
  8218. $this->max = $max !== null ? HTMLPurifier_Length::make($max) : null;
  8219. }
  8220. public function validate($string, $config, $context) {
  8221. $string = $this->parseCDATA($string);
  8222. // Optimizations
  8223. if ($string === '') return false;
  8224. if ($string === '0') return '0';
  8225. if (strlen($string) === 1) return false;
  8226. $length = HTMLPurifier_Length::make($string);
  8227. if (!$length->isValid()) return false;
  8228. if ($this->min) {
  8229. $c = $length->compareTo($this->min);
  8230. if ($c === false) return false;
  8231. if ($c < 0) return false;
  8232. }
  8233. if ($this->max) {
  8234. $c = $length->compareTo($this->max);
  8235. if ($c === false) return false;
  8236. if ($c > 0) return false;
  8237. }
  8238. return $length->toString();
  8239. }
  8240. }
  8241. /**
  8242. * Validates shorthand CSS property list-style.
  8243. * @warning Does not support url tokens that have internal spaces.
  8244. */
  8245. class HTMLPurifier_AttrDef_CSS_ListStyle extends HTMLPurifier_AttrDef
  8246. {
  8247. /**
  8248. * Local copy of component validators.
  8249. * @note See HTMLPurifier_AttrDef_CSS_Font::$info for a similar impl.
  8250. */
  8251. protected $info;
  8252. public function __construct($config) {
  8253. $def = $config->getCSSDefinition();
  8254. $this->info['list-style-type'] = $def->info['list-style-type'];
  8255. $this->info['list-style-position'] = $def->info['list-style-position'];
  8256. $this->info['list-style-image'] = $def->info['list-style-image'];
  8257. }
  8258. public function validate($string, $config, $context) {
  8259. // regular pre-processing
  8260. $string = $this->parseCDATA($string);
  8261. if ($string === '') return false;
  8262. // assumes URI doesn't have spaces in it
  8263. $bits = explode(' ', strtolower($string)); // bits to process
  8264. $caught = array();
  8265. $caught['type'] = false;
  8266. $caught['position'] = false;
  8267. $caught['image'] = false;
  8268. $i = 0; // number of catches
  8269. $none = false;
  8270. foreach ($bits as $bit) {
  8271. if ($i >= 3) return; // optimization bit
  8272. if ($bit === '') continue;
  8273. foreach ($caught as $key => $status) {
  8274. if ($status !== false) continue;
  8275. $r = $this->info['list-style-' . $key]->validate($bit, $config, $context);
  8276. if ($r === false) continue;
  8277. if ($r === 'none') {
  8278. if ($none) continue;
  8279. else $none = true;
  8280. if ($key == 'image') continue;
  8281. }
  8282. $caught[$key] = $r;
  8283. $i++;
  8284. break;
  8285. }
  8286. }
  8287. if (!$i) return false;
  8288. $ret = array();
  8289. // construct type
  8290. if ($caught['type']) $ret[] = $caught['type'];
  8291. // construct image
  8292. if ($caught['image']) $ret[] = $caught['image'];
  8293. // construct position
  8294. if ($caught['position']) $ret[] = $caught['position'];
  8295. if (empty($ret)) return false;
  8296. return implode(' ', $ret);
  8297. }
  8298. }
  8299. /**
  8300. * Framework class for strings that involve multiple values.
  8301. *
  8302. * Certain CSS properties such as border-width and margin allow multiple
  8303. * lengths to be specified. This class can take a vanilla border-width
  8304. * definition and multiply it, usually into a max of four.
  8305. *
  8306. * @note Even though the CSS specification isn't clear about it, inherit
  8307. * can only be used alone: it will never manifest as part of a multi
  8308. * shorthand declaration. Thus, this class does not allow inherit.
  8309. */
  8310. class HTMLPurifier_AttrDef_CSS_Multiple extends HTMLPurifier_AttrDef
  8311. {
  8312. /**
  8313. * Instance of component definition to defer validation to.
  8314. * @todo Make protected
  8315. */
  8316. public $single;
  8317. /**
  8318. * Max number of values allowed.
  8319. * @todo Make protected
  8320. */
  8321. public $max;
  8322. /**
  8323. * @param $single HTMLPurifier_AttrDef to multiply
  8324. * @param $max Max number of values allowed (usually four)
  8325. */
  8326. public function __construct($single, $max = 4) {
  8327. $this->single = $single;
  8328. $this->max = $max;
  8329. }
  8330. public function validate($string, $config, $context) {
  8331. $string = $this->parseCDATA($string);
  8332. if ($string === '') return false;
  8333. $parts = explode(' ', $string); // parseCDATA replaced \r, \t and \n
  8334. $length = count($parts);
  8335. $final = '';
  8336. for ($i = 0, $num = 0; $i < $length && $num < $this->max; $i++) {
  8337. if (ctype_space($parts[$i])) continue;
  8338. $result = $this->single->validate($parts[$i], $config, $context);
  8339. if ($result !== false) {
  8340. $final .= $result . ' ';
  8341. $num++;
  8342. }
  8343. }
  8344. if ($final === '') return false;
  8345. return rtrim($final);
  8346. }
  8347. }
  8348. /**
  8349. * Validates a Percentage as defined by the CSS spec.
  8350. */
  8351. class HTMLPurifier_AttrDef_CSS_Percentage extends HTMLPurifier_AttrDef
  8352. {
  8353. /**
  8354. * Instance of HTMLPurifier_AttrDef_CSS_Number to defer number validation
  8355. */
  8356. protected $number_def;
  8357. /**
  8358. * @param Bool indicating whether to forbid negative values
  8359. */
  8360. public function __construct($non_negative = false) {
  8361. $this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative);
  8362. }
  8363. public function validate($string, $config, $context) {
  8364. $string = $this->parseCDATA($string);
  8365. if ($string === '') return false;
  8366. $length = strlen($string);
  8367. if ($length === 1) return false;
  8368. if ($string[$length - 1] !== '%') return false;
  8369. $number = substr($string, 0, $length - 1);
  8370. $number = $this->number_def->validate($number, $config, $context);
  8371. if ($number === false) return false;
  8372. return "$number%";
  8373. }
  8374. }
  8375. /**
  8376. * Validates the value for the CSS property text-decoration
  8377. * @note This class could be generalized into a version that acts sort of
  8378. * like Enum except you can compound the allowed values.
  8379. */
  8380. class HTMLPurifier_AttrDef_CSS_TextDecoration extends HTMLPurifier_AttrDef
  8381. {
  8382. public function validate($string, $config, $context) {
  8383. static $allowed_values = array(
  8384. 'line-through' => true,
  8385. 'overline' => true,
  8386. 'underline' => true,
  8387. );
  8388. $string = strtolower($this->parseCDATA($string));
  8389. if ($string === 'none') return $string;
  8390. $parts = explode(' ', $string);
  8391. $final = '';
  8392. foreach ($parts as $part) {
  8393. if (isset($allowed_values[$part])) {
  8394. $final .= $part . ' ';
  8395. }
  8396. }
  8397. $final = rtrim($final);
  8398. if ($final === '') return false;
  8399. return $final;
  8400. }
  8401. }
  8402. /**
  8403. * Validates a URI in CSS syntax, which uses url('http://example.com')
  8404. * @note While theoretically speaking a URI in a CSS document could
  8405. * be non-embedded, as of CSS2 there is no such usage so we're
  8406. * generalizing it. This may need to be changed in the future.
  8407. * @warning Since HTMLPurifier_AttrDef_CSS blindly uses semicolons as
  8408. * the separator, you cannot put a literal semicolon in
  8409. * in the URI. Try percent encoding it, in that case.
  8410. */
  8411. class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI
  8412. {
  8413. public function __construct() {
  8414. parent::__construct(true); // always embedded
  8415. }
  8416. public function validate($uri_string, $config, $context) {
  8417. // parse the URI out of the string and then pass it onto
  8418. // the parent object
  8419. $uri_string = $this->parseCDATA($uri_string);
  8420. if (strpos($uri_string, 'url(') !== 0) return false;
  8421. $uri_string = substr($uri_string, 4);
  8422. $new_length = strlen($uri_string) - 1;
  8423. if ($uri_string[$new_length] != ')') return false;
  8424. $uri = trim(substr($uri_string, 0, $new_length));
  8425. if (!empty($uri) && ($uri[0] == "'" || $uri[0] == '"')) {
  8426. $quote = $uri[0];
  8427. $new_length = strlen($uri) - 1;
  8428. if ($uri[$new_length] !== $quote) return false;
  8429. $uri = substr($uri, 1, $new_length - 1);
  8430. }
  8431. $uri = $this->expandCSSEscape($uri);
  8432. $result = parent::validate($uri, $config, $context);
  8433. if ($result === false) return false;
  8434. // extra sanity check; should have been done by URI
  8435. $result = str_replace(array('"', "\\", "\n", "\x0c", "\r"), "", $result);
  8436. // suspicious characters are ()'; we're going to percent encode
  8437. // them for safety.
  8438. $result = str_replace(array('(', ')', "'"), array('%28', '%29', '%27'), $result);
  8439. // there's an extra bug where ampersands lose their escaping on
  8440. // an innerHTML cycle, so a very unlucky query parameter could
  8441. // then change the meaning of the URL. Unfortunately, there's
  8442. // not much we can do about that...
  8443. return "url(\"$result\")";
  8444. }
  8445. }
  8446. /**
  8447. * Validates a boolean attribute
  8448. */
  8449. class HTMLPurifier_AttrDef_HTML_Bool extends HTMLPurifier_AttrDef
  8450. {
  8451. protected $name;
  8452. public $minimized = true;
  8453. public function __construct($name = false) {$this->name = $name;}
  8454. public function validate($string, $config, $context) {
  8455. if (empty($string)) return false;
  8456. return $this->name;
  8457. }
  8458. /**
  8459. * @param $string Name of attribute
  8460. */
  8461. public function make($string) {
  8462. return new HTMLPurifier_AttrDef_HTML_Bool($string);
  8463. }
  8464. }
  8465. /**
  8466. * Validates contents based on NMTOKENS attribute type.
  8467. */
  8468. class HTMLPurifier_AttrDef_HTML_Nmtokens extends HTMLPurifier_AttrDef
  8469. {
  8470. public function validate($string, $config, $context) {
  8471. $string = trim($string);
  8472. // early abort: '' and '0' (strings that convert to false) are invalid
  8473. if (!$string) return false;
  8474. $tokens = $this->split($string, $config, $context);
  8475. $tokens = $this->filter($tokens, $config, $context);
  8476. if (empty($tokens)) return false;
  8477. return implode(' ', $tokens);
  8478. }
  8479. /**
  8480. * Splits a space separated list of tokens into its constituent parts.
  8481. */
  8482. protected function split($string, $config, $context) {
  8483. // OPTIMIZABLE!
  8484. // do the preg_match, capture all subpatterns for reformulation
  8485. // we don't support U+00A1 and up codepoints or
  8486. // escaping because I don't know how to do that with regexps
  8487. // and plus it would complicate optimization efforts (you never
  8488. // see that anyway).
  8489. $pattern = '/(?:(?<=\s)|\A)'. // look behind for space or string start
  8490. '((?:--|-?[A-Za-z_])[A-Za-z_\-0-9]*)'.
  8491. '(?:(?=\s)|\z)/'; // look ahead for space or string end
  8492. preg_match_all($pattern, $string, $matches);
  8493. return $matches[1];
  8494. }
  8495. /**
  8496. * Template method for removing certain tokens based on arbitrary criteria.
  8497. * @note If we wanted to be really functional, we'd do an array_filter
  8498. * with a callback. But... we're not.
  8499. */
  8500. protected function filter($tokens, $config, $context) {
  8501. return $tokens;
  8502. }
  8503. }
  8504. /**
  8505. * Implements special behavior for class attribute (normally NMTOKENS)
  8506. */
  8507. class HTMLPurifier_AttrDef_HTML_Class extends HTMLPurifier_AttrDef_HTML_Nmtokens
  8508. {
  8509. protected function split($string, $config, $context) {
  8510. // really, this twiddle should be lazy loaded
  8511. $name = $config->getDefinition('HTML')->doctype->name;
  8512. if ($name == "XHTML 1.1" || $name == "XHTML 2.0") {
  8513. return parent::split($string, $config, $context);
  8514. } else {
  8515. return preg_split('/\s+/', $string);
  8516. }
  8517. }
  8518. protected function filter($tokens, $config, $context) {
  8519. $allowed = $config->get('Attr.AllowedClasses');
  8520. $forbidden = $config->get('Attr.ForbiddenClasses');
  8521. $ret = array();
  8522. foreach ($tokens as $token) {
  8523. if (
  8524. ($allowed === null || isset($allowed[$token])) &&
  8525. !isset($forbidden[$token]) &&
  8526. // We need this O(n) check because of PHP's array
  8527. // implementation that casts -0 to 0.
  8528. !in_array($token, $ret, true)
  8529. ) {
  8530. $ret[] = $token;
  8531. }
  8532. }
  8533. return $ret;
  8534. }
  8535. }
  8536. /**
  8537. * Validates a color according to the HTML spec.
  8538. */
  8539. class HTMLPurifier_AttrDef_HTML_Color extends HTMLPurifier_AttrDef
  8540. {
  8541. public function validate($string, $config, $context) {
  8542. static $colors = null;
  8543. if ($colors === null) $colors = $config->get('Core.ColorKeywords');
  8544. $string = trim($string);
  8545. if (empty($string)) return false;
  8546. if (isset($colors[strtolower($string)])) return $colors[$string];
  8547. if ($string[0] === '#') $hex = substr($string, 1);
  8548. else $hex = $string;
  8549. $length = strlen($hex);
  8550. if ($length !== 3 && $length !== 6) return false;
  8551. if (!ctype_xdigit($hex)) return false;
  8552. if ($length === 3) $hex = $hex[0].$hex[0].$hex[1].$hex[1].$hex[2].$hex[2];
  8553. return "#$hex";
  8554. }
  8555. }
  8556. /**
  8557. * Special-case enum attribute definition that lazy loads allowed frame targets
  8558. */
  8559. class HTMLPurifier_AttrDef_HTML_FrameTarget extends HTMLPurifier_AttrDef_Enum
  8560. {
  8561. public $valid_values = false; // uninitialized value
  8562. protected $case_sensitive = false;
  8563. public function __construct() {}
  8564. public function validate($string, $config, $context) {
  8565. if ($this->valid_values === false) $this->valid_values = $config->get('Attr.AllowedFrameTargets');
  8566. return parent::validate($string, $config, $context);
  8567. }
  8568. }
  8569. /**
  8570. * Validates the HTML attribute ID.
  8571. * @warning Even though this is the id processor, it
  8572. * will ignore the directive Attr:IDBlacklist, since it will only
  8573. * go according to the ID accumulator. Since the accumulator is
  8574. * automatically generated, it will have already absorbed the
  8575. * blacklist. If you're hacking around, make sure you use load()!
  8576. */
  8577. class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef
  8578. {
  8579. // selector is NOT a valid thing to use for IDREFs, because IDREFs
  8580. // *must* target IDs that exist, whereas selector #ids do not.
  8581. /**
  8582. * Determines whether or not we're validating an ID in a CSS
  8583. * selector context.
  8584. */
  8585. protected $selector;
  8586. public function __construct($selector = false) {
  8587. $this->selector = $selector;
  8588. }
  8589. public function validate($id, $config, $context) {
  8590. if (!$this->selector && !$config->get('Attr.EnableID')) return false;
  8591. $id = trim($id); // trim it first
  8592. if ($id === '') return false;
  8593. $prefix = $config->get('Attr.IDPrefix');
  8594. if ($prefix !== '') {
  8595. $prefix .= $config->get('Attr.IDPrefixLocal');
  8596. // prevent re-appending the prefix
  8597. if (strpos($id, $prefix) !== 0) $id = $prefix . $id;
  8598. } elseif ($config->get('Attr.IDPrefixLocal') !== '') {
  8599. trigger_error('%Attr.IDPrefixLocal cannot be used unless '.
  8600. '%Attr.IDPrefix is set', E_USER_WARNING);
  8601. }
  8602. if (!$this->selector) {
  8603. $id_accumulator =& $context->get('IDAccumulator');
  8604. if (isset($id_accumulator->ids[$id])) return false;
  8605. }
  8606. // we purposely avoid using regex, hopefully this is faster
  8607. if (ctype_alpha($id)) {
  8608. $result = true;
  8609. } else {
  8610. if (!ctype_alpha(@$id[0])) return false;
  8611. $trim = trim( // primitive style of regexps, I suppose
  8612. $id,
  8613. 'A..Za..z0..9:-._'
  8614. );
  8615. $result = ($trim === '');
  8616. }
  8617. $regexp = $config->get('Attr.IDBlacklistRegexp');
  8618. if ($regexp && preg_match($regexp, $id)) {
  8619. return false;
  8620. }
  8621. if (!$this->selector && $result) $id_accumulator->add($id);
  8622. // if no change was made to the ID, return the result
  8623. // else, return the new id if stripping whitespace made it
  8624. // valid, or return false.
  8625. return $result ? $id : false;
  8626. }
  8627. }
  8628. /**
  8629. * Validates an integer representation of pixels according to the HTML spec.
  8630. */
  8631. class HTMLPurifier_AttrDef_HTML_Pixels extends HTMLPurifier_AttrDef
  8632. {
  8633. protected $max;
  8634. public function __construct($max = null) {
  8635. $this->max = $max;
  8636. }
  8637. public function validate($string, $config, $context) {
  8638. $string = trim($string);
  8639. if ($string === '0') return $string;
  8640. if ($string === '') return false;
  8641. $length = strlen($string);
  8642. if (substr($string, $length - 2) == 'px') {
  8643. $string = substr($string, 0, $length - 2);
  8644. }
  8645. if (!is_numeric($string)) return false;
  8646. $int = (int) $string;
  8647. if ($int < 0) return '0';
  8648. // upper-bound value, extremely high values can
  8649. // crash operating systems, see <http://ha.ckers.org/imagecrash.html>
  8650. // WARNING, above link WILL crash you if you're using Windows
  8651. if ($this->max !== null && $int > $this->max) return (string) $this->max;
  8652. return (string) $int;
  8653. }
  8654. public function make($string) {
  8655. if ($string === '') $max = null;
  8656. else $max = (int) $string;
  8657. $class = get_class($this);
  8658. return new $class($max);
  8659. }
  8660. }
  8661. /**
  8662. * Validates the HTML type length (not to be confused with CSS's length).
  8663. *
  8664. * This accepts integer pixels or percentages as lengths for certain
  8665. * HTML attributes.
  8666. */
  8667. class HTMLPurifier_AttrDef_HTML_Length extends HTMLPurifier_AttrDef_HTML_Pixels
  8668. {
  8669. public function validate($string, $config, $context) {
  8670. $string = trim($string);
  8671. if ($string === '') return false;
  8672. $parent_result = parent::validate($string, $config, $context);
  8673. if ($parent_result !== false) return $parent_result;
  8674. $length = strlen($string);
  8675. $last_char = $string[$length - 1];
  8676. if ($last_char !== '%') return false;
  8677. $points = substr($string, 0, $length - 1);
  8678. if (!is_numeric($points)) return false;
  8679. $points = (int) $points;
  8680. if ($points < 0) return '0%';
  8681. if ($points > 100) return '100%';
  8682. return ((string) $points) . '%';
  8683. }
  8684. }
  8685. /**
  8686. * Validates a rel/rev link attribute against a directive of allowed values
  8687. * @note We cannot use Enum because link types allow multiple
  8688. * values.
  8689. * @note Assumes link types are ASCII text
  8690. */
  8691. class HTMLPurifier_AttrDef_HTML_LinkTypes extends HTMLPurifier_AttrDef
  8692. {
  8693. /** Name config attribute to pull. */
  8694. protected $name;
  8695. public function __construct($name) {
  8696. $configLookup = array(
  8697. 'rel' => 'AllowedRel',
  8698. 'rev' => 'AllowedRev'
  8699. );
  8700. if (!isset($configLookup[$name])) {
  8701. trigger_error('Unrecognized attribute name for link '.
  8702. 'relationship.', E_USER_ERROR);
  8703. return;
  8704. }
  8705. $this->name = $configLookup[$name];
  8706. }
  8707. public function validate($string, $config, $context) {
  8708. $allowed = $config->get('Attr.' . $this->name);
  8709. if (empty($allowed)) return false;
  8710. $string = $this->parseCDATA($string);
  8711. $parts = explode(' ', $string);
  8712. // lookup to prevent duplicates
  8713. $ret_lookup = array();
  8714. foreach ($parts as $part) {
  8715. $part = strtolower(trim($part));
  8716. if (!isset($allowed[$part])) continue;
  8717. $ret_lookup[$part] = true;
  8718. }
  8719. if (empty($ret_lookup)) return false;
  8720. $string = implode(' ', array_keys($ret_lookup));
  8721. return $string;
  8722. }
  8723. }
  8724. /**
  8725. * Validates a MultiLength as defined by the HTML spec.
  8726. *
  8727. * A multilength is either a integer (pixel count), a percentage, or
  8728. * a relative number.
  8729. */
  8730. class HTMLPurifier_AttrDef_HTML_MultiLength extends HTMLPurifier_AttrDef_HTML_Length
  8731. {
  8732. public function validate($string, $config, $context) {
  8733. $string = trim($string);
  8734. if ($string === '') return false;
  8735. $parent_result = parent::validate($string, $config, $context);
  8736. if ($parent_result !== false) return $parent_result;
  8737. $length = strlen($string);
  8738. $last_char = $string[$length - 1];
  8739. if ($last_char !== '*') return false;
  8740. $int = substr($string, 0, $length - 1);
  8741. if ($int == '') return '*';
  8742. if (!is_numeric($int)) return false;
  8743. $int = (int) $int;
  8744. if ($int < 0) return false;
  8745. if ($int == 0) return '0';
  8746. if ($int == 1) return '*';
  8747. return ((string) $int) . '*';
  8748. }
  8749. }
  8750. abstract class HTMLPurifier_AttrDef_URI_Email extends HTMLPurifier_AttrDef
  8751. {
  8752. /**
  8753. * Unpacks a mailbox into its display-name and address
  8754. */
  8755. function unpack($string) {
  8756. // needs to be implemented
  8757. }
  8758. }
  8759. // sub-implementations
  8760. /**
  8761. * Validates a host according to the IPv4, IPv6 and DNS (future) specifications.
  8762. */
  8763. class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
  8764. {
  8765. /**
  8766. * Instance of HTMLPurifier_AttrDef_URI_IPv4 sub-validator
  8767. */
  8768. protected $ipv4;
  8769. /**
  8770. * Instance of HTMLPurifier_AttrDef_URI_IPv6 sub-validator
  8771. */
  8772. protected $ipv6;
  8773. public function __construct() {
  8774. $this->ipv4 = new HTMLPurifier_AttrDef_URI_IPv4();
  8775. $this->ipv6 = new HTMLPurifier_AttrDef_URI_IPv6();
  8776. }
  8777. public function validate($string, $config, $context) {
  8778. $length = strlen($string);
  8779. // empty hostname is OK; it's usually semantically equivalent:
  8780. // the default host as defined by a URI scheme is used:
  8781. //
  8782. // If the URI scheme defines a default for host, then that
  8783. // default applies when the host subcomponent is undefined
  8784. // or when the registered name is empty (zero length).
  8785. if ($string === '') return '';
  8786. if ($length > 1 && $string[0] === '[' && $string[$length-1] === ']') {
  8787. //IPv6
  8788. $ip = substr($string, 1, $length - 2);
  8789. $valid = $this->ipv6->validate($ip, $config, $context);
  8790. if ($valid === false) return false;
  8791. return '['. $valid . ']';
  8792. }
  8793. // need to do checks on unusual encodings too
  8794. $ipv4 = $this->ipv4->validate($string, $config, $context);
  8795. if ($ipv4 !== false) return $ipv4;
  8796. // A regular domain name.
  8797. // This doesn't match I18N domain names, but we don't have proper IRI support,
  8798. // so force users to insert Punycode.
  8799. // The productions describing this are:
  8800. $a = '[a-z]'; // alpha
  8801. $an = '[a-z0-9]'; // alphanum
  8802. $and = '[a-z0-9-]'; // alphanum | "-"
  8803. // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
  8804. $domainlabel = "$an($and*$an)?";
  8805. // toplabel = alpha | alpha *( alphanum | "-" ) alphanum
  8806. $toplabel = "$a($and*$an)?";
  8807. // hostname = *( domainlabel "." ) toplabel [ "." ]
  8808. if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) {
  8809. return $string;
  8810. }
  8811. // If we have Net_IDNA2 support, we can support IRIs by
  8812. // punycoding them. (This is the most portable thing to do,
  8813. // since otherwise we have to assume browsers support
  8814. if ($config->get('Core.EnableIDNA')) {
  8815. $idna = new Net_IDNA2(array('encoding' => 'utf8', 'overlong' => false, 'strict' => true));
  8816. // we need to encode each period separately
  8817. $parts = explode('.', $string);
  8818. try {
  8819. $new_parts = array();
  8820. foreach ($parts as $part) {
  8821. $encodable = false;
  8822. for ($i = 0, $c = strlen($part); $i < $c; $i++) {
  8823. if (ord($part[$i]) > 0x7a) {
  8824. $encodable = true;
  8825. break;
  8826. }
  8827. }
  8828. if (!$encodable) {
  8829. $new_parts[] = $part;
  8830. } else {
  8831. $new_parts[] = $idna->encode($part);
  8832. }
  8833. }
  8834. $string = implode('.', $new_parts);
  8835. if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) {
  8836. return $string;
  8837. }
  8838. } catch (Exception $e) {
  8839. // XXX error reporting
  8840. }
  8841. }
  8842. return false;
  8843. }
  8844. }
  8845. /**
  8846. * Validates an IPv4 address
  8847. * @author Feyd @ forums.devnetwork.net (public domain)
  8848. */
  8849. class HTMLPurifier_AttrDef_URI_IPv4 extends HTMLPurifier_AttrDef
  8850. {
  8851. /**
  8852. * IPv4 regex, protected so that IPv6 can reuse it
  8853. */
  8854. protected $ip4;
  8855. public function validate($aIP, $config, $context) {
  8856. if (!$this->ip4) $this->_loadRegex();
  8857. if (preg_match('#^' . $this->ip4 . '$#s', $aIP))
  8858. {
  8859. return $aIP;
  8860. }
  8861. return false;
  8862. }
  8863. /**
  8864. * Lazy load function to prevent regex from being stuffed in
  8865. * cache.
  8866. */
  8867. protected function _loadRegex() {
  8868. $oct = '(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'; // 0-255
  8869. $this->ip4 = "(?:{$oct}\\.{$oct}\\.{$oct}\\.{$oct})";
  8870. }
  8871. }
  8872. /**
  8873. * Validates an IPv6 address.
  8874. * @author Feyd @ forums.devnetwork.net (public domain)
  8875. * @note This function requires brackets to have been removed from address
  8876. * in URI.
  8877. */
  8878. class HTMLPurifier_AttrDef_URI_IPv6 extends HTMLPurifier_AttrDef_URI_IPv4
  8879. {
  8880. public function validate($aIP, $config, $context) {
  8881. if (!$this->ip4) $this->_loadRegex();
  8882. $original = $aIP;
  8883. $hex = '[0-9a-fA-F]';
  8884. $blk = '(?:' . $hex . '{1,4})';
  8885. $pre = '(?:/(?:12[0-8]|1[0-1][0-9]|[1-9][0-9]|[0-9]))'; // /0 - /128
  8886. // prefix check
  8887. if (strpos($aIP, '/') !== false)
  8888. {
  8889. if (preg_match('#' . $pre . '$#s', $aIP, $find))
  8890. {
  8891. $aIP = substr($aIP, 0, 0-strlen($find[0]));
  8892. unset($find);
  8893. }
  8894. else
  8895. {
  8896. return false;
  8897. }
  8898. }
  8899. // IPv4-compatiblity check
  8900. if (preg_match('#(?<=:'.')' . $this->ip4 . '$#s', $aIP, $find))
  8901. {
  8902. $aIP = substr($aIP, 0, 0-strlen($find[0]));
  8903. $ip = explode('.', $find[0]);
  8904. $ip = array_map('dechex', $ip);
  8905. $aIP .= $ip[0] . $ip[1] . ':' . $ip[2] . $ip[3];
  8906. unset($find, $ip);
  8907. }
  8908. // compression check
  8909. $aIP = explode('::', $aIP);
  8910. $c = count($aIP);
  8911. if ($c > 2)
  8912. {
  8913. return false;
  8914. }
  8915. elseif ($c == 2)
  8916. {
  8917. list($first, $second) = $aIP;
  8918. $first = explode(':', $first);
  8919. $second = explode(':', $second);
  8920. if (count($first) + count($second) > 8)
  8921. {
  8922. return false;
  8923. }
  8924. while(count($first) < 8)
  8925. {
  8926. array_push($first, '0');
  8927. }
  8928. array_splice($first, 8 - count($second), 8, $second);
  8929. $aIP = $first;
  8930. unset($first,$second);
  8931. }
  8932. else
  8933. {
  8934. $aIP = explode(':', $aIP[0]);
  8935. }
  8936. $c = count($aIP);
  8937. if ($c != 8)
  8938. {
  8939. return false;
  8940. }
  8941. // All the pieces should be 16-bit hex strings. Are they?
  8942. foreach ($aIP as $piece)
  8943. {
  8944. if (!preg_match('#^[0-9a-fA-F]{4}$#s', sprintf('%04s', $piece)))
  8945. {
  8946. return false;
  8947. }
  8948. }
  8949. return $original;
  8950. }
  8951. }
  8952. /**
  8953. * Primitive email validation class based on the regexp found at
  8954. * http://www.regular-expressions.info/email.html
  8955. */
  8956. class HTMLPurifier_AttrDef_URI_Email_SimpleCheck extends HTMLPurifier_AttrDef_URI_Email
  8957. {
  8958. public function validate($string, $config, $context) {
  8959. // no support for named mailboxes i.e. "Bob <bob@example.com>"
  8960. // that needs more percent encoding to be done
  8961. if ($string == '') return false;
  8962. $string = trim($string);
  8963. $result = preg_match('/^[A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,4}$/i', $string);
  8964. return $result ? $string : false;
  8965. }
  8966. }
  8967. /**
  8968. * Pre-transform that changes proprietary background attribute to CSS.
  8969. */
  8970. class HTMLPurifier_AttrTransform_Background extends HTMLPurifier_AttrTransform {
  8971. public function transform($attr, $config, $context) {
  8972. if (!isset($attr['background'])) return $attr;
  8973. $background = $this->confiscateAttr($attr, 'background');
  8974. // some validation should happen here
  8975. $this->prependCSS($attr, "background-image:url($background);");
  8976. return $attr;
  8977. }
  8978. }
  8979. // this MUST be placed in post, as it assumes that any value in dir is valid
  8980. /**
  8981. * Post-trasnform that ensures that bdo tags have the dir attribute set.
  8982. */
  8983. class HTMLPurifier_AttrTransform_BdoDir extends HTMLPurifier_AttrTransform
  8984. {
  8985. public function transform($attr, $config, $context) {
  8986. if (isset($attr['dir'])) return $attr;
  8987. $attr['dir'] = $config->get('Attr.DefaultTextDir');
  8988. return $attr;
  8989. }
  8990. }
  8991. /**
  8992. * Pre-transform that changes deprecated bgcolor attribute to CSS.
  8993. */
  8994. class HTMLPurifier_AttrTransform_BgColor extends HTMLPurifier_AttrTransform {
  8995. public function transform($attr, $config, $context) {
  8996. if (!isset($attr['bgcolor'])) return $attr;
  8997. $bgcolor = $this->confiscateAttr($attr, 'bgcolor');
  8998. // some validation should happen here
  8999. $this->prependCSS($attr, "background-color:$bgcolor;");
  9000. return $attr;
  9001. }
  9002. }
  9003. /**
  9004. * Pre-transform that changes converts a boolean attribute to fixed CSS
  9005. */
  9006. class HTMLPurifier_AttrTransform_BoolToCSS extends HTMLPurifier_AttrTransform {
  9007. /**
  9008. * Name of boolean attribute that is trigger
  9009. */
  9010. protected $attr;
  9011. /**
  9012. * CSS declarations to add to style, needs trailing semicolon
  9013. */
  9014. protected $css;
  9015. /**
  9016. * @param $attr string attribute name to convert from
  9017. * @param $css string CSS declarations to add to style (needs semicolon)
  9018. */
  9019. public function __construct($attr, $css) {
  9020. $this->attr = $attr;
  9021. $this->css = $css;
  9022. }
  9023. public function transform($attr, $config, $context) {
  9024. if (!isset($attr[$this->attr])) return $attr;
  9025. unset($attr[$this->attr]);
  9026. $this->prependCSS($attr, $this->css);
  9027. return $attr;
  9028. }
  9029. }
  9030. /**
  9031. * Pre-transform that changes deprecated border attribute to CSS.
  9032. */
  9033. class HTMLPurifier_AttrTransform_Border extends HTMLPurifier_AttrTransform {
  9034. public function transform($attr, $config, $context) {
  9035. if (!isset($attr['border'])) return $attr;
  9036. $border_width = $this->confiscateAttr($attr, 'border');
  9037. // some validation should happen here
  9038. $this->prependCSS($attr, "border:{$border_width}px solid;");
  9039. return $attr;
  9040. }
  9041. }
  9042. /**
  9043. * Generic pre-transform that converts an attribute with a fixed number of
  9044. * values (enumerated) to CSS.
  9045. */
  9046. class HTMLPurifier_AttrTransform_EnumToCSS extends HTMLPurifier_AttrTransform {
  9047. /**
  9048. * Name of attribute to transform from
  9049. */
  9050. protected $attr;
  9051. /**
  9052. * Lookup array of attribute values to CSS
  9053. */
  9054. protected $enumToCSS = array();
  9055. /**
  9056. * Case sensitivity of the matching
  9057. * @warning Currently can only be guaranteed to work with ASCII
  9058. * values.
  9059. */
  9060. protected $caseSensitive = false;
  9061. /**
  9062. * @param $attr String attribute name to transform from
  9063. * @param $enumToCSS Lookup array of attribute values to CSS
  9064. * @param $case_sensitive Boolean case sensitivity indicator, default false
  9065. */
  9066. public function __construct($attr, $enum_to_css, $case_sensitive = false) {
  9067. $this->attr = $attr;
  9068. $this->enumToCSS = $enum_to_css;
  9069. $this->caseSensitive = (bool) $case_sensitive;
  9070. }
  9071. public function transform($attr, $config, $context) {
  9072. if (!isset($attr[$this->attr])) return $attr;
  9073. $value = trim($attr[$this->attr]);
  9074. unset($attr[$this->attr]);
  9075. if (!$this->caseSensitive) $value = strtolower($value);
  9076. if (!isset($this->enumToCSS[$value])) {
  9077. return $attr;
  9078. }
  9079. $this->prependCSS($attr, $this->enumToCSS[$value]);
  9080. return $attr;
  9081. }
  9082. }
  9083. // must be called POST validation
  9084. /**
  9085. * Transform that supplies default values for the src and alt attributes
  9086. * in img tags, as well as prevents the img tag from being removed
  9087. * because of a missing alt tag. This needs to be registered as both
  9088. * a pre and post attribute transform.
  9089. */
  9090. class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform
  9091. {
  9092. public function transform($attr, $config, $context) {
  9093. $src = true;
  9094. if (!isset($attr['src'])) {
  9095. if ($config->get('Core.RemoveInvalidImg')) return $attr;
  9096. $attr['src'] = $config->get('Attr.DefaultInvalidImage');
  9097. $src = false;
  9098. }
  9099. if (!isset($attr['alt'])) {
  9100. if ($src) {
  9101. $alt = $config->get('Attr.DefaultImageAlt');
  9102. if ($alt === null) {
  9103. // truncate if the alt is too long
  9104. $attr['alt'] = substr(basename($attr['src']),0,40);
  9105. } else {
  9106. $attr['alt'] = $alt;
  9107. }
  9108. } else {
  9109. $attr['alt'] = $config->get('Attr.DefaultInvalidImageAlt');
  9110. }
  9111. }
  9112. return $attr;
  9113. }
  9114. }
  9115. /**
  9116. * Pre-transform that changes deprecated hspace and vspace attributes to CSS
  9117. */
  9118. class HTMLPurifier_AttrTransform_ImgSpace extends HTMLPurifier_AttrTransform {
  9119. protected $attr;
  9120. protected $css = array(
  9121. 'hspace' => array('left', 'right'),
  9122. 'vspace' => array('top', 'bottom')
  9123. );
  9124. public function __construct($attr) {
  9125. $this->attr = $attr;
  9126. if (!isset($this->css[$attr])) {
  9127. trigger_error(htmlspecialchars($attr) . ' is not valid space attribute');
  9128. }
  9129. }
  9130. public function transform($attr, $config, $context) {
  9131. if (!isset($attr[$this->attr])) return $attr;
  9132. $width = $this->confiscateAttr($attr, $this->attr);
  9133. // some validation could happen here
  9134. if (!isset($this->css[$this->attr])) return $attr;
  9135. $style = '';
  9136. foreach ($this->css[$this->attr] as $suffix) {
  9137. $property = "margin-$suffix";
  9138. $style .= "$property:{$width}px;";
  9139. }
  9140. $this->prependCSS($attr, $style);
  9141. return $attr;
  9142. }
  9143. }
  9144. /**
  9145. * Performs miscellaneous cross attribute validation and filtering for
  9146. * input elements. This is meant to be a post-transform.
  9147. */
  9148. class HTMLPurifier_AttrTransform_Input extends HTMLPurifier_AttrTransform {
  9149. protected $pixels;
  9150. public function __construct() {
  9151. $this->pixels = new HTMLPurifier_AttrDef_HTML_Pixels();
  9152. }
  9153. public function transform($attr, $config, $context) {
  9154. if (!isset($attr['type'])) $t = 'text';
  9155. else $t = strtolower($attr['type']);
  9156. if (isset($attr['checked']) && $t !== 'radio' && $t !== 'checkbox') {
  9157. unset($attr['checked']);
  9158. }
  9159. if (isset($attr['maxlength']) && $t !== 'text' && $t !== 'password') {
  9160. unset($attr['maxlength']);
  9161. }
  9162. if (isset($attr['size']) && $t !== 'text' && $t !== 'password') {
  9163. $result = $this->pixels->validate($attr['size'], $config, $context);
  9164. if ($result === false) unset($attr['size']);
  9165. else $attr['size'] = $result;
  9166. }
  9167. if (isset($attr['src']) && $t !== 'image') {
  9168. unset($attr['src']);
  9169. }
  9170. if (!isset($attr['value']) && ($t === 'radio' || $t === 'checkbox')) {
  9171. $attr['value'] = '';
  9172. }
  9173. return $attr;
  9174. }
  9175. }
  9176. /**
  9177. * Post-transform that copies lang's value to xml:lang (and vice-versa)
  9178. * @note Theoretically speaking, this could be a pre-transform, but putting
  9179. * post is more efficient.
  9180. */
  9181. class HTMLPurifier_AttrTransform_Lang extends HTMLPurifier_AttrTransform
  9182. {
  9183. public function transform($attr, $config, $context) {
  9184. $lang = isset($attr['lang']) ? $attr['lang'] : false;
  9185. $xml_lang = isset($attr['xml:lang']) ? $attr['xml:lang'] : false;
  9186. if ($lang !== false && $xml_lang === false) {
  9187. $attr['xml:lang'] = $lang;
  9188. } elseif ($xml_lang !== false) {
  9189. $attr['lang'] = $xml_lang;
  9190. }
  9191. return $attr;
  9192. }
  9193. }
  9194. /**
  9195. * Class for handling width/height length attribute transformations to CSS
  9196. */
  9197. class HTMLPurifier_AttrTransform_Length extends HTMLPurifier_AttrTransform
  9198. {
  9199. protected $name;
  9200. protected $cssName;
  9201. public function __construct($name, $css_name = null) {
  9202. $this->name = $name;
  9203. $this->cssName = $css_name ? $css_name : $name;
  9204. }
  9205. public function transform($attr, $config, $context) {
  9206. if (!isset($attr[$this->name])) return $attr;
  9207. $length = $this->confiscateAttr($attr, $this->name);
  9208. if(ctype_digit($length)) $length .= 'px';
  9209. $this->prependCSS($attr, $this->cssName . ":$length;");
  9210. return $attr;
  9211. }
  9212. }
  9213. /**
  9214. * Pre-transform that changes deprecated name attribute to ID if necessary
  9215. */
  9216. class HTMLPurifier_AttrTransform_Name extends HTMLPurifier_AttrTransform
  9217. {
  9218. public function transform($attr, $config, $context) {
  9219. // Abort early if we're using relaxed definition of name
  9220. if ($config->get('HTML.Attr.Name.UseCDATA')) return $attr;
  9221. if (!isset($attr['name'])) return $attr;
  9222. $id = $this->confiscateAttr($attr, 'name');
  9223. if ( isset($attr['id'])) return $attr;
  9224. $attr['id'] = $id;
  9225. return $attr;
  9226. }
  9227. }
  9228. /**
  9229. * Post-transform that performs validation to the name attribute; if
  9230. * it is present with an equivalent id attribute, it is passed through;
  9231. * otherwise validation is performed.
  9232. */
  9233. class HTMLPurifier_AttrTransform_NameSync extends HTMLPurifier_AttrTransform
  9234. {
  9235. public function __construct() {
  9236. $this->idDef = new HTMLPurifier_AttrDef_HTML_ID();
  9237. }
  9238. public function transform($attr, $config, $context) {
  9239. if (!isset($attr['name'])) return $attr;
  9240. $name = $attr['name'];
  9241. if (isset($attr['id']) && $attr['id'] === $name) return $attr;
  9242. $result = $this->idDef->validate($name, $config, $context);
  9243. if ($result === false) unset($attr['name']);
  9244. else $attr['name'] = $result;
  9245. return $attr;
  9246. }
  9247. }
  9248. // must be called POST validation
  9249. /**
  9250. * Adds rel="nofollow" to all outbound links. This transform is
  9251. * only attached if Attr.Nofollow is TRUE.
  9252. */
  9253. class HTMLPurifier_AttrTransform_Nofollow extends HTMLPurifier_AttrTransform
  9254. {
  9255. private $parser;
  9256. public function __construct() {
  9257. $this->parser = new HTMLPurifier_URIParser();
  9258. }
  9259. public function transform($attr, $config, $context) {
  9260. if (!isset($attr['href'])) {
  9261. return $attr;
  9262. }
  9263. // XXX Kind of inefficient
  9264. $url = $this->parser->parse($attr['href']);
  9265. $scheme = $url->getSchemeObj($config, $context);
  9266. if ($scheme->browsable && !$url->isLocal($config, $context)) {
  9267. if (isset($attr['rel'])) {
  9268. $rels = explode(' ', $attr);
  9269. if (!in_array('nofollow', $rels)) {
  9270. $rels[] = 'nofollow';
  9271. }
  9272. $attr['rel'] = implode(' ', $rels);
  9273. } else {
  9274. $attr['rel'] = 'nofollow';
  9275. }
  9276. }
  9277. return $attr;
  9278. }
  9279. }
  9280. class HTMLPurifier_AttrTransform_SafeEmbed extends HTMLPurifier_AttrTransform
  9281. {
  9282. public $name = "SafeEmbed";
  9283. public function transform($attr, $config, $context) {
  9284. $attr['allowscriptaccess'] = 'never';
  9285. $attr['allownetworking'] = 'internal';
  9286. $attr['type'] = 'application/x-shockwave-flash';
  9287. return $attr;
  9288. }
  9289. }
  9290. /**
  9291. * Writes default type for all objects. Currently only supports flash.
  9292. */
  9293. class HTMLPurifier_AttrTransform_SafeObject extends HTMLPurifier_AttrTransform
  9294. {
  9295. public $name = "SafeObject";
  9296. function transform($attr, $config, $context) {
  9297. if (!isset($attr['type'])) $attr['type'] = 'application/x-shockwave-flash';
  9298. return $attr;
  9299. }
  9300. }
  9301. /**
  9302. * Validates name/value pairs in param tags to be used in safe objects. This
  9303. * will only allow name values it recognizes, and pre-fill certain attributes
  9304. * with required values.
  9305. *
  9306. * @note
  9307. * This class only supports Flash. In the future, Quicktime support
  9308. * may be added.
  9309. *
  9310. * @warning
  9311. * This class expects an injector to add the necessary parameters tags.
  9312. */
  9313. class HTMLPurifier_AttrTransform_SafeParam extends HTMLPurifier_AttrTransform
  9314. {
  9315. public $name = "SafeParam";
  9316. private $uri;
  9317. public function __construct() {
  9318. $this->uri = new HTMLPurifier_AttrDef_URI(true); // embedded
  9319. $this->wmode = new HTMLPurifier_AttrDef_Enum(array('window', 'opaque', 'transparent'));
  9320. }
  9321. public function transform($attr, $config, $context) {
  9322. // If we add support for other objects, we'll need to alter the
  9323. // transforms.
  9324. switch ($attr['name']) {
  9325. // application/x-shockwave-flash
  9326. // Keep this synchronized with Injector/SafeObject.php
  9327. case 'allowScriptAccess':
  9328. $attr['value'] = 'never';
  9329. break;
  9330. case 'allowNetworking':
  9331. $attr['value'] = 'internal';
  9332. break;
  9333. case 'allowFullScreen':
  9334. if ($config->get('HTML.FlashAllowFullScreen')) {
  9335. $attr['value'] = ($attr['value'] == 'true') ? 'true' : 'false';
  9336. } else {
  9337. $attr['value'] = 'false';
  9338. }
  9339. break;
  9340. case 'wmode':
  9341. $attr['value'] = $this->wmode->validate($attr['value'], $config, $context);
  9342. break;
  9343. case 'movie':
  9344. case 'src':
  9345. $attr['name'] = "movie";
  9346. $attr['value'] = $this->uri->validate($attr['value'], $config, $context);
  9347. break;
  9348. case 'flashvars':
  9349. // we're going to allow arbitrary inputs to the SWF, on
  9350. // the reasoning that it could only hack the SWF, not us.
  9351. break;
  9352. // add other cases to support other param name/value pairs
  9353. default:
  9354. $attr['name'] = $attr['value'] = null;
  9355. }
  9356. return $attr;
  9357. }
  9358. }
  9359. /**
  9360. * Implements required attribute stipulation for <script>
  9361. */
  9362. class HTMLPurifier_AttrTransform_ScriptRequired extends HTMLPurifier_AttrTransform
  9363. {
  9364. public function transform($attr, $config, $context) {
  9365. if (!isset($attr['type'])) {
  9366. $attr['type'] = 'text/javascript';
  9367. }
  9368. return $attr;
  9369. }
  9370. }
  9371. // must be called POST validation
  9372. /**
  9373. * Adds target="blank" to all outbound links. This transform is
  9374. * only attached if Attr.TargetBlank is TRUE. This works regardless
  9375. * of whether or not Attr.AllowedFrameTargets
  9376. */
  9377. class HTMLPurifier_AttrTransform_TargetBlank extends HTMLPurifier_AttrTransform
  9378. {
  9379. private $parser;
  9380. public function __construct() {
  9381. $this->parser = new HTMLPurifier_URIParser();
  9382. }
  9383. public function transform($attr, $config, $context) {
  9384. if (!isset($attr['href'])) {
  9385. return $attr;
  9386. }
  9387. // XXX Kind of inefficient
  9388. $url = $this->parser->parse($attr['href']);
  9389. $scheme = $url->getSchemeObj($config, $context);
  9390. if ($scheme->browsable && !$url->isBenign($config, $context)) {
  9391. $attr['target'] = 'blank';
  9392. }
  9393. return $attr;
  9394. }
  9395. }
  9396. /**
  9397. * Sets height/width defaults for <textarea>
  9398. */
  9399. class HTMLPurifier_AttrTransform_Textarea extends HTMLPurifier_AttrTransform
  9400. {
  9401. public function transform($attr, $config, $context) {
  9402. // Calculated from Firefox
  9403. if (!isset($attr['cols'])) $attr['cols'] = '22';
  9404. if (!isset($attr['rows'])) $attr['rows'] = '3';
  9405. return $attr;
  9406. }
  9407. }
  9408. /**
  9409. * Definition that uses different definitions depending on context.
  9410. *
  9411. * The del and ins tags are notable because they allow different types of
  9412. * elements depending on whether or not they're in a block or inline context.
  9413. * Chameleon allows this behavior to happen by using two different
  9414. * definitions depending on context. While this somewhat generalized,
  9415. * it is specifically intended for those two tags.
  9416. */
  9417. class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
  9418. {
  9419. /**
  9420. * Instance of the definition object to use when inline. Usually stricter.
  9421. */
  9422. public $inline;
  9423. /**
  9424. * Instance of the definition object to use when block.
  9425. */
  9426. public $block;
  9427. public $type = 'chameleon';
  9428. /**
  9429. * @param $inline List of elements to allow when inline.
  9430. * @param $block List of elements to allow when block.
  9431. */
  9432. public function __construct($inline, $block) {
  9433. $this->inline = new HTMLPurifier_ChildDef_Optional($inline);
  9434. $this->block = new HTMLPurifier_ChildDef_Optional($block);
  9435. $this->elements = $this->block->elements;
  9436. }
  9437. public function validateChildren($tokens_of_children, $config, $context) {
  9438. if ($context->get('IsInline') === false) {
  9439. return $this->block->validateChildren(
  9440. $tokens_of_children, $config, $context);
  9441. } else {
  9442. return $this->inline->validateChildren(
  9443. $tokens_of_children, $config, $context);
  9444. }
  9445. }
  9446. }
  9447. /**
  9448. * Custom validation class, accepts DTD child definitions
  9449. *
  9450. * @warning Currently this class is an all or nothing proposition, that is,
  9451. * it will only give a bool return value.
  9452. */
  9453. class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
  9454. {
  9455. public $type = 'custom';
  9456. public $allow_empty = false;
  9457. /**
  9458. * Allowed child pattern as defined by the DTD
  9459. */
  9460. public $dtd_regex;
  9461. /**
  9462. * PCRE regex derived from $dtd_regex
  9463. * @private
  9464. */
  9465. private $_pcre_regex;
  9466. /**
  9467. * @param $dtd_regex Allowed child pattern from the DTD
  9468. */
  9469. public function __construct($dtd_regex) {
  9470. $this->dtd_regex = $dtd_regex;
  9471. $this->_compileRegex();
  9472. }
  9473. /**
  9474. * Compiles the PCRE regex from a DTD regex ($dtd_regex to $_pcre_regex)
  9475. */
  9476. protected function _compileRegex() {
  9477. $raw = str_replace(' ', '', $this->dtd_regex);
  9478. if ($raw{0} != '(') {
  9479. $raw = "($raw)";
  9480. }
  9481. $el = '[#a-zA-Z0-9_.-]+';
  9482. $reg = $raw;
  9483. // COMPLICATED! AND MIGHT BE BUGGY! I HAVE NO CLUE WHAT I'M
  9484. // DOING! Seriously: if there's problems, please report them.
  9485. // collect all elements into the $elements array
  9486. preg_match_all("/$el/", $reg, $matches);
  9487. foreach ($matches[0] as $match) {
  9488. $this->elements[$match] = true;
  9489. }
  9490. // setup all elements as parentheticals with leading commas
  9491. $reg = preg_replace("/$el/", '(,\\0)', $reg);
  9492. // remove commas when they were not solicited
  9493. $reg = preg_replace("/([^,(|]\(+),/", '\\1', $reg);
  9494. // remove all non-paranthetical commas: they are handled by first regex
  9495. $reg = preg_replace("/,\(/", '(', $reg);
  9496. $this->_pcre_regex = $reg;
  9497. }
  9498. public function validateChildren($tokens_of_children, $config, $context) {
  9499. $list_of_children = '';
  9500. $nesting = 0; // depth into the nest
  9501. foreach ($tokens_of_children as $token) {
  9502. if (!empty($token->is_whitespace)) continue;
  9503. $is_child = ($nesting == 0); // direct
  9504. if ($token instanceof HTMLPurifier_Token_Start) {
  9505. $nesting++;
  9506. } elseif ($token instanceof HTMLPurifier_Token_End) {
  9507. $nesting--;
  9508. }
  9509. if ($is_child) {
  9510. $list_of_children .= $token->name . ',';
  9511. }
  9512. }
  9513. // add leading comma to deal with stray comma declarations
  9514. $list_of_children = ',' . rtrim($list_of_children, ',');
  9515. $okay =
  9516. preg_match(
  9517. '/^,?'.$this->_pcre_regex.'$/',
  9518. $list_of_children
  9519. );
  9520. return (bool) $okay;
  9521. }
  9522. }
  9523. /**
  9524. * Definition that disallows all elements.
  9525. * @warning validateChildren() in this class is actually never called, because
  9526. * empty elements are corrected in HTMLPurifier_Strategy_MakeWellFormed
  9527. * before child definitions are parsed in earnest by
  9528. * HTMLPurifier_Strategy_FixNesting.
  9529. */
  9530. class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef
  9531. {
  9532. public $allow_empty = true;
  9533. public $type = 'empty';
  9534. public function __construct() {}
  9535. public function validateChildren($tokens_of_children, $config, $context) {
  9536. return array();
  9537. }
  9538. }
  9539. /**
  9540. * Definition for list containers ul and ol.
  9541. */
  9542. class HTMLPurifier_ChildDef_List extends HTMLPurifier_ChildDef
  9543. {
  9544. public $type = 'list';
  9545. // lying a little bit, so that we can handle ul and ol ourselves
  9546. // XXX: This whole business with 'wrap' is all a bit unsatisfactory
  9547. public $elements = array('li' => true, 'ul' => true, 'ol' => true);
  9548. public function validateChildren($tokens_of_children, $config, $context) {
  9549. // Flag for subclasses
  9550. $this->whitespace = false;
  9551. // if there are no tokens, delete parent node
  9552. if (empty($tokens_of_children)) return false;
  9553. // the new set of children
  9554. $result = array();
  9555. // current depth into the nest
  9556. $nesting = 0;
  9557. // a little sanity check to make sure it's not ALL whitespace
  9558. $all_whitespace = true;
  9559. $seen_li = false;
  9560. $need_close_li = false;
  9561. foreach ($tokens_of_children as $token) {
  9562. if (!empty($token->is_whitespace)) {
  9563. $result[] = $token;
  9564. continue;
  9565. }
  9566. $all_whitespace = false; // phew, we're not talking about whitespace
  9567. if ($nesting == 1 && $need_close_li) {
  9568. $result[] = new HTMLPurifier_Token_End('li');
  9569. $nesting--;
  9570. $need_close_li = false;
  9571. }
  9572. $is_child = ($nesting == 0);
  9573. if ($token instanceof HTMLPurifier_Token_Start) {
  9574. $nesting++;
  9575. } elseif ($token instanceof HTMLPurifier_Token_End) {
  9576. $nesting--;
  9577. }
  9578. if ($is_child) {
  9579. if ($token->name === 'li') {
  9580. // good
  9581. $seen_li = true;
  9582. } elseif ($token->name === 'ul' || $token->name === 'ol') {
  9583. // we want to tuck this into the previous li
  9584. $need_close_li = true;
  9585. $nesting++;
  9586. if (!$seen_li) {
  9587. // create a new li element
  9588. $result[] = new HTMLPurifier_Token_Start('li');
  9589. } else {
  9590. // backtrack until </li> found
  9591. while(true) {
  9592. $t = array_pop($result);
  9593. if ($t instanceof HTMLPurifier_Token_End) {
  9594. // XXX actually, these invariants could very plausibly be violated
  9595. // if we are doing silly things with modifying the set of allowed elements.
  9596. // FORTUNATELY, it doesn't make a difference, since the allowed
  9597. // elements are hard-coded here!
  9598. if ($t->name !== 'li') {
  9599. trigger_error("Only li present invariant violated in List ChildDef", E_USER_ERROR);
  9600. return false;
  9601. }
  9602. break;
  9603. } elseif ($t instanceof HTMLPurifier_Token_Empty) { // bleagh
  9604. if ($t->name !== 'li') {
  9605. trigger_error("Only li present invariant violated in List ChildDef", E_USER_ERROR);
  9606. return false;
  9607. }
  9608. // XXX this should have a helper for it...
  9609. $result[] = new HTMLPurifier_Token_Start('li', $t->attr, $t->line, $t->col, $t->armor);
  9610. break;
  9611. } else {
  9612. if (!$t->is_whitespace) {
  9613. trigger_error("Only whitespace present invariant violated in List ChildDef", E_USER_ERROR);
  9614. return false;
  9615. }
  9616. }
  9617. }
  9618. }
  9619. } else {
  9620. // start wrapping (this doesn't precisely mimic
  9621. // browser behavior, but what browsers do is kind of
  9622. // hard to mimic in a standards compliant way
  9623. // XXX Actually, this has no impact in practice,
  9624. // because this gets handled earlier. Arguably,
  9625. // we should rip out all of that processing
  9626. $result[] = new HTMLPurifier_Token_Start('li');
  9627. $nesting++;
  9628. $seen_li = true;
  9629. $need_close_li = true;
  9630. }
  9631. }
  9632. $result[] = $token;
  9633. }
  9634. if ($need_close_li) {
  9635. $result[] = new HTMLPurifier_Token_End('li');
  9636. }
  9637. if (empty($result)) return false;
  9638. if ($all_whitespace) {
  9639. return false;
  9640. }
  9641. if ($tokens_of_children == $result) return true;
  9642. return $result;
  9643. }
  9644. }
  9645. /**
  9646. * Definition that allows a set of elements, but disallows empty children.
  9647. */
  9648. class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
  9649. {
  9650. /**
  9651. * Lookup table of allowed elements.
  9652. * @public
  9653. */
  9654. public $elements = array();
  9655. /**
  9656. * Whether or not the last passed node was all whitespace.
  9657. */
  9658. protected $whitespace = false;
  9659. /**
  9660. * @param $elements List of allowed element names (lowercase).
  9661. */
  9662. public function __construct($elements) {
  9663. if (is_string($elements)) {
  9664. $elements = str_replace(' ', '', $elements);
  9665. $elements = explode('|', $elements);
  9666. }
  9667. $keys = array_keys($elements);
  9668. if ($keys == array_keys($keys)) {
  9669. $elements = array_flip($elements);
  9670. foreach ($elements as $i => $x) {
  9671. $elements[$i] = true;
  9672. if (empty($i)) unset($elements[$i]); // remove blank
  9673. }
  9674. }
  9675. $this->elements = $elements;
  9676. }
  9677. public $allow_empty = false;
  9678. public $type = 'required';
  9679. public function validateChildren($tokens_of_children, $config, $context) {
  9680. // Flag for subclasses
  9681. $this->whitespace = false;
  9682. // if there are no tokens, delete parent node
  9683. if (empty($tokens_of_children)) return false;
  9684. // the new set of children
  9685. $result = array();
  9686. // current depth into the nest
  9687. $nesting = 0;
  9688. // whether or not we're deleting a node
  9689. $is_deleting = false;
  9690. // whether or not parsed character data is allowed
  9691. // this controls whether or not we silently drop a tag
  9692. // or generate escaped HTML from it
  9693. $pcdata_allowed = isset($this->elements['#PCDATA']);
  9694. // a little sanity check to make sure it's not ALL whitespace
  9695. $all_whitespace = true;
  9696. // some configuration
  9697. $escape_invalid_children = $config->get('Core.EscapeInvalidChildren');
  9698. // generator
  9699. $gen = new HTMLPurifier_Generator($config, $context);
  9700. foreach ($tokens_of_children as $token) {
  9701. if (!empty($token->is_whitespace)) {
  9702. $result[] = $token;
  9703. continue;
  9704. }
  9705. $all_whitespace = false; // phew, we're not talking about whitespace
  9706. $is_child = ($nesting == 0);
  9707. if ($token instanceof HTMLPurifier_Token_Start) {
  9708. $nesting++;
  9709. } elseif ($token instanceof HTMLPurifier_Token_End) {
  9710. $nesting--;
  9711. }
  9712. if ($is_child) {
  9713. $is_deleting = false;
  9714. if (!isset($this->elements[$token->name])) {
  9715. $is_deleting = true;
  9716. if ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text) {
  9717. $result[] = $token;
  9718. } elseif ($pcdata_allowed && $escape_invalid_children) {
  9719. $result[] = new HTMLPurifier_Token_Text(
  9720. $gen->generateFromToken($token)
  9721. );
  9722. }
  9723. continue;
  9724. }
  9725. }
  9726. if (!$is_deleting || ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text)) {
  9727. $result[] = $token;
  9728. } elseif ($pcdata_allowed && $escape_invalid_children) {
  9729. $result[] =
  9730. new HTMLPurifier_Token_Text(
  9731. $gen->generateFromToken($token)
  9732. );
  9733. } else {
  9734. // drop silently
  9735. }
  9736. }
  9737. if (empty($result)) return false;
  9738. if ($all_whitespace) {
  9739. $this->whitespace = true;
  9740. return false;
  9741. }
  9742. if ($tokens_of_children == $result) return true;
  9743. return $result;
  9744. }
  9745. }
  9746. /**
  9747. * Definition that allows a set of elements, and allows no children.
  9748. * @note This is a hack to reuse code from HTMLPurifier_ChildDef_Required,
  9749. * really, one shouldn't inherit from the other. Only altered behavior
  9750. * is to overload a returned false with an array. Thus, it will never
  9751. * return false.
  9752. */
  9753. class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
  9754. {
  9755. public $allow_empty = true;
  9756. public $type = 'optional';
  9757. public function validateChildren($tokens_of_children, $config, $context) {
  9758. $result = parent::validateChildren($tokens_of_children, $config, $context);
  9759. // we assume that $tokens_of_children is not modified
  9760. if ($result === false) {
  9761. if (empty($tokens_of_children)) return true;
  9762. elseif ($this->whitespace) return $tokens_of_children;
  9763. else return array();
  9764. }
  9765. return $result;
  9766. }
  9767. }
  9768. /**
  9769. * Takes the contents of blockquote when in strict and reformats for validation.
  9770. */
  9771. class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Required
  9772. {
  9773. protected $real_elements;
  9774. protected $fake_elements;
  9775. public $allow_empty = true;
  9776. public $type = 'strictblockquote';
  9777. protected $init = false;
  9778. /**
  9779. * @note We don't want MakeWellFormed to auto-close inline elements since
  9780. * they might be allowed.
  9781. */
  9782. public function getAllowedElements($config) {
  9783. $this->init($config);
  9784. return $this->fake_elements;
  9785. }
  9786. public function validateChildren($tokens_of_children, $config, $context) {
  9787. $this->init($config);
  9788. // trick the parent class into thinking it allows more
  9789. $this->elements = $this->fake_elements;
  9790. $result = parent::validateChildren($tokens_of_children, $config, $context);
  9791. $this->elements = $this->real_elements;
  9792. if ($result === false) return array();
  9793. if ($result === true) $result = $tokens_of_children;
  9794. $def = $config->getHTMLDefinition();
  9795. $block_wrap_start = new HTMLPurifier_Token_Start($def->info_block_wrapper);
  9796. $block_wrap_end = new HTMLPurifier_Token_End( $def->info_block_wrapper);
  9797. $is_inline = false;
  9798. $depth = 0;
  9799. $ret = array();
  9800. // assuming that there are no comment tokens
  9801. foreach ($result as $i => $token) {
  9802. $token = $result[$i];
  9803. // ifs are nested for readability
  9804. if (!$is_inline) {
  9805. if (!$depth) {
  9806. if (
  9807. ($token instanceof HTMLPurifier_Token_Text && !$token->is_whitespace) ||
  9808. (!$token instanceof HTMLPurifier_Token_Text && !isset($this->elements[$token->name]))
  9809. ) {
  9810. $is_inline = true;
  9811. $ret[] = $block_wrap_start;
  9812. }
  9813. }
  9814. } else {
  9815. if (!$depth) {
  9816. // starting tokens have been inline text / empty
  9817. if ($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) {
  9818. if (isset($this->elements[$token->name])) {
  9819. // ended
  9820. $ret[] = $block_wrap_end;
  9821. $is_inline = false;
  9822. }
  9823. }
  9824. }
  9825. }
  9826. $ret[] = $token;
  9827. if ($token instanceof HTMLPurifier_Token_Start) $depth++;
  9828. if ($token instanceof HTMLPurifier_Token_End) $depth--;
  9829. }
  9830. if ($is_inline) $ret[] = $block_wrap_end;
  9831. return $ret;
  9832. }
  9833. private function init($config) {
  9834. if (!$this->init) {
  9835. $def = $config->getHTMLDefinition();
  9836. // allow all inline elements
  9837. $this->real_elements = $this->elements;
  9838. $this->fake_elements = $def->info_content_sets['Flow'];
  9839. $this->fake_elements['#PCDATA'] = true;
  9840. $this->init = true;
  9841. }
  9842. }
  9843. }
  9844. /**
  9845. * Definition for tables. The general idea is to extract out all of the
  9846. * essential bits, and then reconstruct it later.
  9847. *
  9848. * This is a bit confusing, because the DTDs and the W3C
  9849. * validators seem to disagree on the appropriate definition. The
  9850. * DTD claims:
  9851. *
  9852. * (CAPTION?, (COL*|COLGROUP*), THEAD?, TFOOT?, TBODY+)
  9853. *
  9854. * But actually, the HTML4 spec then has this to say:
  9855. *
  9856. * The TBODY start tag is always required except when the table
  9857. * contains only one table body and no table head or foot sections.
  9858. * The TBODY end tag may always be safely omitted.
  9859. *
  9860. * So the DTD is kind of wrong. The validator is, unfortunately, kind
  9861. * of on crack.
  9862. *
  9863. * The definition changed again in XHTML1.1; and in my opinion, this
  9864. * formulation makes the most sense.
  9865. *
  9866. * caption?, ( col* | colgroup* ), (( thead?, tfoot?, tbody+ ) | ( tr+ ))
  9867. *
  9868. * Essentially, we have two modes: thead/tfoot/tbody mode, and tr mode.
  9869. * If we encounter a thead, tfoot or tbody, we are placed in the former
  9870. * mode, and we *must* wrap any stray tr segments with a tbody. But if
  9871. * we don't run into any of them, just have tr tags is OK.
  9872. */
  9873. class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
  9874. {
  9875. public $allow_empty = false;
  9876. public $type = 'table';
  9877. public $elements = array('tr' => true, 'tbody' => true, 'thead' => true,
  9878. 'tfoot' => true, 'caption' => true, 'colgroup' => true, 'col' => true);
  9879. public function __construct() {}
  9880. public function validateChildren($tokens_of_children, $config, $context) {
  9881. if (empty($tokens_of_children)) return false;
  9882. // this ensures that the loop gets run one last time before closing
  9883. // up. It's a little bit of a hack, but it works! Just make sure you
  9884. // get rid of the token later.
  9885. $tokens_of_children[] = false;
  9886. // only one of these elements is allowed in a table
  9887. $caption = false;
  9888. $thead = false;
  9889. $tfoot = false;
  9890. // as many of these as you want
  9891. $cols = array();
  9892. $content = array();
  9893. $nesting = 0; // current depth so we can determine nodes
  9894. $is_collecting = false; // are we globbing together tokens to package
  9895. // into one of the collectors?
  9896. $collection = array(); // collected nodes
  9897. $tag_index = 0; // the first node might be whitespace,
  9898. // so this tells us where the start tag is
  9899. $tbody_mode = false; // if true, then we need to wrap any stray
  9900. // <tr>s with a <tbody>.
  9901. foreach ($tokens_of_children as $token) {
  9902. $is_child = ($nesting == 0);
  9903. if ($token === false) {
  9904. // terminating sequence started
  9905. } elseif ($token instanceof HTMLPurifier_Token_Start) {
  9906. $nesting++;
  9907. } elseif ($token instanceof HTMLPurifier_Token_End) {
  9908. $nesting--;
  9909. }
  9910. // handle node collection
  9911. if ($is_collecting) {
  9912. if ($is_child) {
  9913. // okay, let's stash the tokens away
  9914. // first token tells us the type of the collection
  9915. switch ($collection[$tag_index]->name) {
  9916. case 'tbody':
  9917. $tbody_mode = true;
  9918. case 'tr':
  9919. $content[] = $collection;
  9920. break;
  9921. case 'caption':
  9922. if ($caption !== false) break;
  9923. $caption = $collection;
  9924. break;
  9925. case 'thead':
  9926. case 'tfoot':
  9927. $tbody_mode = true;
  9928. // XXX This breaks rendering properties with
  9929. // Firefox, which never floats a <thead> to
  9930. // the top. Ever. (Our scheme will float the
  9931. // first <thead> to the top.) So maybe
  9932. // <thead>s that are not first should be
  9933. // turned into <tbody>? Very tricky, indeed.
  9934. // access the appropriate variable, $thead or $tfoot
  9935. $var = $collection[$tag_index]->name;
  9936. if ($$var === false) {
  9937. $$var = $collection;
  9938. } else {
  9939. // Oops, there's a second one! What
  9940. // should we do? Current behavior is to
  9941. // transmutate the first and last entries into
  9942. // tbody tags, and then put into content.
  9943. // Maybe a better idea is to *attach
  9944. // it* to the existing thead or tfoot?
  9945. // We don't do this, because Firefox
  9946. // doesn't float an extra tfoot to the
  9947. // bottom like it does for the first one.
  9948. $collection[$tag_index]->name = 'tbody';
  9949. $collection[count($collection)-1]->name = 'tbody';
  9950. $content[] = $collection;
  9951. }
  9952. break;
  9953. case 'colgroup':
  9954. $cols[] = $collection;
  9955. break;
  9956. }
  9957. $collection = array();
  9958. $is_collecting = false;
  9959. $tag_index = 0;
  9960. } else {
  9961. // add the node to the collection
  9962. $collection[] = $token;
  9963. }
  9964. }
  9965. // terminate
  9966. if ($token === false) break;
  9967. if ($is_child) {
  9968. // determine what we're dealing with
  9969. if ($token->name == 'col') {
  9970. // the only empty tag in the possie, we can handle it
  9971. // immediately
  9972. $cols[] = array_merge($collection, array($token));
  9973. $collection = array();
  9974. $tag_index = 0;
  9975. continue;
  9976. }
  9977. switch($token->name) {
  9978. case 'caption':
  9979. case 'colgroup':
  9980. case 'thead':
  9981. case 'tfoot':
  9982. case 'tbody':
  9983. case 'tr':
  9984. $is_collecting = true;
  9985. $collection[] = $token;
  9986. continue;
  9987. default:
  9988. if (!empty($token->is_whitespace)) {
  9989. $collection[] = $token;
  9990. $tag_index++;
  9991. }
  9992. continue;
  9993. }
  9994. }
  9995. }
  9996. if (empty($content)) return false;
  9997. $ret = array();
  9998. if ($caption !== false) $ret = array_merge($ret, $caption);
  9999. if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array);
  10000. if ($thead !== false) $ret = array_merge($ret, $thead);
  10001. if ($tfoot !== false) $ret = array_merge($ret, $tfoot);
  10002. if ($tbody_mode) {
  10003. // a little tricky, since the start of the collection may be
  10004. // whitespace
  10005. $inside_tbody = false;
  10006. foreach ($content as $token_array) {
  10007. // find the starting token
  10008. foreach ($token_array as $t) {
  10009. if ($t->name === 'tr' || $t->name === 'tbody') {
  10010. break;
  10011. }
  10012. } // iterator variable carries over
  10013. if ($t->name === 'tr') {
  10014. if ($inside_tbody) {
  10015. $ret = array_merge($ret, $token_array);
  10016. } else {
  10017. $ret[] = new HTMLPurifier_Token_Start('tbody');
  10018. $ret = array_merge($ret, $token_array);
  10019. $inside_tbody = true;
  10020. }
  10021. } elseif ($t->name === 'tbody') {
  10022. if ($inside_tbody) {
  10023. $ret[] = new HTMLPurifier_Token_End('tbody');
  10024. $inside_tbody = false;
  10025. $ret = array_merge($ret, $token_array);
  10026. } else {
  10027. $ret = array_merge($ret, $token_array);
  10028. }
  10029. } else {
  10030. trigger_error("tr/tbody in content invariant failed in Table ChildDef", E_USER_ERROR);
  10031. }
  10032. }
  10033. if ($inside_tbody) {
  10034. $ret[] = new HTMLPurifier_Token_End('tbody');
  10035. }
  10036. } else {
  10037. foreach ($content as $token_array) {
  10038. // invariant: everything in here is <tr>s
  10039. $ret = array_merge($ret, $token_array);
  10040. }
  10041. }
  10042. if (!empty($collection) && $is_collecting == false){
  10043. // grab the trailing space
  10044. $ret = array_merge($ret, $collection);
  10045. }
  10046. array_pop($tokens_of_children); // remove phantom token
  10047. return ($ret === $tokens_of_children) ? true : $ret;
  10048. }
  10049. }
  10050. class HTMLPurifier_DefinitionCache_Decorator extends HTMLPurifier_DefinitionCache
  10051. {
  10052. /**
  10053. * Cache object we are decorating
  10054. */
  10055. public $cache;
  10056. public function __construct() {}
  10057. /**
  10058. * Lazy decorator function
  10059. * @param $cache Reference to cache object to decorate
  10060. */
  10061. public function decorate(&$cache) {
  10062. $decorator = $this->copy();
  10063. // reference is necessary for mocks in PHP 4
  10064. $decorator->cache =& $cache;
  10065. $decorator->type = $cache->type;
  10066. return $decorator;
  10067. }
  10068. /**
  10069. * Cross-compatible clone substitute
  10070. */
  10071. public function copy() {
  10072. return new HTMLPurifier_DefinitionCache_Decorator();
  10073. }
  10074. public function add($def, $config) {
  10075. return $this->cache->add($def, $config);
  10076. }
  10077. public function set($def, $config) {
  10078. return $this->cache->set($def, $config);
  10079. }
  10080. public function replace($def, $config) {
  10081. return $this->cache->replace($def, $config);
  10082. }
  10083. public function get($config) {
  10084. return $this->cache->get($config);
  10085. }
  10086. public function remove($config) {
  10087. return $this->cache->remove($config);
  10088. }
  10089. public function flush($config) {
  10090. return $this->cache->flush($config);
  10091. }
  10092. public function cleanup($config) {
  10093. return $this->cache->cleanup($config);
  10094. }
  10095. }
  10096. /**
  10097. * Null cache object to use when no caching is on.
  10098. */
  10099. class HTMLPurifier_DefinitionCache_Null extends HTMLPurifier_DefinitionCache
  10100. {
  10101. public function add($def, $config) {
  10102. return false;
  10103. }
  10104. public function set($def, $config) {
  10105. return false;
  10106. }
  10107. public function replace($def, $config) {
  10108. return false;
  10109. }
  10110. public function remove($config) {
  10111. return false;
  10112. }
  10113. public function get($config) {
  10114. return false;
  10115. }
  10116. public function flush($config) {
  10117. return false;
  10118. }
  10119. public function cleanup($config) {
  10120. return false;
  10121. }
  10122. }
  10123. class HTMLPurifier_DefinitionCache_Serializer extends
  10124. HTMLPurifier_DefinitionCache
  10125. {
  10126. public function add($def, $config) {
  10127. if (!$this->checkDefType($def)) return;
  10128. $file = $this->generateFilePath($config);
  10129. if (file_exists($file)) return false;
  10130. if (!$this->_prepareDir($config)) return false;
  10131. return $this->_write($file, serialize($def), $config);
  10132. }
  10133. public function set($def, $config) {
  10134. if (!$this->checkDefType($def)) return;
  10135. $file = $this->generateFilePath($config);
  10136. if (!$this->_prepareDir($config)) return false;
  10137. return $this->_write($file, serialize($def), $config);
  10138. }
  10139. public function replace($def, $config) {
  10140. if (!$this->checkDefType($def)) return;
  10141. $file = $this->generateFilePath($config);
  10142. if (!file_exists($file)) return false;
  10143. if (!$this->_prepareDir($config)) return false;
  10144. return $this->_write($file, serialize($def), $config);
  10145. }
  10146. public function get($config) {
  10147. $file = $this->generateFilePath($config);
  10148. if (!file_exists($file)) return false;
  10149. return unserialize(file_get_contents($file));
  10150. }
  10151. public function remove($config) {
  10152. $file = $this->generateFilePath($config);
  10153. if (!file_exists($file)) return false;
  10154. return unlink($file);
  10155. }
  10156. public function flush($config) {
  10157. if (!$this->_prepareDir($config)) return false;
  10158. $dir = $this->generateDirectoryPath($config);
  10159. $dh = opendir($dir);
  10160. while (false !== ($filename = readdir($dh))) {
  10161. if (empty($filename)) continue;
  10162. if ($filename[0] === '.') continue;
  10163. unlink($dir . '/' . $filename);
  10164. }
  10165. }
  10166. public function cleanup($config) {
  10167. if (!$this->_prepareDir($config)) return false;
  10168. $dir = $this->generateDirectoryPath($config);
  10169. $dh = opendir($dir);
  10170. while (false !== ($filename = readdir($dh))) {
  10171. if (empty($filename)) continue;
  10172. if ($filename[0] === '.') continue;
  10173. $key = substr($filename, 0, strlen($filename) - 4);
  10174. if ($this->isOld($key, $config)) unlink($dir . '/' . $filename);
  10175. }
  10176. }
  10177. /**
  10178. * Generates the file path to the serial file corresponding to
  10179. * the configuration and definition name
  10180. * @todo Make protected
  10181. */
  10182. public function generateFilePath($config) {
  10183. $key = $this->generateKey($config);
  10184. return $this->generateDirectoryPath($config) . '/' . $key . '.ser';
  10185. }
  10186. /**
  10187. * Generates the path to the directory contain this cache's serial files
  10188. * @note No trailing slash
  10189. * @todo Make protected
  10190. */
  10191. public function generateDirectoryPath($config) {
  10192. $base = $this->generateBaseDirectoryPath($config);
  10193. return $base . '/' . $this->type;
  10194. }
  10195. /**
  10196. * Generates path to base directory that contains all definition type
  10197. * serials
  10198. * @todo Make protected
  10199. */
  10200. public function generateBaseDirectoryPath($config) {
  10201. $base = $config->get('Cache.SerializerPath');
  10202. $base = is_null($base) ? HTMLPURIFIER_PREFIX . '/HTMLPurifier/DefinitionCache/Serializer' : $base;
  10203. return $base;
  10204. }
  10205. /**
  10206. * Convenience wrapper function for file_put_contents
  10207. * @param $file File name to write to
  10208. * @param $data Data to write into file
  10209. * @param $config Config object
  10210. * @return Number of bytes written if success, or false if failure.
  10211. */
  10212. private function _write($file, $data, $config) {
  10213. $result = file_put_contents($file, $data);
  10214. if ($result !== false) {
  10215. // set permissions of the new file (no execute)
  10216. $chmod = $config->get('Cache.SerializerPermissions');
  10217. if (!$chmod) {
  10218. $chmod = 0644; // invalid config or simpletest
  10219. }
  10220. $chmod = $chmod & 0666;
  10221. chmod($file, $chmod);
  10222. }
  10223. return $result;
  10224. }
  10225. /**
  10226. * Prepares the directory that this type stores the serials in
  10227. * @param $config Config object
  10228. * @return True if successful
  10229. */
  10230. private function _prepareDir($config) {
  10231. $directory = $this->generateDirectoryPath($config);
  10232. $chmod = $config->get('Cache.SerializerPermissions');
  10233. if (!$chmod) {
  10234. $chmod = 0755; // invalid config or simpletest
  10235. }
  10236. if (!is_dir($directory)) {
  10237. $base = $this->generateBaseDirectoryPath($config);
  10238. if (!is_dir($base)) {
  10239. trigger_error('Base directory '.$base.' does not exist,
  10240. please create or change using %Cache.SerializerPath',
  10241. E_USER_WARNING);
  10242. return false;
  10243. } elseif (!$this->_testPermissions($base, $chmod)) {
  10244. return false;
  10245. }
  10246. $old = umask(0000);
  10247. mkdir($directory, $chmod);
  10248. umask($old);
  10249. } elseif (!$this->_testPermissions($directory, $chmod)) {
  10250. return false;
  10251. }
  10252. return true;
  10253. }
  10254. /**
  10255. * Tests permissions on a directory and throws out friendly
  10256. * error messages and attempts to chmod it itself if possible
  10257. * @param $dir Directory path
  10258. * @param $chmod Permissions
  10259. * @return True if directory writable
  10260. */
  10261. private function _testPermissions($dir, $chmod) {
  10262. // early abort, if it is writable, everything is hunky-dory
  10263. if (is_writable($dir)) return true;
  10264. if (!is_dir($dir)) {
  10265. // generally, you'll want to handle this beforehand
  10266. // so a more specific error message can be given
  10267. trigger_error('Directory '.$dir.' does not exist',
  10268. E_USER_WARNING);
  10269. return false;
  10270. }
  10271. if (function_exists('posix_getuid')) {
  10272. // POSIX system, we can give more specific advice
  10273. if (fileowner($dir) === posix_getuid()) {
  10274. // we can chmod it ourselves
  10275. $chmod = $chmod | 0700;
  10276. if (chmod($dir, $chmod)) return true;
  10277. } elseif (filegroup($dir) === posix_getgid()) {
  10278. $chmod = $chmod | 0070;
  10279. } else {
  10280. // PHP's probably running as nobody, so we'll
  10281. // need to give global permissions
  10282. $chmod = $chmod | 0777;
  10283. }
  10284. trigger_error('Directory '.$dir.' not writable, '.
  10285. 'please chmod to ' . decoct($chmod),
  10286. E_USER_WARNING);
  10287. } else {
  10288. // generic error message
  10289. trigger_error('Directory '.$dir.' not writable, '.
  10290. 'please alter file permissions',
  10291. E_USER_WARNING);
  10292. }
  10293. return false;
  10294. }
  10295. }
  10296. /**
  10297. * Definition cache decorator class that cleans up the cache
  10298. * whenever there is a cache miss.
  10299. */
  10300. class HTMLPurifier_DefinitionCache_Decorator_Cleanup extends
  10301. HTMLPurifier_DefinitionCache_Decorator
  10302. {
  10303. public $name = 'Cleanup';
  10304. public function copy() {
  10305. return new HTMLPurifier_DefinitionCache_Decorator_Cleanup();
  10306. }
  10307. public function add($def, $config) {
  10308. $status = parent::add($def, $config);
  10309. if (!$status) parent::cleanup($config);
  10310. return $status;
  10311. }
  10312. public function set($def, $config) {
  10313. $status = parent::set($def, $config);
  10314. if (!$status) parent::cleanup($config);
  10315. return $status;
  10316. }
  10317. public function replace($def, $config) {
  10318. $status = parent::replace($def, $config);
  10319. if (!$status) parent::cleanup($config);
  10320. return $status;
  10321. }
  10322. public function get($config) {
  10323. $ret = parent::get($config);
  10324. if (!$ret) parent::cleanup($config);
  10325. return $ret;
  10326. }
  10327. }
  10328. /**
  10329. * Definition cache decorator class that saves all cache retrievals
  10330. * to PHP's memory; good for unit tests or circumstances where
  10331. * there are lots of configuration objects floating around.
  10332. */
  10333. class HTMLPurifier_DefinitionCache_Decorator_Memory extends
  10334. HTMLPurifier_DefinitionCache_Decorator
  10335. {
  10336. protected $definitions;
  10337. public $name = 'Memory';
  10338. public function copy() {
  10339. return new HTMLPurifier_DefinitionCache_Decorator_Memory();
  10340. }
  10341. public function add($def, $config) {
  10342. $status = parent::add($def, $config);
  10343. if ($status) $this->definitions[$this->generateKey($config)] = $def;
  10344. return $status;
  10345. }
  10346. public function set($def, $config) {
  10347. $status = parent::set($def, $config);
  10348. if ($status) $this->definitions[$this->generateKey($config)] = $def;
  10349. return $status;
  10350. }
  10351. public function replace($def, $config) {
  10352. $status = parent::replace($def, $config);
  10353. if ($status) $this->definitions[$this->generateKey($config)] = $def;
  10354. return $status;
  10355. }
  10356. public function get($config) {
  10357. $key = $this->generateKey($config);
  10358. if (isset($this->definitions[$key])) return $this->definitions[$key];
  10359. $this->definitions[$key] = parent::get($config);
  10360. return $this->definitions[$key];
  10361. }
  10362. }
  10363. /**
  10364. * XHTML 1.1 Bi-directional Text Module, defines elements that
  10365. * declare directionality of content. Text Extension Module.
  10366. */
  10367. class HTMLPurifier_HTMLModule_Bdo extends HTMLPurifier_HTMLModule
  10368. {
  10369. public $name = 'Bdo';
  10370. public $attr_collections = array(
  10371. 'I18N' => array('dir' => false)
  10372. );
  10373. public function setup($config) {
  10374. $bdo = $this->addElement(
  10375. 'bdo', 'Inline', 'Inline', array('Core', 'Lang'),
  10376. array(
  10377. 'dir' => 'Enum#ltr,rtl', // required
  10378. // The Abstract Module specification has the attribute
  10379. // inclusions wrong for bdo: bdo allows Lang
  10380. )
  10381. );
  10382. $bdo->attr_transform_post['required-dir'] = new HTMLPurifier_AttrTransform_BdoDir();
  10383. $this->attr_collections['I18N']['dir'] = 'Enum#ltr,rtl';
  10384. }
  10385. }
  10386. class HTMLPurifier_HTMLModule_CommonAttributes extends HTMLPurifier_HTMLModule
  10387. {
  10388. public $name = 'CommonAttributes';
  10389. public $attr_collections = array(
  10390. 'Core' => array(
  10391. 0 => array('Style'),
  10392. // 'xml:space' => false,
  10393. 'class' => 'Class',
  10394. 'id' => 'ID',
  10395. 'title' => 'CDATA',
  10396. ),
  10397. 'Lang' => array(),
  10398. 'I18N' => array(
  10399. 0 => array('Lang'), // proprietary, for xml:lang/lang
  10400. ),
  10401. 'Common' => array(
  10402. 0 => array('Core', 'I18N')
  10403. )
  10404. );
  10405. }
  10406. /**
  10407. * XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
  10408. * Module.
  10409. */
  10410. class HTMLPurifier_HTMLModule_Edit extends HTMLPurifier_HTMLModule
  10411. {
  10412. public $name = 'Edit';
  10413. public function setup($config) {
  10414. $contents = 'Chameleon: #PCDATA | Inline ! #PCDATA | Flow';
  10415. $attr = array(
  10416. 'cite' => 'URI',
  10417. // 'datetime' => 'Datetime', // not implemented
  10418. );
  10419. $this->addElement('del', 'Inline', $contents, 'Common', $attr);
  10420. $this->addElement('ins', 'Inline', $contents, 'Common', $attr);
  10421. }
  10422. // HTML 4.01 specifies that ins/del must not contain block
  10423. // elements when used in an inline context, chameleon is
  10424. // a complicated workaround to acheive this effect
  10425. // Inline context ! Block context (exclamation mark is
  10426. // separator, see getChildDef for parsing)
  10427. public $defines_child_def = true;
  10428. public function getChildDef($def) {
  10429. if ($def->content_model_type != 'chameleon') return false;
  10430. $value = explode('!', $def->content_model);
  10431. return new HTMLPurifier_ChildDef_Chameleon($value[0], $value[1]);
  10432. }
  10433. }
  10434. /**
  10435. * XHTML 1.1 Forms module, defines all form-related elements found in HTML 4.
  10436. */
  10437. class HTMLPurifier_HTMLModule_Forms extends HTMLPurifier_HTMLModule
  10438. {
  10439. public $name = 'Forms';
  10440. public $safe = false;
  10441. public $content_sets = array(
  10442. 'Block' => 'Form',
  10443. 'Inline' => 'Formctrl',
  10444. );
  10445. public function setup($config) {
  10446. $form = $this->addElement('form', 'Form',
  10447. 'Required: Heading | List | Block | fieldset', 'Common', array(
  10448. 'accept' => 'ContentTypes',
  10449. 'accept-charset' => 'Charsets',
  10450. 'action*' => 'URI',
  10451. 'method' => 'Enum#get,post',
  10452. // really ContentType, but these two are the only ones used today
  10453. 'enctype' => 'Enum#application/x-www-form-urlencoded,multipart/form-data',
  10454. ));
  10455. $form->excludes = array('form' => true);
  10456. $input = $this->addElement('input', 'Formctrl', 'Empty', 'Common', array(
  10457. 'accept' => 'ContentTypes',
  10458. 'accesskey' => 'Character',
  10459. 'alt' => 'Text',
  10460. 'checked' => 'Bool#checked',
  10461. 'disabled' => 'Bool#disabled',
  10462. 'maxlength' => 'Number',
  10463. 'name' => 'CDATA',
  10464. 'readonly' => 'Bool#readonly',
  10465. 'size' => 'Number',
  10466. 'src' => 'URI#embedded',
  10467. 'tabindex' => 'Number',
  10468. 'type' => 'Enum#text,password,checkbox,button,radio,submit,reset,file,hidden,image',
  10469. 'value' => 'CDATA',
  10470. ));
  10471. $input->attr_transform_post[] = new HTMLPurifier_AttrTransform_Input();
  10472. $this->addElement('select', 'Formctrl', 'Required: optgroup | option', 'Common', array(
  10473. 'disabled' => 'Bool#disabled',
  10474. 'multiple' => 'Bool#multiple',
  10475. 'name' => 'CDATA',
  10476. 'size' => 'Number',
  10477. 'tabindex' => 'Number',
  10478. ));
  10479. $this->addElement('option', false, 'Optional: #PCDATA', 'Common', array(
  10480. 'disabled' => 'Bool#disabled',
  10481. 'label' => 'Text',
  10482. 'selected' => 'Bool#selected',
  10483. 'value' => 'CDATA',
  10484. ));
  10485. // It's illegal for there to be more than one selected, but not
  10486. // be multiple. Also, no selected means undefined behavior. This might
  10487. // be difficult to implement; perhaps an injector, or a context variable.
  10488. $textarea = $this->addElement('textarea', 'Formctrl', 'Optional: #PCDATA', 'Common', array(
  10489. 'accesskey' => 'Character',
  10490. 'cols*' => 'Number',
  10491. 'disabled' => 'Bool#disabled',
  10492. 'name' => 'CDATA',
  10493. 'readonly' => 'Bool#readonly',
  10494. 'rows*' => 'Number',
  10495. 'tabindex' => 'Number',
  10496. ));
  10497. $textarea->attr_transform_pre[] = new HTMLPurifier_AttrTransform_Textarea();
  10498. $button = $this->addElement('button', 'Formctrl', 'Optional: #PCDATA | Heading | List | Block | Inline', 'Common', array(
  10499. 'accesskey' => 'Character',
  10500. 'disabled' => 'Bool#disabled',
  10501. 'name' => 'CDATA',
  10502. 'tabindex' => 'Number',
  10503. 'type' => 'Enum#button,submit,reset',
  10504. 'value' => 'CDATA',
  10505. ));
  10506. // For exclusions, ideally we'd specify content sets, not literal elements
  10507. $button->excludes = $this->makeLookup(
  10508. 'form', 'fieldset', // Form
  10509. 'input', 'select', 'textarea', 'label', 'button', // Formctrl
  10510. 'a', // as per HTML 4.01 spec, this is omitted by modularization
  10511. 'isindex', 'iframe' // legacy items
  10512. );
  10513. // Extra exclusion: img usemap="" is not permitted within this element.
  10514. // We'll omit this for now, since we don't have any good way of
  10515. // indicating it yet.
  10516. // This is HIGHLY user-unfriendly; we need a custom child-def for this
  10517. $this->addElement('fieldset', 'Form', 'Custom: (#WS?,legend,(Flow|#PCDATA)*)', 'Common');
  10518. $label = $this->addElement('label', 'Formctrl', 'Optional: #PCDATA | Inline', 'Common', array(
  10519. 'accesskey' => 'Character',
  10520. // 'for' => 'IDREF', // IDREF not implemented, cannot allow
  10521. ));
  10522. $label->excludes = array('label' => true);
  10523. $this->addElement('legend', false, 'Optional: #PCDATA | Inline', 'Common', array(
  10524. 'accesskey' => 'Character',
  10525. ));
  10526. $this->addElement('optgroup', false, 'Required: option', 'Common', array(
  10527. 'disabled' => 'Bool#disabled',
  10528. 'label*' => 'Text',
  10529. ));
  10530. // Don't forget an injector for <isindex>. This one's a little complex
  10531. // because it maps to multiple elements.
  10532. }
  10533. }
  10534. /**
  10535. * XHTML 1.1 Hypertext Module, defines hypertext links. Core Module.
  10536. */
  10537. class HTMLPurifier_HTMLModule_Hypertext extends HTMLPurifier_HTMLModule
  10538. {
  10539. public $name = 'Hypertext';
  10540. public function setup($config) {
  10541. $a = $this->addElement(
  10542. 'a', 'Inline', 'Inline', 'Common',
  10543. array(
  10544. // 'accesskey' => 'Character',
  10545. // 'charset' => 'Charset',
  10546. 'href' => 'URI',
  10547. // 'hreflang' => 'LanguageCode',
  10548. 'rel' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rel'),
  10549. 'rev' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rev'),
  10550. // 'tabindex' => 'Number',
  10551. // 'type' => 'ContentType',
  10552. )
  10553. );
  10554. $a->formatting = true;
  10555. $a->excludes = array('a' => true);
  10556. }
  10557. }
  10558. /**
  10559. * XHTML 1.1 Iframe Module provides inline frames.
  10560. *
  10561. * @note This module is not considered safe unless an Iframe
  10562. * whitelisting mechanism is specified. Currently, the only
  10563. * such mechanism is %URL.SafeIframeRegexp
  10564. */
  10565. class HTMLPurifier_HTMLModule_Iframe extends HTMLPurifier_HTMLModule
  10566. {
  10567. public $name = 'Iframe';
  10568. public $safe = false;
  10569. public function setup($config) {
  10570. if ($config->get('HTML.SafeIframe')) {
  10571. $this->safe = true;
  10572. }
  10573. $this->addElement(
  10574. 'iframe', 'Inline', 'Flow', 'Common',
  10575. array(
  10576. 'src' => 'URI#embedded',
  10577. 'width' => 'Length',
  10578. 'height' => 'Length',
  10579. 'name' => 'ID',
  10580. 'scrolling' => 'Enum#yes,no,auto',
  10581. 'frameborder' => 'Enum#0,1',
  10582. 'longdesc' => 'URI',
  10583. 'marginheight' => 'Pixels',
  10584. 'marginwidth' => 'Pixels',
  10585. )
  10586. );
  10587. }
  10588. }
  10589. /**
  10590. * XHTML 1.1 Image Module provides basic image embedding.
  10591. * @note There is specialized code for removing empty images in
  10592. * HTMLPurifier_Strategy_RemoveForeignElements
  10593. */
  10594. class HTMLPurifier_HTMLModule_Image extends HTMLPurifier_HTMLModule
  10595. {
  10596. public $name = 'Image';
  10597. public function setup($config) {
  10598. $max = $config->get('HTML.MaxImgLength');
  10599. $img = $this->addElement(
  10600. 'img', 'Inline', 'Empty', 'Common',
  10601. array(
  10602. 'alt*' => 'Text',
  10603. // According to the spec, it's Length, but percents can
  10604. // be abused, so we allow only Pixels.
  10605. 'height' => 'Pixels#' . $max,
  10606. 'width' => 'Pixels#' . $max,
  10607. 'longdesc' => 'URI',
  10608. 'src*' => new HTMLPurifier_AttrDef_URI(true), // embedded
  10609. )
  10610. );
  10611. if ($max === null || $config->get('HTML.Trusted')) {
  10612. $img->attr['height'] =
  10613. $img->attr['width'] = 'Length';
  10614. }
  10615. // kind of strange, but splitting things up would be inefficient
  10616. $img->attr_transform_pre[] =
  10617. $img->attr_transform_post[] =
  10618. new HTMLPurifier_AttrTransform_ImgRequired();
  10619. }
  10620. }
  10621. /**
  10622. * XHTML 1.1 Legacy module defines elements that were previously
  10623. * deprecated.
  10624. *
  10625. * @note Not all legacy elements have been implemented yet, which
  10626. * is a bit of a reverse problem as compared to browsers! In
  10627. * addition, this legacy module may implement a bit more than
  10628. * mandated by XHTML 1.1.
  10629. *
  10630. * This module can be used in combination with TransformToStrict in order
  10631. * to transform as many deprecated elements as possible, but retain
  10632. * questionably deprecated elements that do not have good alternatives
  10633. * as well as transform elements that don't have an implementation.
  10634. * See docs/ref-strictness.txt for more details.
  10635. */
  10636. class HTMLPurifier_HTMLModule_Legacy extends HTMLPurifier_HTMLModule
  10637. {
  10638. public $name = 'Legacy';
  10639. public function setup($config) {
  10640. $this->addElement('basefont', 'Inline', 'Empty', false, array(
  10641. 'color' => 'Color',
  10642. 'face' => 'Text', // extremely broad, we should
  10643. 'size' => 'Text', // tighten it
  10644. 'id' => 'ID'
  10645. ));
  10646. $this->addElement('center', 'Block', 'Flow', 'Common');
  10647. $this->addElement('dir', 'Block', 'Required: li', 'Common', array(
  10648. 'compact' => 'Bool#compact'
  10649. ));
  10650. $this->addElement('font', 'Inline', 'Inline', array('Core', 'I18N'), array(
  10651. 'color' => 'Color',
  10652. 'face' => 'Text', // extremely broad, we should
  10653. 'size' => 'Text', // tighten it
  10654. ));
  10655. $this->addElement('menu', 'Block', 'Required: li', 'Common', array(
  10656. 'compact' => 'Bool#compact'
  10657. ));
  10658. $s = $this->addElement('s', 'Inline', 'Inline', 'Common');
  10659. $s->formatting = true;
  10660. $strike = $this->addElement('strike', 'Inline', 'Inline', 'Common');
  10661. $strike->formatting = true;
  10662. $u = $this->addElement('u', 'Inline', 'Inline', 'Common');
  10663. $u->formatting = true;
  10664. // setup modifications to old elements
  10665. $align = 'Enum#left,right,center,justify';
  10666. $address = $this->addBlankElement('address');
  10667. $address->content_model = 'Inline | #PCDATA | p';
  10668. $address->content_model_type = 'optional';
  10669. $address->child = false;
  10670. $blockquote = $this->addBlankElement('blockquote');
  10671. $blockquote->content_model = 'Flow | #PCDATA';
  10672. $blockquote->content_model_type = 'optional';
  10673. $blockquote->child = false;
  10674. $br = $this->addBlankElement('br');
  10675. $br->attr['clear'] = 'Enum#left,all,right,none';
  10676. $caption = $this->addBlankElement('caption');
  10677. $caption->attr['align'] = 'Enum#top,bottom,left,right';
  10678. $div = $this->addBlankElement('div');
  10679. $div->attr['align'] = $align;
  10680. $dl = $this->addBlankElement('dl');
  10681. $dl->attr['compact'] = 'Bool#compact';
  10682. for ($i = 1; $i <= 6; $i++) {
  10683. $h = $this->addBlankElement("h$i");
  10684. $h->attr['align'] = $align;
  10685. }
  10686. $hr = $this->addBlankElement('hr');
  10687. $hr->attr['align'] = $align;
  10688. $hr->attr['noshade'] = 'Bool#noshade';
  10689. $hr->attr['size'] = 'Pixels';
  10690. $hr->attr['width'] = 'Length';
  10691. $img = $this->addBlankElement('img');
  10692. $img->attr['align'] = 'IAlign';
  10693. $img->attr['border'] = 'Pixels';
  10694. $img->attr['hspace'] = 'Pixels';
  10695. $img->attr['vspace'] = 'Pixels';
  10696. // figure out this integer business
  10697. $li = $this->addBlankElement('li');
  10698. $li->attr['value'] = new HTMLPurifier_AttrDef_Integer();
  10699. $li->attr['type'] = 'Enum#s:1,i,I,a,A,disc,square,circle';
  10700. $ol = $this->addBlankElement('ol');
  10701. $ol->attr['compact'] = 'Bool#compact';
  10702. $ol->attr['start'] = new HTMLPurifier_AttrDef_Integer();
  10703. $ol->attr['type'] = 'Enum#s:1,i,I,a,A';
  10704. $p = $this->addBlankElement('p');
  10705. $p->attr['align'] = $align;
  10706. $pre = $this->addBlankElement('pre');
  10707. $pre->attr['width'] = 'Number';
  10708. // script omitted
  10709. $table = $this->addBlankElement('table');
  10710. $table->attr['align'] = 'Enum#left,center,right';
  10711. $table->attr['bgcolor'] = 'Color';
  10712. $tr = $this->addBlankElement('tr');
  10713. $tr->attr['bgcolor'] = 'Color';
  10714. $th = $this->addBlankElement('th');
  10715. $th->attr['bgcolor'] = 'Color';
  10716. $th->attr['height'] = 'Length';
  10717. $th->attr['nowrap'] = 'Bool#nowrap';
  10718. $th->attr['width'] = 'Length';
  10719. $td = $this->addBlankElement('td');
  10720. $td->attr['bgcolor'] = 'Color';
  10721. $td->attr['height'] = 'Length';
  10722. $td->attr['nowrap'] = 'Bool#nowrap';
  10723. $td->attr['width'] = 'Length';
  10724. $ul = $this->addBlankElement('ul');
  10725. $ul->attr['compact'] = 'Bool#compact';
  10726. $ul->attr['type'] = 'Enum#square,disc,circle';
  10727. // "safe" modifications to "unsafe" elements
  10728. // WARNING: If you want to add support for an unsafe, legacy
  10729. // attribute, make a new TrustedLegacy module with the trusted
  10730. // bit set appropriately
  10731. $form = $this->addBlankElement('form');
  10732. $form->content_model = 'Flow | #PCDATA';
  10733. $form->content_model_type = 'optional';
  10734. $form->attr['target'] = 'FrameTarget';
  10735. $input = $this->addBlankElement('input');
  10736. $input->attr['align'] = 'IAlign';
  10737. $legend = $this->addBlankElement('legend');
  10738. $legend->attr['align'] = 'LAlign';
  10739. }
  10740. }
  10741. /**
  10742. * XHTML 1.1 List Module, defines list-oriented elements. Core Module.
  10743. */
  10744. class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule
  10745. {
  10746. public $name = 'List';
  10747. // According to the abstract schema, the List content set is a fully formed
  10748. // one or more expr, but it invariably occurs in an optional declaration
  10749. // so we're not going to do that subtlety. It might cause trouble
  10750. // if a user defines "List" and expects that multiple lists are
  10751. // allowed to be specified, but then again, that's not very intuitive.
  10752. // Furthermore, the actual XML Schema may disagree. Regardless,
  10753. // we don't have support for such nested expressions without using
  10754. // the incredibly inefficient and draconic Custom ChildDef.
  10755. public $content_sets = array('Flow' => 'List');
  10756. public function setup($config) {
  10757. $ol = $this->addElement('ol', 'List', new HTMLPurifier_ChildDef_List(), 'Common');
  10758. $ul = $this->addElement('ul', 'List', new HTMLPurifier_ChildDef_List(), 'Common');
  10759. // XXX The wrap attribute is handled by MakeWellFormed. This is all
  10760. // quite unsatisfactory, because we generated this
  10761. // *specifically* for lists, and now a big chunk of the handling
  10762. // is done properly by the List ChildDef. So actually, we just
  10763. // want enough information to make autoclosing work properly,
  10764. // and then hand off the tricky stuff to the ChildDef.
  10765. $ol->wrap = 'li';
  10766. $ul->wrap = 'li';
  10767. $this->addElement('dl', 'List', 'Required: dt | dd', 'Common');
  10768. $this->addElement('li', false, 'Flow', 'Common');
  10769. $this->addElement('dd', false, 'Flow', 'Common');
  10770. $this->addElement('dt', false, 'Inline', 'Common');
  10771. }
  10772. }
  10773. class HTMLPurifier_HTMLModule_Name extends HTMLPurifier_HTMLModule
  10774. {
  10775. public $name = 'Name';
  10776. public function setup($config) {
  10777. $elements = array('a', 'applet', 'form', 'frame', 'iframe', 'img', 'map');
  10778. foreach ($elements as $name) {
  10779. $element = $this->addBlankElement($name);
  10780. $element->attr['name'] = 'CDATA';
  10781. if (!$config->get('HTML.Attr.Name.UseCDATA')) {
  10782. $element->attr_transform_post['NameSync'] = new HTMLPurifier_AttrTransform_NameSync();
  10783. }
  10784. }
  10785. }
  10786. }
  10787. /**
  10788. * Module adds the nofollow attribute transformation to a tags. It
  10789. * is enabled by HTML.Nofollow
  10790. */
  10791. class HTMLPurifier_HTMLModule_Nofollow extends HTMLPurifier_HTMLModule
  10792. {
  10793. public $name = 'Nofollow';
  10794. public function setup($config) {
  10795. $a = $this->addBlankElement('a');
  10796. $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_Nofollow();
  10797. }
  10798. }
  10799. class HTMLPurifier_HTMLModule_NonXMLCommonAttributes extends HTMLPurifier_HTMLModule
  10800. {
  10801. public $name = 'NonXMLCommonAttributes';
  10802. public $attr_collections = array(
  10803. 'Lang' => array(
  10804. 'lang' => 'LanguageCode',
  10805. )
  10806. );
  10807. }
  10808. /**
  10809. * XHTML 1.1 Object Module, defines elements for generic object inclusion
  10810. * @warning Users will commonly use <embed> to cater to legacy browsers: this
  10811. * module does not allow this sort of behavior
  10812. */
  10813. class HTMLPurifier_HTMLModule_Object extends HTMLPurifier_HTMLModule
  10814. {
  10815. public $name = 'Object';
  10816. public $safe = false;
  10817. public function setup($config) {
  10818. $this->addElement('object', 'Inline', 'Optional: #PCDATA | Flow | param', 'Common',
  10819. array(
  10820. 'archive' => 'URI',
  10821. 'classid' => 'URI',
  10822. 'codebase' => 'URI',
  10823. 'codetype' => 'Text',
  10824. 'data' => 'URI',
  10825. 'declare' => 'Bool#declare',
  10826. 'height' => 'Length',
  10827. 'name' => 'CDATA',
  10828. 'standby' => 'Text',
  10829. 'tabindex' => 'Number',
  10830. 'type' => 'ContentType',
  10831. 'width' => 'Length'
  10832. )
  10833. );
  10834. $this->addElement('param', false, 'Empty', false,
  10835. array(
  10836. 'id' => 'ID',
  10837. 'name*' => 'Text',
  10838. 'type' => 'Text',
  10839. 'value' => 'Text',
  10840. 'valuetype' => 'Enum#data,ref,object'
  10841. )
  10842. );
  10843. }
  10844. }
  10845. /**
  10846. * XHTML 1.1 Presentation Module, defines simple presentation-related
  10847. * markup. Text Extension Module.
  10848. * @note The official XML Schema and DTD specs further divide this into
  10849. * two modules:
  10850. * - Block Presentation (hr)
  10851. * - Inline Presentation (b, big, i, small, sub, sup, tt)
  10852. * We have chosen not to heed this distinction, as content_sets
  10853. * provides satisfactory disambiguation.
  10854. */
  10855. class HTMLPurifier_HTMLModule_Presentation extends HTMLPurifier_HTMLModule
  10856. {
  10857. public $name = 'Presentation';
  10858. public function setup($config) {
  10859. $this->addElement('hr', 'Block', 'Empty', 'Common');
  10860. $this->addElement('sub', 'Inline', 'Inline', 'Common');
  10861. $this->addElement('sup', 'Inline', 'Inline', 'Common');
  10862. $b = $this->addElement('b', 'Inline', 'Inline', 'Common');
  10863. $b->formatting = true;
  10864. $big = $this->addElement('big', 'Inline', 'Inline', 'Common');
  10865. $big->formatting = true;
  10866. $i = $this->addElement('i', 'Inline', 'Inline', 'Common');
  10867. $i->formatting = true;
  10868. $small = $this->addElement('small', 'Inline', 'Inline', 'Common');
  10869. $small->formatting = true;
  10870. $tt = $this->addElement('tt', 'Inline', 'Inline', 'Common');
  10871. $tt->formatting = true;
  10872. }
  10873. }
  10874. /**
  10875. * Module defines proprietary tags and attributes in HTML.
  10876. * @warning If this module is enabled, standards-compliance is off!
  10877. */
  10878. class HTMLPurifier_HTMLModule_Proprietary extends HTMLPurifier_HTMLModule
  10879. {
  10880. public $name = 'Proprietary';
  10881. public function setup($config) {
  10882. $this->addElement('marquee', 'Inline', 'Flow', 'Common',
  10883. array(
  10884. 'direction' => 'Enum#left,right,up,down',
  10885. 'behavior' => 'Enum#alternate',
  10886. 'width' => 'Length',
  10887. 'height' => 'Length',
  10888. 'scrolldelay' => 'Number',
  10889. 'scrollamount' => 'Number',
  10890. 'loop' => 'Number',
  10891. 'bgcolor' => 'Color',
  10892. 'hspace' => 'Pixels',
  10893. 'vspace' => 'Pixels',
  10894. )
  10895. );
  10896. }
  10897. }
  10898. /**
  10899. * XHTML 1.1 Ruby Annotation Module, defines elements that indicate
  10900. * short runs of text alongside base text for annotation or pronounciation.
  10901. */
  10902. class HTMLPurifier_HTMLModule_Ruby extends HTMLPurifier_HTMLModule
  10903. {
  10904. public $name = 'Ruby';
  10905. public function setup($config) {
  10906. $this->addElement('ruby', 'Inline',
  10907. 'Custom: ((rb, (rt | (rp, rt, rp))) | (rbc, rtc, rtc?))',
  10908. 'Common');
  10909. $this->addElement('rbc', false, 'Required: rb', 'Common');
  10910. $this->addElement('rtc', false, 'Required: rt', 'Common');
  10911. $rb = $this->addElement('rb', false, 'Inline', 'Common');
  10912. $rb->excludes = array('ruby' => true);
  10913. $rt = $this->addElement('rt', false, 'Inline', 'Common', array('rbspan' => 'Number'));
  10914. $rt->excludes = array('ruby' => true);
  10915. $this->addElement('rp', false, 'Optional: #PCDATA', 'Common');
  10916. }
  10917. }
  10918. /**
  10919. * A "safe" embed module. See SafeObject. This is a proprietary element.
  10920. */
  10921. class HTMLPurifier_HTMLModule_SafeEmbed extends HTMLPurifier_HTMLModule
  10922. {
  10923. public $name = 'SafeEmbed';
  10924. public function setup($config) {
  10925. $max = $config->get('HTML.MaxImgLength');
  10926. $embed = $this->addElement(
  10927. 'embed', 'Inline', 'Empty', 'Common',
  10928. array(
  10929. 'src*' => 'URI#embedded',
  10930. 'type' => 'Enum#application/x-shockwave-flash',
  10931. 'width' => 'Pixels#' . $max,
  10932. 'height' => 'Pixels#' . $max,
  10933. 'allowscriptaccess' => 'Enum#never',
  10934. 'allownetworking' => 'Enum#internal',
  10935. 'flashvars' => 'Text',
  10936. 'wmode' => 'Enum#window,transparent,opaque',
  10937. 'name' => 'ID',
  10938. )
  10939. );
  10940. $embed->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeEmbed();
  10941. }
  10942. }
  10943. /**
  10944. * A "safe" object module. In theory, objects permitted by this module will
  10945. * be safe, and untrusted users can be allowed to embed arbitrary flash objects
  10946. * (maybe other types too, but only Flash is supported as of right now).
  10947. * Highly experimental.
  10948. */
  10949. class HTMLPurifier_HTMLModule_SafeObject extends HTMLPurifier_HTMLModule
  10950. {
  10951. public $name = 'SafeObject';
  10952. public function setup($config) {
  10953. // These definitions are not intrinsically safe: the attribute transforms
  10954. // are a vital part of ensuring safety.
  10955. $max = $config->get('HTML.MaxImgLength');
  10956. $object = $this->addElement(
  10957. 'object',
  10958. 'Inline',
  10959. 'Optional: param | Flow | #PCDATA',
  10960. 'Common',
  10961. array(
  10962. // While technically not required by the spec, we're forcing
  10963. // it to this value.
  10964. 'type' => 'Enum#application/x-shockwave-flash',
  10965. 'width' => 'Pixels#' . $max,
  10966. 'height' => 'Pixels#' . $max,
  10967. 'data' => 'URI#embedded',
  10968. 'codebase' => new HTMLPurifier_AttrDef_Enum(array(
  10969. 'http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,40,0')),
  10970. )
  10971. );
  10972. $object->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeObject();
  10973. $param = $this->addElement('param', false, 'Empty', false,
  10974. array(
  10975. 'id' => 'ID',
  10976. 'name*' => 'Text',
  10977. 'value' => 'Text'
  10978. )
  10979. );
  10980. $param->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeParam();
  10981. $this->info_injector[] = 'SafeObject';
  10982. }
  10983. }
  10984. /*
  10985. WARNING: THIS MODULE IS EXTREMELY DANGEROUS AS IT ENABLES INLINE SCRIPTING
  10986. INSIDE HTML PURIFIER DOCUMENTS. USE ONLY WITH TRUSTED USER INPUT!!!
  10987. */
  10988. /**
  10989. * XHTML 1.1 Scripting module, defines elements that are used to contain
  10990. * information pertaining to executable scripts or the lack of support
  10991. * for executable scripts.
  10992. * @note This module does not contain inline scripting elements
  10993. */
  10994. class HTMLPurifier_HTMLModule_Scripting extends HTMLPurifier_HTMLModule
  10995. {
  10996. public $name = 'Scripting';
  10997. public $elements = array('script', 'noscript');
  10998. public $content_sets = array('Block' => 'script | noscript', 'Inline' => 'script | noscript');
  10999. public $safe = false;
  11000. public function setup($config) {
  11001. // TODO: create custom child-definition for noscript that
  11002. // auto-wraps stray #PCDATA in a similar manner to
  11003. // blockquote's custom definition (we would use it but
  11004. // blockquote's contents are optional while noscript's contents
  11005. // are required)
  11006. // TODO: convert this to new syntax, main problem is getting
  11007. // both content sets working
  11008. // In theory, this could be safe, but I don't see any reason to
  11009. // allow it.
  11010. $this->info['noscript'] = new HTMLPurifier_ElementDef();
  11011. $this->info['noscript']->attr = array( 0 => array('Common') );
  11012. $this->info['noscript']->content_model = 'Heading | List | Block';
  11013. $this->info['noscript']->content_model_type = 'required';
  11014. $this->info['script'] = new HTMLPurifier_ElementDef();
  11015. $this->info['script']->attr = array(
  11016. 'defer' => new HTMLPurifier_AttrDef_Enum(array('defer')),
  11017. 'src' => new HTMLPurifier_AttrDef_URI(true),
  11018. 'type' => new HTMLPurifier_AttrDef_Enum(array('text/javascript'))
  11019. );
  11020. $this->info['script']->content_model = '#PCDATA';
  11021. $this->info['script']->content_model_type = 'optional';
  11022. $this->info['script']->attr_transform_pre['type'] =
  11023. $this->info['script']->attr_transform_post['type'] =
  11024. new HTMLPurifier_AttrTransform_ScriptRequired();
  11025. }
  11026. }
  11027. /**
  11028. * XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
  11029. * Module.
  11030. */
  11031. class HTMLPurifier_HTMLModule_StyleAttribute extends HTMLPurifier_HTMLModule
  11032. {
  11033. public $name = 'StyleAttribute';
  11034. public $attr_collections = array(
  11035. // The inclusion routine differs from the Abstract Modules but
  11036. // is in line with the DTD and XML Schemas.
  11037. 'Style' => array('style' => false), // see constructor
  11038. 'Core' => array(0 => array('Style'))
  11039. );
  11040. public function setup($config) {
  11041. $this->attr_collections['Style']['style'] = new HTMLPurifier_AttrDef_CSS();
  11042. }
  11043. }
  11044. /**
  11045. * XHTML 1.1 Tables Module, fully defines accessible table elements.
  11046. */
  11047. class HTMLPurifier_HTMLModule_Tables extends HTMLPurifier_HTMLModule
  11048. {
  11049. public $name = 'Tables';
  11050. public function setup($config) {
  11051. $this->addElement('caption', false, 'Inline', 'Common');
  11052. $this->addElement('table', 'Block',
  11053. new HTMLPurifier_ChildDef_Table(), 'Common',
  11054. array(
  11055. 'border' => 'Pixels',
  11056. 'cellpadding' => 'Length',
  11057. 'cellspacing' => 'Length',
  11058. 'frame' => 'Enum#void,above,below,hsides,lhs,rhs,vsides,box,border',
  11059. 'rules' => 'Enum#none,groups,rows,cols,all',
  11060. 'summary' => 'Text',
  11061. 'width' => 'Length'
  11062. )
  11063. );
  11064. // common attributes
  11065. $cell_align = array(
  11066. 'align' => 'Enum#left,center,right,justify,char',
  11067. 'charoff' => 'Length',
  11068. 'valign' => 'Enum#top,middle,bottom,baseline',
  11069. );
  11070. $cell_t = array_merge(
  11071. array(
  11072. 'abbr' => 'Text',
  11073. 'colspan' => 'Number',
  11074. 'rowspan' => 'Number',
  11075. // Apparently, as of HTML5 this attribute only applies
  11076. // to 'th' elements.
  11077. 'scope' => 'Enum#row,col,rowgroup,colgroup',
  11078. ),
  11079. $cell_align
  11080. );
  11081. $this->addElement('td', false, 'Flow', 'Common', $cell_t);
  11082. $this->addElement('th', false, 'Flow', 'Common', $cell_t);
  11083. $this->addElement('tr', false, 'Required: td | th', 'Common', $cell_align);
  11084. $cell_col = array_merge(
  11085. array(
  11086. 'span' => 'Number',
  11087. 'width' => 'MultiLength',
  11088. ),
  11089. $cell_align
  11090. );
  11091. $this->addElement('col', false, 'Empty', 'Common', $cell_col);
  11092. $this->addElement('colgroup', false, 'Optional: col', 'Common', $cell_col);
  11093. $this->addElement('tbody', false, 'Required: tr', 'Common', $cell_align);
  11094. $this->addElement('thead', false, 'Required: tr', 'Common', $cell_align);
  11095. $this->addElement('tfoot', false, 'Required: tr', 'Common', $cell_align);
  11096. }
  11097. }
  11098. /**
  11099. * XHTML 1.1 Target Module, defines target attribute in link elements.
  11100. */
  11101. class HTMLPurifier_HTMLModule_Target extends HTMLPurifier_HTMLModule
  11102. {
  11103. public $name = 'Target';
  11104. public function setup($config) {
  11105. $elements = array('a');
  11106. foreach ($elements as $name) {
  11107. $e = $this->addBlankElement($name);
  11108. $e->attr = array(
  11109. 'target' => new HTMLPurifier_AttrDef_HTML_FrameTarget()
  11110. );
  11111. }
  11112. }
  11113. }
  11114. /**
  11115. * Module adds the target=blank attribute transformation to a tags. It
  11116. * is enabled by HTML.TargetBlank
  11117. */
  11118. class HTMLPurifier_HTMLModule_TargetBlank extends HTMLPurifier_HTMLModule
  11119. {
  11120. public $name = 'TargetBlank';
  11121. public function setup($config) {
  11122. $a = $this->addBlankElement('a');
  11123. $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_TargetBlank();
  11124. }
  11125. }
  11126. /**
  11127. * XHTML 1.1 Text Module, defines basic text containers. Core Module.
  11128. * @note In the normative XML Schema specification, this module
  11129. * is further abstracted into the following modules:
  11130. * - Block Phrasal (address, blockquote, pre, h1, h2, h3, h4, h5, h6)
  11131. * - Block Structural (div, p)
  11132. * - Inline Phrasal (abbr, acronym, cite, code, dfn, em, kbd, q, samp, strong, var)
  11133. * - Inline Structural (br, span)
  11134. * This module, functionally, does not distinguish between these
  11135. * sub-modules, but the code is internally structured to reflect
  11136. * these distinctions.
  11137. */
  11138. class HTMLPurifier_HTMLModule_Text extends HTMLPurifier_HTMLModule
  11139. {
  11140. public $name = 'Text';
  11141. public $content_sets = array(
  11142. 'Flow' => 'Heading | Block | Inline'
  11143. );
  11144. public function setup($config) {
  11145. // Inline Phrasal -------------------------------------------------
  11146. $this->addElement('abbr', 'Inline', 'Inline', 'Common');
  11147. $this->addElement('acronym', 'Inline', 'Inline', 'Common');
  11148. $this->addElement('cite', 'Inline', 'Inline', 'Common');
  11149. $this->addElement('dfn', 'Inline', 'Inline', 'Common');
  11150. $this->addElement('kbd', 'Inline', 'Inline', 'Common');
  11151. $this->addElement('q', 'Inline', 'Inline', 'Common', array('cite' => 'URI'));
  11152. $this->addElement('samp', 'Inline', 'Inline', 'Common');
  11153. $this->addElement('var', 'Inline', 'Inline', 'Common');
  11154. $em = $this->addElement('em', 'Inline', 'Inline', 'Common');
  11155. $em->formatting = true;
  11156. $strong = $this->addElement('strong', 'Inline', 'Inline', 'Common');
  11157. $strong->formatting = true;
  11158. $code = $this->addElement('code', 'Inline', 'Inline', 'Common');
  11159. $code->formatting = true;
  11160. // Inline Structural ----------------------------------------------
  11161. $this->addElement('span', 'Inline', 'Inline', 'Common');
  11162. $this->addElement('br', 'Inline', 'Empty', 'Core');
  11163. // Block Phrasal --------------------------------------------------
  11164. $this->addElement('address', 'Block', 'Inline', 'Common');
  11165. $this->addElement('blockquote', 'Block', 'Optional: Heading | Block | List', 'Common', array('cite' => 'URI') );
  11166. $pre = $this->addElement('pre', 'Block', 'Inline', 'Common');
  11167. $pre->excludes = $this->makeLookup(
  11168. 'img', 'big', 'small', 'object', 'applet', 'font', 'basefont' );
  11169. $this->addElement('h1', 'Heading', 'Inline', 'Common');
  11170. $this->addElement('h2', 'Heading', 'Inline', 'Common');
  11171. $this->addElement('h3', 'Heading', 'Inline', 'Common');
  11172. $this->addElement('h4', 'Heading', 'Inline', 'Common');
  11173. $this->addElement('h5', 'Heading', 'Inline', 'Common');
  11174. $this->addElement('h6', 'Heading', 'Inline', 'Common');
  11175. // Block Structural -----------------------------------------------
  11176. $p = $this->addElement('p', 'Block', 'Inline', 'Common');
  11177. $p->autoclose = array_flip(array("address", "blockquote", "center", "dir", "div", "dl", "fieldset", "ol", "p", "ul"));
  11178. $this->addElement('div', 'Block', 'Flow', 'Common');
  11179. }
  11180. }
  11181. /**
  11182. * Abstract class for a set of proprietary modules that clean up (tidy)
  11183. * poorly written HTML.
  11184. * @todo Figure out how to protect some of these methods/properties
  11185. */
  11186. class HTMLPurifier_HTMLModule_Tidy extends HTMLPurifier_HTMLModule
  11187. {
  11188. /**
  11189. * List of supported levels. Index zero is a special case "no fixes"
  11190. * level.
  11191. */
  11192. public $levels = array(0 => 'none', 'light', 'medium', 'heavy');
  11193. /**
  11194. * Default level to place all fixes in. Disabled by default
  11195. */
  11196. public $defaultLevel = null;
  11197. /**
  11198. * Lists of fixes used by getFixesForLevel(). Format is:
  11199. * HTMLModule_Tidy->fixesForLevel[$level] = array('fix-1', 'fix-2');
  11200. */
  11201. public $fixesForLevel = array(
  11202. 'light' => array(),
  11203. 'medium' => array(),
  11204. 'heavy' => array()
  11205. );
  11206. /**
  11207. * Lazy load constructs the module by determining the necessary
  11208. * fixes to create and then delegating to the populate() function.
  11209. * @todo Wildcard matching and error reporting when an added or
  11210. * subtracted fix has no effect.
  11211. */
  11212. public function setup($config) {
  11213. // create fixes, initialize fixesForLevel
  11214. $fixes = $this->makeFixes();
  11215. $this->makeFixesForLevel($fixes);
  11216. // figure out which fixes to use
  11217. $level = $config->get('HTML.TidyLevel');
  11218. $fixes_lookup = $this->getFixesForLevel($level);
  11219. // get custom fix declarations: these need namespace processing
  11220. $add_fixes = $config->get('HTML.TidyAdd');
  11221. $remove_fixes = $config->get('HTML.TidyRemove');
  11222. foreach ($fixes as $name => $fix) {
  11223. // needs to be refactored a little to implement globbing
  11224. if (
  11225. isset($remove_fixes[$name]) ||
  11226. (!isset($add_fixes[$name]) && !isset($fixes_lookup[$name]))
  11227. ) {
  11228. unset($fixes[$name]);
  11229. }
  11230. }
  11231. // populate this module with necessary fixes
  11232. $this->populate($fixes);
  11233. }
  11234. /**
  11235. * Retrieves all fixes per a level, returning fixes for that specific
  11236. * level as well as all levels below it.
  11237. * @param $level String level identifier, see $levels for valid values
  11238. * @return Lookup up table of fixes
  11239. */
  11240. public function getFixesForLevel($level) {
  11241. if ($level == $this->levels[0]) {
  11242. return array();
  11243. }
  11244. $activated_levels = array();
  11245. for ($i = 1, $c = count($this->levels); $i < $c; $i++) {
  11246. $activated_levels[] = $this->levels[$i];
  11247. if ($this->levels[$i] == $level) break;
  11248. }
  11249. if ($i == $c) {
  11250. trigger_error(
  11251. 'Tidy level ' . htmlspecialchars($level) . ' not recognized',
  11252. E_USER_WARNING
  11253. );
  11254. return array();
  11255. }
  11256. $ret = array();
  11257. foreach ($activated_levels as $level) {
  11258. foreach ($this->fixesForLevel[$level] as $fix) {
  11259. $ret[$fix] = true;
  11260. }
  11261. }
  11262. return $ret;
  11263. }
  11264. /**
  11265. * Dynamically populates the $fixesForLevel member variable using
  11266. * the fixes array. It may be custom overloaded, used in conjunction
  11267. * with $defaultLevel, or not used at all.
  11268. */
  11269. public function makeFixesForLevel($fixes) {
  11270. if (!isset($this->defaultLevel)) return;
  11271. if (!isset($this->fixesForLevel[$this->defaultLevel])) {
  11272. trigger_error(
  11273. 'Default level ' . $this->defaultLevel . ' does not exist',
  11274. E_USER_ERROR
  11275. );
  11276. return;
  11277. }
  11278. $this->fixesForLevel[$this->defaultLevel] = array_keys($fixes);
  11279. }
  11280. /**
  11281. * Populates the module with transforms and other special-case code
  11282. * based on a list of fixes passed to it
  11283. * @param $lookup Lookup table of fixes to activate
  11284. */
  11285. public function populate($fixes) {
  11286. foreach ($fixes as $name => $fix) {
  11287. // determine what the fix is for
  11288. list($type, $params) = $this->getFixType($name);
  11289. switch ($type) {
  11290. case 'attr_transform_pre':
  11291. case 'attr_transform_post':
  11292. $attr = $params['attr'];
  11293. if (isset($params['element'])) {
  11294. $element = $params['element'];
  11295. if (empty($this->info[$element])) {
  11296. $e = $this->addBlankElement($element);
  11297. } else {
  11298. $e = $this->info[$element];
  11299. }
  11300. } else {
  11301. $type = "info_$type";
  11302. $e = $this;
  11303. }
  11304. // PHP does some weird parsing when I do
  11305. // $e->$type[$attr], so I have to assign a ref.
  11306. $f =& $e->$type;
  11307. $f[$attr] = $fix;
  11308. break;
  11309. case 'tag_transform':
  11310. $this->info_tag_transform[$params['element']] = $fix;
  11311. break;
  11312. case 'child':
  11313. case 'content_model_type':
  11314. $element = $params['element'];
  11315. if (empty($this->info[$element])) {
  11316. $e = $this->addBlankElement($element);
  11317. } else {
  11318. $e = $this->info[$element];
  11319. }
  11320. $e->$type = $fix;
  11321. break;
  11322. default:
  11323. trigger_error("Fix type $type not supported", E_USER_ERROR);
  11324. break;
  11325. }
  11326. }
  11327. }
  11328. /**
  11329. * Parses a fix name and determines what kind of fix it is, as well
  11330. * as other information defined by the fix
  11331. * @param $name String name of fix
  11332. * @return array(string $fix_type, array $fix_parameters)
  11333. * @note $fix_parameters is type dependant, see populate() for usage
  11334. * of these parameters
  11335. */
  11336. public function getFixType($name) {
  11337. // parse it
  11338. $property = $attr = null;
  11339. if (strpos($name, '#') !== false) list($name, $property) = explode('#', $name);
  11340. if (strpos($name, '@') !== false) list($name, $attr) = explode('@', $name);
  11341. // figure out the parameters
  11342. $params = array();
  11343. if ($name !== '') $params['element'] = $name;
  11344. if (!is_null($attr)) $params['attr'] = $attr;
  11345. // special case: attribute transform
  11346. if (!is_null($attr)) {
  11347. if (is_null($property)) $property = 'pre';
  11348. $type = 'attr_transform_' . $property;
  11349. return array($type, $params);
  11350. }
  11351. // special case: tag transform
  11352. if (is_null($property)) {
  11353. return array('tag_transform', $params);
  11354. }
  11355. return array($property, $params);
  11356. }
  11357. /**
  11358. * Defines all fixes the module will perform in a compact
  11359. * associative array of fix name to fix implementation.
  11360. */
  11361. public function makeFixes() {}
  11362. }
  11363. class HTMLPurifier_HTMLModule_XMLCommonAttributes extends HTMLPurifier_HTMLModule
  11364. {
  11365. public $name = 'XMLCommonAttributes';
  11366. public $attr_collections = array(
  11367. 'Lang' => array(
  11368. 'xml:lang' => 'LanguageCode',
  11369. )
  11370. );
  11371. }
  11372. /**
  11373. * Name is deprecated, but allowed in strict doctypes, so onl
  11374. */
  11375. class HTMLPurifier_HTMLModule_Tidy_Name extends HTMLPurifier_HTMLModule_Tidy
  11376. {
  11377. public $name = 'Tidy_Name';
  11378. public $defaultLevel = 'heavy';
  11379. public function makeFixes() {
  11380. $r = array();
  11381. // @name for img, a -----------------------------------------------
  11382. // Technically, it's allowed even on strict, so we allow authors to use
  11383. // it. However, it's deprecated in future versions of XHTML.
  11384. $r['img@name'] =
  11385. $r['a@name'] = new HTMLPurifier_AttrTransform_Name();
  11386. return $r;
  11387. }
  11388. }
  11389. class HTMLPurifier_HTMLModule_Tidy_Proprietary extends HTMLPurifier_HTMLModule_Tidy
  11390. {
  11391. public $name = 'Tidy_Proprietary';
  11392. public $defaultLevel = 'light';
  11393. public function makeFixes() {
  11394. $r = array();
  11395. $r['table@background'] = new HTMLPurifier_AttrTransform_Background();
  11396. $r['td@background'] = new HTMLPurifier_AttrTransform_Background();
  11397. $r['th@background'] = new HTMLPurifier_AttrTransform_Background();
  11398. $r['tr@background'] = new HTMLPurifier_AttrTransform_Background();
  11399. $r['thead@background'] = new HTMLPurifier_AttrTransform_Background();
  11400. $r['tfoot@background'] = new HTMLPurifier_AttrTransform_Background();
  11401. $r['tbody@background'] = new HTMLPurifier_AttrTransform_Background();
  11402. $r['table@height'] = new HTMLPurifier_AttrTransform_Length('height');
  11403. return $r;
  11404. }
  11405. }
  11406. class HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4 extends HTMLPurifier_HTMLModule_Tidy
  11407. {
  11408. public function makeFixes() {
  11409. $r = array();
  11410. // == deprecated tag transforms ===================================
  11411. $r['font'] = new HTMLPurifier_TagTransform_Font();
  11412. $r['menu'] = new HTMLPurifier_TagTransform_Simple('ul');
  11413. $r['dir'] = new HTMLPurifier_TagTransform_Simple('ul');
  11414. $r['center'] = new HTMLPurifier_TagTransform_Simple('div', 'text-align:center;');
  11415. $r['u'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:underline;');
  11416. $r['s'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:line-through;');
  11417. $r['strike'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:line-through;');
  11418. // == deprecated attribute transforms =============================
  11419. $r['caption@align'] =
  11420. new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
  11421. // we're following IE's behavior, not Firefox's, due
  11422. // to the fact that no one supports caption-side:right,
  11423. // W3C included (with CSS 2.1). This is a slightly
  11424. // unreasonable attribute!
  11425. 'left' => 'text-align:left;',
  11426. 'right' => 'text-align:right;',
  11427. 'top' => 'caption-side:top;',
  11428. 'bottom' => 'caption-side:bottom;' // not supported by IE
  11429. ));
  11430. // @align for img -------------------------------------------------
  11431. $r['img@align'] =
  11432. new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
  11433. 'left' => 'float:left;',
  11434. 'right' => 'float:right;',
  11435. 'top' => 'vertical-align:top;',
  11436. 'middle' => 'vertical-align:middle;',
  11437. 'bottom' => 'vertical-align:baseline;',
  11438. ));
  11439. // @align for table -----------------------------------------------
  11440. $r['table@align'] =
  11441. new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
  11442. 'left' => 'float:left;',
  11443. 'center' => 'margin-left:auto;margin-right:auto;',
  11444. 'right' => 'float:right;'
  11445. ));
  11446. // @align for hr -----------------------------------------------
  11447. $r['hr@align'] =
  11448. new HTMLPurifier_AttrTransform_EnumToCSS('align', array(
  11449. // we use both text-align and margin because these work
  11450. // for different browsers (IE and Firefox, respectively)
  11451. // and the melange makes for a pretty cross-compatible
  11452. // solution
  11453. 'left' => 'margin-left:0;margin-right:auto;text-align:left;',
  11454. 'center' => 'margin-left:auto;margin-right:auto;text-align:center;',
  11455. 'right' => 'margin-left:auto;margin-right:0;text-align:right;'
  11456. ));
  11457. // @align for h1, h2, h3, h4, h5, h6, p, div ----------------------
  11458. // {{{
  11459. $align_lookup = array();
  11460. $align_values = array('left', 'right', 'center', 'justify');
  11461. foreach ($align_values as $v) $align_lookup[$v] = "text-align:$v;";
  11462. // }}}
  11463. $r['h1@align'] =
  11464. $r['h2@align'] =
  11465. $r['h3@align'] =
  11466. $r['h4@align'] =
  11467. $r['h5@align'] =
  11468. $r['h6@align'] =
  11469. $r['p@align'] =
  11470. $r['div@align'] =
  11471. new HTMLPurifier_AttrTransform_EnumToCSS('align', $align_lookup);
  11472. // @bgcolor for table, tr, td, th ---------------------------------
  11473. $r['table@bgcolor'] =
  11474. $r['td@bgcolor'] =
  11475. $r['th@bgcolor'] =
  11476. new HTMLPurifier_AttrTransform_BgColor();
  11477. // @border for img ------------------------------------------------
  11478. $r['img@border'] = new HTMLPurifier_AttrTransform_Border();
  11479. // @clear for br --------------------------------------------------
  11480. $r['br@clear'] =
  11481. new HTMLPurifier_AttrTransform_EnumToCSS('clear', array(
  11482. 'left' => 'clear:left;',
  11483. 'right' => 'clear:right;',
  11484. 'all' => 'clear:both;',
  11485. 'none' => 'clear:none;',
  11486. ));
  11487. // @height for td, th ---------------------------------------------
  11488. $r['td@height'] =
  11489. $r['th@height'] =
  11490. new HTMLPurifier_AttrTransform_Length('height');
  11491. // @hspace for img ------------------------------------------------
  11492. $r['img@hspace'] = new HTMLPurifier_AttrTransform_ImgSpace('hspace');
  11493. // @noshade for hr ------------------------------------------------
  11494. // this transformation is not precise but often good enough.
  11495. // different browsers use different styles to designate noshade
  11496. $r['hr@noshade'] =
  11497. new HTMLPurifier_AttrTransform_BoolToCSS(
  11498. 'noshade',
  11499. 'color:#808080;background-color:#808080;border:0;'
  11500. );
  11501. // @nowrap for td, th ---------------------------------------------
  11502. $r['td@nowrap'] =
  11503. $r['th@nowrap'] =
  11504. new HTMLPurifier_AttrTransform_BoolToCSS(
  11505. 'nowrap',
  11506. 'white-space:nowrap;'
  11507. );
  11508. // @size for hr --------------------------------------------------
  11509. $r['hr@size'] = new HTMLPurifier_AttrTransform_Length('size', 'height');
  11510. // @type for li, ol, ul -------------------------------------------
  11511. // {{{
  11512. $ul_types = array(
  11513. 'disc' => 'list-style-type:disc;',
  11514. 'square' => 'list-style-type:square;',
  11515. 'circle' => 'list-style-type:circle;'
  11516. );
  11517. $ol_types = array(
  11518. '1' => 'list-style-type:decimal;',
  11519. 'i' => 'list-style-type:lower-roman;',
  11520. 'I' => 'list-style-type:upper-roman;',
  11521. 'a' => 'list-style-type:lower-alpha;',
  11522. 'A' => 'list-style-type:upper-alpha;'
  11523. );
  11524. $li_types = $ul_types + $ol_types;
  11525. // }}}
  11526. $r['ul@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ul_types);
  11527. $r['ol@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ol_types, true);
  11528. $r['li@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $li_types, true);
  11529. // @vspace for img ------------------------------------------------
  11530. $r['img@vspace'] = new HTMLPurifier_AttrTransform_ImgSpace('vspace');
  11531. // @width for hr, td, th ------------------------------------------
  11532. $r['td@width'] =
  11533. $r['th@width'] =
  11534. $r['hr@width'] = new HTMLPurifier_AttrTransform_Length('width');
  11535. return $r;
  11536. }
  11537. }
  11538. class HTMLPurifier_HTMLModule_Tidy_Strict extends HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4
  11539. {
  11540. public $name = 'Tidy_Strict';
  11541. public $defaultLevel = 'light';
  11542. public function makeFixes() {
  11543. $r = parent::makeFixes();
  11544. $r['blockquote#content_model_type'] = 'strictblockquote';
  11545. return $r;
  11546. }
  11547. public $defines_child_def = true;
  11548. public function getChildDef($def) {
  11549. if ($def->content_model_type != 'strictblockquote') return parent::getChildDef($def);
  11550. return new HTMLPurifier_ChildDef_StrictBlockquote($def->content_model);
  11551. }
  11552. }
  11553. class HTMLPurifier_HTMLModule_Tidy_Transitional extends HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4
  11554. {
  11555. public $name = 'Tidy_Transitional';
  11556. public $defaultLevel = 'heavy';
  11557. }
  11558. class HTMLPurifier_HTMLModule_Tidy_XHTML extends HTMLPurifier_HTMLModule_Tidy
  11559. {
  11560. public $name = 'Tidy_XHTML';
  11561. public $defaultLevel = 'medium';
  11562. public function makeFixes() {
  11563. $r = array();
  11564. $r['@lang'] = new HTMLPurifier_AttrTransform_Lang();
  11565. return $r;
  11566. }
  11567. }
  11568. /**
  11569. * Injector that auto paragraphs text in the root node based on
  11570. * double-spacing.
  11571. * @todo Ensure all states are unit tested, including variations as well.
  11572. * @todo Make a graph of the flow control for this Injector.
  11573. */
  11574. class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
  11575. {
  11576. public $name = 'AutoParagraph';
  11577. public $needed = array('p');
  11578. private function _pStart() {
  11579. $par = new HTMLPurifier_Token_Start('p');
  11580. $par->armor['MakeWellFormed_TagClosedError'] = true;
  11581. return $par;
  11582. }
  11583. public function handleText(&$token) {
  11584. $text = $token->data;
  11585. // Does the current parent allow <p> tags?
  11586. if ($this->allowsElement('p')) {
  11587. if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) {
  11588. // Note that we have differing behavior when dealing with text
  11589. // in the anonymous root node, or a node inside the document.
  11590. // If the text as a double-newline, the treatment is the same;
  11591. // if it doesn't, see the next if-block if you're in the document.
  11592. $i = $nesting = null;
  11593. if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) {
  11594. // State 1.1: ... ^ (whitespace, then document end)
  11595. // ----
  11596. // This is a degenerate case
  11597. } else {
  11598. if (!$token->is_whitespace || $this->_isInline($current)) {
  11599. // State 1.2: PAR1
  11600. // ----
  11601. // State 1.3: PAR1\n\nPAR2
  11602. // ------------
  11603. // State 1.4: <div>PAR1\n\nPAR2 (see State 2)
  11604. // ------------
  11605. $token = array($this->_pStart());
  11606. $this->_splitText($text, $token);
  11607. } else {
  11608. // State 1.5: \n<hr />
  11609. // --
  11610. }
  11611. }
  11612. } else {
  11613. // State 2: <div>PAR1... (similar to 1.4)
  11614. // ----
  11615. // We're in an element that allows paragraph tags, but we're not
  11616. // sure if we're going to need them.
  11617. if ($this->_pLookAhead()) {
  11618. // State 2.1: <div>PAR1<b>PAR1\n\nPAR2
  11619. // ----
  11620. // Note: This will always be the first child, since any
  11621. // previous inline element would have triggered this very
  11622. // same routine, and found the double newline. One possible
  11623. // exception would be a comment.
  11624. $token = array($this->_pStart(), $token);
  11625. } else {
  11626. // State 2.2.1: <div>PAR1<div>
  11627. // ----
  11628. // State 2.2.2: <div>PAR1<b>PAR1</b></div>
  11629. // ----
  11630. }
  11631. }
  11632. // Is the current parent a <p> tag?
  11633. } elseif (
  11634. !empty($this->currentNesting) &&
  11635. $this->currentNesting[count($this->currentNesting)-1]->name == 'p'
  11636. ) {
  11637. // State 3.1: ...<p>PAR1
  11638. // ----
  11639. // State 3.2: ...<p>PAR1\n\nPAR2
  11640. // ------------
  11641. $token = array();
  11642. $this->_splitText($text, $token);
  11643. // Abort!
  11644. } else {
  11645. // State 4.1: ...<b>PAR1
  11646. // ----
  11647. // State 4.2: ...<b>PAR1\n\nPAR2
  11648. // ------------
  11649. }
  11650. }
  11651. public function handleElement(&$token) {
  11652. // We don't have to check if we're already in a <p> tag for block
  11653. // tokens, because the tag would have been autoclosed by MakeWellFormed.
  11654. if ($this->allowsElement('p')) {
  11655. if (!empty($this->currentNesting)) {
  11656. if ($this->_isInline($token)) {
  11657. // State 1: <div>...<b>
  11658. // ---
  11659. // Check if this token is adjacent to the parent token
  11660. // (seek backwards until token isn't whitespace)
  11661. $i = null;
  11662. $this->backward($i, $prev);
  11663. if (!$prev instanceof HTMLPurifier_Token_Start) {
  11664. // Token wasn't adjacent
  11665. if (
  11666. $prev instanceof HTMLPurifier_Token_Text &&
  11667. substr($prev->data, -2) === "\n\n"
  11668. ) {
  11669. // State 1.1.4: <div><p>PAR1</p>\n\n<b>
  11670. // ---
  11671. // Quite frankly, this should be handled by splitText
  11672. $token = array($this->_pStart(), $token);
  11673. } else {
  11674. // State 1.1.1: <div><p>PAR1</p><b>
  11675. // ---
  11676. // State 1.1.2: <div><br /><b>
  11677. // ---
  11678. // State 1.1.3: <div>PAR<b>
  11679. // ---
  11680. }
  11681. } else {
  11682. // State 1.2.1: <div><b>
  11683. // ---
  11684. // Lookahead to see if <p> is needed.
  11685. if ($this->_pLookAhead()) {
  11686. // State 1.3.1: <div><b>PAR1\n\nPAR2
  11687. // ---
  11688. $token = array($this->_pStart(), $token);
  11689. } else {
  11690. // State 1.3.2: <div><b>PAR1</b></div>
  11691. // ---
  11692. // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div>
  11693. // ---
  11694. }
  11695. }
  11696. } else {
  11697. // State 2.3: ...<div>
  11698. // -----
  11699. }
  11700. } else {
  11701. if ($this->_isInline($token)) {
  11702. // State 3.1: <b>
  11703. // ---
  11704. // This is where the {p} tag is inserted, not reflected in
  11705. // inputTokens yet, however.
  11706. $token = array($this->_pStart(), $token);
  11707. } else {
  11708. // State 3.2: <div>
  11709. // -----
  11710. }
  11711. $i = null;
  11712. if ($this->backward($i, $prev)) {
  11713. if (
  11714. !$prev instanceof HTMLPurifier_Token_Text
  11715. ) {
  11716. // State 3.1.1: ...</p>{p}<b>
  11717. // ---
  11718. // State 3.2.1: ...</p><div>
  11719. // -----
  11720. if (!is_array($token)) $token = array($token);
  11721. array_unshift($token, new HTMLPurifier_Token_Text("\n\n"));
  11722. } else {
  11723. // State 3.1.2: ...</p>\n\n{p}<b>
  11724. // ---
  11725. // State 3.2.2: ...</p>\n\n<div>
  11726. // -----
  11727. // Note: PAR<ELEM> cannot occur because PAR would have been
  11728. // wrapped in <p> tags.
  11729. }
  11730. }
  11731. }
  11732. } else {
  11733. // State 2.2: <ul><li>
  11734. // ----
  11735. // State 2.4: <p><b>
  11736. // ---
  11737. }
  11738. }
  11739. /**
  11740. * Splits up a text in paragraph tokens and appends them
  11741. * to the result stream that will replace the original
  11742. * @param $data String text data that will be processed
  11743. * into paragraphs
  11744. * @param $result Reference to array of tokens that the
  11745. * tags will be appended onto
  11746. * @param $config Instance of HTMLPurifier_Config
  11747. * @param $context Instance of HTMLPurifier_Context
  11748. */
  11749. private function _splitText($data, &$result) {
  11750. $raw_paragraphs = explode("\n\n", $data);
  11751. $paragraphs = array(); // without empty paragraphs
  11752. $needs_start = false;
  11753. $needs_end = false;
  11754. $c = count($raw_paragraphs);
  11755. if ($c == 1) {
  11756. // There were no double-newlines, abort quickly. In theory this
  11757. // should never happen.
  11758. $result[] = new HTMLPurifier_Token_Text($data);
  11759. return;
  11760. }
  11761. for ($i = 0; $i < $c; $i++) {
  11762. $par = $raw_paragraphs[$i];
  11763. if (trim($par) !== '') {
  11764. $paragraphs[] = $par;
  11765. } else {
  11766. if ($i == 0) {
  11767. // Double newline at the front
  11768. if (empty($result)) {
  11769. // The empty result indicates that the AutoParagraph
  11770. // injector did not add any start paragraph tokens.
  11771. // This means that we have been in a paragraph for
  11772. // a while, and the newline means we should start a new one.
  11773. $result[] = new HTMLPurifier_Token_End('p');
  11774. $result[] = new HTMLPurifier_Token_Text("\n\n");
  11775. // However, the start token should only be added if
  11776. // there is more processing to be done (i.e. there are
  11777. // real paragraphs in here). If there are none, the
  11778. // next start paragraph tag will be handled by the
  11779. // next call to the injector
  11780. $needs_start = true;
  11781. } else {
  11782. // We just started a new paragraph!
  11783. // Reinstate a double-newline for presentation's sake, since
  11784. // it was in the source code.
  11785. array_unshift($result, new HTMLPurifier_Token_Text("\n\n"));
  11786. }
  11787. } elseif ($i + 1 == $c) {
  11788. // Double newline at the end
  11789. // There should be a trailing </p> when we're finally done.
  11790. $needs_end = true;
  11791. }
  11792. }
  11793. }
  11794. // Check if this was just a giant blob of whitespace. Move this earlier,
  11795. // perhaps?
  11796. if (empty($paragraphs)) {
  11797. return;
  11798. }
  11799. // Add the start tag indicated by \n\n at the beginning of $data
  11800. if ($needs_start) {
  11801. $result[] = $this->_pStart();
  11802. }
  11803. // Append the paragraphs onto the result
  11804. foreach ($paragraphs as $par) {
  11805. $result[] = new HTMLPurifier_Token_Text($par);
  11806. $result[] = new HTMLPurifier_Token_End('p');
  11807. $result[] = new HTMLPurifier_Token_Text("\n\n");
  11808. $result[] = $this->_pStart();
  11809. }
  11810. // Remove trailing start token; Injector will handle this later if
  11811. // it was indeed needed. This prevents from needing to do a lookahead,
  11812. // at the cost of a lookbehind later.
  11813. array_pop($result);
  11814. // If there is no need for an end tag, remove all of it and let
  11815. // MakeWellFormed close it later.
  11816. if (!$needs_end) {
  11817. array_pop($result); // removes \n\n
  11818. array_pop($result); // removes </p>
  11819. }
  11820. }
  11821. /**
  11822. * Returns true if passed token is inline (and, ergo, allowed in
  11823. * paragraph tags)
  11824. */
  11825. private function _isInline($token) {
  11826. return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);
  11827. }
  11828. /**
  11829. * Looks ahead in the token list and determines whether or not we need
  11830. * to insert a <p> tag.
  11831. */
  11832. private function _pLookAhead() {
  11833. $this->current($i, $current);
  11834. if ($current instanceof HTMLPurifier_Token_Start) $nesting = 1;
  11835. else $nesting = 0;
  11836. $ok = false;
  11837. while ($this->forwardUntilEndToken($i, $current, $nesting)) {
  11838. $result = $this->_checkNeedsP($current);
  11839. if ($result !== null) {
  11840. $ok = $result;
  11841. break;
  11842. }
  11843. }
  11844. return $ok;
  11845. }
  11846. /**
  11847. * Determines if a particular token requires an earlier inline token
  11848. * to get a paragraph. This should be used with _forwardUntilEndToken
  11849. */
  11850. private function _checkNeedsP($current) {
  11851. if ($current instanceof HTMLPurifier_Token_Start){
  11852. if (!$this->_isInline($current)) {
  11853. // <div>PAR1<div>
  11854. // ----
  11855. // Terminate early, since we hit a block element
  11856. return false;
  11857. }
  11858. } elseif ($current instanceof HTMLPurifier_Token_Text) {
  11859. if (strpos($current->data, "\n\n") !== false) {
  11860. // <div>PAR1<b>PAR1\n\nPAR2
  11861. // ----
  11862. return true;
  11863. } else {
  11864. // <div>PAR1<b>PAR1...
  11865. // ----
  11866. }
  11867. }
  11868. return null;
  11869. }
  11870. }
  11871. /**
  11872. * Injector that displays the URL of an anchor instead of linking to it, in addition to showing the text of the link.
  11873. */
  11874. class HTMLPurifier_Injector_DisplayLinkURI extends HTMLPurifier_Injector
  11875. {
  11876. public $name = 'DisplayLinkURI';
  11877. public $needed = array('a');
  11878. public function handleElement(&$token) {
  11879. }
  11880. public function handleEnd(&$token) {
  11881. if (isset($token->start->attr['href'])){
  11882. $url = $token->start->attr['href'];
  11883. unset($token->start->attr['href']);
  11884. $token = array($token, new HTMLPurifier_Token_Text(" ($url)"));
  11885. } else {
  11886. // nothing to display
  11887. }
  11888. }
  11889. }
  11890. /**
  11891. * Injector that converts http, https and ftp text URLs to actual links.
  11892. */
  11893. class HTMLPurifier_Injector_Linkify extends HTMLPurifier_Injector
  11894. {
  11895. public $name = 'Linkify';
  11896. public $needed = array('a' => array('href'));
  11897. public function handleText(&$token) {
  11898. if (!$this->allowsElement('a')) return;
  11899. if (strpos($token->data, '://') === false) {
  11900. // our really quick heuristic failed, abort
  11901. // this may not work so well if we want to match things like
  11902. // "google.com", but then again, most people don't
  11903. return;
  11904. }
  11905. // there is/are URL(s). Let's split the string:
  11906. // Note: this regex is extremely permissive
  11907. $bits = preg_split('#((?:https?|ftp)://[^\s\'"<>()]+)#S', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
  11908. $token = array();
  11909. // $i = index
  11910. // $c = count
  11911. // $l = is link
  11912. for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) {
  11913. if (!$l) {
  11914. if ($bits[$i] === '') continue;
  11915. $token[] = new HTMLPurifier_Token_Text($bits[$i]);
  11916. } else {
  11917. $token[] = new HTMLPurifier_Token_Start('a', array('href' => $bits[$i]));
  11918. $token[] = new HTMLPurifier_Token_Text($bits[$i]);
  11919. $token[] = new HTMLPurifier_Token_End('a');
  11920. }
  11921. }
  11922. }
  11923. }
  11924. /**
  11925. * Injector that converts configuration directive syntax %Namespace.Directive
  11926. * to links
  11927. */
  11928. class HTMLPurifier_Injector_PurifierLinkify extends HTMLPurifier_Injector
  11929. {
  11930. public $name = 'PurifierLinkify';
  11931. public $docURL;
  11932. public $needed = array('a' => array('href'));
  11933. public function prepare($config, $context) {
  11934. $this->docURL = $config->get('AutoFormat.PurifierLinkify.DocURL');
  11935. return parent::prepare($config, $context);
  11936. }
  11937. public function handleText(&$token) {
  11938. if (!$this->allowsElement('a')) return;
  11939. if (strpos($token->data, '%') === false) return;
  11940. $bits = preg_split('#%([a-z0-9]+\.[a-z0-9]+)#Si', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
  11941. $token = array();
  11942. // $i = index
  11943. // $c = count
  11944. // $l = is link
  11945. for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) {
  11946. if (!$l) {
  11947. if ($bits[$i] === '') continue;
  11948. $token[] = new HTMLPurifier_Token_Text($bits[$i]);
  11949. } else {
  11950. $token[] = new HTMLPurifier_Token_Start('a',
  11951. array('href' => str_replace('%s', $bits[$i], $this->docURL)));
  11952. $token[] = new HTMLPurifier_Token_Text('%' . $bits[$i]);
  11953. $token[] = new HTMLPurifier_Token_End('a');
  11954. }
  11955. }
  11956. }
  11957. }
  11958. class HTMLPurifier_Injector_RemoveEmpty extends HTMLPurifier_Injector
  11959. {
  11960. private $context, $config, $attrValidator, $removeNbsp, $removeNbspExceptions;
  11961. public function prepare($config, $context) {
  11962. parent::prepare($config, $context);
  11963. $this->config = $config;
  11964. $this->context = $context;
  11965. $this->removeNbsp = $config->get('AutoFormat.RemoveEmpty.RemoveNbsp');
  11966. $this->removeNbspExceptions = $config->get('AutoFormat.RemoveEmpty.RemoveNbsp.Exceptions');
  11967. $this->attrValidator = new HTMLPurifier_AttrValidator();
  11968. }
  11969. publi