PageRenderTime 44ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/framework/vendors/htmlpurifier/standalone/HTMLPurifier/Filter/ExtractStyleBlocks.php

https://gitlab.com/zenfork/vektor
PHP | 289 lines | 161 code | 11 blank | 117 comment | 37 complexity | f5656c71d81cc104346422bcd6e3c51c MD5 | raw file
  1. <?php
  2. // why is this a top level function? Because PHP 5.2.0 doesn't seem to
  3. // understand how to interpret this filter if it's a static method.
  4. // It's all really silly, but if we go this route it might be reasonable
  5. // to coalesce all of these methods into one.
  6. function htmlpurifier_filter_extractstyleblocks_muteerrorhandler() {}
  7. /**
  8. * This filter extracts <style> blocks from input HTML, cleans them up
  9. * using CSSTidy, and then places them in $purifier->context->get('StyleBlocks')
  10. * so they can be used elsewhere in the document.
  11. *
  12. * @note
  13. * See tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php for
  14. * sample usage.
  15. *
  16. * @note
  17. * This filter can also be used on stylesheets not included in the
  18. * document--something purists would probably prefer. Just directly
  19. * call HTMLPurifier_Filter_ExtractStyleBlocks->cleanCSS()
  20. */
  21. class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter
  22. {
  23. public $name = 'ExtractStyleBlocks';
  24. private $_styleMatches = array();
  25. private $_tidy;
  26. private $_id_attrdef;
  27. private $_class_attrdef;
  28. private $_enum_attrdef;
  29. public function __construct() {
  30. $this->_tidy = new csstidy();
  31. $this->_tidy->set_cfg('lowercase_s', false);
  32. $this->_id_attrdef = new HTMLPurifier_AttrDef_HTML_ID(true);
  33. $this->_class_attrdef = new HTMLPurifier_AttrDef_CSS_Ident();
  34. $this->_enum_attrdef = new HTMLPurifier_AttrDef_Enum(array('first-child', 'link', 'visited', 'active', 'hover', 'focus'));
  35. }
  36. /**
  37. * Save the contents of CSS blocks to style matches
  38. * @param $matches preg_replace style $matches array
  39. */
  40. protected function styleCallback($matches) {
  41. $this->_styleMatches[] = $matches[1];
  42. }
  43. /**
  44. * Removes inline <style> tags from HTML, saves them for later use
  45. * @todo Extend to indicate non-text/css style blocks
  46. */
  47. public function preFilter($html, $config, $context) {
  48. $tidy = $config->get('Filter.ExtractStyleBlocks.TidyImpl');
  49. if ($tidy !== null) $this->_tidy = $tidy;
  50. $html = preg_replace_callback('#<style(?:\s.*)?>(.+)</style>#isU', array($this, 'styleCallback'), $html);
  51. $style_blocks = $this->_styleMatches;
  52. $this->_styleMatches = array(); // reset
  53. $context->register('StyleBlocks', $style_blocks); // $context must not be reused
  54. if ($this->_tidy) {
  55. foreach ($style_blocks as &$style) {
  56. $style = $this->cleanCSS($style, $config, $context);
  57. }
  58. }
  59. return $html;
  60. }
  61. /**
  62. * Takes CSS (the stuff found in <style>) and cleans it.
  63. * @warning Requires CSSTidy <http://csstidy.sourceforge.net/>
  64. * @param $css CSS styling to clean
  65. * @param $config Instance of HTMLPurifier_Config
  66. * @param $context Instance of HTMLPurifier_Context
  67. * @return Cleaned CSS
  68. */
  69. public function cleanCSS($css, $config, $context) {
  70. // prepare scope
  71. $scope = $config->get('Filter.ExtractStyleBlocks.Scope');
  72. if ($scope !== null) {
  73. $scopes = array_map('trim', explode(',', $scope));
  74. } else {
  75. $scopes = array();
  76. }
  77. // remove comments from CSS
  78. $css = trim($css);
  79. if (strncmp('<!--', $css, 4) === 0) {
  80. $css = substr($css, 4);
  81. }
  82. if (strlen($css) > 3 && substr($css, -3) == '-->') {
  83. $css = substr($css, 0, -3);
  84. }
  85. $css = trim($css);
  86. set_error_handler('htmlpurifier_filter_extractstyleblocks_muteerrorhandler');
  87. $this->_tidy->parse($css);
  88. restore_error_handler();
  89. $css_definition = $config->getDefinition('CSS');
  90. $html_definition = $config->getDefinition('HTML');
  91. $new_css = array();
  92. foreach ($this->_tidy->css as $k => $decls) {
  93. // $decls are all CSS declarations inside an @ selector
  94. $new_decls = array();
  95. foreach ($decls as $selector => $style) {
  96. $selector = trim($selector);
  97. if ($selector === '') continue; // should not happen
  98. // Parse the selector
  99. // Here is the relevant part of the CSS grammar:
  100. //
  101. // ruleset
  102. // : selector [ ',' S* selector ]* '{' ...
  103. // selector
  104. // : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]?
  105. // combinator
  106. // : '+' S*
  107. // : '>' S*
  108. // simple_selector
  109. // : element_name [ HASH | class | attrib | pseudo ]*
  110. // | [ HASH | class | attrib | pseudo ]+
  111. // element_name
  112. // : IDENT | '*'
  113. // ;
  114. // class
  115. // : '.' IDENT
  116. // ;
  117. // attrib
  118. // : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S*
  119. // [ IDENT | STRING ] S* ]? ']'
  120. // ;
  121. // pseudo
  122. // : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ]
  123. // ;
  124. //
  125. // For reference, here are the relevant tokens:
  126. //
  127. // HASH #{name}
  128. // IDENT {ident}
  129. // INCLUDES ==
  130. // DASHMATCH |=
  131. // STRING {string}
  132. // FUNCTION {ident}\(
  133. //
  134. // And the lexical scanner tokens
  135. //
  136. // name {nmchar}+
  137. // nmchar [_a-z0-9-]|{nonascii}|{escape}
  138. // nonascii [\240-\377]
  139. // escape {unicode}|\\[^\r\n\f0-9a-f]
  140. // unicode \\{h}}{1,6}(\r\n|[ \t\r\n\f])?
  141. // ident -?{nmstart}{nmchar*}
  142. // nmstart [_a-z]|{nonascii}|{escape}
  143. // string {string1}|{string2}
  144. // string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
  145. // string2 \'([^\n\r\f\\"]|\\{nl}|{escape})*\'
  146. //
  147. // We'll implement a subset (in order to reduce attack
  148. // surface); in particular:
  149. //
  150. // - No Unicode support
  151. // - No escapes support
  152. // - No string support (by proxy no attrib support)
  153. // - element_name is matched against allowed
  154. // elements (some people might find this
  155. // annoying...)
  156. // - Pseudo-elements one of :first-child, :link,
  157. // :visited, :active, :hover, :focus
  158. // handle ruleset
  159. $selectors = array_map('trim', explode(',', $selector));
  160. $new_selectors = array();
  161. foreach ($selectors as $sel) {
  162. // split on +, > and spaces
  163. $basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE);
  164. // even indices are chunks, odd indices are
  165. // delimiters
  166. $nsel = null;
  167. $delim = null; // guaranteed to be non-null after
  168. // two loop iterations
  169. for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) {
  170. $x = $basic_selectors[$i];
  171. if ($i % 2) {
  172. // delimiter
  173. if ($x === ' ') {
  174. $delim = ' ';
  175. } else {
  176. $delim = ' ' . $x . ' ';
  177. }
  178. } else {
  179. // simple selector
  180. $components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE);
  181. $sdelim = null;
  182. $nx = null;
  183. for ($j = 0, $cc = count($components); $j < $cc; $j ++) {
  184. $y = $components[$j];
  185. if ($j === 0) {
  186. if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) {
  187. $nx = $y;
  188. } else {
  189. // $nx stays null; this matters
  190. // if we don't manage to find
  191. // any valid selector content,
  192. // in which case we ignore the
  193. // outer $delim
  194. }
  195. } elseif ($j % 2) {
  196. // set delimiter
  197. $sdelim = $y;
  198. } else {
  199. $attrdef = null;
  200. if ($sdelim === '#') {
  201. $attrdef = $this->_id_attrdef;
  202. } elseif ($sdelim === '.') {
  203. $attrdef = $this->_class_attrdef;
  204. } elseif ($sdelim === ':') {
  205. $attrdef = $this->_enum_attrdef;
  206. } else {
  207. throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split');
  208. }
  209. $r = $attrdef->validate($y, $config, $context);
  210. if ($r !== false) {
  211. if ($r !== true) {
  212. $y = $r;
  213. }
  214. if ($nx === null) {
  215. $nx = '';
  216. }
  217. $nx .= $sdelim . $y;
  218. }
  219. }
  220. }
  221. if ($nx !== null) {
  222. if ($nsel === null) {
  223. $nsel = $nx;
  224. } else {
  225. $nsel .= $delim . $nx;
  226. }
  227. } else {
  228. // delimiters to the left of invalid
  229. // basic selector ignored
  230. }
  231. }
  232. }
  233. if ($nsel !== null) {
  234. if (!empty($scopes)) {
  235. foreach ($scopes as $s) {
  236. $new_selectors[] = "$s $nsel";
  237. }
  238. } else {
  239. $new_selectors[] = $nsel;
  240. }
  241. }
  242. }
  243. if (empty($new_selectors)) continue;
  244. $selector = implode(', ', $new_selectors);
  245. foreach ($style as $name => $value) {
  246. if (!isset($css_definition->info[$name])) {
  247. unset($style[$name]);
  248. continue;
  249. }
  250. $def = $css_definition->info[$name];
  251. $ret = $def->validate($value, $config, $context);
  252. if ($ret === false) unset($style[$name]);
  253. else $style[$name] = $ret;
  254. }
  255. $new_decls[$selector] = $style;
  256. }
  257. $new_css[$k] = $new_decls;
  258. }
  259. // remove stuff that shouldn't be used, could be reenabled
  260. // after security risks are analyzed
  261. $this->_tidy->css = $new_css;
  262. $this->_tidy->import = array();
  263. $this->_tidy->charset = null;
  264. $this->_tidy->namespace = null;
  265. $css = $this->_tidy->print->plain();
  266. // we are going to escape any special characters <>& to ensure
  267. // that no funny business occurs (i.e. </style> in a font-family prop).
  268. if ($config->get('Filter.ExtractStyleBlocks.Escaping')) {
  269. $css = str_replace(
  270. array('<', '>', '&'),
  271. array('\3C ', '\3E ', '\26 '),
  272. $css
  273. );
  274. }
  275. return $css;
  276. }
  277. }
  278. // vim: et sw=4 sts=4