PageRenderTime 422ms CodeModel.GetById 10ms RepoModel.GetById 1ms app.codeStats 0ms

/core/lib/Drupal/Component/Utility/Html.php

http://github.com/drupal/drupal
PHP | 484 lines | 145 code | 34 blank | 305 comment | 13 complexity | c300df8c5da62b3d558f9650c331f2bb MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1
  1. <?php
  2. namespace Drupal\Component\Utility;
  3. /**
  4. * Provides DOMDocument helpers for parsing and serializing HTML strings.
  5. *
  6. * @ingroup utility
  7. */
  8. class Html {
  9. /**
  10. * An array of previously cleaned HTML classes.
  11. *
  12. * @var array
  13. */
  14. protected static $classes = [];
  15. /**
  16. * An array of the initial IDs used in one request.
  17. *
  18. * @var array
  19. */
  20. protected static $seenIdsInit;
  21. /**
  22. * An array of IDs, including incremented versions when an ID is duplicated.
  23. * @var array
  24. */
  25. protected static $seenIds;
  26. /**
  27. * Stores whether the current request was sent via AJAX.
  28. *
  29. * @var bool
  30. */
  31. protected static $isAjax = FALSE;
  32. /**
  33. * All attributes that may contain URIs.
  34. *
  35. * - The attributes 'code' and 'codebase' are omitted, because they only exist
  36. * for the <applet> tag. The time of Java applets has passed.
  37. * - The attribute 'icon' is omitted, because no browser implements the
  38. * <command> tag anymore.
  39. * See https://developer.mozilla.org/en-US/docs/Web/HTML/Element/command.
  40. * - The 'manifest' attribute is omitted because it only exists for the <html>
  41. * tag. That tag only makes sense in a HTML-served-as-HTML context, in which
  42. * case relative URLs are guaranteed to work.
  43. *
  44. * @see https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
  45. * @see https://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value
  46. *
  47. * @var string[]
  48. */
  49. protected static $uriAttributes = ['href', 'poster', 'src', 'cite', 'data', 'action', 'formaction', 'srcset', 'about'];
  50. /**
  51. * Prepares a string for use as a valid class name.
  52. *
  53. * Do not pass one string containing multiple classes as they will be
  54. * incorrectly concatenated with dashes, i.e. "one two" will become "one-two".
  55. *
  56. * @param mixed $class
  57. * The class name to clean. It can be a string or anything that can be cast
  58. * to string.
  59. *
  60. * @return string
  61. * The cleaned class name.
  62. */
  63. public static function getClass($class) {
  64. $class = (string) $class;
  65. if (!isset(static::$classes[$class])) {
  66. static::$classes[$class] = static::cleanCssIdentifier(mb_strtolower($class));
  67. }
  68. return static::$classes[$class];
  69. }
  70. /**
  71. * Prepares a string for use as a CSS identifier (element, class, or ID name).
  72. *
  73. * Link below shows the syntax for valid CSS identifiers (including element
  74. * names, classes, and IDs in selectors).
  75. *
  76. * @see http://www.w3.org/TR/CSS21/syndata.html#characters
  77. *
  78. * @param string $identifier
  79. * The identifier to clean.
  80. * @param array $filter
  81. * An array of string replacements to use on the identifier.
  82. *
  83. * @return string
  84. * The cleaned identifier.
  85. */
  86. public static function cleanCssIdentifier($identifier, array $filter = [
  87. ' ' => '-',
  88. '_' => '-',
  89. '/' => '-',
  90. '[' => '-',
  91. ']' => '',
  92. ]) {
  93. // We could also use strtr() here but its much slower than str_replace(). In
  94. // order to keep '__' to stay '__' we first replace it with a different
  95. // placeholder after checking that it is not defined as a filter.
  96. $double_underscore_replacements = 0;
  97. if (!isset($filter['__'])) {
  98. $identifier = str_replace('__', '##', $identifier, $double_underscore_replacements);
  99. }
  100. $identifier = str_replace(array_keys($filter), array_values($filter), $identifier);
  101. // Replace temporary placeholder '##' with '__' only if the original
  102. // $identifier contained '__'.
  103. if ($double_underscore_replacements > 0) {
  104. $identifier = str_replace('##', '__', $identifier);
  105. }
  106. // Valid characters in a CSS identifier are:
  107. // - the hyphen (U+002D)
  108. // - a-z (U+0030 - U+0039)
  109. // - A-Z (U+0041 - U+005A)
  110. // - the underscore (U+005F)
  111. // - 0-9 (U+0061 - U+007A)
  112. // - ISO 10646 characters U+00A1 and higher
  113. // We strip out any character not in the above list.
  114. $identifier = preg_replace('/[^\x{002D}\x{0030}-\x{0039}\x{0041}-\x{005A}\x{005F}\x{0061}-\x{007A}\x{00A1}-\x{FFFF}]/u', '', $identifier);
  115. // Identifiers cannot start with a digit, two hyphens, or a hyphen followed by a digit.
  116. $identifier = preg_replace([
  117. '/^[0-9]/',
  118. '/^(-[0-9])|^(--)/',
  119. ], ['_', '__'], $identifier);
  120. return $identifier;
  121. }
  122. /**
  123. * Sets if this request is an Ajax request.
  124. *
  125. * @param bool $is_ajax
  126. * TRUE if this request is an Ajax request, FALSE otherwise.
  127. */
  128. public static function setIsAjax($is_ajax) {
  129. static::$isAjax = $is_ajax;
  130. }
  131. /**
  132. * Prepares a string for use as a valid HTML ID and guarantees uniqueness.
  133. *
  134. * This function ensures that each passed HTML ID value only exists once on
  135. * the page. By tracking the already returned ids, this function enables
  136. * forms, blocks, and other content to be output multiple times on the same
  137. * page, without breaking (X)HTML validation.
  138. *
  139. * For already existing IDs, a counter is appended to the ID string.
  140. * Therefore, JavaScript and CSS code should not rely on any value that was
  141. * generated by this function and instead should rely on manually added CSS
  142. * classes or similarly reliable constructs.
  143. *
  144. * Two consecutive hyphens separate the counter from the original ID. To
  145. * manage uniqueness across multiple Ajax requests on the same page, Ajax
  146. * requests POST an array of all IDs currently present on the page, which are
  147. * used to prime this function's cache upon first invocation.
  148. *
  149. * To allow reverse-parsing of IDs submitted via Ajax, any multiple
  150. * consecutive hyphens in the originally passed $id are replaced with a
  151. * single hyphen.
  152. *
  153. * @param string $id
  154. * The ID to clean.
  155. *
  156. * @return string
  157. * The cleaned ID.
  158. */
  159. public static function getUniqueId($id) {
  160. // If this is an Ajax request, then content returned by this page request
  161. // will be merged with content already on the base page. The HTML IDs must
  162. // be unique for the fully merged content. Therefore use unique IDs.
  163. if (static::$isAjax) {
  164. return static::getId($id) . '--' . Crypt::randomBytesBase64(8);
  165. }
  166. // @todo Remove all that code once we switch over to random IDs only,
  167. // see https://www.drupal.org/node/1090592.
  168. if (!isset(static::$seenIdsInit)) {
  169. static::$seenIdsInit = [];
  170. }
  171. if (!isset(static::$seenIds)) {
  172. static::$seenIds = static::$seenIdsInit;
  173. }
  174. $id = static::getId($id);
  175. // Ensure IDs are unique by appending a counter after the first occurrence.
  176. // The counter needs to be appended with a delimiter that does not exist in
  177. // the base ID. Requiring a unique delimiter helps ensure that we really do
  178. // return unique IDs and also helps us re-create the $seen_ids array during
  179. // Ajax requests.
  180. if (isset(static::$seenIds[$id])) {
  181. $id = $id . '--' . ++static::$seenIds[$id];
  182. }
  183. else {
  184. static::$seenIds[$id] = 1;
  185. }
  186. return $id;
  187. }
  188. /**
  189. * Prepares a string for use as a valid HTML ID.
  190. *
  191. * Only use this function when you want to intentionally skip the uniqueness
  192. * guarantee of self::getUniqueId().
  193. *
  194. * @param string $id
  195. * The ID to clean.
  196. *
  197. * @return string
  198. * The cleaned ID.
  199. *
  200. * @see self::getUniqueId()
  201. */
  202. public static function getId($id) {
  203. $id = str_replace([' ', '_', '[', ']'], ['-', '-', '-', ''], mb_strtolower($id));
  204. // As defined in http://www.w3.org/TR/html4/types.html#type-name, HTML IDs can
  205. // only contain letters, digits ([0-9]), hyphens ("-"), underscores ("_"),
  206. // colons (":"), and periods ("."). We strip out any character not in that
  207. // list. Note that the CSS spec doesn't allow colons or periods in identifiers
  208. // (http://www.w3.org/TR/CSS21/syndata.html#characters), so we strip those two
  209. // characters as well.
  210. $id = preg_replace('/[^A-Za-z0-9\-_]/', '', $id);
  211. // Removing multiple consecutive hyphens.
  212. $id = preg_replace('/\-+/', '-', $id);
  213. return $id;
  214. }
  215. /**
  216. * Resets the list of seen IDs.
  217. */
  218. public static function resetSeenIds() {
  219. static::$seenIds = NULL;
  220. }
  221. /**
  222. * Normalizes an HTML snippet.
  223. *
  224. * This function is essentially \DOMDocument::normalizeDocument(), but
  225. * operates on an HTML string instead of a \DOMDocument.
  226. *
  227. * @param string $html
  228. * The HTML string to normalize.
  229. *
  230. * @return string
  231. * The normalized HTML string.
  232. */
  233. public static function normalize($html) {
  234. $document = static::load($html);
  235. return static::serialize($document);
  236. }
  237. /**
  238. * Parses an HTML snippet and returns it as a DOM object.
  239. *
  240. * This function loads the body part of a partial (X)HTML document and returns
  241. * a full \DOMDocument object that represents this document.
  242. *
  243. * Use \Drupal\Component\Utility\Html::serialize() to serialize this
  244. * \DOMDocument back to a string.
  245. *
  246. * @param string $html
  247. * The partial (X)HTML snippet to load. Invalid markup will be corrected on
  248. * import.
  249. *
  250. * @return \DOMDocument
  251. * A \DOMDocument that represents the loaded (X)HTML snippet.
  252. */
  253. public static function load($html) {
  254. $document = <<<EOD
  255. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  256. <html xmlns="http://www.w3.org/1999/xhtml">
  257. <head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head>
  258. <body>!html</body>
  259. </html>
  260. EOD;
  261. // PHP's \DOMDocument serialization adds extra whitespace when the markup
  262. // of the wrapping document contains newlines, so ensure we remove all
  263. // newlines before injecting the actual HTML body to be processed.
  264. $document = strtr($document, ["\n" => '', '!html' => $html]);
  265. $dom = new \DOMDocument();
  266. // Ignore warnings during HTML soup loading.
  267. @$dom->loadHTML($document);
  268. return $dom;
  269. }
  270. /**
  271. * Converts the body of a \DOMDocument back to an HTML snippet.
  272. *
  273. * The function serializes the body part of a \DOMDocument back to an (X)HTML
  274. * snippet. The resulting (X)HTML snippet will be properly formatted to be
  275. * compatible with HTML user agents.
  276. *
  277. * @param \DOMDocument $document
  278. * A \DOMDocument object to serialize, only the tags below the first <body>
  279. * node will be converted.
  280. *
  281. * @return string
  282. * A valid (X)HTML snippet, as a string.
  283. */
  284. public static function serialize(\DOMDocument $document) {
  285. $body_node = $document->getElementsByTagName('body')->item(0);
  286. $html = '';
  287. if ($body_node !== NULL) {
  288. foreach ($body_node->getElementsByTagName('script') as $node) {
  289. static::escapeCdataElement($node);
  290. }
  291. foreach ($body_node->getElementsByTagName('style') as $node) {
  292. static::escapeCdataElement($node, '/*', '*/');
  293. }
  294. foreach ($body_node->childNodes as $node) {
  295. $html .= $document->saveXML($node);
  296. }
  297. }
  298. return $html;
  299. }
  300. /**
  301. * Adds comments around a <!CDATA section in a \DOMNode.
  302. *
  303. * \DOMDocument::loadHTML() in \Drupal\Component\Utility\Html::load() makes
  304. * CDATA sections from the contents of inline script and style tags. This can
  305. * cause HTML4 browsers to throw exceptions.
  306. *
  307. * This function attempts to solve the problem by creating a
  308. * \DOMDocumentFragment to comment the CDATA tag.
  309. *
  310. * @param \DOMNode $node
  311. * The element potentially containing a CDATA node.
  312. * @param string $comment_start
  313. * (optional) A string to use as a comment start marker to escape the CDATA
  314. * declaration. Defaults to '//'.
  315. * @param string $comment_end
  316. * (optional) A string to use as a comment end marker to escape the CDATA
  317. * declaration. Defaults to an empty string.
  318. */
  319. public static function escapeCdataElement(\DOMNode $node, $comment_start = '//', $comment_end = '') {
  320. foreach ($node->childNodes as $child_node) {
  321. if ($child_node instanceof \DOMCdataSection) {
  322. $embed_prefix = "\n<!--{$comment_start}--><![CDATA[{$comment_start} ><!--{$comment_end}\n";
  323. $embed_suffix = "\n{$comment_start}--><!]]>{$comment_end}\n";
  324. // Prevent invalid cdata escaping as this would throw a DOM error.
  325. // This is the same behavior as found in libxml2.
  326. // Related W3C standard: http://www.w3.org/TR/REC-xml/#dt-cdsection
  327. // Fix explanation: http://wikipedia.org/wiki/CDATA#Nesting
  328. $data = str_replace(']]>', ']]]]><![CDATA[>', $child_node->data);
  329. $fragment = $node->ownerDocument->createDocumentFragment();
  330. $fragment->appendXML($embed_prefix . $data . $embed_suffix);
  331. $node->appendChild($fragment);
  332. $node->removeChild($child_node);
  333. }
  334. }
  335. }
  336. /**
  337. * Decodes all HTML entities including numerical ones to regular UTF-8 bytes.
  338. *
  339. * Double-escaped entities will only be decoded once ("&amp;lt;" becomes
  340. * "&lt;", not "<"). Be careful when using this function, as it will revert
  341. * previous sanitization efforts (&lt;script&gt; will become <script>).
  342. *
  343. * This method is not the opposite of Html::escape(). For example, this method
  344. * will convert "&eacute;" to "é", whereas Html::escape() will not convert "é"
  345. * to "&eacute;".
  346. *
  347. * @param string $text
  348. * The text to decode entities in.
  349. *
  350. * @return string
  351. * The input $text, with all HTML entities decoded once.
  352. *
  353. * @see html_entity_decode()
  354. * @see \Drupal\Component\Utility\Html::escape()
  355. */
  356. public static function decodeEntities($text) {
  357. return html_entity_decode($text, ENT_QUOTES, 'UTF-8');
  358. }
  359. /**
  360. * Escapes text by converting special characters to HTML entities.
  361. *
  362. * This method escapes HTML for sanitization purposes by replacing the
  363. * following special characters with their HTML entity equivalents:
  364. * - & (ampersand) becomes &amp;
  365. * - " (double quote) becomes &quot;
  366. * - ' (single quote) becomes &#039;
  367. * - < (less than) becomes &lt;
  368. * - > (greater than) becomes &gt;
  369. * Special characters that have already been escaped will be double-escaped
  370. * (for example, "&lt;" becomes "&amp;lt;"), and invalid UTF-8 encoding
  371. * will be converted to the Unicode replacement character ("�").
  372. *
  373. * This method is not the opposite of Html::decodeEntities(). For example,
  374. * this method will not encode "é" to "&eacute;", whereas
  375. * Html::decodeEntities() will convert all HTML entities to UTF-8 bytes,
  376. * including "&eacute;" and "&lt;" to "é" and "<".
  377. *
  378. * When constructing @link theme_render render arrays @endlink passing the output of Html::escape() to
  379. * '#markup' is not recommended. Use the '#plain_text' key instead and the
  380. * renderer will autoescape the text.
  381. *
  382. * @param string $text
  383. * The input text.
  384. *
  385. * @return string
  386. * The text with all HTML special characters converted.
  387. *
  388. * @see htmlspecialchars()
  389. * @see \Drupal\Component\Utility\Html::decodeEntities()
  390. *
  391. * @ingroup sanitization
  392. */
  393. public static function escape($text) {
  394. return htmlspecialchars($text, ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8');
  395. }
  396. /**
  397. * Converts all root-relative URLs to absolute URLs.
  398. *
  399. * Does not change any existing protocol-relative or absolute URLs. Does not
  400. * change other relative URLs because they would result in different absolute
  401. * URLs depending on the current path. For example: when the same content
  402. * containing such a relative URL (for example 'image.png'), is served from
  403. * its canonical URL (for example 'http://example.com/some-article') or from
  404. * a listing or feed (for example 'http://example.com/all-articles') their
  405. * "current path" differs, resulting in different absolute URLs:
  406. * 'http://example.com/some-article/image.png' versus
  407. * 'http://example.com/all-articles/image.png'. Only one can be correct.
  408. * Therefore relative URLs that are not root-relative cannot be safely
  409. * transformed and should generally be avoided.
  410. *
  411. * Necessary for HTML that is served outside of a website, for example, RSS
  412. * and e-mail.
  413. *
  414. * @param string $html
  415. * The partial (X)HTML snippet to load. Invalid markup will be corrected on
  416. * import.
  417. * @param string $scheme_and_host
  418. * The root URL, which has a URI scheme, host and optional port.
  419. *
  420. * @return string
  421. * The updated (X)HTML snippet.
  422. */
  423. public static function transformRootRelativeUrlsToAbsolute($html, $scheme_and_host) {
  424. assert(empty(array_diff(array_keys(parse_url($scheme_and_host)), ["scheme", "host", "port"])), '$scheme_and_host contains scheme, host and port at most.');
  425. assert(isset(parse_url($scheme_and_host)["scheme"]), '$scheme_and_host is absolute and hence has a scheme.');
  426. assert(isset(parse_url($scheme_and_host)["host"]), '$base_url is absolute and hence has a host.');
  427. $html_dom = Html::load($html);
  428. $xpath = new \DOMXpath($html_dom);
  429. // Update all root-relative URLs to absolute URLs in the given HTML.
  430. foreach (static::$uriAttributes as $attr) {
  431. foreach ($xpath->query("//*[starts-with(@$attr, '/') and not(starts-with(@$attr, '//'))]") as $node) {
  432. $node->setAttribute($attr, $scheme_and_host . $node->getAttribute($attr));
  433. }
  434. foreach ($xpath->query("//*[@srcset]") as $node) {
  435. // @see https://html.spec.whatwg.org/multipage/embedded-content.html#attr-img-srcset
  436. // @see https://html.spec.whatwg.org/multipage/embedded-content.html#image-candidate-string
  437. $image_candidate_strings = explode(',', $node->getAttribute('srcset'));
  438. $image_candidate_strings = array_map('trim', $image_candidate_strings);
  439. for ($i = 0; $i < count($image_candidate_strings); $i++) {
  440. $image_candidate_string = $image_candidate_strings[$i];
  441. if ($image_candidate_string[0] === '/' && $image_candidate_string[1] !== '/') {
  442. $image_candidate_strings[$i] = $scheme_and_host . $image_candidate_string;
  443. }
  444. }
  445. $node->setAttribute('srcset', implode(', ', $image_candidate_strings));
  446. }
  447. }
  448. return Html::serialize($html_dom);
  449. }
  450. }