PageRenderTime 51ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/Jyxo/Html.php

http://github.com/jyxo/php
PHP | 796 lines | 450 code | 89 blank | 257 comment | 34 complexity | f5715f95f8b37405d936547cbf36a2ea MD5 | raw file
  1. <?php declare(strict_types = 1);
  2. /**
  3. * Jyxo PHP Library
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file license.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * https://github.com/jyxo/php/blob/master/license.txt
  11. */
  12. namespace Jyxo;
  13. use LogicException;
  14. use function array_pop;
  15. use function array_push;
  16. use function count;
  17. use function end;
  18. use function html_entity_decode;
  19. use function htmlspecialchars;
  20. use function mb_strtoupper;
  21. use function mb_substr;
  22. use function nl2br;
  23. use function preg_match;
  24. use function preg_match_all;
  25. use function preg_replace;
  26. use function preg_replace_callback;
  27. use function sprintf;
  28. use function str_ireplace;
  29. use function str_repeat;
  30. use function str_replace;
  31. use function strip_tags;
  32. use function stripos;
  33. use function strlen;
  34. use function strtolower;
  35. use function strtr;
  36. use function substr;
  37. use function substr_count;
  38. use function tidy_repair_string;
  39. use function trim;
  40. use const ENT_QUOTES;
  41. /**
  42. * Functions for HTML processing.
  43. *
  44. * @copyright Copyright (c) 2005-2011 Jyxo, s.r.o.
  45. * @license https://github.com/jyxo/php/blob/master/license.txt
  46. * @author Jaroslav HanslĂ­k
  47. */
  48. class Html
  49. {
  50. /**
  51. * Constructor preventing from creating instances of a static class.
  52. *
  53. * @throws LogicException If trying to create an instance
  54. */
  55. final public function __construct()
  56. {
  57. throw new LogicException(sprintf('Cannot create an instance of a static class %s.', static::class));
  58. }
  59. /**
  60. * Tests if the given text contains at least one HTML tag.
  61. * It is just an estimation.
  62. *
  63. * @param string $text Input text to be tested
  64. * @return bool
  65. */
  66. public static function is(string $text): bool
  67. {
  68. return (bool) preg_match('~<[a-z][a-z0-9]*(\\s[^<]*)?>~i', $text);
  69. }
  70. /**
  71. * Fixes an invalid HTML source, unifies quotes and removes unnecessary whitespace.
  72. * Required the Tidy PHP extension.
  73. *
  74. * @param string $html Input HTML source
  75. * @return string
  76. */
  77. public static function repair(string $html): string
  78. {
  79. // HTML fixing
  80. static $config = [
  81. // Uses LF line endings
  82. 'newline' => 'LF',
  83. // Removes indent
  84. 'indent' => false,
  85. // Output will be in XHTML format
  86. 'output-xhtml' => true,
  87. // No BOM
  88. 'output-bom' => false,
  89. // Automatic doctype
  90. 'doctype' => 'auto',
  91. // 'clean' => true, // Removes presentation tags (inline styles would be moved into <style> elements)
  92. // Cleans MS HTML mess
  93. 'bare' => true,
  94. // No wrapping
  95. 'wrap' => 0,
  96. // No <![ ... ]> wrapping
  97. 'wrap-sections' => false,
  98. // 'quote-marks' => true, // Replaces quotes with appropriate entities (causes problems with later regular expression processing)
  99. // 'logical-emphasis' => true, // Replaces all <i> and <b> tags with <em> and <strong> (styles cannot be parsed after)
  100. // Text inside <body> encapsulates with a <p> tag
  101. 'enclose-text' => true,
  102. // Disables <div> merging
  103. 'merge-divs' => false,
  104. // Disables <span> merging
  105. 'merge-spans' => false,
  106. // 'hide-comments' => true, // Removes comments (it would remove conditional comments used when inserting Flash)
  107. // Makes output even on error
  108. 'force-output' => true,
  109. // Don't show any errors
  110. 'show-errors' => 0,
  111. // Don't show any warnings
  112. 'show-warnings' => false,
  113. // Makes an ordinary text from CDATA blocks
  114. 'escape-cdata' => true,
  115. // Preserves correctly formatted entities
  116. 'preserve-entities' => true,
  117. // 'drop-proprietary-attributes' => true, // Removes proprietary attributes (it would remove e.g. the background attribute)
  118. // 'drop-font-tags' => true // Removes <FONT> and <CENTER> tags
  119. ];
  120. $html = tidy_repair_string($html, $config, 'utf8');
  121. // Removes namespace <?xml:namespace prefix = o ns = "urn:schemas-microsoft-com:office:office" /? > generated by MS Word
  122. $html = preg_replace('~<\?xml:namespace[^>]*>~i', '', $html);
  123. // Removes unnecessary line breaks and keeps them inside <pre> elements
  124. // Tidy adds one more line breaks inside <pre> elements
  125. $html = preg_replace("~(<pre[^>]*>)\n~", '\\1', $html);
  126. $html = preg_replace("~\n</pre>~", '</pre>', $html);
  127. $html = preg_replace_callback('~(<pre[^>]*>)(.+?)(</pre>)~s', static function ($matches) {
  128. return $matches[1] . strtr(nl2br($matches[2]), ['\"' => '"']) . $matches[3];
  129. }, $html);
  130. // Strip line breaks
  131. $html = strtr($html, ["\r" => '', "\n" => '']);
  132. // Replace single quotes with double quotes (for easier processing later)
  133. $html = preg_replace('~(<[a-z][a-z0-9]*[^>]+[a-z]+=)\'([^\']*)\'~i', '\\1"\\2"', $html);
  134. // Remove unnecessary spaces inside elements (for easier processing later)
  135. $html = preg_replace('~(<[a-z][a-z0-9]*[^>]+[a-z]+=")\\s+([^"]*")~i', '\\1\\2', $html);
  136. $html = preg_replace('~(<[a-z][a-z0-9]*[^>]+[a-z]+="[^"]*)\s+(")~i', '\\1\\2', $html);
  137. return $html;
  138. }
  139. /**
  140. * Removes given tags from the HTML source.
  141. * If no tags are given, the default set is used.
  142. * Expects valid HTML code.
  143. *
  144. * @param string $html HTML source code
  145. * @param array $tags Tags to be removed
  146. * @return string
  147. */
  148. public static function removeTags(string $html, array $tags = []): string
  149. {
  150. // Default set of tags
  151. static $default = [
  152. 'frameset',
  153. 'frame',
  154. 'noframes',
  155. 'iframe',
  156. 'script',
  157. 'noscript',
  158. 'style',
  159. 'link',
  160. 'object',
  161. 'embed',
  162. 'form',
  163. 'input',
  164. 'select',
  165. 'textarea',
  166. 'button',
  167. ];
  168. // If no tags are set, the default set will be used
  169. if (empty($tags)) {
  170. $tags = $default;
  171. }
  172. // Remove given tags
  173. foreach ($tags as $tag) {
  174. switch ($tag) {
  175. // Embed
  176. case 'embed':
  177. // Second variant is because of Tidy that processes <embed> this way
  178. $pattern = ['~\s*<embed[^>]*>.*?</embed>~is', '~\s*<embed[^>]*>~is'];
  179. break;
  180. // Self closing tags
  181. case 'link':
  182. case 'meta':
  183. case 'br':
  184. case 'hr':
  185. case 'img':
  186. case 'input':
  187. $pattern = ['~\s*<' . $tag . '[^>]*>~is'];
  188. break;
  189. // Pair tags
  190. default:
  191. $pattern = ['~\s*<' . $tag . '(?:\s+[^>]*)?>.*?</' . $tag . '>~is'];
  192. break;
  193. }
  194. $html = preg_replace($pattern, '', $html);
  195. }
  196. return $html;
  197. }
  198. /**
  199. * Removes tags of the same type nested into each other from the HTML source.
  200. * Expects valid HTML source
  201. *
  202. * @param string $html HTML source code
  203. * @param string $tag Tags to be processed
  204. * @return string
  205. */
  206. public static function removeInnerTags(string $html, string $tag): string
  207. {
  208. if (preg_match_all('~(?:<' . $tag . '>)|(?:</' . $tag . '>)|(?:<[^>]+>)|(?:[^<]+)~i', $html, $matches)) {
  209. $html = '';
  210. $level = 0;
  211. foreach ($matches[0] as $htmlPart) {
  212. if (stripos($htmlPart, '<' . $tag) === 0) {
  213. $level++;
  214. if ($level === 1) {
  215. $html .= $htmlPart;
  216. }
  217. } elseif (stripos($htmlPart, '</' . $tag) === 0) {
  218. if ($level === 1) {
  219. $html .= $htmlPart;
  220. }
  221. $level--;
  222. } else {
  223. $html .= $htmlPart;
  224. }
  225. }
  226. }
  227. return $html;
  228. }
  229. /**
  230. * Removes given attributes from the HTML source.
  231. * If no attributes are given, the default set will be used.
  232. * Expects valid HTML source.
  233. *
  234. * @param string $html HTML source code
  235. * @param array $attributes Attributes to be removed
  236. * @return string
  237. */
  238. public static function removeAttributes(string $html, array $attributes = []): string
  239. {
  240. // Default set of attributes
  241. static $default = ['id', 'class'];
  242. // If no attributes are given, the default set will be used
  243. if (empty($attributes)) {
  244. $attributes = $default;
  245. }
  246. // Remove given attributes
  247. foreach ($attributes as $attribute) {
  248. $html = preg_replace('~(<[a-z][a-z0-9]*[^>]*?)\\s+' . $attribute . '="[^"]*"~is', '\\1', $html);
  249. }
  250. return $html;
  251. }
  252. /**
  253. * Removes all javascript events from the HTML source.
  254. * If it is necessary to remove only certain events, the removeAttributes() method can be used.
  255. * Expects valid HTML source.
  256. *
  257. * @param string $html HTML source code
  258. * @return string
  259. */
  260. public static function removeJavascriptEvents(string $html): string
  261. {
  262. // A tag can have multiple events, therefore it is necessary to process the source multiple times
  263. while (preg_match('~<[a-z][a-z0-9]*[^>]*?\\s+on[a-z]+="[^"]*"~is', $html)) {
  264. $html = preg_replace('~(<[a-z][a-z0-9]*[^>]*?)\\s+on[a-z]+="[^"]*"~is', '\\1', $html);
  265. }
  266. return $html;
  267. }
  268. /**
  269. * Removes foreign images from the HTML source.
  270. * Keeps <img> tags (only set the value about:blank into its src attribute), because removing the tag entirely could affect
  271. * the page layout.
  272. * Expects valid HTML source.
  273. *
  274. * @param string $html HTML source code
  275. * @return string
  276. */
  277. public static function removeRemoteImages(string $html): string
  278. {
  279. static $remoteImages = [
  280. '~(<img[^>]+src=")http(?:s)?://[^"]+(")~i',
  281. '~(<[a-z][a-z0-9]*[^>]+background=")http(?:s)?://[^"]+(")~i',
  282. '~(<[a-z][a-z0-9]*[^>]+style="[^"]*background\\s*[:])([\-a-z0-9#%\\s]*)url\([^)]+\)(;)?~is',
  283. '~(<[a-z][a-z0-9]*[^>]+style="[^"]*)background-image\\s*[:]([\-a-z0-9#%\\s]*)url\([^)]+\)(;)?~is',
  284. '~(<[a-z][a-z0-9]*[^>]+style="[^"]*list-style\\s*[:])([\-a-z0-9\\s]*)url\([^)]+\)(;)?~is',
  285. '~(<[a-z][a-z0-9]*[^>]+style="[^"]*)list-style-image\\s*[:]([\-a-z0-9\\s]*)url\([^)]+\)(;)?~is',
  286. ];
  287. // We use value about:blank for the <img> tag's src attribute, because removing the tag entirely could affect the page layout
  288. static $remoteImagesReplacement = [
  289. '\\1about:blank\\2',
  290. '\\1\\2',
  291. '\\1\\2\\3',
  292. '\\1',
  293. '\\1\\2\\3',
  294. '\\1',
  295. ];
  296. return preg_replace($remoteImages, $remoteImagesReplacement, $html);
  297. }
  298. /**
  299. * Removes possibly dangerous attributes that could contain XSS code from the HTML source.
  300. *
  301. * @param string $html HTML source code
  302. * @return string
  303. */
  304. public static function removeDangerous(string $html): string
  305. {
  306. static $dangerous = [
  307. '~\\s+href="javascript[^"]*"~i',
  308. '~\\s+src="javascript[^"]*"~i',
  309. // See http://www.soom.cz/index.php?name=projects/testmail/main
  310. '~\\s+href="data:[^"]*"~i',
  311. '~\\s+src="data:[^"]*"~i',
  312. ];
  313. return preg_replace($dangerous, '', $html);
  314. }
  315. /**
  316. * Returns <body> contents from the given HTML source.
  317. * Expects valid HTML source.
  318. *
  319. * @param string $html HTML source code
  320. * @return string
  321. */
  322. public static function getBody(string $html): string
  323. {
  324. // If the source code contains <body>, return this element's contents
  325. if (preg_match('~<body([^>]*)>(.*?)</body>~is', $html, $matches)) {
  326. $body = trim($matches[2]);
  327. // Converts <body> inline styles to a newly created <div> element
  328. if (preg_match('~style="[^"]+"~i', $matches[1], $style)) {
  329. $body = '<div ' . $style[0] . '>' . $body . '</div>';
  330. }
  331. return $body;
  332. }
  333. // Return everything otherwise
  334. return $html;
  335. }
  336. /**
  337. * Converts text to HTML source code.
  338. *
  339. * @param string $text Input text
  340. * @param bool $convertLinks Convert urls and emails to links
  341. * @return string
  342. */
  343. public static function fromText(string $text, bool $convertLinks = true): string
  344. {
  345. // Trimming whitespace (except spaces)
  346. $text = trim($text, "\r\n");
  347. // Two empty lines max
  348. $text = preg_replace("~\n\\s+\n~", "\n\n", $text);
  349. // Special chars
  350. $html = htmlspecialchars($text, ENT_QUOTES, 'utf-8', false);
  351. // Two spaces mean an indent, convert to non-breaking spaces
  352. $html = str_replace(' ', '&nbsp;&nbsp;', $html);
  353. // Convert tabs to four non-breaking spaces
  354. $html = str_replace("\t", '&nbsp;&nbsp;&nbsp;&nbsp;', $html);
  355. // Paragraph
  356. $html = '<p>' . preg_replace("~\n\n[^\\n]?~", '</p><p>\\0', $html) . '</p>';
  357. $html = str_replace("\n", "<br />\n", $html);
  358. $html = str_ireplace('<p><br />', "<p>\n", $html);
  359. // Citation
  360. preg_match_all('~(?:(^(?:<p>)?\\s*&gt;(?:&gt;|\\s)*)(.*)$)|(?:.+)~im', $html, $matches);
  361. $html = '';
  362. $offset = 0;
  363. for ($i = 0; $i < count($matches[0]); $i++) {
  364. $currentOffset = substr_count($matches[1][$i], '&gt;');
  365. if ($currentOffset > 0) {
  366. if ($currentOffset > $offset) {
  367. $html .= str_repeat('<blockquote type="cite">', $currentOffset - $offset) . '<p>';
  368. $offset = $currentOffset;
  369. } elseif ($currentOffset < $offset) {
  370. $html .= '</p>' . str_repeat('</blockquote>', $offset - $currentOffset) . '<p>';
  371. $offset = $currentOffset;
  372. }
  373. $html .= $matches[2][$i];
  374. } else {
  375. if ($offset > 0) {
  376. $html .= '</p>' . str_repeat('</blockquote>', $offset) . '<p>';
  377. $offset = 0;
  378. }
  379. $html .= $matches[0][$i];
  380. }
  381. }
  382. if ($offset > 0) {
  383. $html .= '</p>' . str_repeat('</blockquote>', $offset);
  384. }
  385. // Removes empty lines that were created during previous processing
  386. $html = preg_replace('~(?:<br />)+</p></blockquote>~i', '</p></blockquote>', $html);
  387. $html = str_ireplace('<p><br /></p>', '', $html);
  388. $html = str_ireplace('<p><br />', '<p>', $html);
  389. // Emails and urls
  390. if ($convertLinks) {
  391. $html = self::linkFromText($html);
  392. }
  393. return $html;
  394. }
  395. /**
  396. * Converts text to a link to an url or email.
  397. *
  398. * @param string $text Input text
  399. * @return string
  400. */
  401. public static function linkFromText(string $text): string
  402. {
  403. $patternGenericTld = '(?:tld|aero|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|asia|post|geo)';
  404. $patternTld = '(?-i:' . $patternGenericTld . '|[a-z]{2})';
  405. $patternDomain = '(?:(?:[a-z]|[a-z0-9](?:[\-a-z0-9]{0,61}[a-z0-9]))[.])*(?:[a-z0-9](?:[\-a-z0-9]{0,61}[a-z0-9])[.]' . $patternTld . ')';
  406. $pattern8bit = '(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]?[0-9])';
  407. $patternIPv4 = '(?:' . $pattern8bit . '(?:[.]' . $pattern8bit . '){3})';
  408. // a:b:c:d:e:f:g:h
  409. $patternIpV6Variant8Hex = '(?:(?:[0-9a-f]{1,4}:){7}[0-9a-f]{1,4})';
  410. // Compressed a::b
  411. $patternIpV6VariantCompressedHex = '(?:(?:(?:[0-9a-f]{1,4}(?::[0-9a-f]{1,4})*)?)::(?:(?:[0-9a-f]{1,4}(?::[0-9a-f]{1,4})*)?))';
  412. // IPv4 mapped to IPv6 a:b:c:d:e:f:w.x.y.z
  413. $patternIpV6VariantHex4Dec = '(?:(?:(?:[0-9a-f]{1,4}:){6})' . $patternIPv4 . ')';
  414. // Compressed IPv4 mapped to IPv6 a::b:w.x.y.z
  415. $patternIpV6VariantCompressedHex4Dec = '(?:(?:(?:[0-9a-f]{1,4}(?::[0-9a-f]{1,4})*)?)::(?:(?:[0-9a-f]{1,4}:)*)' . $patternIPv4 . ')';
  416. $patternIpV6 = '(?:' . $patternIpV6Variant8Hex . '|' . $patternIpV6VariantCompressedHex . '|' . $patternIpV6VariantHex4Dec . '|' . $patternIpV6VariantCompressedHex4Dec . ')';
  417. // mailto:username
  418. $patternEmail = '(?:mailto:)?(?:[\-\\w!#\$%&\'*+/=?^`{|}\~]+(?:[.][\-\\w!#\$%&\'*+/=?^`{|}\~]+)*)';
  419. // @domain.tld
  420. $patternEmail .= '(?:@' . $patternDomain . ')';
  421. // protocol://user:password@
  422. $patternUrl = '(?:(?:http|ftp)s?://(?:[\\S]+(?:[:][\\S]*)?@)?)?';
  423. // domain.tld, IPv4 or IPv6
  424. $patternUrl .= '(?:' . $patternDomain . '|' . $patternIPv4 . '|' . $patternIpV6 . ')';
  425. // :port/path/file.extension
  426. $patternUrl .= '(?::[0-9]+)?(?:(?:/[-\\w\\pL\\pN\~.:!%]+)*(?:/|[.][a-z0-9]{2,4})?)?';
  427. // ?query#hash
  428. $patternUrl .= '(?:[?][\]\[\-\\w\\pL\\pN.,?!\~%#@&;:/\'\=+]*)?(?:#[\]\[\-\\w\\pL\\pN.,?!\~%@&;:/\'\=+]*)?';
  429. return preg_replace_callback(
  430. '~(^|[^\\pL\\pN])(?:(' . $patternEmail . ')|(' . $patternUrl . '))(?=$|\\W)~iu',
  431. static function ($matches) {
  432. // Url
  433. if (isset($matches[3])) {
  434. $url = $matches[3];
  435. // Remove special chars at the end
  436. if (preg_match('~(([.,:;?!>)\]}]|(&gt;))+)$~i', $url, $matches2)) {
  437. $punctuation = $matches2[1];
  438. // strlen is necessary because of &gt;
  439. $url = mb_substr($url, 0, -strlen($matches2[1]), 'utf-8');
  440. } else {
  441. $punctuation = '';
  442. }
  443. // Add missing http://
  444. $linkUrl = !preg_match('~^(http|ftp)s?://~i', $url) ? 'http://' . $url : $url;
  445. // Create a link
  446. return $matches[1] . '<a href="' . $linkUrl . '">' . $url . '</a>' . $punctuation;
  447. }
  448. // Emails
  449. if (!isset($matches[2])) {
  450. return;
  451. }
  452. $email = $matches[2];
  453. if (stripos($email, 'mailto:') !== false) {
  454. $email = substr($matches[2], 7);
  455. $protocol = 'mailto:';
  456. } else {
  457. $protocol = '';
  458. }
  459. return $matches[1] . '<a href="mailto:' . $email . '">' . $protocol . $email . '</a>';
  460. },
  461. $text
  462. );
  463. }
  464. /**
  465. * Converts HTML source code to plaintext.
  466. *
  467. * @param string $html HTML source code
  468. * @return string
  469. */
  470. public static function toText(string $html): string
  471. {
  472. $text = $html;
  473. // Remove styles a scripts
  474. $text = self::removeTags($text, ['style', 'script']);
  475. // Re-format lines
  476. // <pre>
  477. $text = preg_replace_callback('~<pre[^>]*>(.+?)</pre>~is', static function ($matches) {
  478. // Line breaks are converted to <br />, that are removed later
  479. return nl2br($matches[1]);
  480. }, $text);
  481. // \r, redundant line breaks, tabs and <br />
  482. $text = preg_replace(
  483. ["~\r~", "~[\n\t]+~", '~<br[^>]*>~i'],
  484. ['', ' ', "\n"],
  485. $text
  486. );
  487. // Processing of most tags and entities
  488. static $search = [
  489. // <h3> to <h6>
  490. '~<h[3-6][^>]*>(.+?)</h[3-6]>~is',
  491. // <div> and </div>
  492. '~(<div[^>]*>)|(</div>)~i',
  493. // <p> and </p>
  494. '~(<p(?:\s+[^>]+)?>)|(</p>)~i',
  495. // <table> and </table>
  496. '~(<table[^>]*>)|(</table>)~i',
  497. // </tr>
  498. '~</tr>*~i',
  499. // <td> and </td>
  500. '~<td[^>]*>(.+?)</td>~is',
  501. // '~(<code[^>]*>)|(</code>)~i', // <code> and </code>
  502. // Ellipsis
  503. '~(&hellip;)~i',
  504. // Quotes
  505. '~(&#8220;)|(&#8221;)~i',
  506. // Apostrophe
  507. '~(&apos;)~i',
  508. // Copyright
  509. '~(&copy;)|(&#169;)~i',
  510. // Trademark
  511. '~&trade;~i',
  512. // Registered trademark
  513. '~&reg;~i',
  514. // Dash and hyphen
  515. '~(&mdash;)|(&ndash;)~i',
  516. ];
  517. static $replace = [
  518. // <h3> to <h6>
  519. "\n\n\\1\n\n",
  520. // <div> and </div>
  521. "\n\n",
  522. // <p> and </p>
  523. "\n\n",
  524. // <table> and </table>
  525. "\n\n",
  526. // </tr>
  527. "\n",
  528. // <td> and </td>
  529. "\\1\t",
  530. // "\n\n", // <code> and </code>
  531. // Ellipsis
  532. '...',
  533. // Quotes
  534. '"',
  535. // Apostrophe
  536. '\'',
  537. // Copyright
  538. '(c)',
  539. // Trademark
  540. '(tm)',
  541. // Registered trademark
  542. '(R)',
  543. // Dash and hyphen
  544. '-',
  545. ];
  546. $text = preg_replace($search, $replace, $text);
  547. // <h1> and <h2>
  548. $text = preg_replace_callback('~<h[12][^>]*>(.+?)</h[12]>~is', static function ($matches) {
  549. return "\n\n\n" . mb_strtoupper($matches[1], 'utf-8') . "\n\n";
  550. }, $text);
  551. // <strong>
  552. $text = preg_replace_callback('~<strong[^>]*>(.+?)</strong>~is', static function ($matches) {
  553. return mb_strtoupper($matches[1], 'utf-8');
  554. }, $text);
  555. // <hr />
  556. $text = preg_replace_callback('~<hr[^>]*>~i', static function ($matches) {
  557. return "\n" . str_repeat('-', 50) . "\n";
  558. }, $text);
  559. // <th>
  560. $text = preg_replace_callback('~<th[^>]*>(.+?)</th>~is', static function ($matches) {
  561. return mb_strtoupper($matches[1], 'utf-8') . "\t";
  562. }, $text);
  563. // <a>
  564. $text = self::linkToText($text);
  565. // <ul> and <ol>
  566. $text = self::listToText($text);
  567. // Two empty lines at most
  568. $text = trim($text, "\n ");
  569. $text = preg_replace("~\n\\s+\n~", "\n\n", $text);
  570. // Process <blockquote> (empty lines are removed before <blockquote> processing on purpose)
  571. $text = self::blockquoteToText($text);
  572. // Remove all left tags
  573. $text = strip_tags($text);
  574. // Replacing [textlink] for <> (must be done after strip_tags)
  575. $text = preg_replace('~\[textlink\]\\s*~s', '<', $text);
  576. $text = preg_replace('~\\s*\[/textlink\]~s', '>', $text);
  577. // Replaces non-breaking spaces
  578. $text = preg_replace(['~&nbsp;&nbsp;&nbsp;&nbsp;~i', '~&nbsp;~i'], ["\t", ' '], $text);
  579. // Remove other entities (must not be performed before)
  580. // After previous processing some entities are upper case, that is why we have to use strtolower
  581. $text = preg_replace_callback('~(&#?[a-z0-9]+;)~i', static function ($matches) {
  582. return html_entity_decode(strtolower($matches[1]), ENT_QUOTES, 'utf-8');
  583. }, $text);
  584. // Two empty lines at most (performed second times on purpose)
  585. $text = trim($text, "\n ");
  586. $text = preg_replace("~\n\\s+\n~", "\n\n", $text);
  587. // Because of <blockquote> converting
  588. $text = preg_replace("~(\n>\\s*)+\n~", "\n>\n", $text);
  589. // One space at most
  590. $text = preg_replace("~(\n|\t)( )+~", '\1', $text);
  591. $text = preg_replace('~( ){2,}~', ' ', $text);
  592. // No space at line ends
  593. $text = preg_replace("~[ \t]+\n~", "\n", $text);
  594. return $text;
  595. }
  596. /**
  597. * Converts HTML links into plaintext.
  598. *
  599. * @param string $text Text with HTML fragments
  600. * @return string
  601. */
  602. private static function linkToText(string $text): string
  603. {
  604. return preg_replace_callback('~(<a\\s+[^>]*>)(.+?)</a>~is', static function ($matches) {
  605. $url = preg_match('~\\shref="([^"]+)"~i', $matches[1], $submatches) ? trim($submatches[1]) : '';
  606. $content = $matches[2];
  607. $clearContent = trim(strip_tags($content));
  608. // Some urls have no real meaning
  609. if (empty($url) || ($url[0] === '#') || (substr($url, 0, 2) === '/?')) {
  610. return $content;
  611. }
  612. // Invalid url gets ignored
  613. if (!Input\Validator\IsUrl::validate($url)) {
  614. return $content;
  615. }
  616. // If the link text and target are the same, use only one of them
  617. return $url === $clearContent ? '[textlink]' . $content . '[/textlink]' : $content . ' [textlink]' . $url . '[/textlink]';
  618. }, $text);
  619. }
  620. /**
  621. * Converts HTML lists to plaintext.
  622. *
  623. * @param string $text Text with HTML fragments
  624. * @return string
  625. */
  626. private static function listToText(string $text): string
  627. {
  628. static $symbols = ['#', '*', 'o', '+'];
  629. preg_match_all('~(?:<[a-z][a-z0-9]*[^>]*(?: /)?>)|(?:</[a-z][a-z0-9]*>)|(?:<![^>]+>)|(?:[^<]+)~i', $text, $matches);
  630. $text = '';
  631. $ulLevel = 0;
  632. $olLevel = 0;
  633. $olLiCount = [];
  634. $path = [];
  635. foreach ($matches[0] as $textPart) {
  636. if (stripos($textPart, '<ol') === 0) {
  637. array_push($path, 'ol');
  638. $olLevel++;
  639. $olLiCount[$olLevel] = 1;
  640. $textPart = "\n\n";
  641. } elseif (strtolower($textPart) === '</ol>') {
  642. array_pop($path);
  643. $olLevel--;
  644. $textPart = "\n\n";
  645. } elseif (stripos($textPart, '<ul') === 0) {
  646. array_push($path, 'ul');
  647. $ulLevel++;
  648. $textPart = "\n\n";
  649. } elseif (strtolower($textPart) === '</ul>') {
  650. array_pop($path);
  651. $ulLevel--;
  652. $textPart = "\n\n";
  653. } elseif (stripos($textPart, '<li') === 0) {
  654. $textPart = str_repeat("\t", $olLevel + $ulLevel);
  655. if (end($path) === 'ul') {
  656. $textPart .= $symbols[$ulLevel % 4] . ' ';
  657. } elseif (end($path) === 'ol') {
  658. $textPart .= $olLiCount[$olLevel] . '. ';
  659. $olLiCount[$olLevel]++;
  660. }
  661. } elseif (strtolower($textPart) === '</li>') {
  662. $textPart = "\n";
  663. }
  664. $text .= $textPart;
  665. }
  666. return $text;
  667. }
  668. /**
  669. * Converts citations into plaintext.
  670. *
  671. * @param string $text Text with HTML fragments
  672. * @return string
  673. */
  674. private static function blockquoteToText(string $text): string
  675. {
  676. if (preg_match_all('~(?:<blockquote[^>]*>\\s*)|(?:\\s*</blockquote>)|(?:.+?(?=</?blockquote)|(?:.+))~is', $text, $matches) > 0) {
  677. $text = '';
  678. $offset = 0;
  679. foreach ($matches[0] as $textPart) {
  680. $currentOffset = substr_count(strtolower($textPart), '<blockquote');
  681. if ($currentOffset > 0) {
  682. $offset += $currentOffset;
  683. // Adds a line to the beginning
  684. $text .= ($offset === 1 ? "\n" : '');
  685. continue;
  686. }
  687. $currentOffset = substr_count(strtolower($textPart), '</blockquote>');
  688. if ($currentOffset > 0) {
  689. $offset -= $currentOffset;
  690. $text .= '';
  691. continue;
  692. }
  693. if ($offset > 0) {
  694. // Opening tag
  695. $text .= "\n" . str_repeat('>', $offset) . ' '
  696. // Beginning of all lines
  697. . str_replace("\n", "\n" . str_repeat('>', $offset) . ' ', trim($textPart))
  698. // Closing tag
  699. . "\n" . str_repeat('>', $offset);
  700. continue;
  701. }
  702. $text .= $textPart;
  703. }
  704. }
  705. return $text;
  706. }
  707. }