/Jyxo/Html.php
PHP | 796 lines | 450 code | 89 blank | 257 comment | 34 complexity | f5715f95f8b37405d936547cbf36a2ea MD5 | raw file
- <?php declare(strict_types = 1);
- /**
- * Jyxo PHP Library
- *
- * LICENSE
- *
- * This source file is subject to the new BSD license that is bundled
- * with this package in the file license.txt.
- * It is also available through the world-wide-web at this URL:
- * https://github.com/jyxo/php/blob/master/license.txt
- */
- namespace Jyxo;
- use LogicException;
- use function array_pop;
- use function array_push;
- use function count;
- use function end;
- use function html_entity_decode;
- use function htmlspecialchars;
- use function mb_strtoupper;
- use function mb_substr;
- use function nl2br;
- use function preg_match;
- use function preg_match_all;
- use function preg_replace;
- use function preg_replace_callback;
- use function sprintf;
- use function str_ireplace;
- use function str_repeat;
- use function str_replace;
- use function strip_tags;
- use function stripos;
- use function strlen;
- use function strtolower;
- use function strtr;
- use function substr;
- use function substr_count;
- use function tidy_repair_string;
- use function trim;
- use const ENT_QUOTES;
- /**
- * Functions for HTML processing.
- *
- * @copyright Copyright (c) 2005-2011 Jyxo, s.r.o.
- * @license https://github.com/jyxo/php/blob/master/license.txt
- * @author Jaroslav HanslĂk
- */
- class Html
- {
- /**
- * Constructor preventing from creating instances of a static class.
- *
- * @throws LogicException If trying to create an instance
- */
- final public function __construct()
- {
- throw new LogicException(sprintf('Cannot create an instance of a static class %s.', static::class));
- }
- /**
- * Tests if the given text contains at least one HTML tag.
- * It is just an estimation.
- *
- * @param string $text Input text to be tested
- * @return bool
- */
- public static function is(string $text): bool
- {
- return (bool) preg_match('~<[a-z][a-z0-9]*(\\s[^<]*)?>~i', $text);
- }
- /**
- * Fixes an invalid HTML source, unifies quotes and removes unnecessary whitespace.
- * Required the Tidy PHP extension.
- *
- * @param string $html Input HTML source
- * @return string
- */
- public static function repair(string $html): string
- {
- // HTML fixing
- static $config = [
- // Uses LF line endings
- 'newline' => 'LF',
- // Removes indent
- 'indent' => false,
- // Output will be in XHTML format
- 'output-xhtml' => true,
- // No BOM
- 'output-bom' => false,
- // Automatic doctype
- 'doctype' => 'auto',
- // 'clean' => true, // Removes presentation tags (inline styles would be moved into <style> elements)
- // Cleans MS HTML mess
- 'bare' => true,
- // No wrapping
- 'wrap' => 0,
- // No <![ ... ]> wrapping
- 'wrap-sections' => false,
- // 'quote-marks' => true, // Replaces quotes with appropriate entities (causes problems with later regular expression processing)
- // 'logical-emphasis' => true, // Replaces all <i> and <b> tags with <em> and <strong> (styles cannot be parsed after)
- // Text inside <body> encapsulates with a <p> tag
- 'enclose-text' => true,
- // Disables <div> merging
- 'merge-divs' => false,
- // Disables <span> merging
- 'merge-spans' => false,
- // 'hide-comments' => true, // Removes comments (it would remove conditional comments used when inserting Flash)
- // Makes output even on error
- 'force-output' => true,
- // Don't show any errors
- 'show-errors' => 0,
- // Don't show any warnings
- 'show-warnings' => false,
- // Makes an ordinary text from CDATA blocks
- 'escape-cdata' => true,
- // Preserves correctly formatted entities
- 'preserve-entities' => true,
- // 'drop-proprietary-attributes' => true, // Removes proprietary attributes (it would remove e.g. the background attribute)
- // 'drop-font-tags' => true // Removes <FONT> and <CENTER> tags
- ];
- $html = tidy_repair_string($html, $config, 'utf8');
- // Removes namespace <?xml:namespace prefix = o ns = "urn:schemas-microsoft-com:office:office" /? > generated by MS Word
- $html = preg_replace('~<\?xml:namespace[^>]*>~i', '', $html);
- // Removes unnecessary line breaks and keeps them inside <pre> elements
- // Tidy adds one more line breaks inside <pre> elements
- $html = preg_replace("~(<pre[^>]*>)\n~", '\\1', $html);
- $html = preg_replace("~\n</pre>~", '</pre>', $html);
- $html = preg_replace_callback('~(<pre[^>]*>)(.+?)(</pre>)~s', static function ($matches) {
- return $matches[1] . strtr(nl2br($matches[2]), ['\"' => '"']) . $matches[3];
- }, $html);
- // Strip line breaks
- $html = strtr($html, ["\r" => '', "\n" => '']);
- // Replace single quotes with double quotes (for easier processing later)
- $html = preg_replace('~(<[a-z][a-z0-9]*[^>]+[a-z]+=)\'([^\']*)\'~i', '\\1"\\2"', $html);
- // Remove unnecessary spaces inside elements (for easier processing later)
- $html = preg_replace('~(<[a-z][a-z0-9]*[^>]+[a-z]+=")\\s+([^"]*")~i', '\\1\\2', $html);
- $html = preg_replace('~(<[a-z][a-z0-9]*[^>]+[a-z]+="[^"]*)\s+(")~i', '\\1\\2', $html);
- return $html;
- }
- /**
- * Removes given tags from the HTML source.
- * If no tags are given, the default set is used.
- * Expects valid HTML code.
- *
- * @param string $html HTML source code
- * @param array $tags Tags to be removed
- * @return string
- */
- public static function removeTags(string $html, array $tags = []): string
- {
- // Default set of tags
- static $default = [
- 'frameset',
- 'frame',
- 'noframes',
- 'iframe',
- 'script',
- 'noscript',
- 'style',
- 'link',
- 'object',
- 'embed',
- 'form',
- 'input',
- 'select',
- 'textarea',
- 'button',
- ];
- // If no tags are set, the default set will be used
- if (empty($tags)) {
- $tags = $default;
- }
- // Remove given tags
- foreach ($tags as $tag) {
- switch ($tag) {
- // Embed
- case 'embed':
- // Second variant is because of Tidy that processes <embed> this way
- $pattern = ['~\s*<embed[^>]*>.*?</embed>~is', '~\s*<embed[^>]*>~is'];
- break;
- // Self closing tags
- case 'link':
- case 'meta':
- case 'br':
- case 'hr':
- case 'img':
- case 'input':
- $pattern = ['~\s*<' . $tag . '[^>]*>~is'];
- break;
- // Pair tags
- default:
- $pattern = ['~\s*<' . $tag . '(?:\s+[^>]*)?>.*?</' . $tag . '>~is'];
- break;
- }
- $html = preg_replace($pattern, '', $html);
- }
- return $html;
- }
- /**
- * Removes tags of the same type nested into each other from the HTML source.
- * Expects valid HTML source
- *
- * @param string $html HTML source code
- * @param string $tag Tags to be processed
- * @return string
- */
- public static function removeInnerTags(string $html, string $tag): string
- {
- if (preg_match_all('~(?:<' . $tag . '>)|(?:</' . $tag . '>)|(?:<[^>]+>)|(?:[^<]+)~i', $html, $matches)) {
- $html = '';
- $level = 0;
- foreach ($matches[0] as $htmlPart) {
- if (stripos($htmlPart, '<' . $tag) === 0) {
- $level++;
- if ($level === 1) {
- $html .= $htmlPart;
- }
- } elseif (stripos($htmlPart, '</' . $tag) === 0) {
- if ($level === 1) {
- $html .= $htmlPart;
- }
- $level--;
- } else {
- $html .= $htmlPart;
- }
- }
- }
- return $html;
- }
- /**
- * Removes given attributes from the HTML source.
- * If no attributes are given, the default set will be used.
- * Expects valid HTML source.
- *
- * @param string $html HTML source code
- * @param array $attributes Attributes to be removed
- * @return string
- */
- public static function removeAttributes(string $html, array $attributes = []): string
- {
- // Default set of attributes
- static $default = ['id', 'class'];
- // If no attributes are given, the default set will be used
- if (empty($attributes)) {
- $attributes = $default;
- }
- // Remove given attributes
- foreach ($attributes as $attribute) {
- $html = preg_replace('~(<[a-z][a-z0-9]*[^>]*?)\\s+' . $attribute . '="[^"]*"~is', '\\1', $html);
- }
- return $html;
- }
- /**
- * Removes all javascript events from the HTML source.
- * If it is necessary to remove only certain events, the removeAttributes() method can be used.
- * Expects valid HTML source.
- *
- * @param string $html HTML source code
- * @return string
- */
- public static function removeJavascriptEvents(string $html): string
- {
- // A tag can have multiple events, therefore it is necessary to process the source multiple times
- while (preg_match('~<[a-z][a-z0-9]*[^>]*?\\s+on[a-z]+="[^"]*"~is', $html)) {
- $html = preg_replace('~(<[a-z][a-z0-9]*[^>]*?)\\s+on[a-z]+="[^"]*"~is', '\\1', $html);
- }
- return $html;
- }
- /**
- * Removes foreign images from the HTML source.
- * Keeps <img> tags (only set the value about:blank into its src attribute), because removing the tag entirely could affect
- * the page layout.
- * Expects valid HTML source.
- *
- * @param string $html HTML source code
- * @return string
- */
- public static function removeRemoteImages(string $html): string
- {
- static $remoteImages = [
- '~(<img[^>]+src=")http(?:s)?://[^"]+(")~i',
- '~(<[a-z][a-z0-9]*[^>]+background=")http(?:s)?://[^"]+(")~i',
- '~(<[a-z][a-z0-9]*[^>]+style="[^"]*background\\s*[:])([\-a-z0-9#%\\s]*)url\([^)]+\)(;)?~is',
- '~(<[a-z][a-z0-9]*[^>]+style="[^"]*)background-image\\s*[:]([\-a-z0-9#%\\s]*)url\([^)]+\)(;)?~is',
- '~(<[a-z][a-z0-9]*[^>]+style="[^"]*list-style\\s*[:])([\-a-z0-9\\s]*)url\([^)]+\)(;)?~is',
- '~(<[a-z][a-z0-9]*[^>]+style="[^"]*)list-style-image\\s*[:]([\-a-z0-9\\s]*)url\([^)]+\)(;)?~is',
- ];
- // We use value about:blank for the <img> tag's src attribute, because removing the tag entirely could affect the page layout
- static $remoteImagesReplacement = [
- '\\1about:blank\\2',
- '\\1\\2',
- '\\1\\2\\3',
- '\\1',
- '\\1\\2\\3',
- '\\1',
- ];
- return preg_replace($remoteImages, $remoteImagesReplacement, $html);
- }
- /**
- * Removes possibly dangerous attributes that could contain XSS code from the HTML source.
- *
- * @param string $html HTML source code
- * @return string
- */
- public static function removeDangerous(string $html): string
- {
- static $dangerous = [
- '~\\s+href="javascript[^"]*"~i',
- '~\\s+src="javascript[^"]*"~i',
- // See http://www.soom.cz/index.php?name=projects/testmail/main
- '~\\s+href="data:[^"]*"~i',
- '~\\s+src="data:[^"]*"~i',
- ];
- return preg_replace($dangerous, '', $html);
- }
- /**
- * Returns <body> contents from the given HTML source.
- * Expects valid HTML source.
- *
- * @param string $html HTML source code
- * @return string
- */
- public static function getBody(string $html): string
- {
- // If the source code contains <body>, return this element's contents
- if (preg_match('~<body([^>]*)>(.*?)</body>~is', $html, $matches)) {
- $body = trim($matches[2]);
- // Converts <body> inline styles to a newly created <div> element
- if (preg_match('~style="[^"]+"~i', $matches[1], $style)) {
- $body = '<div ' . $style[0] . '>' . $body . '</div>';
- }
- return $body;
- }
- // Return everything otherwise
- return $html;
- }
- /**
- * Converts text to HTML source code.
- *
- * @param string $text Input text
- * @param bool $convertLinks Convert urls and emails to links
- * @return string
- */
- public static function fromText(string $text, bool $convertLinks = true): string
- {
- // Trimming whitespace (except spaces)
- $text = trim($text, "\r\n");
- // Two empty lines max
- $text = preg_replace("~\n\\s+\n~", "\n\n", $text);
- // Special chars
- $html = htmlspecialchars($text, ENT_QUOTES, 'utf-8', false);
- // Two spaces mean an indent, convert to non-breaking spaces
- $html = str_replace(' ', ' ', $html);
- // Convert tabs to four non-breaking spaces
- $html = str_replace("\t", ' ', $html);
- // Paragraph
- $html = '<p>' . preg_replace("~\n\n[^\\n]?~", '</p><p>\\0', $html) . '</p>';
- $html = str_replace("\n", "<br />\n", $html);
- $html = str_ireplace('<p><br />', "<p>\n", $html);
- // Citation
- preg_match_all('~(?:(^(?:<p>)?\\s*>(?:>|\\s)*)(.*)$)|(?:.+)~im', $html, $matches);
- $html = '';
- $offset = 0;
- for ($i = 0; $i < count($matches[0]); $i++) {
- $currentOffset = substr_count($matches[1][$i], '>');
- if ($currentOffset > 0) {
- if ($currentOffset > $offset) {
- $html .= str_repeat('<blockquote type="cite">', $currentOffset - $offset) . '<p>';
- $offset = $currentOffset;
- } elseif ($currentOffset < $offset) {
- $html .= '</p>' . str_repeat('</blockquote>', $offset - $currentOffset) . '<p>';
- $offset = $currentOffset;
- }
- $html .= $matches[2][$i];
- } else {
- if ($offset > 0) {
- $html .= '</p>' . str_repeat('</blockquote>', $offset) . '<p>';
- $offset = 0;
- }
- $html .= $matches[0][$i];
- }
- }
- if ($offset > 0) {
- $html .= '</p>' . str_repeat('</blockquote>', $offset);
- }
- // Removes empty lines that were created during previous processing
- $html = preg_replace('~(?:<br />)+</p></blockquote>~i', '</p></blockquote>', $html);
- $html = str_ireplace('<p><br /></p>', '', $html);
- $html = str_ireplace('<p><br />', '<p>', $html);
- // Emails and urls
- if ($convertLinks) {
- $html = self::linkFromText($html);
- }
- return $html;
- }
- /**
- * Converts text to a link to an url or email.
- *
- * @param string $text Input text
- * @return string
- */
- public static function linkFromText(string $text): string
- {
- $patternGenericTld = '(?:tld|aero|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|asia|post|geo)';
- $patternTld = '(?-i:' . $patternGenericTld . '|[a-z]{2})';
- $patternDomain = '(?:(?:[a-z]|[a-z0-9](?:[\-a-z0-9]{0,61}[a-z0-9]))[.])*(?:[a-z0-9](?:[\-a-z0-9]{0,61}[a-z0-9])[.]' . $patternTld . ')';
- $pattern8bit = '(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]?[0-9])';
- $patternIPv4 = '(?:' . $pattern8bit . '(?:[.]' . $pattern8bit . '){3})';
- // a:b:c:d:e:f:g:h
- $patternIpV6Variant8Hex = '(?:(?:[0-9a-f]{1,4}:){7}[0-9a-f]{1,4})';
- // Compressed a::b
- $patternIpV6VariantCompressedHex = '(?:(?:(?:[0-9a-f]{1,4}(?::[0-9a-f]{1,4})*)?)::(?:(?:[0-9a-f]{1,4}(?::[0-9a-f]{1,4})*)?))';
- // IPv4 mapped to IPv6 a:b:c:d:e:f:w.x.y.z
- $patternIpV6VariantHex4Dec = '(?:(?:(?:[0-9a-f]{1,4}:){6})' . $patternIPv4 . ')';
- // Compressed IPv4 mapped to IPv6 a::b:w.x.y.z
- $patternIpV6VariantCompressedHex4Dec = '(?:(?:(?:[0-9a-f]{1,4}(?::[0-9a-f]{1,4})*)?)::(?:(?:[0-9a-f]{1,4}:)*)' . $patternIPv4 . ')';
- $patternIpV6 = '(?:' . $patternIpV6Variant8Hex . '|' . $patternIpV6VariantCompressedHex . '|' . $patternIpV6VariantHex4Dec . '|' . $patternIpV6VariantCompressedHex4Dec . ')';
- // mailto:username
- $patternEmail = '(?:mailto:)?(?:[\-\\w!#\$%&\'*+/=?^`{|}\~]+(?:[.][\-\\w!#\$%&\'*+/=?^`{|}\~]+)*)';
- // @domain.tld
- $patternEmail .= '(?:@' . $patternDomain . ')';
- // protocol://user:password@
- $patternUrl = '(?:(?:http|ftp)s?://(?:[\\S]+(?:[:][\\S]*)?@)?)?';
- // domain.tld, IPv4 or IPv6
- $patternUrl .= '(?:' . $patternDomain . '|' . $patternIPv4 . '|' . $patternIpV6 . ')';
- // :port/path/file.extension
- $patternUrl .= '(?::[0-9]+)?(?:(?:/[-\\w\\pL\\pN\~.:!%]+)*(?:/|[.][a-z0-9]{2,4})?)?';
- // ?query#hash
- $patternUrl .= '(?:[?][\]\[\-\\w\\pL\\pN.,?!\~%#@&;:/\'\=+]*)?(?:#[\]\[\-\\w\\pL\\pN.,?!\~%@&;:/\'\=+]*)?';
- return preg_replace_callback(
- '~(^|[^\\pL\\pN])(?:(' . $patternEmail . ')|(' . $patternUrl . '))(?=$|\\W)~iu',
- static function ($matches) {
- // Url
- if (isset($matches[3])) {
- $url = $matches[3];
- // Remove special chars at the end
- if (preg_match('~(([.,:;?!>)\]}]|(>))+)$~i', $url, $matches2)) {
- $punctuation = $matches2[1];
- // strlen is necessary because of >
- $url = mb_substr($url, 0, -strlen($matches2[1]), 'utf-8');
- } else {
- $punctuation = '';
- }
- // Add missing http://
- $linkUrl = !preg_match('~^(http|ftp)s?://~i', $url) ? 'http://' . $url : $url;
- // Create a link
- return $matches[1] . '<a href="' . $linkUrl . '">' . $url . '</a>' . $punctuation;
- }
- // Emails
- if (!isset($matches[2])) {
- return;
- }
- $email = $matches[2];
- if (stripos($email, 'mailto:') !== false) {
- $email = substr($matches[2], 7);
- $protocol = 'mailto:';
- } else {
- $protocol = '';
- }
- return $matches[1] . '<a href="mailto:' . $email . '">' . $protocol . $email . '</a>';
- },
- $text
- );
- }
- /**
- * Converts HTML source code to plaintext.
- *
- * @param string $html HTML source code
- * @return string
- */
- public static function toText(string $html): string
- {
- $text = $html;
- // Remove styles a scripts
- $text = self::removeTags($text, ['style', 'script']);
- // Re-format lines
- // <pre>
- $text = preg_replace_callback('~<pre[^>]*>(.+?)</pre>~is', static function ($matches) {
- // Line breaks are converted to <br />, that are removed later
- return nl2br($matches[1]);
- }, $text);
- // \r, redundant line breaks, tabs and <br />
- $text = preg_replace(
- ["~\r~", "~[\n\t]+~", '~<br[^>]*>~i'],
- ['', ' ', "\n"],
- $text
- );
- // Processing of most tags and entities
- static $search = [
- // <h3> to <h6>
- '~<h[3-6][^>]*>(.+?)</h[3-6]>~is',
- // <div> and </div>
- '~(<div[^>]*>)|(</div>)~i',
- // <p> and </p>
- '~(<p(?:\s+[^>]+)?>)|(</p>)~i',
- // <table> and </table>
- '~(<table[^>]*>)|(</table>)~i',
- // </tr>
- '~</tr>*~i',
- // <td> and </td>
- '~<td[^>]*>(.+?)</td>~is',
- // '~(<code[^>]*>)|(</code>)~i', // <code> and </code>
- // Ellipsis
- '~(…)~i',
- // Quotes
- '~(“)|(”)~i',
- // Apostrophe
- '~(')~i',
- // Copyright
- '~(©)|(©)~i',
- // Trademark
- '~™~i',
- // Registered trademark
- '~®~i',
- // Dash and hyphen
- '~(—)|(–)~i',
- ];
- static $replace = [
- // <h3> to <h6>
- "\n\n\\1\n\n",
- // <div> and </div>
- "\n\n",
- // <p> and </p>
- "\n\n",
- // <table> and </table>
- "\n\n",
- // </tr>
- "\n",
- // <td> and </td>
- "\\1\t",
- // "\n\n", // <code> and </code>
- // Ellipsis
- '...',
- // Quotes
- '"',
- // Apostrophe
- '\'',
- // Copyright
- '(c)',
- // Trademark
- '(tm)',
- // Registered trademark
- '(R)',
- // Dash and hyphen
- '-',
- ];
- $text = preg_replace($search, $replace, $text);
- // <h1> and <h2>
- $text = preg_replace_callback('~<h[12][^>]*>(.+?)</h[12]>~is', static function ($matches) {
- return "\n\n\n" . mb_strtoupper($matches[1], 'utf-8') . "\n\n";
- }, $text);
- // <strong>
- $text = preg_replace_callback('~<strong[^>]*>(.+?)</strong>~is', static function ($matches) {
- return mb_strtoupper($matches[1], 'utf-8');
- }, $text);
- // <hr />
- $text = preg_replace_callback('~<hr[^>]*>~i', static function ($matches) {
- return "\n" . str_repeat('-', 50) . "\n";
- }, $text);
- // <th>
- $text = preg_replace_callback('~<th[^>]*>(.+?)</th>~is', static function ($matches) {
- return mb_strtoupper($matches[1], 'utf-8') . "\t";
- }, $text);
- // <a>
- $text = self::linkToText($text);
- // <ul> and <ol>
- $text = self::listToText($text);
- // Two empty lines at most
- $text = trim($text, "\n ");
- $text = preg_replace("~\n\\s+\n~", "\n\n", $text);
- // Process <blockquote> (empty lines are removed before <blockquote> processing on purpose)
- $text = self::blockquoteToText($text);
- // Remove all left tags
- $text = strip_tags($text);
- // Replacing [textlink] for <> (must be done after strip_tags)
- $text = preg_replace('~\[textlink\]\\s*~s', '<', $text);
- $text = preg_replace('~\\s*\[/textlink\]~s', '>', $text);
- // Replaces non-breaking spaces
- $text = preg_replace(['~ ~i', '~ ~i'], ["\t", ' '], $text);
- // Remove other entities (must not be performed before)
- // After previous processing some entities are upper case, that is why we have to use strtolower
- $text = preg_replace_callback('~(&#?[a-z0-9]+;)~i', static function ($matches) {
- return html_entity_decode(strtolower($matches[1]), ENT_QUOTES, 'utf-8');
- }, $text);
- // Two empty lines at most (performed second times on purpose)
- $text = trim($text, "\n ");
- $text = preg_replace("~\n\\s+\n~", "\n\n", $text);
- // Because of <blockquote> converting
- $text = preg_replace("~(\n>\\s*)+\n~", "\n>\n", $text);
- // One space at most
- $text = preg_replace("~(\n|\t)( )+~", '\1', $text);
- $text = preg_replace('~( ){2,}~', ' ', $text);
- // No space at line ends
- $text = preg_replace("~[ \t]+\n~", "\n", $text);
- return $text;
- }
- /**
- * Converts HTML links into plaintext.
- *
- * @param string $text Text with HTML fragments
- * @return string
- */
- private static function linkToText(string $text): string
- {
- return preg_replace_callback('~(<a\\s+[^>]*>)(.+?)</a>~is', static function ($matches) {
- $url = preg_match('~\\shref="([^"]+)"~i', $matches[1], $submatches) ? trim($submatches[1]) : '';
- $content = $matches[2];
- $clearContent = trim(strip_tags($content));
- // Some urls have no real meaning
- if (empty($url) || ($url[0] === '#') || (substr($url, 0, 2) === '/?')) {
- return $content;
- }
- // Invalid url gets ignored
- if (!Input\Validator\IsUrl::validate($url)) {
- return $content;
- }
- // If the link text and target are the same, use only one of them
- return $url === $clearContent ? '[textlink]' . $content . '[/textlink]' : $content . ' [textlink]' . $url . '[/textlink]';
- }, $text);
- }
- /**
- * Converts HTML lists to plaintext.
- *
- * @param string $text Text with HTML fragments
- * @return string
- */
- private static function listToText(string $text): string
- {
- static $symbols = ['#', '*', 'o', '+'];
- preg_match_all('~(?:<[a-z][a-z0-9]*[^>]*(?: /)?>)|(?:</[a-z][a-z0-9]*>)|(?:<![^>]+>)|(?:[^<]+)~i', $text, $matches);
- $text = '';
- $ulLevel = 0;
- $olLevel = 0;
- $olLiCount = [];
- $path = [];
- foreach ($matches[0] as $textPart) {
- if (stripos($textPart, '<ol') === 0) {
- array_push($path, 'ol');
- $olLevel++;
- $olLiCount[$olLevel] = 1;
- $textPart = "\n\n";
- } elseif (strtolower($textPart) === '</ol>') {
- array_pop($path);
- $olLevel--;
- $textPart = "\n\n";
- } elseif (stripos($textPart, '<ul') === 0) {
- array_push($path, 'ul');
- $ulLevel++;
- $textPart = "\n\n";
- } elseif (strtolower($textPart) === '</ul>') {
- array_pop($path);
- $ulLevel--;
- $textPart = "\n\n";
- } elseif (stripos($textPart, '<li') === 0) {
- $textPart = str_repeat("\t", $olLevel + $ulLevel);
- if (end($path) === 'ul') {
- $textPart .= $symbols[$ulLevel % 4] . ' ';
- } elseif (end($path) === 'ol') {
- $textPart .= $olLiCount[$olLevel] . '. ';
- $olLiCount[$olLevel]++;
- }
- } elseif (strtolower($textPart) === '</li>') {
- $textPart = "\n";
- }
- $text .= $textPart;
- }
- return $text;
- }
- /**
- * Converts citations into plaintext.
- *
- * @param string $text Text with HTML fragments
- * @return string
- */
- private static function blockquoteToText(string $text): string
- {
- if (preg_match_all('~(?:<blockquote[^>]*>\\s*)|(?:\\s*</blockquote>)|(?:.+?(?=</?blockquote)|(?:.+))~is', $text, $matches) > 0) {
- $text = '';
- $offset = 0;
- foreach ($matches[0] as $textPart) {
- $currentOffset = substr_count(strtolower($textPart), '<blockquote');
- if ($currentOffset > 0) {
- $offset += $currentOffset;
- // Adds a line to the beginning
- $text .= ($offset === 1 ? "\n" : '');
- continue;
- }
- $currentOffset = substr_count(strtolower($textPart), '</blockquote>');
- if ($currentOffset > 0) {
- $offset -= $currentOffset;
- $text .= '';
- continue;
- }
- if ($offset > 0) {
- // Opening tag
- $text .= "\n" . str_repeat('>', $offset) . ' '
- // Beginning of all lines
- . str_replace("\n", "\n" . str_repeat('>', $offset) . ' ', trim($textPart))
- // Closing tag
- . "\n" . str_repeat('>', $offset);
- continue;
- }
- $text .= $textPart;
- }
- }
- return $text;
- }
- }