/public_html/wire/modules/Textformatter/TextformatterSmartypants/Michelf/SmartyPants.php
PHP | 513 lines | 380 code | 45 blank | 88 comment | 72 complexity | cfdec7339434f641e275a294538e805e MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, LGPL-2.1, MPL-2.0-no-copyleft-exception
- <?php
- #
- # SmartyPants - Smart typography for web sites
- #
- # PHP SmartyPants
- # Copyright (c) 2004-2016 Michel Fortin
- # <https://michelf.ca/>
- #
- # Original SmartyPants
- # Copyright (c) 2003-2004 John Gruber
- # <https://daringfireball.net/>
- #
- namespace Michelf;
- #
- # SmartyPants Parser Class
- #
- class SmartyPants {
- ### Version ###
- const SMARTYPANTSLIB_VERSION = "1.7.1";
- ### Presets
- # SmartyPants does nothing at all
- const ATTR_DO_NOTHING = 0;
- # "--" for em-dashes; no en-dash support
- const ATTR_EM_DASH = 1;
- # "---" for em-dashes; "--" for en-dashes
- const ATTR_LONG_EM_DASH_SHORT_EN = 2;
- # "--" for em-dashes; "---" for en-dashes
- const ATTR_SHORT_EM_DASH_LONG_EN = 3;
- # "--" for em-dashes; "---" for en-dashes
- const ATTR_STUPEFY = -1;
- # The default preset: ATTR_EM_DASH
- const ATTR_DEFAULT = SmartyPants::ATTR_EM_DASH;
- ### Standard Function Interface ###
- public static function defaultTransform($text, $attr = SmartyPants::ATTR_DEFAULT) {
- #
- # Initialize the parser and return the result of its transform method.
- # This will work fine for derived classes too.
- #
- # Take parser class on which this function was called.
- $parser_class = \get_called_class();
- # try to take parser from the static parser list
- static $parser_list;
- $parser =& $parser_list[$parser_class][$attr];
- # create the parser if not already set
- if (!$parser)
- $parser = new $parser_class($attr);
- # Transform text using parser.
- return $parser->transform($text);
- }
- ### Configuration Variables ###
- # Partial regex for matching tags to skip
- public $tags_to_skip = 'pre|code|kbd|script|style|math';
- # Options to specify which transformations to make:
- public $do_nothing = 0; # disable all transforms
- public $do_quotes = 0;
- public $do_backticks = 0; # 1 => double only, 2 => double & single
- public $do_dashes = 0; # 1, 2, or 3 for the three modes described above
- public $do_ellipses = 0;
- public $do_stupefy = 0;
- public $convert_quot = 0; # should we translate " entities into normal quotes?
- ### Parser Implementation ###
- public function __construct($attr = SmartyPants::ATTR_DEFAULT) {
- #
- # Initialize a parser with certain attributes.
- #
- # Parser attributes:
- # 0 : do nothing
- # 1 : set all
- # 2 : set all, using old school en- and em- dash shortcuts
- # 3 : set all, using inverted old school en and em- dash shortcuts
- #
- # q : quotes
- # b : backtick quotes (``double'' only)
- # B : backtick quotes (``double'' and `single')
- # d : dashes
- # D : old school dashes
- # i : inverted old school dashes
- # e : ellipses
- # w : convert " entities to " for Dreamweaver users
- #
- if ($attr == "0") {
- $this->do_nothing = 1;
- }
- else if ($attr == "1") {
- # Do everything, turn all options on.
- $this->do_quotes = 1;
- $this->do_backticks = 1;
- $this->do_dashes = 1;
- $this->do_ellipses = 1;
- }
- else if ($attr == "2") {
- # Do everything, turn all options on, use old school dash shorthand.
- $this->do_quotes = 1;
- $this->do_backticks = 1;
- $this->do_dashes = 2;
- $this->do_ellipses = 1;
- }
- else if ($attr == "3") {
- # Do everything, turn all options on, use inverted old school dash shorthand.
- $this->do_quotes = 1;
- $this->do_backticks = 1;
- $this->do_dashes = 3;
- $this->do_ellipses = 1;
- }
- else if ($attr == "-1") {
- # Special "stupefy" mode.
- $this->do_stupefy = 1;
- }
- else {
- $chars = preg_split('//', $attr);
- foreach ($chars as $c){
- if ($c == "q") { $this->do_quotes = 1; }
- else if ($c == "b") { $this->do_backticks = 1; }
- else if ($c == "B") { $this->do_backticks = 2; }
- else if ($c == "d") { $this->do_dashes = 1; }
- else if ($c == "D") { $this->do_dashes = 2; }
- else if ($c == "i") { $this->do_dashes = 3; }
- else if ($c == "e") { $this->do_ellipses = 1; }
- else if ($c == "w") { $this->convert_quot = 1; }
- else {
- # Unknown attribute option, ignore.
- }
- }
- }
- }
- public function transform($text) {
- if ($this->do_nothing) {
- return $text;
- }
- $tokens = $this->tokenizeHTML($text);
- $result = '';
- $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags.
- $prev_token_last_char = ""; # This is a cheat, used to get some context
- # for one-character tokens that consist of
- # just a quote char. What we do is remember
- # the last character of the previous text
- # token, to use as context to curl single-
- # character quote tokens correctly.
- foreach ($tokens as $cur_token) {
- if ($cur_token[0] == "tag") {
- # Don't mess with quotes inside tags.
- $result .= $cur_token[1];
- if (preg_match('@<(/?)(?:'.$this->tags_to_skip.')[\s>]@', $cur_token[1], $matches)) {
- $in_pre = isset($matches[1]) && $matches[1] == '/' ? 0 : 1;
- }
- } else {
- $t = $cur_token[1];
- $last_char = substr($t, -1); # Remember last char of this token before processing.
- if (! $in_pre) {
- $t = $this->educate($t, $prev_token_last_char);
- }
- $prev_token_last_char = $last_char;
- $result .= $t;
- }
- }
- return $result;
- }
- protected function educate($t, $prev_token_last_char) {
- $t = $this->processEscapes($t);
- if ($this->convert_quot) {
- $t = preg_replace('/"/', '"', $t);
- }
- if ($this->do_dashes) {
- if ($this->do_dashes == 1) $t = $this->educateDashes($t);
- if ($this->do_dashes == 2) $t = $this->educateDashesOldSchool($t);
- if ($this->do_dashes == 3) $t = $this->educateDashesOldSchoolInverted($t);
- }
- if ($this->do_ellipses) $t = $this->educateEllipses($t);
- # Note: backticks need to be processed before quotes.
- if ($this->do_backticks) {
- $t = $this->educateBackticks($t);
- if ($this->do_backticks == 2) $t = $this->educateSingleBackticks($t);
- }
- if ($this->do_quotes) {
- if ($t == "'") {
- # Special case: single-character ' token
- if (preg_match('/\S/', $prev_token_last_char)) {
- $t = "’";
- }
- else {
- $t = "‘";
- }
- }
- else if ($t == '"') {
- # Special case: single-character " token
- if (preg_match('/\S/', $prev_token_last_char)) {
- $t = "”";
- }
- else {
- $t = "“";
- }
- }
- else {
- # Normal case:
- $t = $this->educateQuotes($t);
- }
- }
- if ($this->do_stupefy) $t = $this->stupefyEntities($t);
-
- return $t;
- }
- protected function educateQuotes($_) {
- #
- # Parameter: String.
- #
- # Returns: The string, with "educated" curly quote HTML entities.
- #
- # Example input: "Isn't this fun?"
- # Example output: “Isn’t this fun?”
- #
- # Make our own "punctuation" character class, because the POSIX-style
- # [:PUNCT:] is only available in Perl 5.6 or later:
- $punct_class = "[!\"#\\$\\%'()*+,-.\\/:;<=>?\\@\\[\\\\\]\\^_`{|}~]";
- # Special case if the very first character is a quote
- # followed by punctuation at a non-word-break. Close the quotes by brute force:
- $_ = preg_replace(
- array("/^'(?=$punct_class\\B)/", "/^\"(?=$punct_class\\B)/"),
- array('’', '”'), $_);
- # Special case for double sets of quotes, e.g.:
- # <p>He said, "'Quoted' words in a larger quote."</p>
- $_ = preg_replace(
- array("/\"'(?=\w)/", "/'\"(?=\w)/"),
- array('“‘', '‘“'), $_);
- # Special case for decade abbreviations (the '80s):
- $_ = preg_replace("/'(?=\\d{2}s)/", '’', $_);
- $close_class = '[^\ \t\r\n\[\{\(\-]';
- $dec_dashes = '&\#8211;|&\#8212;';
- # Get most opening single quotes:
- $_ = preg_replace("{
- (
- \\s | # a whitespace char, or
- | # a non-breaking space entity, or
- -- | # dashes, or
- &[mn]dash; | # named dash entities
- $dec_dashes | # or decimal entities
- &\\#x201[34]; # or hex
- )
- ' # the quote
- (?=\\w) # followed by a word character
- }x", '\1‘', $_);
- # Single closing quotes:
- $_ = preg_replace("{
- ($close_class)?
- '
- (?(1)| # If $1 captured, then do nothing;
- (?=\\s | s\\b) # otherwise, positive lookahead for a whitespace
- ) # char or an 's' at a word ending position. This
- # is a special case to handle something like:
- # \"<i>Custer</i>'s Last Stand.\"
- }xi", '\1’', $_);
- # Any remaining single quotes should be opening ones:
- $_ = str_replace("'", '‘', $_);
- # Get most opening double quotes:
- $_ = preg_replace("{
- (
- \\s | # a whitespace char, or
- | # a non-breaking space entity, or
- -- | # dashes, or
- &[mn]dash; | # named dash entities
- $dec_dashes | # or decimal entities
- &\\#x201[34]; # or hex
- )
- \" # the quote
- (?=\\w) # followed by a word character
- }x", '\1“', $_);
- # Double closing quotes:
- $_ = preg_replace("{
- ($close_class)?
- \"
- (?(1)|(?=\\s)) # If $1 captured, then do nothing;
- # if not, then make sure the next char is whitespace.
- }x", '\1”', $_);
- # Any remaining quotes should be opening ones.
- $_ = str_replace('"', '“', $_);
- return $_;
- }
- protected function educateBackticks($_) {
- #
- # Parameter: String.
- # Returns: The string, with ``backticks'' -style double quotes
- # translated into HTML curly quote entities.
- #
- # Example input: ``Isn't this fun?''
- # Example output: “Isn't this fun?”
- #
- $_ = str_replace(array("``", "''",),
- array('“', '”'), $_);
- return $_;
- }
- protected function educateSingleBackticks($_) {
- #
- # Parameter: String.
- # Returns: The string, with `backticks' -style single quotes
- # translated into HTML curly quote entities.
- #
- # Example input: `Isn't this fun?'
- # Example output: ‘Isn’t this fun?’
- #
- $_ = str_replace(array("`", "'",),
- array('‘', '’'), $_);
- return $_;
- }
- protected function educateDashes($_) {
- #
- # Parameter: String.
- #
- # Returns: The string, with each instance of "--" translated to
- # an em-dash HTML entity.
- #
- $_ = str_replace('--', '—', $_);
- return $_;
- }
- protected function educateDashesOldSchool($_) {
- #
- # Parameter: String.
- #
- # Returns: The string, with each instance of "--" translated to
- # an en-dash HTML entity, and each "---" translated to
- # an em-dash HTML entity.
- #
- # em en
- $_ = str_replace(array("---", "--",),
- array('—', '–'), $_);
- return $_;
- }
- protected function educateDashesOldSchoolInverted($_) {
- #
- # Parameter: String.
- #
- # Returns: The string, with each instance of "--" translated to
- # an em-dash HTML entity, and each "---" translated to
- # an en-dash HTML entity. Two reasons why: First, unlike the
- # en- and em-dash syntax supported by
- # EducateDashesOldSchool(), it's compatible with existing
- # entries written before SmartyPants 1.1, back when "--" was
- # only used for em-dashes. Second, em-dashes are more
- # common than en-dashes, and so it sort of makes sense that
- # the shortcut should be shorter to type. (Thanks to Aaron
- # Swartz for the idea.)
- #
- # en em
- $_ = str_replace(array("---", "--",),
- array('–', '—'), $_);
- return $_;
- }
- protected function educateEllipses($_) {
- #
- # Parameter: String.
- # Returns: The string, with each instance of "..." translated to
- # an ellipsis HTML entity. Also converts the case where
- # there are spaces between the dots.
- #
- # Example input: Huh...?
- # Example output: Huh…?
- #
- $_ = str_replace(array("...", ". . .",), '…', $_);
- return $_;
- }
- protected function stupefyEntities($_) {
- #
- # Parameter: String.
- # Returns: The string, with each SmartyPants HTML entity translated to
- # its ASCII counterpart.
- #
- # Example input: “Hello — world.”
- # Example output: "Hello -- world."
- #
- # en-dash em-dash
- $_ = str_replace(array('–', '—'),
- array('-', '--'), $_);
- # single quote open close
- $_ = str_replace(array('‘', '’'), "'", $_);
- # double quote open close
- $_ = str_replace(array('“', '”'), '"', $_);
- $_ = str_replace('…', '...', $_); # ellipsis
- return $_;
- }
- protected function processEscapes($_) {
- #
- # Parameter: String.
- # Returns: The string, with after processing the following backslash
- # escape sequences. This is useful if you want to force a "dumb"
- # quote or other character to appear.
- #
- # Escape Value
- # ------ -----
- # \\ \
- # \" "
- # \' '
- # \. .
- # \- -
- # \` `
- #
- $_ = str_replace(
- array('\\\\', '\"', "\'", '\.', '\-', '\`'),
- array('\', '"', ''', '.', '-', '`'), $_);
- return $_;
- }
- protected function tokenizeHTML($str) {
- #
- # Parameter: String containing HTML markup.
- # Returns: An array of the tokens comprising the input
- # string. Each token is either a tag (possibly with nested,
- # tags contained therein, such as <a href="<MTFoo>">, or a
- # run of text between tags. Each element of the array is a
- # two-element array; the first is either 'tag' or 'text';
- # the second is the actual value.
- #
- #
- # Regular expression derived from the _tokenize() subroutine in
- # Brad Choate's MTRegex plugin.
- # <http://www.bradchoate.com/past/mtregex.php>
- #
- $index = 0;
- $tokens = array();
- $match = '(?s:<!--.*?-->)|'. # comment
- '(?s:<\?.*?\?>)|'. # processing instruction
- # regular tags
- '(?:<[/!$]?[-a-zA-Z0-9:]+\b(?>[^"\'>]+|"[^"]*"|\'[^\']*\')*>)';
- $parts = preg_split("{($match)}", $str, -1, PREG_SPLIT_DELIM_CAPTURE);
- foreach ($parts as $part) {
- if (++$index % 2 && $part != '')
- $tokens[] = array('text', $part);
- else
- $tokens[] = array('tag', $part);
- }
- return $tokens;
- }
- }