/system/Services/Markdown.php
PHP | 1348 lines | 972 code | 138 blank | 238 comment | 45 complexity | 4ce15305e83164c3d205b803ce146a03 MD5 | raw file
- <?php
- #
- # Markdown - A text-to-HTML conversion tool for web writers
- #
- # PHP Markdown
- # Copyright (c) 2004-2012 Michel Fortin
- # <http://michelf.com/projects/php-markdown/>
- #
- # Original Markdown
- # Copyright (c) 2004-2006 John Gruber
- # <http://daringfireball.net/projects/markdown/>
- #
- # Change to ">" for HTML output
- @define('MARKDOWN_EMPTY_ELEMENT_SUFFIX', " />");
- # Define the width of a tab for code blocks.
- @define('MARKDOWN_TAB_WIDTH', 4);
- class Services_Markdown
- {
-
- # Regex to match balanced [brackets].
- # Needed to insert a maximum bracked depth while converting to PHP.
- var $nested_brackets_depth = 6;
- var $nested_brackets_re;
-
- var $nested_url_parenthesis_depth = 4;
- var $nested_url_parenthesis_re;
-
- # Table of hash values for escaped characters:
- var $escape_chars = '\`*_{}[]()>#+-.!';
- var $escape_chars_re;
-
- # Change to ">" for HTML output.
- var $empty_element_suffix = MARKDOWN_EMPTY_ELEMENT_SUFFIX;
- var $tab_width = MARKDOWN_TAB_WIDTH;
-
- # Change to `true` to disallow markup or entities.
- var $no_markup = false;
- var $no_entities = false;
-
- # Predefined urls and titles for reference links and images.
- var $predef_urls = array();
- var $predef_titles = array();
- function __construct()
- {
- #
- # Constructor function. Initialize appropriate member variables.
- #
- $this->_initDetab();
- $this->prepareItalicsAndBold();
-
- $this->nested_brackets_re = str_repeat('(?>[^\[\]]+|\[', $this->nested_brackets_depth) . str_repeat('\])*', $this->nested_brackets_depth);
-
- $this->nested_url_parenthesis_re = str_repeat('(?>[^()\s]+|\(', $this->nested_url_parenthesis_depth) . str_repeat('(?>\)))*', $this->nested_url_parenthesis_depth);
-
- $this->escape_chars_re = '[' . preg_quote($this->escape_chars) . ']';
-
- # Sort document, block, and span gamut in ascendent priority order.
- asort($this->document_gamut);
- asort($this->block_gamut);
- asort($this->span_gamut);
- }
-
- # Internal hashes used during transformation.
- var $urls = array();
- var $titles = array();
- var $html_hashes = array();
-
- # Status flag to avoid invalid nesting.
- var $in_anchor = false;
- function setup()
- {
- #
- # Called before the transformation process starts to setup parser
- # states.
- #
- # Clear global hashes.
- $this->urls = $this->predef_urls;
- $this->titles = $this->predef_titles;
- $this->html_hashes = array();
-
- $in_anchor = false;
- }
- function teardown()
- {
- #
- # Called after the transformation process to clear any variable
- # which may be taking up memory unnecessarly.
- #
- $this->urls = array();
- $this->titles = array();
- $this->html_hashes = array();
- }
- function transform($text)
- {
- #
- # Main function. Performs some preprocessing on the input text
- # and pass it through the document gamut.
- #
- $this->setup();
-
- # Remove UTF-8 BOM and marker character in input, if present.
- $text = preg_replace('{^\xEF\xBB\xBF|\x1A}', '', $text);
-
- # Standardize line endings:
- # DOS to Unix and Mac to Unix
- $text = preg_replace('{\r\n?}', "\n", $text);
-
- # Make sure $text ends with a couple of newlines:
- $text .= "\n";
-
- # Convert all tabs to spaces.
- $text = $this->detab($text);
-
- # Turn block-level HTML blocks into hash entries
- $text = $this->hashHTMLBlocks($text);
-
- # Strip any lines consisting only of spaces and tabs.
- # This makes subsequent regexen easier to write, because we can
- # match consecutive blank lines with /\n+/ instead of something
- # contorted like /[ ]*\n+/ .
- $text = preg_replace('/^[ ]+$/m', '', $text);
-
- # Run document gamut methods.
- foreach ($this->document_gamut as $method => $priority)
- {
- $text = $this->$method($text);
- }
-
- $text = FORMAT::parse_links($text);
-
- $text = $this->md5Hash_decode($text);
-
- $this->teardown();
-
- return $text;
- }
-
- var $document_gamut = array(
- # Strip link definitions, store in hashes.
- "stripLinkDefinitions" => 20,
-
- "runBasicBlockGamut" => 30
- );
- function stripLinkDefinitions($text)
- {
- #
- # Strips link definitions from text, stores the URLs and titles in
- # hash references.
- #
- $less_than_tab = $this->tab_width - 1;
-
- # Link defs are in the form: ^[id]: url "optional title"
- $text = preg_replace_callback('{
- ^[ ]{0,' . $less_than_tab . '}\[(.+)\][ ]?: # id = $1
- [ ]*
- \n? # maybe *one* newline
- [ ]*
- (?:
- <(.+?)> # url = $2
- |
- (\S+?) # url = $3
- )
- [ ]*
- \n? # maybe one newline
- [ ]*
- (?:
- (?<=\s) # lookbehind for whitespace
- ["(]
- (.*?) # title = $4
- [")]
- [ ]*
- )? # title is optional
- (?:\n+|\Z)
- }xm', array(
- &$this,
- '_stripLinkDefinitions_callback'
- ), $text);
- return $text;
- }
- function _stripLinkDefinitions_callback($matches)
- {
- $link_id = strtolower($matches[1]);
- $url = $matches[2] == '' ? $matches[3] : $matches[2];
- $this->urls[$link_id] = $url;
- $this->titles[$link_id] = & $matches[4];
- return ''; # String that will replace the block
- }
- function hashHTMLBlocks($text)
- {
- if ($this->no_markup)
- return $text;
-
- $less_than_tab = $this->tab_width - 1;
-
- # Hashify HTML blocks:
- # We only want to do this for block-level HTML tags, such as headers,
- # lists, and tables. That's because we still want to wrap <p>s around
- # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
- # phrase emphasis, and spans. The list of tags we're looking for is
- # hard-coded:
- #
- # * List "a" is made of tags which can be both inline or block-level.
- # These will be treated block-level when the start tag is alone on
- # its line, otherwise they're not matched here and will be taken as
- # inline later.
- # * List "b" is made of tags which are always block-level;
- #
- $block_tags_a_re = 'ins|del';
- $block_tags_b_re = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|' . 'script|noscript|form|fieldset|iframe|math';
-
- # Regular expression for the content of a block tag.
- $nested_tags_level = 4;
- $attr = '
- (?> # optional tag attributes
- \s # starts with whitespace
- (?>
- [^>"/]+ # text outside quotes
- |
- /+(?!>) # slash not followed by ">"
- |
- "[^"]*" # text inside double quotes (tolerate ">")
- |
- \'[^\']*\' # text inside single quotes (tolerate ">")
- )*
- )?
- ';
- $content = str_repeat('
- (?>
- [^<]+ # content without tag
- |
- <\2 # nested opening tag
- ' . $attr . ' # attributes
- (?>
- />
- |
- >', $nested_tags_level) . # end of opening tag
- '.*?' . # last level nested tag content
- str_repeat('
- </\2\s*> # closing nested tag
- )
- |
- <(?!/\2\s*> # other tags with a different name
- )
- )*', $nested_tags_level);
- $content2 = str_replace('\2', '\3', $content);
-
- # First, look for nested blocks, e.g.:
- # <div>
- # <div>
- # tags for inner block must be indented.
- # </div>
- # </div>
- #
- # The outermost tags must start at the left margin for this to match, and
- # the inner nested divs must be indented.
- # We need to do this before the next, more liberal match, because the next
- # match will start at the first `<div>` and stop at the first `</div>`.
- $text = preg_replace_callback('{(?>
- (?>
- (?<=\n\n) # Starting after a blank line
- | # or
- \A\n? # the beginning of the doc
- )
- ( # save in $1
- # Match from `\n<tag>` to `</tag>\n`, handling nested tags
- # in between.
-
- [ ]{0,' . $less_than_tab . '}
- <(' . $block_tags_b_re . ')# start tag = $2
- ' . $attr . '> # attributes followed by > and \n
- ' . $content . ' # content, support nesting
- </\2> # the matching end tag
- [ ]* # trailing spaces/tabs
- (?=\n+|\Z) # followed by a newline or end of document
- | # Special version for tags of group a.
- [ ]{0,' . $less_than_tab . '}
- <(' . $block_tags_a_re . ')# start tag = $3
- ' . $attr . '>[ ]*\n # attributes followed by >
- ' . $content2 . ' # content, support nesting
- </\3> # the matching end tag
- [ ]* # trailing spaces/tabs
- (?=\n+|\Z) # followed by a newline or end of document
-
- | # Special case just for <hr />. It was easier to make a special
- # case than to make the other regex more complicated.
-
- [ ]{0,' . $less_than_tab . '}
- <(hr) # start tag = $2
- ' . $attr . ' # attributes
- /?> # the matching end tag
- [ ]*
- (?=\n{2,}|\Z) # followed by a blank line or end of document
-
- | # Special case for standalone HTML comments:
-
- [ ]{0,' . $less_than_tab . '}
- (?s:
- <!-- .*? -->
- )
- [ ]*
- (?=\n{2,}|\Z) # followed by a blank line or end of document
-
- | # PHP and ASP-style processor instructions (<? and <%)
-
- [ ]{0,' . $less_than_tab . '}
- (?s:
- <([?%]) # $2
- .*?
- \2>
- )
- [ ]*
- (?=\n{2,}|\Z) # followed by a blank line or end of document
-
- )
- )}Sxmi', array(
- &$this,
- '_hashHTMLBlocks_callback'
- ), $text);
-
- return $text;
- }
- function _hashHTMLBlocks_callback($matches)
- {
- $text = $matches[1];
- $key = $this->hashBlock($text);
- return "\n\n$key\n\n";
- }
- function hashPart($text, $boundary = 'X')
- {
- #
- # Called whenever a tag must be hashed when a function insert an atomic
- # element in the text stream. Passing $text to through this function gives
- # a unique text-token which will be reverted back when calling unhash.
- #
- # The $boundary argument specify what character should be used to surround
- # the token. By convension, "B" is used for block elements that needs not
- # to be wrapped into paragraph tags at the end, ":" is used for elements
- # that are word separators and "X" is used in the general case.
- #
- # Swap back any tag hash found in $text so we do not have to `unhash`
- # multiple times at the end.
- $text = $this->unhash($text);
-
- # Then hash the block.
- static $i = 0;
- $key = "$boundary\x1A" . ++ $i . $boundary;
- $this->html_hashes[$key] = $text;
- return $key; # String that will replace the tag.
- }
- function hashBlock($text)
- {
- #
- # Shortcut function for hashPart with block-level boundaries.
- #
- return $this->hashPart($text, 'B');
- }
-
- var $block_gamut = array(
- #
- # These are all the transformations that form block-level
- # tags like paragraphs, headers, and list items.
- #
- "doCodeBlocks" => 5,
- "doHeaders" => 10,
- "doLists" => 40,
- "doBlockQuotes" => 60
- );
- function runBlockGamut($text)
- {
- #
- # Run block gamut tranformations.
- #
- # We need to escape raw HTML in Markdown source before doing anything
- # else. This need to be done for each block, and not only at the
- # begining in the Markdown function since hashed blocks can be part of
- # list items and could have been indented. Indented blocks would have
- # been seen as a code block in a previous pass of hashHTMLBlocks.
- $text = $this->hashHTMLBlocks($text);
-
- return $this->runBasicBlockGamut($text);
- }
- function runBasicBlockGamut($text)
- {
- #
- # Run block gamut tranformations, without hashing HTML blocks. This is
- # useful when HTML blocks are known to be already hashed, like in the first
- # whole-document pass.
- #
- foreach ($this->block_gamut as $method => $priority)
- {
- $text = $this->$method($text);
- }
-
- # Finally form paragraph and restore hashed blocks.
- $text = $this->formParagraphs($text);
-
- return $text;
- }
-
- var $span_gamut = array(
- #
- # These are all the transformations that occur *within* block-level
- # tags like paragraphs, headers, and list items.
- #
- # Process character escapes, code spans, and inline HTML
- # in one shot.
- "parseSpan" => - 30,
-
- # Process anchor and image tags. Images must come first,
- # because ![foo][f] looks like an anchor.
- "doImages" => 10,
-
- # Make links out of things like `<http://example.com/>`
- # Must come after doAnchors, because you can use < and >
- # delimiters in inline links like [this](<url>).
- "encodeAmpsAndAngles" => 40,
-
- "doItalicsAndBold" => 50,
- "doHardBreaks" => 60
- );
- function runSpanGamut($text)
- {
- #
- # Run span gamut tranformations.
- #
- foreach ($this->span_gamut as $method => $priority)
- {
- $text = $this->$method($text);
- }
-
- return $text;
- }
- function doHardBreaks($text)
- {
- # Do hard breaks:
- return preg_replace_callback('/ {2,}\n/', array(
- &$this,
- '_doHardBreaks_callback'
- ), $text);
- }
- function _doHardBreaks_callback($matches)
- {
- return $this->hashPart("<br$this->empty_element_suffix\n");
- }
- function doImages($text)
- {
- #
- # Turn Markdown image shortcuts into <img> tags.
- #
- #
- # First, handle reference-style labeled images: ![alt text][id]
- #
- $text = preg_replace_callback('{
- ( # wrap whole match in $1
- !\[
- (' . $this->nested_brackets_re . ') # alt text = $2
- \]
- [ ]? # one optional space
- (?:\n[ ]*)? # one optional newline followed by spaces
- \[
- (.*?) # id = $3
- \]
- )
- }xs', array(
- &$this,
- '_doImages_reference_callback'
- ), $text);
-
- #
- # Next, handle inline images: ![alt text](url "optional title")
- # Don't forget: encode * and _
- #
- $text = preg_replace_callback('{
- ( # wrap whole match in $1
- !\[
- (' . $this->nested_brackets_re . ') # alt text = $2
- \]
- \s? # One optional whitespace character
- \( # literal paren
- [ \n]*
- (?:
- <(\S*)> # src url = $3
- |
- (' . $this->nested_url_parenthesis_re . ') # src url = $4
- )
- [ \n]*
- ( # $5
- ([\'"]) # quote char = $6
- (.*?) # title = $7
- \6 # matching quote
- [ \n]*
- )? # title is optional
- \)
- )
- }xs', array(
- &$this,
- '_doImages_inline_callback'
- ), $text);
-
- return $text;
- }
- function _doImages_reference_callback($matches)
- {
- $whole_match = $matches[1];
- $alt_text = $matches[2];
- $link_id = strtolower($matches[3]);
-
- if ($link_id == "")
- {
- $link_id = strtolower($alt_text); # for shortcut links like ![this][].
- }
-
- $alt_text = $this->encodeAttribute($alt_text);
- if (isset($this->urls[$link_id]))
- {
- $url = $this->encodeAttribute($this->urls[$link_id]);
- $result = "<img src=\"$url\" alt=\"$alt_text\"";
- if (isset($this->titles[$link_id]))
- {
- $title = $this->titles[$link_id];
- $title = $this->encodeAttribute($title);
- $result .= " title=\"$title\"";
- }
- $result .= $this->empty_element_suffix;
- $result = $this->hashPart($result);
- }
- else
- {
- # If there's no such link ID, leave intact:
- $result = $whole_match;
- }
-
- return $result;
- }
- function _doImages_inline_callback($matches)
- {
- $whole_match = $matches[1];
- $alt_text = $matches[2];
- $url = $matches[3] == '' ? $matches[4] : $matches[3];
- $title = & $matches[7];
-
- $alt_text = $this->encodeAttribute($alt_text);
- $url = $this->encodeAttribute($url);
- $result = "<img src=\"$url\" alt=\"$alt_text\"";
- if (isset($title))
- {
- $title = $this->encodeAttribute($title);
- $result .= " title=\"$title\""; # $title already quoted
- }
- $result .= $this->empty_element_suffix;
-
- return $this->hashPart($result);
- }
- function doHeaders($text)
- {
- $text = preg_replace_callback('{
- ^(\#{2,3}) # $1 = string of #\'s
- [ ]*
- ([^\n]+?) # $2 = Header text
- [ ]*
- \#* # optional closing #\'s (not counted)
- [\n]
- }xm', array(
- &$this,
- '_doHeaders_callback_atx'
- ), $text);
-
- return $text;
- }
- function _doHeaders_callback_atx($matches)
- {
- $level = strlen($matches[1]);
- $block = "<h$level>" . $this->runSpanGamut($matches[2]) . "</h$level>";
- return $this->hashBlock($block);
- }
- function doLists($text)
- {
- #
- # Form HTML ordered (numbered) and unordered (bulleted) lists.
- #
- $less_than_tab = $this->tab_width - 1;
-
- # Re-usable patterns to match list item bullets and number markers:
- $marker_ul_re = '[-]';
- $marker_ol_re = '\d+[\.]';
- $marker_any_re = "(?:$marker_ul_re|$marker_ol_re)";
-
- $markers_relist = array(
- $marker_ul_re => $marker_ol_re,
- $marker_ol_re => $marker_ul_re
- );
-
- foreach ($markers_relist as $marker_re => $other_marker_re)
- {
- # Re-usable pattern to match any entirel ul or ol list:
- $whole_list_re = '
- ( # $1 = whole list
- ( # $2
- ([ ]{0,' . $less_than_tab . '}) # $3 = number of spaces
- (' . $marker_re . ') # $4 = first list item marker
- [ ]+
- )
- (?s:.+?)
- ( # $5
- \z
- |
- \n{2,}
- (?=\S)
- (?! # Negative lookahead for another list item marker
- [ ]*
- ' . $marker_re . '[ ]+
- )
- |
- (?= # Lookahead for another kind of list
- \n
- \3 # Must have the same indentation
- ' . $other_marker_re . '[ ]+
- )
- )
- )
- '; // mx
-
- # We use a different prefix before nested lists than top-level lists.
- # See extended comment in _ProcessListItems().
-
- if ($this->list_level)
- {
- $text = preg_replace_callback('{
- ^
- ' . $whole_list_re . '
- }mx', array(
- &$this,
- '_doLists_callback'
- ), $text);
- }
- else
- {
- $text = preg_replace_callback('{
- (?:(?<=\n)\n|\A\n?) # Must eat the newline
- ' . $whole_list_re . '
- }mx', array(
- &$this,
- '_doLists_callback'
- ), $text);
- }
- }
-
- return $text;
- }
- function _doLists_callback($matches)
- {
- # Re-usable patterns to match list item bullets and number markers:
- $marker_ul_re = '[*+-]';
- $marker_ol_re = '\d+[\.]';
- $marker_any_re = "(?:$marker_ul_re|$marker_ol_re)";
-
- $list = $matches[1];
- $list_type = preg_match("/$marker_ul_re/", $matches[4]) ? "ul" : "ol";
-
- $marker_any_re = ($list_type == "ul" ? $marker_ul_re : $marker_ol_re);
-
- $list .= "\n";
- $result = $this->processListItems($list, $marker_any_re);
-
- $result = $this->hashBlock("<$list_type>" . $result . "</$list_type>");
- return $result;
- }
-
- var $list_level = 0;
- function processListItems($list_str, $marker_any_re)
- {
- #
- # Process the contents of a single ordered or unordered list, splitting it
- # into individual list items.
- #
- # The $this->list_level global keeps track of when we're inside a list.
- # Each time we enter a list, we increment it; when we leave a list,
- # we decrement. If it's zero, we're not in a list anymore.
- #
- # We do this because when we're not inside a list, we want to treat
- # something like this:
- #
- # I recommend upgrading to version
- # 8. Oops, now this line is treated
- # as a sub-list.
- #
- # As a single paragraph, despite the fact that the second line starts
- # with a digit-period-space sequence.
- #
- # Whereas when we're inside a list (or sub-list), that line will be
- # treated as the start of a sub-list. What a kludge, huh? This is
- # an aspect of Markdown's syntax that's hard to parse perfectly
- # without resorting to mind-reading. Perhaps the solution is to
- # change the syntax rules such that sub-lists must start with a
- # starting cardinal number; e.g. "1." or "a.".
-
- $this->list_level ++;
-
- # trim trailing blank lines:
- $list_str = preg_replace("/\n{2,}\\z/", "\n", $list_str);
-
- $list_str = preg_replace_callback('{
- (\n)? # leading line = $1
- (^[ ]*) # leading whitespace = $2
- (' . $marker_any_re . ' # list marker and space = $3
- (?:[ ]+|(?=\n)) # space only required if item is not empty
- )
- ((?s:.*?)) # list item text = $4
- (?:(\n+(?=\n))|\n) # tailing blank line = $5
- (?= \n* (\z | \2 (' . $marker_any_re . ') (?:[ ]+|(?=\n))))
- }xm', array(
- &$this,
- '_processListItems_callback'
- ), $list_str);
-
- $this->list_level --;
- return $list_str;
- }
- function _processListItems_callback($matches)
- {
- $item = $matches[4];
- $leading_line = & $matches[1];
- $leading_space = & $matches[2];
- $marker_space = $matches[3];
- $tailing_blank_line = & $matches[5];
-
- if ($leading_line || $tailing_blank_line || preg_match('/\n{2,}/', $item))
- {
- # Replace marker with the appropriate whitespace indentation
- $item = $leading_space . str_repeat(' ', strlen($marker_space)) . $item;
- $item = $this->runBlockGamut($this->outdent($item) . "\n");
- }
- else
- {
- # Recursion for sub-lists:
- $item = $this->doLists($this->outdent($item));
- $item = preg_replace('/\n+$/', '', $item);
- $item = $this->runSpanGamut($item);
- }
-
- return "<li>" . $item . "</li>";
- }
- function doCodeBlocks($text)
- {
- #
- # Process Markdown `<code>` blocks.
- #
- preg_match('/\{\{\{/i', $text, $_m_c_open);
- preg_match('/\}\}\}/i', $text, $_m_c_close);
-
- if (count($_m_c_open) == count($_m_c_close))
- {
- $text = preg_replace_callback('/\{\{\{[ \n]*(.*?)\}\}\}/is', array(
- &$this,
- 'code_block_callback'
- ), $text);
- }
-
- return $text;
- }
- function code_block_callback($matches)
- {
- $str = str_replace(array(
- "\t",
- " "
- ), array(
- " ",
- " "
- ), $matches[1]);
-
- $str = $this->md5Hash($str);
-
- return '<code>' . $str . '</code>';
- }
- function get_hash_table()
- {
- $md_reg = array(
- '\\',
- '`',
- '*',
- '_',
- '{',
- '}',
- '[',
- ']',
- '(',
- ')',
- '#',
- '.',
- '!',
- ':',
- '<',
- '>'
- );
-
- $rd_reg = array();
-
- foreach ($md_reg as $val)
- {
- $rd_reg[] = md5($val);
- }
-
- return array(
- $md_reg,
- $rd_reg
- );
- }
- function md5Hash($text)
- {
- $hash = $this->get_hash_table();
-
- return str_replace($hash[0], $hash[1], $text);
- }
- function md5Hash_decode($text)
- {
- $hash = $this->get_hash_table();
-
- return str_replace($hash[1], $hash[0], $text);
- }
- function makeCodeSpan($code)
- {
- #
- # Create a code span markup for $code. Called from handleSpanToken.
- #
- $code = htmlspecialchars(trim($code), ENT_NOQUOTES);
- return $this->hashPart("<code>$code</code>");
- }
-
- var $em_relist = array(
- '' => '(?:(?<!\*)\*(?!\*)|(?<!_)_(?!_))(?=\S|$)(?![\.,:;]\s)',
- '*' => '(?<=\S|^)(?<!\*)\*(?!\*)',
- '_' => '(?<=\S|^)(?<!_)_(?!_)'
- );
- var $strong_relist = array(
- '' => '(?:(?<!\*)\*\*(?!\*)|(?<!_)__(?!_))(?=\S|$)(?![\.,:;]\s)',
- '**' => '(?<=\S|^)(?<!\*)\*\*(?!\*)',
- '__' => '(?<=\S|^)(?<!_)__(?!_)'
- );
- var $em_strong_relist = array(
- '' => '(?:(?<!\*)\*\*\*(?!\*)|(?<!_)___(?!_))(?=\S|$)(?![\.,:;]\s)',
- '***' => '(?<=\S|^)(?<!\*)\*\*\*(?!\*)',
- '___' => '(?<=\S|^)(?<!_)___(?!_)'
- );
- var $em_strong_prepared_relist;
- function prepareItalicsAndBold()
- {
- #
- # Prepare regular expressions for searching emphasis tokens in any
- # context.
- #
- foreach ($this->em_relist as $em => $em_re)
- {
- foreach ($this->strong_relist as $strong => $strong_re)
- {
- # Construct list of allowed token expressions.
- $token_relist = array();
- if (isset($this->em_strong_relist["$em$strong"]))
- {
- $token_relist[] = $this->em_strong_relist["$em$strong"];
- }
- $token_relist[] = $em_re;
- $token_relist[] = $strong_re;
-
- # Construct master expression from list.
- $token_re = '{(' . implode('|', $token_relist) . ')}';
- $this->em_strong_prepared_relist["$em$strong"] = $token_re;
- }
- }
- }
- function doItalicsAndBold($text)
- {
- $token_stack = array(
- ''
- );
- $text_stack = array(
- ''
- );
- $em = '';
- $strong = '';
- $tree_char_em = false;
-
- while (1)
- {
- #
- # Get prepared regular expression for seraching emphasis tokens
- # in current context.
- #
- $token_re = $this->em_strong_prepared_relist["$em$strong"];
-
- #
- # Each loop iteration search for the next emphasis token.
- # Each token is then passed to handleSpanToken.
- #
- $parts = preg_split($token_re, $text, 2, PREG_SPLIT_DELIM_CAPTURE);
- $text_stack[0] .= $parts[0];
- $token = & $parts[1];
- $text = & $parts[2];
-
- if (empty($token))
- {
- # Reached end of text span: empty stack without emitting.
- # any more emphasis.
- while ($token_stack[0])
- {
- $text_stack[1] .= array_shift($token_stack);
- $text_stack[0] .= array_shift($text_stack);
- }
- break;
- }
-
- $token_len = strlen($token);
- if ($tree_char_em)
- {
- # Reached closing marker while inside a three-char emphasis.
- if ($token_len == 3)
- {
- # Three-char closing marker, close em and strong.
- array_shift($token_stack);
- $span = array_shift($text_stack);
- $span = $this->runSpanGamut($span);
- $span = "<strong><em>$span</em></strong>";
- $text_stack[0] .= $this->hashPart($span);
- $em = '';
- $strong = '';
- }
- else
- {
- # Other closing marker: close one em or strong and
- # change current token state to match the other
- $token_stack[0] = str_repeat($token{0}, 3 - $token_len);
- $tag = $token_len == 2 ? "strong" : "em";
- $span = $text_stack[0];
- $span = $this->runSpanGamut($span);
- $span = "<$tag>$span</$tag>";
- $text_stack[0] = $this->hashPart($span);
- $$tag = ''; # $$tag stands for $em or $strong
- }
- $tree_char_em = false;
- }
- else if ($token_len == 3)
- {
- if ($em)
- {
- # Reached closing marker for both em and strong.
- # Closing strong marker:
- for ($i = 0; $i < 2; ++ $i)
- {
- $shifted_token = array_shift($token_stack);
- $tag = strlen($shifted_token) == 2 ? "strong" : "em";
- $span = array_shift($text_stack);
- $span = $this->runSpanGamut($span);
- $span = "<$tag>$span</$tag>";
- $text_stack[0] .= $this->hashPart($span);
- $$tag = ''; # $$tag stands for $em or $strong
- }
- }
- else
- {
- # Reached opening three-char emphasis marker. Push on token
- # stack; will be handled by the special condition above.
- $em = $token{0};
- $strong = "$em$em";
- array_unshift($token_stack, $token);
- array_unshift($text_stack, '');
- $tree_char_em = true;
- }
- }
- else if ($token_len == 2)
- {
- if ($token == '__') // remove __ support
- {
- $text_stack[0] .= $token;
- }
- else if ($strong)
- {
- # Unwind any dangling emphasis marker:
- if (strlen($token_stack[0]) == 1)
- {
- $text_stack[1] .= array_shift($token_stack);
- $text_stack[0] .= array_shift($text_stack);
- }
- # Closing strong marker:
- array_shift($token_stack);
- $span = array_shift($text_stack);
- $span = $this->runSpanGamut($span);
- $span = "<strong>$span</strong>";
- $text_stack[0] .= $this->hashPart($span);
- $strong = '';
- }
- else
- {
- array_unshift($token_stack, $token);
- array_unshift($text_stack, '');
- $strong = $token;
- }
- }
- else
- {
- # Here $token_len == 1
- if ($em)
- {
- if (strlen($token_stack[0]) == 1 && ($token != '_'))
- { // remove _ support
- # Closing emphasis marker:
- array_shift($token_stack);
- $span = array_shift($text_stack);
- $span = $this->runSpanGamut($span);
- $span = "<em>$span</em>";
- $text_stack[0] .= $this->hashPart($span);
- $em = '';
- }
- else
- {
- $text_stack[0] .= $token;
- }
- }
- else
- {
- array_unshift($token_stack, $token);
- array_unshift($text_stack, '');
- $em = $token;
- }
- }
- }
- return $text_stack[0];
- }
- function doBlockQuotes($text)
- {
- $text = preg_replace_callback('/
- ( # Wrap whole match in $1
- (?>
- ^[ ]*>[ ]? # ">" at the start of a line
- .+\n # rest of the first line
- (.+\n)* # subsequent consecutive lines
- \n* # blanks
- )+
- )
- /xm', array(
- &$this,
- '_doBlockQuotes_callback'
- ), $text);
-
- return $text;
- }
- function _doBlockQuotes_callback($matches)
- {
- $bq = $matches[1];
- # trim one level of quoting - trim whitespace-only lines
- $bq = preg_replace('/^[ ]*>[ ]?|^[ ]+$/m', '', $bq);
- $bq = $this->runBlockGamut($bq); # recurse
-
- $bq = preg_replace('/^/m', " ", $bq);
- # These leading spaces cause problem with <pre> content,
- # so we need to fix that:
- $bq = preg_replace_callback('{(\s*<pre>.+?</pre>)}sx', array(
- &$this,
- '_doBlockQuotes_callback2'
- ), $bq);
-
- $bq = $this->md5Hash($bq);
-
- return $this->hashBlock("<blockquote>$bq</blockquote>\n");
- }
- function _doBlockQuotes_callback2($matches)
- {
- $pre = $matches[1];
- $pre = preg_replace('/^ /m', '', $pre);
- return $pre;
- }
- function formParagraphs($text)
- {
- #
- # Params:
- # $text - string to process with html <p> tags
- #
- # Strip leading and trailing lines:
- $text = preg_replace('/\A\n+|\n+\z/', '', $text);
-
- $grafs = preg_split('/\n{2,}/', $text, - 1, PREG_SPLIT_NO_EMPTY);
-
- foreach ($grafs as $key => $value)
- {
- if (! preg_match('/^B\x1A[0-9]+B$/', $value))
- {
- # Is a paragraph.
- $value = $this->runSpanGamut($value);
- /*$value = preg_replace('/^([ ]*)/', "<p>", $value);
- $value .= "</p>";*/
- $grafs[$key] = $this->unhash($value);
- }
- else
- {
- # Is a block.
- # Modify elements of @grafs in-place...
- $graf = $value;
- $block = $this->html_hashes[$graf];
- $graf = $block;
- $grafs[$key] = $graf;
- }
- }
-
- return implode("\n\n", $grafs);
- }
- function encodeAttribute($text)
- {
- #
- # Encode text for a double-quoted HTML attribute. This function
- # is *not* suitable for attributes enclosed in single quotes.
- #
- $text = $this->encodeAmpsAndAngles($text);
- $text = str_replace('"', '"', $text);
- return $text;
- }
- function encodeAmpsAndAngles($text)
- {
- #
- # Smart processing for ampersands and angle brackets that need to
- # be encoded. Valid character entities are left alone unless the
- # no-entities mode is set.
- #
- if ($this->no_entities)
- {
- $text = str_replace('&', '&', $text);
- }
- else
- {
- # Ampersand-encoding based entirely on Nat Irons's Amputator
- # MT plugin: <http://bumppo.net/projects/amputator/>
- $text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/', '&', $text);
- ;
- }
- # Encode remaining <'s
- $text = str_replace('<', '<', $text);
-
- return $text;
- }
- function parseSpan($str)
- {
- #
- # Take the string $str and parse it into tokens, hashing embeded HTML,
- # escaped characters and handling code spans.
- #
- $output = '';
-
- $span_re = '{
- (
- \\\\' . $this->escape_chars_re . '
- |
- (?<![`\\\\])
- `+ # code span marker
- ' . ($this->no_markup ? '' : '
- |
- <!-- .*? --> # comment
- |
- <\?.*?\?> | <%.*?%> # processing instruction
- |
- <[/!$]?[-a-zA-Z0-9:_]+ # regular tags
- (?>
- \s
- (?>[^"\'>]+|"[^"]*"|\'[^\']*\')*
- )?
- >
- ') . '
- )
- }xs';
-
- while (1)
- {
- #
- # Each loop iteration seach for either the next tag, the next
- # openning code span marker, or the next escaped character.
- # Each token is then passed to handleSpanToken.
- #
- $parts = preg_split($span_re, $str, 2, PREG_SPLIT_DELIM_CAPTURE);
-
- # Create token from text preceding tag.
- if ($parts[0] != "")
- {
- $output .= $parts[0];
- }
-
- # Check if we reach the end.
- if (isset($parts[1]))
- {
- $output .= $this->handleSpanToken($parts[1], $parts[2]);
- $str = $parts[2];
- }
- else
- {
- break;
- }
- }
-
- return $output;
- }
- function handleSpanToken($token, &$str)
- {
- #
- # Handle $token provided by parseSpan by determining its nature and
- # returning the corresponding value that should replace it.
- #
- switch ($token{0})
- {
- case "\\" :
- return $token;
- return $this->hashPart("&#" . ord($token{1}) . ";");
- case "`" :
- return $token; // return as text since no ending marker found.
- default :
- return $this->hashPart($token);
- }
- }
- function outdent($text)
- {
- #
- # Remove one level of line-leading tabs or spaces
- #
- return preg_replace('/^(\t|[ ]{1,' . $this->tab_width . '})/m', '', $text);
- }
-
- # String length function for detab. `_initDetab` will create a function to
- # hanlde UTF-8 if the default function does not exist.
- var $utf8_strlen = 'mb_strlen';
- function detab($text)
- {
- #
- # Replace tabs with the appropriate amount of space.
- #
- # For each line we separate the line in blocks delemited by
- # tab characters. Then we reconstruct every line by adding the
- # appropriate number of space between each blocks.
-
- $text = preg_replace_callback('/^.*\t.*$/m', array(
- &$this,
- '_detab_callback'
- ), $text);
-
- return $text;
- }
- function _detab_callback($matches)
- {
- $line = $matches[0];
- $strlen = $this->utf8_strlen; # strlen function for UTF-8.
-
- # Split in blocks.
- $blocks = explode("\t", $line);
- # Add each blocks to the line.
- $line = $blocks[0];
- unset($blocks[0]); # Do not add first block twice.
- foreach ($blocks as $block)
- {
- # Calculate amount of space, insert spaces, insert block.
- $amount = $this->tab_width - $strlen($line, 'UTF-8') % $this->tab_width;
- $line .= str_repeat(" ", $amount) . $block;
- }
- return $line;
- }
- function _initDetab()
- {
- #
- # Check for the availability of the function in the `utf8_strlen` property
- # (initially `mb_strlen`). If the function is not available, create a
- # function that will loosely count the number of UTF-8 characters with a
- # regular expression.
- #
- if (function_exists($this->utf8_strlen))
- return;
- $this->utf8_strlen = create_function('$text', 'return preg_match_all(
- "/[\\\\x00-\\\\xBF]|[\\\\xC0-\\\\xFF][\\\\x80-\\\\xBF]*/",
- $text, $m);');
- }
- function unhash($text)
- {
- #
- # Swap back in all the tags hashed by _HashHTMLBlocks.
- #
- return preg_replace_callback('/(.)\x1A[0-9]+\1/', array(
- &$this,
- '_unhash_callback'
- ), $text);
- }
- function _unhash_callback($matches)
- {
- return $this->html_hashes[$matches[0]];
- }
- }