Markdown.php - Markdown - A text-to-HTML conversion tool fo…

/system/Services/Markdown.php

https://github.com/sony88/answion · PHP · 1348 lines · 921 code · 155 blank · 272 comment · 51 complexity · 4ce15305e83164c3d205b803ce146a03 MD5 · raw file

<?php
#
# Markdown  -  A text-to-HTML conversion tool for web writers
#
# PHP Markdown
# Copyright (c) 2004-2012 Michel Fortin  
# <http://michelf.com/projects/php-markdown/>
#
# Original Markdown
# Copyright (c) 2004-2006 John Gruber  
# <http://daringfireball.net/projects/markdown/>
#


# Change to ">" for HTML output
@define('MARKDOWN_EMPTY_ELEMENT_SUFFIX', " />");

# Define the width of a tab for code blocks.
@define('MARKDOWN_TAB_WIDTH', 4);

class Services_Markdown
{
	
	# Regex to match balanced [brackets].
	# Needed to insert a maximum bracked depth while converting to PHP.
	var $nested_brackets_depth = 6;
	var $nested_brackets_re;
	
	var $nested_url_parenthesis_depth = 4;
	var $nested_url_parenthesis_re;
	
	# Table of hash values for escaped characters:
	var $escape_chars = '\`*_{}[]()>#+-.!';
	var $escape_chars_re;
	
	# Change to ">" for HTML output.
	var $empty_element_suffix = MARKDOWN_EMPTY_ELEMENT_SUFFIX;
	var $tab_width = MARKDOWN_TAB_WIDTH;
	
	# Change to `true` to disallow markup or entities.
	var $no_markup = false;
	var $no_entities = false;
	
	# Predefined urls and titles for reference links and images.
	var $predef_urls = array();
	var $predef_titles = array();

	function __construct()
	{
		#
		# Constructor function. Initialize appropriate member variables.
		#
		$this->_initDetab();
		$this->prepareItalicsAndBold();
		
		$this->nested_brackets_re = str_repeat('(?>[^\[\]]+|\[', $this->nested_brackets_depth) . str_repeat('\])*', $this->nested_brackets_depth);
		
		$this->nested_url_parenthesis_re = str_repeat('(?>[^()\s]+|\(', $this->nested_url_parenthesis_depth) . str_repeat('(?>\)))*', $this->nested_url_parenthesis_depth);
		
		$this->escape_chars_re = '[' . preg_quote($this->escape_chars) . ']';
		
		# Sort document, block, and span gamut in ascendent priority order.
		asort($this->document_gamut);
		asort($this->block_gamut);
		asort($this->span_gamut);
	}
	
	# Internal hashes used during transformation.
	var $urls = array();
	var $titles = array();
	var $html_hashes = array();
	
	# Status flag to avoid invalid nesting.
	var $in_anchor = false;

	function setup()
	{
		#
		# Called before the transformation process starts to setup parser 
		# states.
		#
		# Clear global hashes.
		$this->urls = $this->predef_urls;
		$this->titles = $this->predef_titles;
		$this->html_hashes = array();
		
		$in_anchor = false;
	}

	function teardown()
	{
		#
		# Called after the transformation process to clear any variable 
		# which may be taking up memory unnecessarly.
		#
		$this->urls = array();
		$this->titles = array();
		$this->html_hashes = array();
	}

	function transform($text)
	{
		#
		# Main function. Performs some preprocessing on the input text
		# and pass it through the document gamut.
		#
		$this->setup();
		
		# Remove UTF-8 BOM and marker character in input, if present.
		$text = preg_replace('{^\xEF\xBB\xBF|\x1A}', '', $text);
		
		# Standardize line endings:
		#   DOS to Unix and Mac to Unix
		$text = preg_replace('{\r\n?}', "\n", $text);
		
		# Make sure $text ends with a couple of newlines:
		$text .= "\n";
		
		# Convert all tabs to spaces.
		$text = $this->detab($text);
		
		# Turn block-level HTML blocks into hash entries
		$text = $this->hashHTMLBlocks($text);
		
		# Strip any lines consisting only of spaces and tabs.
		# This makes subsequent regexen easier to write, because we can
		# match consecutive blank lines with /\n+/ instead of something
		# contorted like /[ ]*\n+/ .
		$text = preg_replace('/^[ ]+$/m', '', $text);
		
		# Run document gamut methods.
		foreach ($this->document_gamut as $method => $priority)
		{
			$text = $this->$method($text);
		}
		
		$text = FORMAT::parse_links($text);
		
		$text = $this->md5Hash_decode($text);
		
		$this->teardown();
		
		return $text;
	}
	
	var $document_gamut = array(
		# Strip link definitions, store in hashes.
		"stripLinkDefinitions" => 20, 
		
		"runBasicBlockGamut" => 30
	);

	function stripLinkDefinitions($text)
	{
		#
		# Strips link definitions from text, stores the URLs and titles in
		# hash references.
		#
		$less_than_tab = $this->tab_width - 1;
		
		# Link defs are in the form: ^[id]: url "optional title"
		$text = preg_replace_callback('{
								^[ ]{0,' . $less_than_tab . '}\[(.+)\][ ]?:	# id = $1
								  [ ]*
								  \n?				# maybe *one* newline
								  [ ]*
								(?:
								  <(.+?)>			# url = $2
								|
								  (\S+?)			# url = $3
								)
								  [ ]*
								  \n?				# maybe one newline
								  [ ]*
								(?:
									(?<=\s)			# lookbehind for whitespace
									["(]
									(.*?)			# title = $4
									[")]
									[ ]*
								)?	# title is optional
								(?:\n+|\Z)
				}xm', array(
			&$this, 
			'_stripLinkDefinitions_callback'
		), $text);
		return $text;
	}

	function _stripLinkDefinitions_callback($matches)
	{
		$link_id = strtolower($matches[1]);
		$url = $matches[2] == '' ? $matches[3] : $matches[2];
		$this->urls[$link_id] = $url;
		$this->titles[$link_id] = & $matches[4];
		return ''; # String that will replace the block
	}

	function hashHTMLBlocks($text)
	{
		if ($this->no_markup)
			return $text;
		
		$less_than_tab = $this->tab_width - 1;
		
		# Hashify HTML blocks:
		# We only want to do this for block-level HTML tags, such as headers,
		# lists, and tables. That's because we still want to wrap <p>s around
		# "paragraphs" that are wrapped in non-block-level tags, such as anchors,
		# phrase emphasis, and spans. The list of tags we're looking for is
		# hard-coded:
		#
		# *  List "a" is made of tags which can be both inline or block-level.
		#    These will be treated block-level when the start tag is alone on 
		#    its line, otherwise they're not matched here and will be taken as 
		#    inline later.
		# *  List "b" is made of tags which are always block-level;
		#
		$block_tags_a_re = 'ins|del';
		$block_tags_b_re = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|' . 'script|noscript|form|fieldset|iframe|math';
		
		# Regular expression for the content of a block tag.
		$nested_tags_level = 4;
		$attr = '
				(?>				# optional tag attributes
				  \s			# starts with whitespace
				  (?>
					[^>"/]+		# text outside quotes
				  |
					/+(?!>)		# slash not followed by ">"
				  |
					"[^"]*"		# text inside double quotes (tolerate ">")
				  |
					\'[^\']*\'	# text inside single quotes (tolerate ">")
				  )*
				)?	
				';
		$content = str_repeat('
					(?>
					  [^<]+			# content without tag
					|
					  <\2			# nested opening tag
						' . $attr . '	# attributes
						(?>
						  />
						|
						  >', $nested_tags_level) . 		# end of opening tag
'.*?' . 		# last level nested tag content
str_repeat('
						  </\2\s*>	# closing nested tag
						)
					  |				
						<(?!/\2\s*>	# other tags with a different name
					  )
					)*', $nested_tags_level);
		$content2 = str_replace('\2', '\3', $content);
		
		# First, look for nested blocks, e.g.:
		# 	<div>
		# 		<div>
		# 		tags for inner block must be indented.
		# 		</div>
		# 	</div>
		#
		# The outermost tags must start at the left margin for this to match, and
		# the inner nested divs must be indented.
		# We need to do this before the next, more liberal match, because the next
		# match will start at the first `<div>` and stop at the first `</div>`.
		$text = preg_replace_callback('{(?>
				(?>
					(?<=\n\n)		# Starting after a blank line
					|				# or
					\A\n?			# the beginning of the doc
				)
				(						# save in $1

				  # Match from `\n<tag>` to `</tag>\n`, handling nested tags 
				  # in between.
					
							[ ]{0,' . $less_than_tab . '}
							<(' . $block_tags_b_re . ')# start tag = $2
							' . $attr . '>			# attributes followed by > and \n
							' . $content . '		# content, support nesting
							</\2>				# the matching end tag
							[ ]*				# trailing spaces/tabs
							(?=\n+|\Z)	# followed by a newline or end of document

				| # Special version for tags of group a.

							[ ]{0,' . $less_than_tab . '}
							<(' . $block_tags_a_re . ')# start tag = $3
							' . $attr . '>[ ]*\n	# attributes followed by >
							' . $content2 . '		# content, support nesting
							</\3>				# the matching end tag
							[ ]*				# trailing spaces/tabs
							(?=\n+|\Z)	# followed by a newline or end of document
					
				| # Special case just for <hr />. It was easier to make a special 
				  # case than to make the other regex more complicated.
			
							[ ]{0,' . $less_than_tab . '}
							<(hr)				# start tag = $2
							' . $attr . '			# attributes
							/?>					# the matching end tag
							[ ]*
							(?=\n{2,}|\Z)		# followed by a blank line or end of document
			
				| # Special case for standalone HTML comments:
			
						[ ]{0,' . $less_than_tab . '}
						(?s:
							<!-- .*? -->
						)
						[ ]*
						(?=\n{2,}|\Z)		# followed by a blank line or end of document
			
				| # PHP and ASP-style processor instructions (<? and <%)
			
						[ ]{0,' . $less_than_tab . '}
						(?s:
							<([?%])			# $2
							.*?
							\2>
						)
						[ ]*
						(?=\n{2,}|\Z)		# followed by a blank line or end of document
					
				)
				)}Sxmi', array(
			&$this, 
			'_hashHTMLBlocks_callback'
		), $text);
		
		return $text;
	}

	function _hashHTMLBlocks_callback($matches)
	{
		$text = $matches[1];
		$key = $this->hashBlock($text);
		return "\n\n$key\n\n";
	}

	function hashPart($text, $boundary = 'X')
	{
		#
		# Called whenever a tag must be hashed when a function insert an atomic 
		# element in the text stream. Passing $text to through this function gives
		# a unique text-token which will be reverted back when calling unhash.
		#
		# The $boundary argument specify what character should be used to surround
		# the token. By convension, "B" is used for block elements that needs not
		# to be wrapped into paragraph tags at the end, ":" is used for elements
		# that are word separators and "X" is used in the general case.
		#
		# Swap back any tag hash found in $text so we do not have to `unhash`
		# multiple times at the end.
		$text = $this->unhash($text);
		
		# Then hash the block.
		static $i = 0;
		$key = "$boundary\x1A" . ++ $i . $boundary;
		$this->html_hashes[$key] = $text;
		return $key; # String that will replace the tag.
	}

	function hashBlock($text)
	{
		#
		# Shortcut function for hashPart with block-level boundaries.
		#
		return $this->hashPart($text, 'B');
	}
	
	var $block_gamut = array(
		#
		# These are all the transformations that form block-level
		# tags like paragraphs, headers, and list items.
		#
		"doCodeBlocks" => 5, 
		"doHeaders" => 10, 
		"doLists" => 40, 
		"doBlockQuotes" => 60
	);

	function runBlockGamut($text)
	{
		#
		# Run block gamut tranformations.
		#
		# We need to escape raw HTML in Markdown source before doing anything 
		# else. This need to be done for each block, and not only at the 
		# begining in the Markdown function since hashed blocks can be part of
		# list items and could have been indented. Indented blocks would have 
		# been seen as a code block in a previous pass of hashHTMLBlocks.
		$text = $this->hashHTMLBlocks($text);
		
		return $this->runBasicBlockGamut($text);
	}

	function runBasicBlockGamut($text)
	{
		#
		# Run block gamut tranformations, without hashing HTML blocks. This is 
		# useful when HTML blocks are known to be already hashed, like in the first
		# whole-document pass.
		#
		foreach ($this->block_gamut as $method => $priority)
		{
			$text = $this->$method($text);
		}
		
		# Finally form paragraph and restore hashed blocks.
		$text = $this->formParagraphs($text);
		
		return $text;
	}
	
	var $span_gamut = array(
		#
		# These are all the transformations that occur *within* block-level
		# tags like paragraphs, headers, and list items.
		#
		# Process character escapes, code spans, and inline HTML
		# in one shot.
		"parseSpan" => - 30, 
		
		# Process anchor and image tags. Images must come first,
		# because ![foo][f] looks like an anchor.
		"doImages" => 10, 
		
		# Make links out of things like `<http://example.com/>`
		# Must come after doAnchors, because you can use < and >
		# delimiters in inline links like [this](<url>).
		"encodeAmpsAndAngles" => 40, 
		
		"doItalicsAndBold" => 50, 
		"doHardBreaks" => 60
	);

	function runSpanGamut($text)
	{
		#
		# Run span gamut tranformations.
		#
		foreach ($this->span_gamut as $method => $priority)
		{
			$text = $this->$method($text);
		}
		
		return $text;
	}

	function doHardBreaks($text)
	{
		# Do hard breaks:
		return preg_replace_callback('/ {2,}\n/', array(
			&$this, 
			'_doHardBreaks_callback'
		), $text);
	}

	function _doHardBreaks_callback($matches)
	{
		return $this->hashPart("<br$this->empty_element_suffix\n");
	}

	function doImages($text)
	{
		#
		# Turn Markdown image shortcuts into <img> tags.
		#
		#
		# First, handle reference-style labeled images: ![alt text][id]
		#
		$text = preg_replace_callback('{
				(				# wrap whole match in $1
				  !\[
					(' . $this->nested_brackets_re . ')		# alt text = $2
				  \]

				  [ ]?				# one optional space
				  (?:\n[ ]*)?		# one optional newline followed by spaces

				  \[
					(.*?)		# id = $3
				  \]

				)
				}xs', array(
			&$this, 
			'_doImages_reference_callback'
		), $text);
		
		#
		# Next, handle inline images:  ![alt text](url "optional title")
		# Don't forget: encode * and _
		#
		$text = preg_replace_callback('{
				(				# wrap whole match in $1
				  !\[
					(' . $this->nested_brackets_re . ')		# alt text = $2
				  \]
				  \s?			# One optional whitespace character
				  \(			# literal paren
					[ \n]*
					(?:
						<(\S*)>	# src url = $3
					|
						(' . $this->nested_url_parenthesis_re . ')	# src url = $4
					)
					[ \n]*
					(			# $5
					  ([\'"])	# quote char = $6
					  (.*?)		# title = $7
					  \6		# matching quote
					  [ \n]*
					)?			# title is optional
				  \)
				)
				}xs', array(
			&$this, 
			'_doImages_inline_callback'
		), $text);
		
		return $text;
	}

	function _doImages_reference_callback($matches)
	{
		$whole_match = $matches[1];
		$alt_text = $matches[2];
		$link_id = strtolower($matches[3]);
		
		if ($link_id == "")
		{
			$link_id = strtolower($alt_text); # for shortcut links like ![this][].
		}
		
		$alt_text = $this->encodeAttribute($alt_text);
		if (isset($this->urls[$link_id]))
		{
			$url = $this->encodeAttribute($this->urls[$link_id]);
			$result = "<img src=\"$url\" alt=\"$alt_text\"";
			if (isset($this->titles[$link_id]))
			{
				$title = $this->titles[$link_id];
				$title = $this->encodeAttribute($title);
				$result .= " title=\"$title\"";
			}
			$result .= $this->empty_element_suffix;
			$result = $this->hashPart($result);
		}
		else
		{
			# If there's no such link ID, leave intact:
			$result = $whole_match;
		}
		
		return $result;
	}

	function _doImages_inline_callback($matches)
	{
		$whole_match = $matches[1];
		$alt_text = $matches[2];
		$url = $matches[3] == '' ? $matches[4] : $matches[3];
		$title = & $matches[7];
		
		$alt_text = $this->encodeAttribute($alt_text);
		$url = $this->encodeAttribute($url);
		$result = "<img src=\"$url\" alt=\"$alt_text\"";
		if (isset($title))
		{
			$title = $this->encodeAttribute($title);
			$result .= " title=\"$title\""; # $title already quoted
		}
		$result .= $this->empty_element_suffix;
		
		return $this->hashPart($result);
	}

	function doHeaders($text)
	{
		$text = preg_replace_callback('{
					^(\#{2,3})	# $1 = string of #\'s
					[ ]*
					([^\n]+?)		# $2 = Header text
					[ ]*
					\#*			# optional closing #\'s (not counted)
					[\n]
				}xm', array(
			&$this, 
			'_doHeaders_callback_atx'
		), $text);
		
		return $text;
	}

	function _doHeaders_callback_atx($matches)
	{
		$level = strlen($matches[1]);
		$block = "<h$level>" . $this->runSpanGamut($matches[2]) . "</h$level>";
		return $this->hashBlock($block);
	}

	function doLists($text)
	{
		#
		# Form HTML ordered (numbered) and unordered (bulleted) lists.
		#
		$less_than_tab = $this->tab_width - 1;
		
		# Re-usable patterns to match list item bullets and number markers:
		$marker_ul_re = '[-]';
		$marker_ol_re = '\d+[\.]';
		$marker_any_re = "(?:$marker_ul_re|$marker_ol_re)";
		
		$markers_relist = array(
			$marker_ul_re => $marker_ol_re, 
			$marker_ol_re => $marker_ul_re
		);
		
		foreach ($markers_relist as $marker_re => $other_marker_re)
		{
			# Re-usable pattern to match any entirel ul or ol list:
			$whole_list_re = '
					(								# $1 = whole list
					  (								# $2
						([ ]{0,' . $less_than_tab . '})	# $3 = number of spaces
						(' . $marker_re . ')			# $4 = first list item marker
						[ ]+
					  )
					  (?s:.+?)
					  (								# $5
						  \z
						|
						  \n{2,}
						  (?=\S)
						  (?!						# Negative lookahead for another list item marker
							[ ]*
							' . $marker_re . '[ ]+
						  )
						|
						  (?=						# Lookahead for another kind of list
						    \n
							\3						# Must have the same indentation
							' . $other_marker_re . '[ ]+
						  )
					  )
					)
				'; // mx
			

			# We use a different prefix before nested lists than top-level lists.
			# See extended comment in _ProcessListItems().
			

			if ($this->list_level)
			{
				$text = preg_replace_callback('{
							^
							' . $whole_list_re . '
						}mx', array(
					&$this, 
					'_doLists_callback'
				), $text);
			}
			else
			{
				$text = preg_replace_callback('{
							(?:(?<=\n)\n|\A\n?) # Must eat the newline
							' . $whole_list_re . '
						}mx', array(
					&$this, 
					'_doLists_callback'
				), $text);
			}
		}
		
		return $text;
	}

	function _doLists_callback($matches)
	{
		# Re-usable patterns to match list item bullets and number markers:
		$marker_ul_re = '[*+-]';
		$marker_ol_re = '\d+[\.]';
		$marker_any_re = "(?:$marker_ul_re|$marker_ol_re)";
		
		$list = $matches[1];
		$list_type = preg_match("/$marker_ul_re/", $matches[4]) ? "ul" : "ol";
		
		$marker_any_re = ($list_type == "ul" ? $marker_ul_re : $marker_ol_re);
		
		$list .= "\n";
		$result = $this->processListItems($list, $marker_any_re);
		
		$result = $this->hashBlock("<$list_type>" . $result . "</$list_type>");
		return $result;
	}
	
	var $list_level = 0;

	function processListItems($list_str, $marker_any_re)
	{
		#
		#	Process the contents of a single ordered or unordered list, splitting it
		#	into individual list items.
		#
		# The $this->list_level global keeps track of when we're inside a list.
		# Each time we enter a list, we increment it; when we leave a list,
		# we decrement. If it's zero, we're not in a list anymore.
		#
		# We do this because when we're not inside a list, we want to treat
		# something like this:
		#
		#		I recommend upgrading to version
		#		8. Oops, now this line is treated
		#		as a sub-list.
		#
		# As a single paragraph, despite the fact that the second line starts
		# with a digit-period-space sequence.
		#
		# Whereas when we're inside a list (or sub-list), that line will be
		# treated as the start of a sub-list. What a kludge, huh? This is
		# an aspect of Markdown's syntax that's hard to parse perfectly
		# without resorting to mind-reading. Perhaps the solution is to
		# change the syntax rules such that sub-lists must start with a
		# starting cardinal number; e.g. "1." or "a.".
		

		$this->list_level ++;
		
		# trim trailing blank lines:
		$list_str = preg_replace("/\n{2,}\\z/", "\n", $list_str);
		
		$list_str = preg_replace_callback('{
				(\n)?							# leading line = $1
				(^[ ]*)							# leading whitespace = $2
				(' . $marker_any_re . '				# list marker and space = $3
					(?:[ ]+|(?=\n))	# space only required if item is not empty
				)
				((?s:.*?))						# list item text   = $4
				(?:(\n+(?=\n))|\n)				# tailing blank line = $5
				(?= \n* (\z | \2 (' . $marker_any_re . ') (?:[ ]+|(?=\n))))
				}xm', array(
			&$this, 
			'_processListItems_callback'
		), $list_str);
		
		$this->list_level --;
		return $list_str;
	}

	function _processListItems_callback($matches)
	{
		$item = $matches[4];
		$leading_line = & $matches[1];
		$leading_space = & $matches[2];
		$marker_space = $matches[3];
		$tailing_blank_line = & $matches[5];
		
		if ($leading_line || $tailing_blank_line || preg_match('/\n{2,}/', $item))
		{
			# Replace marker with the appropriate whitespace indentation
			$item = $leading_space . str_repeat(' ', strlen($marker_space)) . $item;
			$item = $this->runBlockGamut($this->outdent($item) . "\n");
		}
		else
		{
			# Recursion for sub-lists:
			$item = $this->doLists($this->outdent($item));
			$item = preg_replace('/\n+$/', '', $item);
			$item = $this->runSpanGamut($item);
		}
		
		return "<li>" . $item . "</li>";
	}

	function doCodeBlocks($text)
	{
		#
		#	Process Markdown `<code>` blocks.
		#
		preg_match('/\{\{\{/i', $text, $_m_c_open);
		preg_match('/\}\}\}/i', $text, $_m_c_close);
		
		if (count($_m_c_open) == count($_m_c_close))
		{
			$text = preg_replace_callback('/\{\{\{[ \n]*(.*?)\}\}\}/is', array(
				&$this, 
				'code_block_callback'
			), $text);
		}
		
		return $text;
	}

	function code_block_callback($matches)
	{
		$str = str_replace(array(
			"\t", 
			" "
		), array(
			"&nbsp;&nbsp;&nbsp;&nbsp;", 
			"&nbsp;"
		), $matches[1]);
		
		$str = $this->md5Hash($str);
		
		return '<code>' . $str . '</code>';
	}

	function get_hash_table()
	{
		$md_reg = array(
			'\\', 
			'`', 
			'*', 
			'_', 
			'{', 
			'}', 
			'[', 
			']', 
			'(', 
			')', 
			'#', 
			'.', 
			'!', 
			':', 
			'<', 
			'>'
		);
		
		$rd_reg = array();
		
		foreach ($md_reg as $val)
		{
			$rd_reg[] = md5($val);
		}
		
		return array(
			$md_reg, 
			$rd_reg
		);
	}

	function md5Hash($text)
	{
		$hash = $this->get_hash_table();
		
		return str_replace($hash[0], $hash[1], $text);
	}

	function md5Hash_decode($text)
	{
		$hash = $this->get_hash_table();
		
		return str_replace($hash[1], $hash[0], $text);
	}

	function makeCodeSpan($code)
	{
		#
		# Create a code span markup for $code. Called from handleSpanToken.
		#
		$code = htmlspecialchars(trim($code), ENT_NOQUOTES);
		return $this->hashPart("<code>$code</code>");
	}
	
	var $em_relist = array(
		'' => '(?:(?<!\*)\*(?!\*)|(?<!_)_(?!_))(?=\S|$)(?![\.,:;]\s)', 
		'*' => '(?<=\S|^)(?<!\*)\*(?!\*)', 
		'_' => '(?<=\S|^)(?<!_)_(?!_)'
	);
	var $strong_relist = array(
		'' => '(?:(?<!\*)\*\*(?!\*)|(?<!_)__(?!_))(?=\S|$)(?![\.,:;]\s)', 
		'**' => '(?<=\S|^)(?<!\*)\*\*(?!\*)', 
		'__' => '(?<=\S|^)(?<!_)__(?!_)'
	);
	var $em_strong_relist = array(
		'' => '(?:(?<!\*)\*\*\*(?!\*)|(?<!_)___(?!_))(?=\S|$)(?![\.,:;]\s)', 
		'***' => '(?<=\S|^)(?<!\*)\*\*\*(?!\*)', 
		'___' => '(?<=\S|^)(?<!_)___(?!_)'
	);
	var $em_strong_prepared_relist;

	function prepareItalicsAndBold()
	{
		#
		# Prepare regular expressions for searching emphasis tokens in any
		# context.
		#
		foreach ($this->em_relist as $em => $em_re)
		{
			foreach ($this->strong_relist as $strong => $strong_re)
			{
				# Construct list of allowed token expressions.
				$token_relist = array();
				if (isset($this->em_strong_relist["$em$strong"]))
				{
					$token_relist[] = $this->em_strong_relist["$em$strong"];
				}
				$token_relist[] = $em_re;
				$token_relist[] = $strong_re;
				
				# Construct master expression from list.
				$token_re = '{(' . implode('|', $token_relist) . ')}';
				$this->em_strong_prepared_relist["$em$strong"] = $token_re;
			}
		}
	}

	function doItalicsAndBold($text)
	{
		$token_stack = array(
			''
		);
		$text_stack = array(
			''
		);
		$em = '';
		$strong = '';
		$tree_char_em = false;
		
		while (1)
		{
			#
			# Get prepared regular expression for seraching emphasis tokens
			# in current context.
			#
			$token_re = $this->em_strong_prepared_relist["$em$strong"];
			
			#
			# Each loop iteration search for the next emphasis token. 
			# Each token is then passed to handleSpanToken.
			#
			$parts = preg_split($token_re, $text, 2, PREG_SPLIT_DELIM_CAPTURE);
			$text_stack[0] .= $parts[0];
			$token = & $parts[1];
			$text = & $parts[2];
			
			if (empty($token))
			{
				# Reached end of text span: empty stack without emitting.
				# any more emphasis.
				while ($token_stack[0])
				{
					$text_stack[1] .= array_shift($token_stack);
					$text_stack[0] .= array_shift($text_stack);
				}
				break;
			}
			
			$token_len = strlen($token);
			if ($tree_char_em)
			{
				# Reached closing marker while inside a three-char emphasis.
				if ($token_len == 3)
				{
					# Three-char closing marker, close em and strong.
					array_shift($token_stack);
					$span = array_shift($text_stack);
					$span = $this->runSpanGamut($span);
					$span = "<strong><em>$span</em></strong>";
					$text_stack[0] .= $this->hashPart($span);
					$em = '';
					$strong = '';
				}
				else
				{
					# Other closing marker: close one em or strong and
					# change current token state to match the other
					$token_stack[0] = str_repeat($token{0}, 3 - $token_len);
					$tag = $token_len == 2 ? "strong" : "em";
					$span = $text_stack[0];
					$span = $this->runSpanGamut($span);
					$span = "<$tag>$span</$tag>";
					$text_stack[0] = $this->hashPart($span);
					$$tag = ''; # $$tag stands for $em or $strong
				}
				$tree_char_em = false;
			}
			else if ($token_len == 3)
			{
				if ($em)
				{
					# Reached closing marker for both em and strong.
					# Closing strong marker:
					for ($i = 0; $i < 2; ++ $i)
					{
						$shifted_token = array_shift($token_stack);
						$tag = strlen($shifted_token) == 2 ? "strong" : "em";
						$span = array_shift($text_stack);
						$span = $this->runSpanGamut($span);
						$span = "<$tag>$span</$tag>";
						$text_stack[0] .= $this->hashPart($span);
						$$tag = ''; # $$tag stands for $em or $strong
					}
				}
				else
				{
					# Reached opening three-char emphasis marker. Push on token 
					# stack; will be handled by the special condition above.
					$em = $token{0};
					$strong = "$em$em";
					array_unshift($token_stack, $token);
					array_unshift($text_stack, '');
					$tree_char_em = true;
				}
			}
			else if ($token_len == 2)
			{
				if ($token == '__') // remove __ support
				{
					$text_stack[0] .= $token;
				}
				else if ($strong)
				{
					# Unwind any dangling emphasis marker:
					if (strlen($token_stack[0]) == 1)
					{
						$text_stack[1] .= array_shift($token_stack);
						$text_stack[0] .= array_shift($text_stack);
					}
					# Closing strong marker:
					array_shift($token_stack);
					$span = array_shift($text_stack);
					$span = $this->runSpanGamut($span);
					$span = "<strong>$span</strong>";
					$text_stack[0] .= $this->hashPart($span);
					$strong = '';
				}
				else
				{
					array_unshift($token_stack, $token);
					array_unshift($text_stack, '');
					$strong = $token;
				}
			}
			else
			{
				# Here $token_len == 1
				if ($em)
				{
					if (strlen($token_stack[0]) == 1 && ($token != '_'))
					{ // remove _ support
						# Closing emphasis marker:
						array_shift($token_stack);
						$span = array_shift($text_stack);
						$span = $this->runSpanGamut($span);
						$span = "<em>$span</em>";
						$text_stack[0] .= $this->hashPart($span);
						$em = '';
					}
					else
					{
						$text_stack[0] .= $token;
					}
				}
				else
				{
					array_unshift($token_stack, $token);
					array_unshift($text_stack, '');
					$em = $token;
				}
			}
		}
		return $text_stack[0];
	}

	function doBlockQuotes($text)
	{
		$text = preg_replace_callback('/
				  (								# Wrap whole match in $1
					(?>
					  ^[ ]*>[ ]?			# ">" at the start of a line
						.+\n					# rest of the first line
					  (.+\n)*					# subsequent consecutive lines
					  \n*						# blanks
					)+
				  )
				/xm', array(
			&$this, 
			'_doBlockQuotes_callback'
		), $text);
		
		return $text;
	}

	function _doBlockQuotes_callback($matches)
	{
		$bq = $matches[1];
		# trim one level of quoting - trim whitespace-only lines
		$bq = preg_replace('/^[ ]*>[ ]?|^[ ]+$/m', '', $bq);
		$bq = $this->runBlockGamut($bq); # recurse
		

		$bq = preg_replace('/^/m', "  ", $bq);
		# These leading spaces cause problem with <pre> content, 
		# so we need to fix that:
		$bq = preg_replace_callback('{(\s*<pre>.+?</pre>)}sx', array(
			&$this, 
			'_doBlockQuotes_callback2'
		), $bq);
		
		$bq = $this->md5Hash($bq);
		
		return $this->hashBlock("<blockquote>$bq</blockquote>\n");
	}

	function _doBlockQuotes_callback2($matches)
	{
		$pre = $matches[1];
		$pre = preg_replace('/^  /m', '', $pre);
		return $pre;
	}

	function formParagraphs($text)
	{
		#
		#	Params:
		#		$text - string to process with html <p> tags
		#
		# Strip leading and trailing lines:
		$text = preg_replace('/\A\n+|\n+\z/', '', $text);
		
		$grafs = preg_split('/\n{2,}/', $text, - 1, PREG_SPLIT_NO_EMPTY);
		
		foreach ($grafs as $key => $value)
		{
			if (! preg_match('/^B\x1A[0-9]+B$/', $value))
			{
				# Is a paragraph.
				$value = $this->runSpanGamut($value);
				/*$value = preg_replace('/^([ ]*)/', "<p>", $value);
					$value .= "</p>";*/
				$grafs[$key] = $this->unhash($value);
			}
			else
			{
				# Is a block.
				# Modify elements of @grafs in-place...
				$graf = $value;
				$block = $this->html_hashes[$graf];
				$graf = $block;
				$grafs[$key] = $graf;
			}
		}
		
		return implode("\n\n", $grafs);
	}

	function encodeAttribute($text)
	{
		#
		# Encode text for a double-quoted HTML attribute. This function
		# is *not* suitable for attributes enclosed in single quotes.
		#
		$text = $this->encodeAmpsAndAngles($text);
		$text = str_replace('"', '&quot;', $text);
		return $text;
	}

	function encodeAmpsAndAngles($text)
	{
		#
		# Smart processing for ampersands and angle brackets that need to 
		# be encoded. Valid character entities are left alone unless the
		# no-entities mode is set.
		#
		if ($this->no_entities)
		{
			$text = str_replace('&', '&amp;', $text);
		}
		else
		{
			# Ampersand-encoding based entirely on Nat Irons's Amputator
			# MT plugin: <http://bumppo.net/projects/amputator/>
			$text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/', '&amp;', $text);
			;
		}
		# Encode remaining <'s
		$text = str_replace('<', '&lt;', $text);
		
		return $text;
	}

	function parseSpan($str)
	{
		#
		# Take the string $str and parse it into tokens, hashing embeded HTML,
		# escaped characters and handling code spans.
		#
		$output = '';
		
		$span_re = '{
					(
						\\\\' . $this->escape_chars_re . '
					|
						(?<![`\\\\])
						`+						# code span marker
				' . ($this->no_markup ? '' : '
					|
						<!--    .*?     -->		# comment
					|
						<\?.*?\?> | <%.*?%>		# processing instruction
					|
						<[/!$]?[-a-zA-Z0-9:_]+	# regular tags
						(?>
							\s
							(?>[^"\'>]+|"[^"]*"|\'[^\']*\')*
						)?
						>
				') . '
					)
					}xs';
		
		while (1)
		{
			#
			# Each loop iteration seach for either the next tag, the next 
			# openning code span marker, or the next escaped character. 
			# Each token is then passed to handleSpanToken.
			#
			$parts = preg_split($span_re, $str, 2, PREG_SPLIT_DELIM_CAPTURE);
			
			# Create token from text preceding tag.
			if ($parts[0] != "")
			{
				$output .= $parts[0];
			}
			
			# Check if we reach the end.
			if (isset($parts[1]))
			{
				$output .= $this->handleSpanToken($parts[1], $parts[2]);
				$str = $parts[2];
			}
			else
			{
				break;
			}
		}
		
		return $output;
	}

	function handleSpanToken($token, &$str)
	{
		#
		# Handle $token provided by parseSpan by determining its nature and 
		# returning the corresponding value that should replace it.
		#
		switch ($token{0})
		{
			case "\\" :
				return $token;
				return $this->hashPart("&#" . ord($token{1}) . ";");
			case "`" :
				return $token; // return as text since no ending marker found.
			default :
				return $this->hashPart($token);
		}
	}

	function outdent($text)
	{
		#
		# Remove one level of line-leading tabs or spaces
		#
		return preg_replace('/^(\t|[ ]{1,' . $this->tab_width . '})/m', '', $text);
	}
	
	# String length function for detab. `_initDetab` will create a function to 
	# hanlde UTF-8 if the default function does not exist.
	var $utf8_strlen = 'mb_strlen';

	function detab($text)
	{
		#
		# Replace tabs with the appropriate amount of space.
		#
		# For each line we separate the line in blocks delemited by
		# tab characters. Then we reconstruct every line by adding the 
		# appropriate number of space between each blocks.
		

		$text = preg_replace_callback('/^.*\t.*$/m', array(
			&$this, 
			'_detab_callback'
		), $text);
		
		return $text;
	}

	function _detab_callback($matches)
	{
		$line = $matches[0];
		$strlen = $this->utf8_strlen; # strlen function for UTF-8.
		

		# Split in blocks.
		$blocks = explode("\t", $line);
		# Add each blocks to the line.
		$line = $blocks[0];
		unset($blocks[0]); # Do not add first block twice.
		foreach ($blocks as $block)
		{
			# Calculate amount of space, insert spaces, insert block.
			$amount = $this->tab_width - $strlen($line, 'UTF-8') % $this->tab_width;
			$line .= str_repeat(" ", $amount) . $block;
		}
		return $line;
	}

	function _initDetab()
	{
		#
		# Check for the availability of the function in the `utf8_strlen` property
		# (initially `mb_strlen`). If the function is not available, create a 
		# function that will loosely count the number of UTF-8 characters with a
		# regular expression.
		#
		if (function_exists($this->utf8_strlen))
			return;
		$this->utf8_strlen = create_function('$text', 'return preg_match_all(
				"/[\\\\x00-\\\\xBF]|[\\\\xC0-\\\\xFF][\\\\x80-\\\\xBF]*/", 
				$text, $m);');
	}

	function unhash($text)
	{
		#
		# Swap back in all the tags hashed by _HashHTMLBlocks.
		#
		return preg_replace_callback('/(.)\x1A[0-9]+\1/', array(
			&$this, 
			'_unhash_callback'
		), $text);
	}

	function _unhash_callback($matches)
	{
		return $this->html_hashes[$matches[0]];
	}
}
Alerts (7)

'var' Legacy var keyword detected; use public/private/protected for class properties
26 27 29 30 33 703
Complexity hotspot; line 1047 (total complexity: 4)
1047