PageRenderTime 52ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 1ms

/framework/3rdParty/Markdown/MarkdownParser.php

http://prado3.googlecode.com/
PHP | 1256 lines | 731 code | 148 blank | 377 comment | 27 complexity | 39dd97d6912576dc6d7cff606d31ffd1 MD5 | raw file
Possible License(s): Apache-2.0, IPL-1.0, LGPL-3.0, LGPL-2.1, BSD-3-Clause
  1. <?php
  2. #
  3. # Markdown - A text-to-HTML conversion tool for web writers
  4. #
  5. # Copyright (c) 2004-2005 John Gruber
  6. # <http://daringfireball.net/projects/markdown/>
  7. #
  8. # Copyright (c) 2004-2005 Michel Fortin - PHP Port
  9. # <http://www.michelf.com/projects/php-markdown/>
  10. #
  11. /**
  12. * PHP5 version of the markdown parser.
  13. * Usage:
  14. * <code>
  15. * $markdown = new MarkdownParser;
  16. * echo $markdown->parse($text);
  17. * </code>
  18. */
  19. class MarkdownParser
  20. {
  21. private static $md_nested_brackets;
  22. private static $md_escape_table = array();
  23. private static $md_backslash_escape_table = array();
  24. private static $md_nested_brackets_depth = 6;
  25. protected $md_empty_element_suffix = " />"; # Change to ">" for HTML output
  26. protected $md_tab_width = 4;
  27. private $md_list_level = 0;
  28. private $md_urls = array();
  29. private $md_titles = array();
  30. private $md_html_blocks = array();
  31. public function __construct()
  32. {
  33. if(is_null(self::$md_nested_brackets))
  34. $this->initialize();
  35. }
  36. private function initialize()
  37. {
  38. self::$md_nested_brackets =
  39. str_repeat('(?>[^\[\]]+|\[', self::$md_nested_brackets_depth).
  40. str_repeat('\])*', self::$md_nested_brackets_depth);
  41. self::$md_escape_table = array(
  42. "\\" => md5("\\"),
  43. "`" => md5("`"),
  44. "*" => md5("*"),
  45. "_" => md5("_"),
  46. "{" => md5("{"),
  47. "}" => md5("}"),
  48. "[" => md5("["),
  49. "]" => md5("]"),
  50. "(" => md5("("),
  51. ")" => md5(")"),
  52. ">" => md5(">"),
  53. "#" => md5("#"),
  54. "+" => md5("+"),
  55. "-" => md5("-"),
  56. "." => md5("."),
  57. "!" => md5("!")
  58. );
  59. # Table of hash values for escaped characters:
  60. # Create an identical table but for escaped characters.
  61. foreach (self::$md_escape_table as $key => $char)
  62. self::$md_backslash_escape_table["\\$key"] = $char;
  63. }
  64. public function parse($text)
  65. {
  66. #
  67. # Main function. The order in which other subs are called here is
  68. # essential. Link and image substitutions need to happen before
  69. # _EscapeSpecialCharsWithinTagAttributes(), so that any *'s or _'s in the <a>
  70. # and <img> tags get encoded.
  71. #
  72. # Clear the hashes. If we don't clear these, you get conflicts
  73. # from other articles when generating a page which contains more than
  74. # one article (e.g. an index page that shows the N most recent
  75. # articles):
  76. $this->md_urls = array();
  77. $this->md_titles = array();
  78. $this->md_html_blocks = array();
  79. # Standardize line endings:
  80. # DOS to Unix and Mac to Unix
  81. $text = str_replace(array("\r\n", "\r"), "\n", $text);
  82. # Make sure $text ends with a couple of newlines:
  83. $text .= "\n\n";
  84. # Convert all tabs to spaces.
  85. $text = $this->_Detab($text);
  86. # Strip any lines consisting only of spaces and tabs.
  87. # This makes subsequent regexen easier to write, because we can
  88. # match consecutive blank lines with /\n+/ instead of something
  89. # contorted like /[ \t]*\n+/ .
  90. $text = preg_replace('/^[ \t]+$/m', '', $text);
  91. # Turn block-level HTML blocks into hash entries
  92. $text = $this->_HashHTMLBlocks($text);
  93. # Strip link definitions, store in hashes.
  94. $text = $this->_StripLinkDefinitions($text);
  95. $text = $this->_RunBlockGamut($text);
  96. $text = $this->_UnescapeSpecialChars($text);
  97. return $text . "\n";
  98. }
  99. private function _StripLinkDefinitions($text) {
  100. #
  101. # Strips link definitions from text, stores the URLs and titles in
  102. # hash references.
  103. #
  104. $less_than_tab = $this->md_tab_width - 1;
  105. # Link defs are in the form: ^[id]: url "optional title"
  106. $text = preg_replace_callback('{
  107. ^[ ]{0,'.$less_than_tab.'}\[(.+)\]: # id = $1
  108. [ \t]*
  109. \n? # maybe *one* newline
  110. [ \t]*
  111. <?(\S+?)>? # url = $2
  112. [ \t]*
  113. \n? # maybe one newline
  114. [ \t]*
  115. (?:
  116. (?<=\s) # lookbehind for whitespace
  117. ["(]
  118. (.+?) # title = $3
  119. [")]
  120. [ \t]*
  121. )? # title is optional
  122. (?:\n+|\Z)
  123. }xm',
  124. array($this,'_StripLinkDefinitions_callback'),
  125. $text);
  126. return $text;
  127. }
  128. private function _StripLinkDefinitions_callback($matches) {
  129. $link_id = strtolower($matches[1]);
  130. $this->md_urls[$link_id] = $this->_EncodeAmpsAndAngles($matches[2]);
  131. if (isset($matches[3]))
  132. $this->md_titles[$link_id] = str_replace('"', '&quot;', $matches[3]);
  133. return ''; # String that will replace the block
  134. }
  135. private function _HashHTMLBlocks($text) {
  136. $less_than_tab = $this->md_tab_width - 1;
  137. # Hashify HTML blocks:
  138. # We only want to do this for block-level HTML tags, such as headers,
  139. # lists, and tables. That's because we still want to wrap <p>s around
  140. # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
  141. # phrase emphasis, and spans. The list of tags we're looking for is
  142. # hard-coded:
  143. $block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|'.
  144. 'script|noscript|form|fieldset|iframe|math|ins|del';
  145. $block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|'.
  146. 'script|noscript|form|fieldset|iframe|math';
  147. # First, look for nested blocks, e.g.:
  148. # <div>
  149. # <div>
  150. # tags for inner block must be indented.
  151. # </div>
  152. # </div>
  153. #
  154. # The outermost tags must start at the left margin for this to match, and
  155. # the inner nested divs must be indented.
  156. # We need to do this before the next, more liberal match, because the next
  157. # match will start at the first `<div>` and stop at the first `</div>`.
  158. $text = preg_replace_callback("{
  159. ( # save in $1
  160. ^ # start of line (with /m)
  161. <($block_tags_a) # start tag = $2
  162. \\b # word break
  163. (.*\\n)*? # any number of lines, minimally matching
  164. </\\2> # the matching end tag
  165. [ \\t]* # trailing spaces/tabs
  166. (?=\\n+|\\Z) # followed by a newline or end of document
  167. )
  168. }xm",
  169. array($this,'_HashHTMLBlocks_callback'),
  170. $text);
  171. #
  172. # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
  173. #
  174. $text = preg_replace_callback("{
  175. ( # save in $1
  176. ^ # start of line (with /m)
  177. <($block_tags_b) # start tag = $2
  178. \\b # word break
  179. (.*\\n)*? # any number of lines, minimally matching
  180. .*</\\2> # the matching end tag
  181. [ \\t]* # trailing spaces/tabs
  182. (?=\\n+|\\Z) # followed by a newline or end of document
  183. )
  184. }xm",
  185. array($this,'_HashHTMLBlocks_callback'),
  186. $text);
  187. # Special case just for <hr />. It was easier to make a special case than
  188. # to make the other regex more complicated.
  189. $text = preg_replace_callback('{
  190. (?:
  191. (?<=\n\n) # Starting after a blank line
  192. | # or
  193. \A\n? # the beginning of the doc
  194. )
  195. ( # save in $1
  196. [ ]{0,'.$less_than_tab.'}
  197. <(hr) # start tag = $2
  198. \b # word break
  199. ([^<>])*? #
  200. /?> # the matching end tag
  201. [ \t]*
  202. (?=\n{2,}|\Z) # followed by a blank line or end of document
  203. )
  204. }x',
  205. array($this,'_HashHTMLBlocks_callback'),
  206. $text);
  207. # Special case for standalone HTML comments:
  208. $text = preg_replace_callback('{
  209. (?:
  210. (?<=\n\n) # Starting after a blank line
  211. | # or
  212. \A\n? # the beginning of the doc
  213. )
  214. ( # save in $1
  215. [ ]{0,'.$less_than_tab.'}
  216. (?s:
  217. <!
  218. (--.*?--\s*)+
  219. >
  220. )
  221. [ \t]*
  222. (?=\n{2,}|\Z) # followed by a blank line or end of document
  223. )
  224. }x',
  225. array($this,'_HashHTMLBlocks_callback'),
  226. $text);
  227. return $text;
  228. }
  229. private function _HashHTMLBlocks_callback($matches) {
  230. $text = $matches[1];
  231. $key = md5($text);
  232. $this->md_html_blocks[$key] = $text;
  233. return "\n\n$key\n\n"; # String that will replace the block
  234. }
  235. private function _RunBlockGamut($text) {
  236. #
  237. # These are all the transformations that form block-level
  238. # tags like paragraphs, headers, and list items.
  239. #
  240. $text = $this->_DoHeaders($text);
  241. # Do Horizontal Rules:
  242. $text = preg_replace(
  243. array('{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}mx',
  244. '{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}mx',
  245. '{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}mx'),
  246. "\n<hr{$this->md_empty_element_suffix}\n",
  247. $text);
  248. $text = $this->_DoLists($text);
  249. $text = $this->_DoCodeBlocks($text);
  250. $text = $this->_DoBlockQuotes($text);
  251. # We already ran _HashHTMLBlocks() before, in Markdown(), but that
  252. # was to escape raw HTML in the original Markdown source. This time,
  253. # we're escaping the markup we've just created, so that we don't wrap
  254. # <p> tags around block-level tags.
  255. $text = $this->_HashHTMLBlocks($text);
  256. $text = $this->_FormParagraphs($text);
  257. return $text;
  258. }
  259. private function _RunSpanGamut($text) {
  260. #
  261. # These are all the transformations that occur *within* block-level
  262. # tags like paragraphs, headers, and list items.
  263. #
  264. $text = $this->_DoCodeSpans($text);
  265. $text = $this->_EscapeSpecialChars($text);
  266. # Process anchor and image tags. Images must come first,
  267. # because ![foo][f] looks like an anchor.
  268. $text = $this->_DoImages($text);
  269. $text = $this->_DoAnchors($text);
  270. # Make links out of things like `<http://example.com/>`
  271. # Must come after _DoAnchors(), because you can use < and >
  272. # delimiters in inline links like [this](<url>).
  273. $text = $this->_DoAutoLinks($text);
  274. $text = $this->_EncodeAmpsAndAngles($text);
  275. $text = $this->_DoItalicsAndBold($text);
  276. # Do hard breaks:
  277. $text = preg_replace('/ {2,}\n/', "<br{$this->md_empty_element_suffix}\n", $text);
  278. return $text;
  279. }
  280. private function _EscapeSpecialChars($text) {
  281. $tokens = $this->_TokenizeHTML($text);
  282. $text = ''; # rebuild $text from the tokens
  283. # $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags.
  284. # $tags_to_skip = "!<(/?)(?:pre|code|kbd|script|math)[\s>]!";
  285. foreach ($tokens as $cur_token) {
  286. if ($cur_token[0] == 'tag') {
  287. # Within tags, encode * and _ so they don't conflict
  288. # with their use in Markdown for italics and strong.
  289. # We're replacing each such character with its
  290. # corresponding MD5 checksum value; this is likely
  291. # overkill, but it should prevent us from colliding
  292. # with the escape values by accident.
  293. $cur_token[1] = str_replace(array('*', '_'),
  294. array(self::$md_escape_table['*'], self::$md_escape_table['_']),
  295. $cur_token[1]);
  296. $text .= $cur_token[1];
  297. } else {
  298. $t = $cur_token[1];
  299. $t = $this->_EncodeBackslashEscapes($t);
  300. $text .= $t;
  301. }
  302. }
  303. return $text;
  304. }
  305. private function _DoAnchors($text) {
  306. #
  307. # Turn Markdown link shortcuts into XHTML <a> tags.
  308. #
  309. #
  310. # First, handle reference-style links: [link text] [id]
  311. #
  312. $bracket = self::$md_nested_brackets;
  313. $text = preg_replace_callback("{
  314. ( # wrap whole match in $1
  315. \\[
  316. ({$bracket}) # link text = $2
  317. \\]
  318. [ ]? # one optional space
  319. (?:\\n[ ]*)? # one optional newline followed by spaces
  320. \\[
  321. (.*?) # id = $3
  322. \\]
  323. )
  324. }xs",
  325. array($this,'_DoAnchors_reference_callback'), $text);
  326. #
  327. # Next, inline-style links: [link text](url "optional title")
  328. #
  329. $text = preg_replace_callback("{
  330. ( # wrap whole match in $1
  331. \\[
  332. ({$bracket}) # link text = $2
  333. \\]
  334. \\( # literal paren
  335. [ \\t]*
  336. <?(.*?)>? # href = $3
  337. [ \\t]*
  338. ( # $4
  339. (['\"]) # quote char = $5
  340. (.*?) # Title = $6
  341. \\5 # matching quote
  342. )? # title is optional
  343. \\)
  344. )
  345. }xs",
  346. array($this,'_DoAnchors_inline_callback'), $text);
  347. return $text;
  348. }
  349. private function _DoAnchors_reference_callback($matches) {
  350. $whole_match = $matches[1];
  351. $link_text = $matches[2];
  352. $link_id = strtolower($matches[3]);
  353. if ($link_id == "") {
  354. $link_id = strtolower($link_text); # for shortcut links like [this][].
  355. }
  356. if (isset($this->md_urls[$link_id])) {
  357. $url = $this->md_urls[$link_id];
  358. # We've got to encode these to avoid conflicting with italics/bold.
  359. $url = str_replace(array('*', '_'),
  360. array(self::$md_escape_table['*'], self::$md_escape_table['_']),
  361. $url);
  362. $result = "<a href=\"$url\"";
  363. if ( isset( $this->md_titles[$link_id] ) ) {
  364. $title = $this->md_titles[$link_id];
  365. $title = str_replace(array('*', '_'),
  366. array(self::$md_escape_table['*'],
  367. self::$md_escape_table['_']), $title);
  368. $result .= " title=\"$title\"";
  369. }
  370. $result .= ">$link_text</a>";
  371. }
  372. else {
  373. $result = $whole_match;
  374. }
  375. return $result;
  376. }
  377. private function _DoAnchors_inline_callback($matches) {
  378. $whole_match = $matches[1];
  379. $link_text = $matches[2];
  380. $url = $matches[3];
  381. $title =& $matches[6];
  382. # We've got to encode these to avoid conflicting with italics/bold.
  383. $url = str_replace(array('*', '_'),
  384. array(self::$md_escape_table['*'], self::$md_escape_table['_']),
  385. $url);
  386. $result = "<a href=\"$url\"";
  387. if (isset($title)) {
  388. $title = str_replace('"', '&quot;', $title);
  389. $title = str_replace(array('*', '_'),
  390. array(self::$md_escape_table['*'], self::$md_escape_table['_']),
  391. $title);
  392. $result .= " title=\"$title\"";
  393. }
  394. $result .= ">$link_text</a>";
  395. return $result;
  396. }
  397. private function _DoImages($text) {
  398. #
  399. # Turn Markdown image shortcuts into <img> tags.
  400. #
  401. #
  402. # First, handle reference-style labeled images: ![alt text][id]
  403. #
  404. $text = preg_replace_callback('{
  405. ( # wrap whole match in $1
  406. !\[
  407. ('.self::$md_nested_brackets.') # alt text = $2
  408. \]
  409. [ ]? # one optional space
  410. (?:\n[ ]*)? # one optional newline followed by spaces
  411. \[
  412. (.*?) # id = $3
  413. \]
  414. )
  415. }xs',
  416. array($this,'_DoImages_reference_callback'), $text);
  417. #
  418. # Next, handle inline images: ![alt text](url "optional title")
  419. # Don't forget: encode * and _
  420. $text = preg_replace_callback('{
  421. ( # wrap whole match in $1
  422. !\[
  423. ('.self::$md_nested_brackets.') # alt text = $2
  424. \]
  425. \( # literal paren
  426. [ \t]*
  427. <?(\S+?)>? # src url = $3
  428. [ \t]*
  429. ( # $4
  430. ([\'"]) # quote char = $5
  431. (.*?) # title = $6
  432. \5 # matching quote
  433. [ \t]*
  434. )? # title is optional
  435. \)
  436. )
  437. }xs',
  438. array($this,'_DoImages_inline_callback'), $text);
  439. return $text;
  440. }
  441. private function _DoImages_reference_callback($matches) {
  442. $whole_match = $matches[1];
  443. $alt_text = $matches[2];
  444. $link_id = strtolower($matches[3]);
  445. if ($link_id == "") {
  446. $link_id = strtolower($alt_text); # for shortcut links like ![this][].
  447. }
  448. $alt_text = str_replace('"', '&quot;', $alt_text);
  449. if (isset($this->md_urls[$link_id])) {
  450. $url = $this->md_urls[$link_id];
  451. # We've got to encode these to avoid conflicting with italics/bold.
  452. $url = str_replace(array('*', '_'),
  453. array(self::$md_escape_table['*'], self::$md_escape_table['_']),
  454. $url);
  455. $result = "<img src=\"$url\" alt=\"$alt_text\"";
  456. if (isset($this->md_titles[$link_id])) {
  457. $title = $this->md_titles[$link_id];
  458. $title = str_replace(array('*', '_'),
  459. array(self::$md_escape_table['*'],
  460. self::$md_escape_table['_']), $title);
  461. $result .= " title=\"$title\"";
  462. }
  463. $result .= $this->md_empty_element_suffix;
  464. }
  465. else {
  466. # If there's no such link ID, leave intact:
  467. $result = $whole_match;
  468. }
  469. return $result;
  470. }
  471. private function _DoImages_inline_callback($matches) {
  472. $whole_match = $matches[1];
  473. $alt_text = $matches[2];
  474. $url = $matches[3];
  475. $title = '';
  476. if (isset($matches[6])) {
  477. $title = $matches[6];
  478. }
  479. $alt_text = str_replace('"', '&quot;', $alt_text);
  480. $title = str_replace('"', '&quot;', $title);
  481. # We've got to encode these to avoid conflicting with italics/bold.
  482. $url = str_replace(array('*', '_'),
  483. array(self::$md_escape_table['*'], self::$md_escape_table['_']),
  484. $url);
  485. $result = "<img src=\"$url\" alt=\"$alt_text\"";
  486. if (isset($title)) {
  487. $title = str_replace(array('*', '_'),
  488. array(self::$md_escape_table['*'], self::$md_escape_table['_']),
  489. $title);
  490. $result .= " title=\"$title\""; # $title already quoted
  491. }
  492. $result .= $this->md_empty_element_suffix;
  493. return $result;
  494. }
  495. private function _DoHeaders($text) {
  496. # Setext-style headers:
  497. # Header 1
  498. # ========
  499. #
  500. # Header 2
  501. # --------
  502. #
  503. $text = preg_replace(
  504. array('{ ^(.+)[ \t]*\n=+[ \t]*\n+ }emx',
  505. '{ ^(.+)[ \t]*\n-+[ \t]*\n+ }emx'),
  506. array("'<h1>'.\$this->_RunSpanGamut(\$this->_UnslashQuotes('\\1')).'</h1>\n\n'",
  507. "'<h2>'.\$this->_RunSpanGamut(\$this->_UnslashQuotes('\\1')).'</h2>\n\n'"),
  508. $text);
  509. # atx-style headers:
  510. # # Header 1
  511. # ## Header 2
  512. # ## Header 2 with closing hashes ##
  513. # ...
  514. # ###### Header 6
  515. #
  516. $text = preg_replace("{
  517. ^(\\#{1,6}) # $1 = string of #'s
  518. [ \\t]*
  519. (.+?) # $2 = Header text
  520. [ \\t]*
  521. \\#* # optional closing #'s (not counted)
  522. \\n+
  523. }xme",
  524. "'<h'.strlen('\\1').'>'.\$this->_RunSpanGamut(\$this->_UnslashQuotes('\\2')).'</h'.strlen('\\1').'>\n\n'",
  525. $text);
  526. return $text;
  527. }
  528. private function _DoLists($text) {
  529. #
  530. # Form HTML ordered (numbered) and unordered (bulleted) lists.
  531. #
  532. $less_than_tab = $this->md_tab_width - 1;
  533. # Re-usable patterns to match list item bullets and number markers:
  534. $marker_ul = '[*+-]';
  535. $marker_ol = '\d+[.]';
  536. $marker_any = "(?:$marker_ul|$marker_ol)";
  537. $markers = array($marker_ul, $marker_ol);
  538. foreach ($markers as $marker) {
  539. # Re-usable pattern to match any entirel ul or ol list:
  540. $whole_list = '
  541. ( # $1 = whole list
  542. ( # $2
  543. [ ]{0,'.$less_than_tab.'}
  544. ('.$marker.') # $3 = first list item marker
  545. [ \t]+
  546. )
  547. (?s:.+?)
  548. ( # $4
  549. \z
  550. |
  551. \n{2,}
  552. (?=\S)
  553. (?! # Negative lookahead for another list item marker
  554. [ \t]*
  555. '.$marker.'[ \t]+
  556. )
  557. )
  558. )
  559. '; // mx
  560. # We use a different prefix before nested lists than top-level lists.
  561. # See extended comment in _ProcessListItems().
  562. if ($this->md_list_level) {
  563. $text = preg_replace_callback('{
  564. ^
  565. '.$whole_list.'
  566. }mx',
  567. array($this,'_DoLists_callback_top'), $text);
  568. }
  569. else {
  570. $text = preg_replace_callback('{
  571. (?:(?<=\n\n)|\A\n?)
  572. '.$whole_list.'
  573. }mx',
  574. array($this,'_DoLists_callback_nested'), $text);
  575. }
  576. }
  577. return $text;
  578. }
  579. private function _DoLists_callback_top($matches) {
  580. # Re-usable patterns to match list item bullets and number markers:
  581. $marker_ul = '[*+-]';
  582. $marker_ol = '\d+[.]';
  583. $marker_any = "(?:$marker_ul|$marker_ol)";
  584. $list = $matches[1];
  585. $list_type = preg_match("/$marker_ul/", $matches[3]) ? "ul" : "ol";
  586. $marker_any = ( $list_type == "ul" ? $marker_ul : $marker_ol );
  587. # Turn double returns into triple returns, so that we can make a
  588. # paragraph for the last item in a list, if necessary:
  589. $list = preg_replace("/\n{2,}/", "\n\n\n", $list);
  590. $result = $this->_ProcessListItems($list, $marker_any);
  591. # Trim any trailing whitespace, to put the closing `</$list_type>`
  592. # up on the preceding line, to get it past the current stupid
  593. # HTML block parser. This is a hack to work around the terrible
  594. # hack that is the HTML block parser.
  595. $result = rtrim($result);
  596. $result = "<$list_type>" . $result . "</$list_type>\n";
  597. return $result;
  598. }
  599. private function _DoLists_callback_nested($matches) {
  600. # Re-usable patterns to match list item bullets and number markers:
  601. $marker_ul = '[*+-]';
  602. $marker_ol = '\d+[.]';
  603. $marker_any = "(?:$marker_ul|$marker_ol)";
  604. $list = $matches[1];
  605. $list_type = preg_match("/$marker_ul/", $matches[3]) ? "ul" : "ol";
  606. $marker_any = ( $list_type == "ul" ? $marker_ul : $marker_ol );
  607. # Turn double returns into triple returns, so that we can make a
  608. # paragraph for the last item in a list, if necessary:
  609. $list = preg_replace("/\n{2,}/", "\n\n\n", $list);
  610. $result = $this->_ProcessListItems($list, $marker_any);
  611. $result = "<$list_type>\n" . $result . "</$list_type>\n";
  612. return $result;
  613. }
  614. private function _ProcessListItems($list_str, $marker_any) {
  615. #
  616. # Process the contents of a single ordered or unordered list, splitting it
  617. # into individual list items.
  618. #
  619. # The $md_list_level keeps track of when we're inside a list.
  620. # Each time we enter a list, we increment it; when we leave a list,
  621. # we decrement. If it's zero, we're not in a list anymore.
  622. #
  623. # We do this because when we're not inside a list, we want to treat
  624. # something like this:
  625. #
  626. # I recommend upgrading to version
  627. # 8. Oops, now this line is treated
  628. # as a sub-list.
  629. #
  630. # As a single paragraph, despite the fact that the second line starts
  631. # with a digit-period-space sequence.
  632. #
  633. # Whereas when we're inside a list (or sub-list), that line will be
  634. # treated as the start of a sub-list. What a kludge, huh? This is
  635. # an aspect of Markdown's syntax that's hard to parse perfectly
  636. # without resorting to mind-reading. Perhaps the solution is to
  637. # change the syntax rules such that sub-lists must start with a
  638. # starting cardinal number; e.g. "1." or "a.".
  639. $this->md_list_level++;
  640. # trim trailing blank lines:
  641. $list_str = preg_replace("/\n{2,}\\z/", "\n", $list_str);
  642. $list_str = preg_replace_callback('{
  643. (\n)? # leading line = $1
  644. (^[ \t]*) # leading whitespace = $2
  645. ('.$marker_any.') [ \t]+ # list marker = $3
  646. ((?s:.+?) # list item text = $4
  647. (\n{1,2}))
  648. (?= \n* (\z | \2 ('.$marker_any.') [ \t]+))
  649. }xm',
  650. array($this,'_ProcessListItems_callback'), $list_str);
  651. $this->md_list_level--;
  652. return $list_str;
  653. }
  654. private function _ProcessListItems_callback($matches) {
  655. $item = $matches[4];
  656. $leading_line =& $matches[1];
  657. $leading_space =& $matches[2];
  658. if ($leading_line || preg_match('/\n{2,}/', $item)) {
  659. $item = $this->_RunBlockGamut($this->_Outdent($item));
  660. }
  661. else {
  662. # Recursion for sub-lists:
  663. $item = $this->_DoLists($this->_Outdent($item));
  664. $item = preg_replace('/\n+$/', '', $item);
  665. $item = $this->_RunSpanGamut($item);
  666. }
  667. return "<li>" . $item . "</li>\n";
  668. }
  669. private function _DoCodeBlocks($text) {
  670. #
  671. # Process Markdown `<pre><code>` blocks.
  672. #
  673. $text = preg_replace_callback('{
  674. (?:\n\n|\A)
  675. ( # $1 = the code block -- one or more lines, starting with a space/tab
  676. (?:
  677. (?:[ ]{'.$this->md_tab_width.'} | \t) # Lines must start with a tab or a tab-width of spaces
  678. .*\n+
  679. )+
  680. )
  681. ((?=^[ ]{0,'.$this->md_tab_width.'}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
  682. }xm',
  683. array($this,'_DoCodeBlocks_callback'), $text);
  684. return $text;
  685. }
  686. private function _DoCodeBlocks_callback($matches) {
  687. $codeblock = $matches[1];
  688. $codeblock = $this->_EncodeCode($this->_Outdent($codeblock));
  689. // $codeblock = _Detab($codeblock);
  690. # trim leading newlines and trailing whitespace
  691. $codeblock = preg_replace(array('/\A\n+/', '/\s+\z/'), '', $codeblock);
  692. $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
  693. return $result;
  694. }
  695. private function _DoCodeSpans($text) {
  696. #
  697. # * Backtick quotes are used for <code></code> spans.
  698. #
  699. # * You can use multiple backticks as the delimiters if you want to
  700. # include literal backticks in the code span. So, this input:
  701. #
  702. # Just type ``foo `bar` baz`` at the prompt.
  703. #
  704. # Will translate to:
  705. #
  706. # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
  707. #
  708. # There's no arbitrary limit to the number of backticks you
  709. # can use as delimters. If you need three consecutive backticks
  710. # in your code, use four for delimiters, etc.
  711. #
  712. # * You can use spaces to get literal backticks at the edges:
  713. #
  714. # ... type `` `bar` `` ...
  715. #
  716. # Turns to:
  717. #
  718. # ... type <code>`bar`</code> ...
  719. #
  720. $text = preg_replace_callback('@
  721. (?<!\\\) # Character before opening ` can\'t be a backslash
  722. (`+) # $1 = Opening run of `
  723. (.+?) # $2 = The code block
  724. (?<!`)
  725. \1 # Matching closer
  726. (?!`)
  727. @xs',
  728. array($this,'_DoCodeSpans_callback'), $text);
  729. return $text;
  730. }
  731. private function _DoCodeSpans_callback($matches) {
  732. $c = $matches[2];
  733. $c = preg_replace('/^[ \t]*/', '', $c); # leading whitespace
  734. $c = preg_replace('/[ \t]*$/', '', $c); # trailing whitespace
  735. $c = $this->_EncodeCode($c);
  736. return "<code>$c</code>";
  737. }
  738. private function _EncodeCode($_) {
  739. #
  740. # Encode/escape certain characters inside Markdown code runs.
  741. # The point is that in code, these characters are literals,
  742. # and lose their special Markdown meanings.
  743. #
  744. # Encode all ampersands; HTML entities are not
  745. # entities within a Markdown code span.
  746. $_ = str_replace('&', '&amp;', $_);
  747. # Do the angle bracket song and dance:
  748. $_ = str_replace(array('<', '>'),
  749. array('&lt;', '&gt;'), $_);
  750. # Now, escape characters that are magic in Markdown:
  751. $_ = str_replace(array_keys(self::$md_escape_table),
  752. array_values(self::$md_escape_table), $_);
  753. return $_;
  754. }
  755. private function _DoItalicsAndBold($text) {
  756. # <strong> must go first:
  757. $text = preg_replace('{
  758. ( # $1: Marker
  759. (?<!\*\*) \*\* | # (not preceded by two chars of
  760. (?<!__) __ # the same marker)
  761. )
  762. (?=\S) # Not followed by whitespace
  763. (?!\1) # or two others marker chars.
  764. ( # $2: Content
  765. (?:
  766. [^*_]+? # Anthing not em markers.
  767. |
  768. # Balence any regular emphasis inside.
  769. ([*_]) (?=\S) .+? (?<=\S) \3 # $3: em char (* or _)
  770. |
  771. (?! \1 ) . # Allow unbalenced * and _.
  772. )+?
  773. )
  774. (?<=\S) \1 # End mark not preceded by whitespace.
  775. }sx',
  776. '<strong>\2</strong>', $text);
  777. # Then <em>:
  778. $text = preg_replace(
  779. '{ ( (?<!\*)\* | (?<!_)_ ) (?=\S) (?! \1) (.+?) (?<=\S) \1 }sx',
  780. '<em>\2</em>', $text);
  781. return $text;
  782. }
  783. private function _DoBlockQuotes($text) {
  784. $text = preg_replace_callback('/
  785. ( # Wrap whole match in $1
  786. (
  787. ^[ \t]*>[ \t]? # ">" at the start of a line
  788. .+\n # rest of the first line
  789. (.+\n)* # subsequent consecutive lines
  790. \n* # blanks
  791. )+
  792. )
  793. /xm',
  794. array($this,'_DoBlockQuotes_callback'), $text);
  795. return $text;
  796. }
  797. private function _DoBlockQuotes_callback($matches) {
  798. $bq = $matches[1];
  799. # trim one level of quoting - trim whitespace-only lines
  800. $bq = preg_replace(array('/^[ \t]*>[ \t]?/m', '/^[ \t]+$/m'), '', $bq);
  801. $bq = $this->_RunBlockGamut($bq); # recurse
  802. $bq = preg_replace('/^/m', " ", $bq);
  803. # These leading spaces screw with <pre> content, so we need to fix that:
  804. $bq = preg_replace_callback('{(\s*<pre>.+?</pre>)}sx',
  805. array($this,'_DoBlockQuotes_callback2'), $bq);
  806. return "<blockquote>\n$bq\n</blockquote>\n\n";
  807. }
  808. private function _DoBlockQuotes_callback2($matches) {
  809. $pre = $matches[1];
  810. $pre = preg_replace('/^ /m', '', $pre);
  811. return $pre;
  812. }
  813. private function _FormParagraphs($text) {
  814. #
  815. # Params:
  816. # $text - string to process with html <p> tags
  817. #
  818. # Strip leading and trailing lines:
  819. $text = preg_replace(array('/\A\n+/', '/\n+\z/'), '', $text);
  820. $grafs = preg_split('/\n{2,}/', $text, -1, PREG_SPLIT_NO_EMPTY);
  821. #
  822. # Wrap <p> tags.
  823. #
  824. foreach ($grafs as $key => $value) {
  825. if (!isset( $this->md_html_blocks[$value] )) {
  826. $value = $this->_RunSpanGamut($value);
  827. $value = preg_replace('/^([ \t]*)/', '<p>', $value);
  828. $value .= "</p>";
  829. $grafs[$key] = $value;
  830. }
  831. }
  832. #
  833. # Unhashify HTML blocks
  834. #
  835. foreach ($grafs as $key => $value) {
  836. if (isset( $this->md_html_blocks[$value] )) {
  837. $grafs[$key] = $this->md_html_blocks[$value];
  838. }
  839. }
  840. return implode("\n\n", $grafs);
  841. }
  842. private function _EncodeAmpsAndAngles($text) {
  843. # Smart processing for ampersands and angle brackets that need to be encoded.
  844. # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
  845. # http://bumppo.net/projects/amputator/
  846. $text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/',
  847. '&amp;', $text);;
  848. # Encode naked <'s
  849. $text = preg_replace('{<(?![a-z/?\$!])}i', '&lt;', $text);
  850. return $text;
  851. }
  852. private function _EncodeBackslashEscapes($text) {
  853. #
  854. # Parameter: String.
  855. # Returns: The string, with after processing the following backslash
  856. # escape sequences.
  857. #
  858. # Must process escaped backslashes first.
  859. return str_replace(array_keys(self::$md_backslash_escape_table),
  860. array_values(self::$md_backslash_escape_table), $text);
  861. }
  862. private function _DoAutoLinks($text) {
  863. $text = preg_replace("!<((https?|ftp):[^'\">\\s]+)>!",
  864. '<a href="\1">\1</a>', $text);
  865. # Email addresses: <address@domain.foo>
  866. $text = preg_replace('{
  867. <
  868. (?:mailto:)?
  869. (
  870. [-.\w]+
  871. \@
  872. [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
  873. )
  874. >
  875. }exi',
  876. "\$this->_EncodeEmailAddress(\$this->_UnescapeSpecialChars(\$this->_UnslashQuotes('\\1')))",
  877. $text);
  878. return $text;
  879. }
  880. private function _EncodeEmailAddress($addr) {
  881. #
  882. # Input: an email address, e.g. "foo@example.com"
  883. #
  884. # Output: the email address as a mailto link, with each character
  885. # of the address encoded as either a decimal or hex entity, in
  886. # the hopes of foiling most address harvesting spam bots. E.g.:
  887. #
  888. # <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
  889. # x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
  890. # &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
  891. #
  892. # Based by a filter by Matthew Wickline, posted to the BBEdit-Talk
  893. # mailing list: <http://tinyurl.com/yu7ue>
  894. #
  895. $addr = "mailto:" . $addr;
  896. $length = strlen($addr);
  897. # leave ':' alone (to spot mailto: later)
  898. $addr = preg_replace_callback('/([^\:])/',
  899. array($this,'_EncodeEmailAddress_callback'), $addr);
  900. $addr = "<a href=\"$addr\">$addr</a>";
  901. # strip the mailto: from the visible part
  902. $addr = preg_replace('/">.+?:/', '">', $addr);
  903. return $addr;
  904. }
  905. private function _EncodeEmailAddress_callback($matches) {
  906. $char = $matches[1];
  907. $r = rand(0, 100);
  908. # roughly 10% raw, 45% hex, 45% dec
  909. # '@' *must* be encoded. I insist.
  910. if ($r > 90 && $char != '@') return $char;
  911. if ($r < 45) return '&#x'.dechex(ord($char)).';';
  912. return '&#'.ord($char).';';
  913. }
  914. private function _UnescapeSpecialChars($text) {
  915. #
  916. # Swap back in all the special characters we've hidden.
  917. #
  918. return str_replace(array_values(self::$md_escape_table),
  919. array_keys(self::$md_escape_table), $text);
  920. }
  921. # _TokenizeHTML is shared between PHP Markdown and PHP SmartyPants.
  922. # We only define it if it is not already defined.
  923. private function _TokenizeHTML($str) {
  924. #
  925. # Parameter: String containing HTML markup.
  926. # Returns: An array of the tokens comprising the input
  927. # string. Each token is either a tag (possibly with nested,
  928. # tags contained therein, such as <a href="<MTFoo>">, or a
  929. # run of text between tags. Each element of the array is a
  930. # two-element array; the first is either 'tag' or 'text';
  931. # the second is the actual value.
  932. #
  933. #
  934. # Regular expression derived from the _tokenize() subroutine in
  935. # Brad Choate's MTRegex plugin.
  936. # <http://www.bradchoate.com/past/mtregex.php>
  937. #
  938. $index = 0;
  939. $tokens = array();
  940. $match = '(?s:<!(?:--.*?--\s*)+>)|'. # comment
  941. '(?s:<\?.*?\?>)|'. # processing instruction
  942. # regular tags
  943. '(?:<[/!$]?[-a-zA-Z0-9:]+\b(?>[^"\'>]+|"[^"]*"|\'[^\']*\')*>)';
  944. $parts = preg_split("{($match)}", $str, -1, PREG_SPLIT_DELIM_CAPTURE);
  945. foreach ($parts as $part) {
  946. if (++$index % 2 && $part != '')
  947. $tokens[] = array('text', $part);
  948. else
  949. $tokens[] = array('tag', $part);
  950. }
  951. return $tokens;
  952. }
  953. private function _Outdent($text) {
  954. #
  955. # Remove one level of line-leading tabs or spaces
  956. #
  957. return preg_replace("/^(\\t|[ ]{1,".$this->md_tab_width."})/m", "", $text);
  958. }
  959. private function _Detab($text) {
  960. #
  961. # Replace tabs with the appropriate amount of space.
  962. #
  963. # For each line we separate the line in blocks delemited by
  964. # tab characters. Then we reconstruct every line by adding the
  965. # appropriate number of space between each blocks.
  966. $lines = explode("\n", $text);
  967. $text = "";
  968. foreach ($lines as $line) {
  969. # Split in blocks.
  970. $blocks = explode("\t", $line);
  971. # Add each blocks to the line.
  972. $line = $blocks[0];
  973. unset($blocks[0]); # Do not add first block twice.
  974. foreach ($blocks as $block) {
  975. # Calculate amount of space, insert spaces, insert block.
  976. $amount = $this->md_tab_width - strlen($line) % $this->md_tab_width;
  977. $line .= str_repeat(" ", $amount) . $block;
  978. }
  979. $text .= "$line\n";
  980. }
  981. return $text;
  982. }
  983. private function _UnslashQuotes($text) {
  984. #
  985. # This function is useful to remove automaticaly slashed double quotes
  986. # when using preg_replace and evaluating an expression.
  987. # Parameter: String.
  988. # Returns: The string with any slash-double-quote (\") sequence replaced
  989. # by a single double quote.
  990. #
  991. return str_replace('\"', '"', $text);
  992. }
  993. }
  994. /*
  995. PHP Markdown
  996. ============
  997. Description
  998. -----------
  999. This is a PHP translation of the original Markdown formatter written in
  1000. Perl by John Gruber.
  1001. Markdown is a text-to-HTML filter; it translates an easy-to-read /
  1002. easy-to-write structured text format into HTML. Markdown's text format
  1003. is most similar to that of plain text email, and supports features such
  1004. as headers, *emphasis*, code blocks, blockquotes, and links.
  1005. Markdown's syntax is designed not as a generic markup language, but
  1006. specifically to serve as a front-end to (X)HTML. You can use span-level
  1007. HTML tags anywhere in a Markdown document, and you can use block level
  1008. HTML tags (like <div> and <table> as well).
  1009. For more information about Markdown's syntax, see:
  1010. <http://daringfireball.net/projects/markdown/>
  1011. Bugs
  1012. ----
  1013. To file bug reports please send email to:
  1014. <michel.fortin@michelf.com>
  1015. Please include with your report: (1) the example input; (2) the output you
  1016. expected; (3) the output Markdown actually produced.
  1017. Version History
  1018. ---------------
  1019. See the readme file for detailed release notes for this version.
  1020. 1.0.1c - 9 Dec 2005
  1021. 1.0.1b - 6 Jun 2005
  1022. 1.0.1a - 15 Apr 2005
  1023. 1.0.1 - 16 Dec 2004
  1024. 1.0 - 21 Aug 2004
  1025. Author & Contributors
  1026. ---------------------
  1027. Original Perl version by John Gruber
  1028. <http://daringfireball.net/>
  1029. PHP port and other contributions by Michel Fortin
  1030. <http://www.michelf.com/>
  1031. Copyright and License
  1032. ---------------------
  1033. Copyright (c) 2004-2005 Michel Fortin
  1034. <http://www.michelf.com/>
  1035. All rights reserved.
  1036. Copyright (c) 2003-2004 John Gruber
  1037. <http://daringfireball.net/>
  1038. All rights reserved.
  1039. Redistribution and use in source and binary forms, with or without
  1040. modification, are permitted provided that the following conditions are
  1041. met:
  1042. * Redistributions of source code must retain the above copyright notice,
  1043. this list of conditions and the following disclaimer.
  1044. * Redistributions in binary form must reproduce the above copyright
  1045. notice, this list of conditions and the following disclaimer in the
  1046. documentation and/or other materials provided with the distribution.
  1047. * Neither the name "Markdown" nor the names of its contributors may
  1048. be used to endorse or promote products derived from this software
  1049. without specific prior written permission.
  1050. This software is provided by the copyright holders and contributors "as
  1051. is" and any express or implied warranties, including, but not limited
  1052. to, the implied warranties of merchantability and fitness for a
  1053. particular purpose are disclaimed. In no event shall the copyright owner
  1054. or contributors be liable for any direct, indirect, incidental, special,
  1055. exemplary, or consequential damages (including, but not limited to,
  1056. procurement of substitute goods or services; loss of use, data, or
  1057. profits; or business interruption) however caused and on any theory of
  1058. liability, whether in contract, strict liability, or tort (including
  1059. negligence or otherwise) arising in any way out of the use of this
  1060. software, even if advised of the possibility of such damage.
  1061. */