PageRenderTime 54ms CodeModel.GetById 15ms RepoModel.GetById 1ms app.codeStats 0ms

/Markdown.class.php

https://github.com/piero-la-lune/PHP-Markdown-for-comments
PHP | 1221 lines | 814 code | 130 blank | 277 comment | 65 complexity | de47dc224306da991342b0709aeb6442 MD5 | raw file
Possible License(s): GPL-3.0
  1. <?php
  2. # A php class for parsing Markdown markup in comments, without authorizing
  3. # html tags. This is an adaptation of PHP Markdown by Michel Fortin.
  4. #
  5. # PHP Markdown for comments
  6. # Adaptation of PHP Markdown by Pierre Monchalin
  7. # <http://github.com/piero-la-lune/PHP-Markdown-for-comments>
  8. #
  9. # PHP Markdown
  10. # Copyright (c) 2004-2012 Michel Fortin
  11. # <http://michelf.com/projects/php-markdown/>
  12. #
  13. # Original Markdown
  14. # Copyright (c) 2004-2006 John Gruber
  15. # <http://daringfireball.net/projects/markdown/>
  16. #
  17. #
  18. #
  19. # This program is free software: you can redistribute it and/or modify
  20. # it under the terms of the GNU General Public License as published by
  21. # the Free Software Foundation, either version 3 of the License, or
  22. # (at your option) any later version.
  23. #
  24. # This program is distributed in the hope that it will be useful,
  25. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  26. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  27. # GNU General Public License for more details.
  28. #
  29. # You should have received a copy of the GNU General Public License
  30. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  31. class Markdown {
  32. # Regex to match balanced [brackets].
  33. # Needed to insert a maximum bracked depth while converting to PHP.
  34. protected $nested_brackets_depth = 6;
  35. protected $nested_brackets_re;
  36. protected $nested_url_parenthesis_depth = 4;
  37. protected $nested_url_parenthesis_re;
  38. # Table of hash values for escaped characters:
  39. protected $escape_chars = '\`*_{}[]()>#+-.!';
  40. protected $escape_chars_re;
  41. # Predefined urls and titles for reference links and images.
  42. protected $predef_urls = array();
  43. protected $predef_titles = array();
  44. protected $tab_width = 4;
  45. protected $document_gamut = array(
  46. 'stripLinkDefinitions',
  47. 'runBlockGamut'
  48. );
  49. # These are all the transformations that form block-level tags like
  50. # paragraphs, headers, and list items.
  51. protected $block_gamut = array(
  52. 'doHeaders',
  53. 'doHorizontalRules',
  54. 'doLists',
  55. 'doCodeBlocks',
  56. 'doBlockQuotes'
  57. );
  58. # These are all the transformations that occur *within* block-level tags
  59. # like paragraphs, headers, and list items.
  60. protected $span_gamut = array(
  61. # Process character escapes, code spans, and inline HTML in one shot.
  62. 'parseSpan',
  63. # Process anchor and image tags. Images must come first, because
  64. # ![foo][f] looks like an anchor.
  65. 'doImages',
  66. 'doAnchors',
  67. # Make links out of things like `<http://example.com/>`
  68. # Must come after doAnchors, because you can use < and > delimiters in
  69. # inline links like [this](<url>).
  70. 'doAutoLinks',
  71. 'encodeAmpsAndAngles',
  72. 'doItalicsAndBold',
  73. 'doHardBreaks'
  74. );
  75. public function __construct() {
  76. $this->prepareItalicsAndBold();
  77. $this->nested_brackets_re =
  78. str_repeat('(?>[^\[\]]+|\[', $this->nested_brackets_depth).
  79. str_repeat('\])*', $this->nested_brackets_depth);
  80. $this->nested_url_parenthesis_re =
  81. str_repeat('(?>[^()\s]+|\(', $this->nested_url_parenthesis_depth).
  82. str_repeat('(?>\)))*', $this->nested_url_parenthesis_depth);
  83. $this->escape_chars_re = '['.preg_quote($this->escape_chars).']';
  84. }
  85. #
  86. # Main function. Performs some preprocessing on the input text and pass it
  87. # through the document gamut.
  88. #
  89. public function transform($text) {
  90. $this->setup();
  91. # Remove UTF-8 BOM and marker character in input, if present.
  92. $text = preg_replace('{^\xEF\xBB\xBF|\x1A}', '', $text);
  93. # Standardize line endings: DOS to Unix and Mac to Unix
  94. $text = preg_replace('{\r\n?}', "\n", $text);
  95. # Make sure $text ends with a couple of newlines:
  96. $text .= "\n\n";
  97. # Convert all tabs to spaces.
  98. $text = $this->detab($text);
  99. # Strip any lines consisting only of spaces and tabs.
  100. # This makes subsequent regexen easier to write, because we can match
  101. # consecutive blank lines with /\n+/ instead of something contorted
  102. # like /[ ]*\n+/ .
  103. $text = preg_replace('/^[ ]+$/m', '', $text);
  104. # Run document gamut methods.
  105. foreach ($this->document_gamut as $method) {
  106. $text = $this->$method($text);
  107. }
  108. $this->teardown();
  109. return $text . "\n";
  110. }
  111. #
  112. # Prepare regular expressions for searching emphasis tokens in any context.
  113. #
  114. private $em_relist = array(
  115. '' => '(?:(?<!\*)\*(?!\*)|(?<!_)_(?!_))(?=\S|$)(?![\.,:;]\s)',
  116. '*' => '(?<=\S|^)(?<!\*)\*(?!\*)',
  117. '_' => '(?<=\S|^)(?<!_)_(?!_)',
  118. );
  119. private $strong_relist = array(
  120. '' => '(?:(?<!\*)\*\*(?!\*)|(?<!_)__(?!_))(?=\S|$)(?![\.,:;]\s)',
  121. '**' => '(?<=\S|^)(?<!\*)\*\*(?!\*)',
  122. '__' => '(?<=\S|^)(?<!_)__(?!_)',
  123. );
  124. private $em_strong_relist = array(
  125. '' => '(?:(?<!\*)\*\*\*(?!\*)|(?<!_)___(?!_))(?=\S|$)(?![\.,:;]\s)',
  126. '***' => '(?<=\S|^)(?<!\*)\*\*\*(?!\*)',
  127. '___' => '(?<=\S|^)(?<!_)___(?!_)',
  128. );
  129. private $em_strong_prepared_relist;
  130. protected function prepareItalicsAndBold() {
  131. foreach ($this->em_relist as $em => $em_re) {
  132. foreach ($this->strong_relist as $strong => $strong_re) {
  133. # Construct list of allowed token expressions.
  134. $token_relist = array();
  135. if (isset($this->em_strong_relist["$em$strong"])) {
  136. $token_relist[] = $this->em_strong_relist["$em$strong"];
  137. }
  138. $token_relist[] = $em_re;
  139. $token_relist[] = $strong_re;
  140. # Construct master expression from list.
  141. $token_re = '{('. implode('|', $token_relist) .')}';
  142. $this->em_strong_prepared_relist["$em$strong"] = $token_re;
  143. }
  144. }
  145. }
  146. #
  147. # Called before the transformation process starts to setup parser states.
  148. #
  149. # Internal hashes used during transformation.
  150. private $urls = array();
  151. private $titles = array();
  152. private $html_hashes = array();
  153. # Status flag to avoid invalid nesting.
  154. private $in_anchor = false;
  155. protected function setup() {
  156. # Clear global hashes.
  157. $this->urls = $this->predef_urls;
  158. $this->titles = $this->predef_titles;
  159. $this->html_hashes = array();
  160. $in_anchor = false;
  161. }
  162. #
  163. # Called after the transformation process to clear any variable which may be
  164. # taking up memory unnecessarly.
  165. #
  166. protected function teardown() {
  167. $this->urls = array();
  168. $this->titles = array();
  169. $this->html_hashes = array();
  170. }
  171. #
  172. # Remove one level of line-leading tabs or spaces
  173. #
  174. protected function outdent($text) {
  175. return preg_replace('/^(\t|[ ]{1,'.$this->tab_width.'})/m', '', $text);
  176. }
  177. #
  178. # Replace tabs with the appropriate amount of space.
  179. #
  180. protected function detab($text) {
  181. # For each line we separate the line in blocks delemited by tab
  182. # characters. Then we reconstruct every line by adding the appropriate
  183. # number of space between each blocks.
  184. $text = preg_replace_callback('/^.*\t.*$/m',
  185. array(&$this, '_detab_callback'), $text);
  186. return $text;
  187. }
  188. protected function _detab_callback($matches) {
  189. $line = $matches[0];
  190. # Split in blocks.
  191. $blocks = explode("\t", $line);
  192. # Add each blocks to the line.
  193. $line = $blocks[0];
  194. unset($blocks[0]); # Do not add first block twice.
  195. foreach ($blocks as $block) {
  196. # Calculate amount of space, insert spaces, insert block.
  197. $amount = $this->tab_width -
  198. mb_strlen($line, 'UTF-8') % $this->tab_width;
  199. $line .= str_repeat(" ", $amount) . $block;
  200. }
  201. return $line;
  202. }
  203. #
  204. # Strips link definitions from text, stores the URLs and titles in hash
  205. # references.
  206. #
  207. protected function stripLinkDefinitions($text) {
  208. $less_than_tab = $this->tab_width - 1;
  209. # Link defs are in the form: ^[id]: url "optional title"
  210. $text = preg_replace_callback('{
  211. ^[ ]{0,'.$less_than_tab.'}\[(.+)\][ ]?: # id = $1
  212. [ ]*
  213. \n? # maybe *one* newline
  214. [ ]*
  215. (?:
  216. <(.+?)> # url = $2
  217. |
  218. (\S+?) # url = $3
  219. )
  220. [ ]*
  221. \n? # maybe one newline
  222. [ ]*
  223. (?:
  224. (?<=\s) # lookbehind for whitespace
  225. ["(]
  226. (.*?) # title = $4
  227. [")]
  228. [ ]*
  229. )? # title is optional
  230. (?:\n+|\Z)
  231. }xm',
  232. array(&$this, '_stripLinkDefinitions_callback'),
  233. $text);
  234. return $text;
  235. }
  236. protected function _stripLinkDefinitions_callback($matches) {
  237. $link_id = strtolower($matches[1]);
  238. $url = $matches[2] == '' ? $matches[3] : $matches[2];
  239. $this->urls[$link_id] = $url;
  240. $this->titles[$link_id] =& $matches[4];
  241. return ''; # String that will replace the block
  242. }
  243. #
  244. # Run block gamut tranformations, without hashing HTML blocks.
  245. #
  246. protected function runBlockGamut($text) {
  247. foreach ($this->block_gamut as $method) {
  248. $text = $this->$method($text);
  249. }
  250. # Finally form paragraph and restore hashed blocks.
  251. $text = $this->formParagraphs($text);
  252. return $text;
  253. }
  254. #
  255. # Run span gamut tranformations.
  256. #
  257. protected function runSpanGamut($text) {
  258. foreach ($this->span_gamut as $method) {
  259. $text = $this->$method($text);
  260. }
  261. return $text;
  262. }
  263. #
  264. # Params:
  265. # $text - string to process with html <p> tags
  266. #
  267. protected function formParagraphs($text) {
  268. # Strip leading and trailing lines:
  269. $text = preg_replace('/\A\n+|\n+\z/', '', $text);
  270. $grafs = preg_split('/\n{2,}/', $text, -1, PREG_SPLIT_NO_EMPTY);
  271. #
  272. # Wrap <p> tags and unhashify HTML blocks
  273. #
  274. foreach ($grafs as $key => $value) {
  275. if (!preg_match('/^B\x1A[0-9]+B$/', $value)) {
  276. # Is a paragraph.
  277. $value = $this->runSpanGamut($value);
  278. $value = preg_replace('/^([ ]*)/', "<p>", $value);
  279. $value .= "</p>";
  280. $grafs[$key] = $this->unhash($value);
  281. }
  282. else {
  283. # Is a block.
  284. # Modify elements of @grafs in-place...
  285. $graf = $value;
  286. $block = $this->html_hashes[$graf];
  287. $graf = $block;
  288. $grafs[$key] = $graf;
  289. }
  290. }
  291. return implode("\n\n", $grafs);
  292. }
  293. #
  294. # Process headers
  295. # Note : this function was adapted, not allowing levels 1 and 2 for titles.
  296. #
  297. protected function doHeaders($text) {
  298. # Setext-style headers:
  299. # Header 1
  300. # ========
  301. #
  302. # Header 2
  303. # --------
  304. #
  305. $text = preg_replace_callback('{ ^(.+?)[ ]*\n(=+|-+)[ ]*\n+ }mx',
  306. array(&$this, '_doHeaders_callback_setext'), $text);
  307. # atx-style headers:
  308. # # Header 1
  309. # ## Header 2
  310. # ## Header 2 with closing hashes ##
  311. # ...
  312. # #### Header 4
  313. #
  314. $text = preg_replace_callback('{
  315. ^(\#{1,4}) # $1 = string of #\'s
  316. [ ]*
  317. (.+?) # $2 = Header text
  318. [ ]*
  319. \#* # optional closing #\'s (not counted)
  320. \n+
  321. }xm',
  322. array(&$this, '_doHeaders_callback_atx'), $text);
  323. return $text;
  324. }
  325. protected function _doHeaders_callback_setext($matches) {
  326. # Terrible hack to check we haven't found an empty list item.
  327. if ($matches[2] == '-' && preg_match('{^-(?: |$)}', $matches[1]))
  328. return $matches[0];
  329. $level = $matches[2]{0} == '=' ? 3 : 4;
  330. $block = "<h$level>".$this->runSpanGamut($matches[1])."</h$level>";
  331. return "\n" . $this->hashBlock($block) . "\n\n";
  332. }
  333. protected function _doHeaders_callback_atx($matches) {
  334. $level = strlen($matches[1])+2;
  335. $block = "<h$level>".$this->runSpanGamut($matches[2])."</h$level>";
  336. return "\n" . $this->hashBlock($block) . "\n\n";
  337. }
  338. #
  339. # Process horizontal rules
  340. #
  341. protected function doHorizontalRules($text) {
  342. return preg_replace(
  343. '{
  344. ^[ ]{0,3} # Leading space
  345. ([-*_]) # $1: First marker
  346. (?> # Repeated marker group
  347. [ ]{0,2} # Zero, one, or two spaces.
  348. \1 # Marker character
  349. ){2,} # Group repeated at least twice
  350. [ ]* # Tailing spaces
  351. $ # End of line.
  352. }mx',
  353. "\n".$this->hashBlock("<hr />")."\n",
  354. $text);
  355. }
  356. #
  357. # Form HTML ordered (numbered) and unordered (bulleted) lists.
  358. #
  359. protected function doLists($text) {
  360. $less_than_tab = $this->tab_width - 1;
  361. # Re-usable patterns to match list item bullets and number markers:
  362. $marker_ul_re = '[*+-]';
  363. $marker_ol_re = '\d+[\.]';
  364. $marker_any_re = "(?:$marker_ul_re|$marker_ol_re)";
  365. $markers_relist = array(
  366. $marker_ul_re => $marker_ol_re,
  367. $marker_ol_re => $marker_ul_re,
  368. );
  369. foreach ($markers_relist as $marker_re => $other_marker_re) {
  370. # Re-usable pattern to match any entirel ul or ol list:
  371. $whole_list_re = '
  372. ( # $1 = whole list
  373. ( # $2
  374. ([ ]{0,'.$less_than_tab.'}) # $3 = number of spaces
  375. ('.$marker_re.') # $4 = first list item marker
  376. [ ]+
  377. )
  378. (?s:.+?)
  379. ( # $5
  380. \z
  381. |
  382. \n{2,}
  383. (?=\S)
  384. (?! # Negative lookahead for another list item marker
  385. [ ]*
  386. '.$marker_re.'[ ]+
  387. )
  388. |
  389. (?= # Lookahead for another kind of list
  390. \n
  391. \3 # Must have the same indentation
  392. '.$other_marker_re.'[ ]+
  393. )
  394. )
  395. )
  396. '; // mx
  397. # We use a different prefix before nested lists than top-level lists.
  398. # See extended comment in _ProcessListItems().
  399. if ($this->list_level) {
  400. $text = preg_replace_callback('{
  401. ^
  402. '.$whole_list_re.'
  403. }mx',
  404. array(&$this, '_doLists_callback'), $text);
  405. }
  406. else {
  407. $text = preg_replace_callback('{
  408. (?:(?<=\n)\n|\A\n?) # Must eat the newline
  409. '.$whole_list_re.'
  410. }mx',
  411. array(&$this, '_doLists_callback'), $text);
  412. }
  413. }
  414. return $text;
  415. }
  416. protected function _doLists_callback($matches) {
  417. # Re-usable patterns to match list item bullets and number markers:
  418. $marker_ul_re = '[*+-]';
  419. $marker_ol_re = '\d+[\.]';
  420. $marker_any_re = "(?:$marker_ul_re|$marker_ol_re)";
  421. $list = $matches[1];
  422. $list_type = preg_match("/$marker_ul_re/", $matches[4]) ? "ul" : "ol";
  423. $marker_any_re = ( $list_type == "ul" ? $marker_ul_re : $marker_ol_re );
  424. $list .= "\n";
  425. $result = $this->processListItems($list, $marker_any_re);
  426. $result = $this->hashBlock("<$list_type>\n" . $result . "</$list_type>");
  427. return "\n". $result ."\n\n";
  428. }
  429. #
  430. # Process the contents of a single ordered or unordered list, splitting it
  431. # into individual list items.
  432. #
  433. # The $this->list_level global keeps track of when we're inside a list.
  434. # Each time we enter a list, we increment it; when we leave a list,
  435. # we decrement. If it's zero, we're not in a list anymore.
  436. #
  437. # We do this because when we're not inside a list, we want to treat
  438. # something like this:
  439. #
  440. # I recommend upgrading to version
  441. # 8. Oops, now this line is treated
  442. # as a sub-list.
  443. #
  444. # As a single paragraph, despite the fact that the second line starts
  445. # with a digit-period-space sequence.
  446. #
  447. # Whereas when we're inside a list (or sub-list), that line will be
  448. # treated as the start of a sub-list. What a kludge, huh? This is
  449. # an aspect of Markdown's syntax that's hard to parse perfectly
  450. # without resorting to mind-reading. Perhaps the solution is to
  451. # change the syntax rules such that sub-lists must start with a
  452. # starting cardinal number; e.g. "1." or "a.".
  453. private $list_level = 0;
  454. protected function processListItems($list_str, $marker_any_re) {
  455. $this->list_level++;
  456. # trim trailing blank lines:
  457. $list_str = preg_replace("/\n{2,}\\z/", "\n", $list_str);
  458. $list_str = preg_replace_callback('{
  459. (\n)? # leading line = $1
  460. (^[ ]*) # leading whitespace = $2
  461. ('.$marker_any_re.' # list marker and space = $3
  462. (?:[ ]+|(?=\n)) # space only required if item is not empty
  463. )
  464. ((?s:.*?)) # list item text = $4
  465. (?:(\n+(?=\n))|\n) # tailing blank line = $5
  466. (?= \n* (\z | \2 ('.$marker_any_re.') (?:[ ]+|(?=\n))))
  467. }xm',
  468. array(&$this, '_processListItems_callback'), $list_str);
  469. $this->list_level--;
  470. return $list_str;
  471. }
  472. protected function _processListItems_callback($matches) {
  473. $item = $matches[4];
  474. $leading_line =& $matches[1];
  475. $leading_space =& $matches[2];
  476. $marker_space = $matches[3];
  477. $tailing_blank_line =& $matches[5];
  478. if ($leading_line || $tailing_blank_line ||
  479. preg_match('/\n{2,}/', $item))
  480. {
  481. # Replace marker with the appropriate whitespace indentation
  482. $item = $leading_space . str_repeat(' ', strlen($marker_space)) . $item;
  483. $item = $this->runBlockGamut($this->outdent($item)."\n");
  484. }
  485. else {
  486. # Recursion for sub-lists:
  487. $item = $this->doLists($this->outdent($item));
  488. $item = preg_replace('/\n+$/', '', $item);
  489. $item = $this->runSpanGamut($item);
  490. }
  491. return "<li>" . $item . "</li>\n";
  492. }
  493. #
  494. # Process Markdown `<pre><code>` blocks.
  495. #
  496. protected function doCodeBlocks($text) {
  497. $text = preg_replace_callback('{
  498. (?:\n\n|\A\n?)
  499. ( # $1 = the code block -- one or more lines, starting with a space/tab
  500. (?>
  501. [ ]{'.$this->tab_width.'} # Lines must start with a tab or a tab-width of spaces
  502. .*\n+
  503. )+
  504. )
  505. ((?=^[ ]{0,'.$this->tab_width.'}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
  506. }xm',
  507. array(&$this, '_doCodeBlocks_callback'), $text);
  508. return $text;
  509. }
  510. protected function _doCodeBlocks_callback($matches) {
  511. $codeblock = $matches[1];
  512. $codeblock = $this->outdent($codeblock);
  513. $codeblock = htmlspecialchars($codeblock, ENT_NOQUOTES);
  514. # trim leading newlines and trailing newlines
  515. $codeblock = preg_replace('/\A\n+|\n+\z/', '', $codeblock);
  516. $codeblock = "<pre><code>$codeblock\n</code></pre>";
  517. return "\n\n".$this->hashBlock($codeblock)."\n\n";
  518. }
  519. #
  520. # Process blockquotes.
  521. #
  522. protected function doBlockQuotes($text) {
  523. $text = preg_replace_callback('/
  524. ( # Wrap whole match in $1
  525. (?>
  526. ^[ ]*>[ ]? # ">" at the start of a line
  527. .+\n # rest of the first line
  528. (.+\n)* # subsequent consecutive lines
  529. \n* # blanks
  530. )+
  531. )
  532. /xm',
  533. array(&$this, '_doBlockQuotes_callback'), $text);
  534. return $text;
  535. }
  536. protected function _doBlockQuotes_callback($matches) {
  537. $bq = $matches[1];
  538. # trim one level of quoting - trim whitespace-only lines
  539. $bq = preg_replace('/^[ ]*>[ ]?|^[ ]+$/m', '', $bq);
  540. $bq = $this->runBlockGamut($bq); # recurse
  541. $bq = preg_replace('/^/m', " ", $bq);
  542. # These leading spaces cause problem with <pre> content,
  543. # so we need to fix that:
  544. $bq = preg_replace_callback('{(\s*<pre>.+?</pre>)}sx',
  545. array(&$this, '_doBlockQuotes_callback2'), $bq);
  546. return "\n". $this->hashBlock("<blockquote>\n$bq\n</blockquote>")."\n\n";
  547. }
  548. protected function _doBlockQuotes_callback2($matches) {
  549. $pre = $matches[1];
  550. $pre = preg_replace('/^ /m', '', $pre);
  551. return $pre;
  552. }
  553. #
  554. # Take the string $str and parse it into tokens, hashing embeded HTML, escaped
  555. # characters and handling code spans.
  556. #
  557. protected function parseSpan($str) {
  558. $output = '';
  559. $span_re = '{
  560. (
  561. \\\\'.$this->escape_chars_re.'
  562. |
  563. (?<![`\\\\])
  564. `+ # code span marker
  565. )
  566. }xs';
  567. while (1) {
  568. #
  569. # Each loop iteration seach for either the next tag, the next
  570. # openning code span marker, or the next escaped character.
  571. # Each token is then passed to handleSpanToken.
  572. #
  573. $parts = preg_split($span_re, $str, 2, PREG_SPLIT_DELIM_CAPTURE);
  574. # Create token from text preceding tag.
  575. if ($parts[0] != "") {
  576. $output .= $parts[0];
  577. }
  578. # Check if we reach the end.
  579. if (isset($parts[1])) {
  580. $output .= $this->handleSpanToken($parts[1], $parts[2]);
  581. $str = $parts[2];
  582. }
  583. else {
  584. break;
  585. }
  586. }
  587. return $output;
  588. }
  589. #
  590. # Handle $token provided by parseSpan by determining its nature and returning
  591. # the corresponding value that should replace it.
  592. #
  593. protected function handleSpanToken($token, &$str) {
  594. switch ($token{0}) {
  595. case "\\":
  596. return $this->hashPart("&#". ord($token{1}). ";");
  597. case "`":
  598. # Search for end marker in remaining text.
  599. if (preg_match('/^(.*?[^`])'.preg_quote($token).'(?!`)(.*)$/sm',
  600. $str, $matches))
  601. {
  602. $str = $matches[2];
  603. $codespan = $this->makeCodeSpan($matches[1]);
  604. return $this->hashPart($codespan);
  605. }
  606. return $token; // return as text since no ending marker found.
  607. default:
  608. return $this->hashPart($token);
  609. }
  610. }
  611. #
  612. # Create a code span markup for $code.
  613. #
  614. protected function makeCodeSpan($code) {
  615. $code = htmlspecialchars(trim($code), ENT_NOQUOTES);
  616. return $this->hashPart("<code>$code</code>");
  617. }
  618. #
  619. # Turn Markdown image shortcuts into <img> tags.
  620. #
  621. protected function doImages($text) {
  622. #
  623. # First, handle reference-style labeled images: ![alt text][id]
  624. #
  625. $text = preg_replace_callback('{
  626. ( # wrap whole match in $1
  627. !\[
  628. ('.$this->nested_brackets_re.') # alt text = $2
  629. \]
  630. [ ]? # one optional space
  631. (?:\n[ ]*)? # one optional newline followed by spaces
  632. \[
  633. (.*?) # id = $3
  634. \]
  635. )
  636. }xs',
  637. array(&$this, '_doImages_reference_callback'), $text);
  638. #
  639. # Next, handle inline images: ![alt text](url "optional title")
  640. # Don't forget: encode * and _
  641. #
  642. $text = preg_replace_callback('{
  643. ( # wrap whole match in $1
  644. !\[
  645. ('.$this->nested_brackets_re.') # alt text = $2
  646. \]
  647. \s? # One optional whitespace character
  648. \( # literal paren
  649. [ \n]*
  650. (?:
  651. <(\S*)> # src url = $3
  652. |
  653. ('.$this->nested_url_parenthesis_re.') # src url = $4
  654. )
  655. [ \n]*
  656. ( # $5
  657. ([\'"]) # quote char = $6
  658. (.*?) # title = $7
  659. \6 # matching quote
  660. [ \n]*
  661. )? # title is optional
  662. \)
  663. )
  664. }xs',
  665. array(&$this, '_doImages_inline_callback'), $text);
  666. return $text;
  667. }
  668. protected function _doImages_reference_callback($matches) {
  669. $whole_match = $matches[1];
  670. $alt_text = $matches[2];
  671. $link_id = strtolower($matches[3]);
  672. if ($link_id == "") {
  673. $link_id = strtolower($alt_text); # for shortcut links like ![this][].
  674. }
  675. $alt_text = $this->encodeAttribute($alt_text);
  676. if (isset($this->urls[$link_id])) {
  677. $url = $this->encodeAttribute($this->urls[$link_id]);
  678. $result = "<img src=\"$url\" alt=\"$alt_text\"";
  679. if (isset($this->titles[$link_id])) {
  680. $title = $this->titles[$link_id];
  681. $title = $this->encodeAttribute($title);
  682. $result .= " title=\"$title\"";
  683. }
  684. $result .= ' />';
  685. $result = $this->hashPart($result);
  686. }
  687. else {
  688. # If there's no such link ID, leave intact:
  689. $result = $whole_match;
  690. }
  691. return $result;
  692. }
  693. protected function _doImages_inline_callback($matches) {
  694. $whole_match = $matches[1];
  695. $alt_text = $matches[2];
  696. $url = $matches[3] == '' ? $matches[4] : $matches[3];
  697. $title =& $matches[7];
  698. $alt_text = $this->encodeAttribute($alt_text);
  699. $url = $this->encodeAttribute($url);
  700. $result = "<img src=\"$url\" alt=\"$alt_text\"";
  701. if (isset($title)) {
  702. $title = $this->encodeAttribute($title);
  703. $result .= " title=\"$title\""; # $title already quoted
  704. }
  705. $result .= ' />';
  706. return $this->hashPart($result);
  707. }
  708. #
  709. # Turn Markdown link shortcuts into XHTML <a> tags.
  710. #
  711. protected function doAnchors($text) {
  712. if ($this->in_anchor) return $text;
  713. $this->in_anchor = true;
  714. #
  715. # First, handle reference-style links: [link text] [id]
  716. #
  717. $text = preg_replace_callback('{
  718. ( # wrap whole match in $1
  719. \[
  720. ('.$this->nested_brackets_re.') # link text = $2
  721. \]
  722. [ ]? # one optional space
  723. (?:\n[ ]*)? # one optional newline followed by spaces
  724. \[
  725. (.*?) # id = $3
  726. \]
  727. )
  728. }xs',
  729. array(&$this, '_doAnchors_reference_callback'), $text);
  730. #
  731. # Next, inline-style links: [link text](url "optional title")
  732. #
  733. $text = preg_replace_callback('{
  734. ( # wrap whole match in $1
  735. \[
  736. ('.$this->nested_brackets_re.') # link text = $2
  737. \]
  738. \( # literal paren
  739. [ \n]*
  740. (?:
  741. <(.+?)> # href = $3
  742. |
  743. ('.$this->nested_url_parenthesis_re.') # href = $4
  744. )
  745. [ \n]*
  746. ( # $5
  747. ([\'"]) # quote char = $6
  748. (.*?) # Title = $7
  749. \6 # matching quote
  750. [ \n]* # ignore any spaces/tabs between closing quote and )
  751. )? # title is optional
  752. \)
  753. )
  754. }xs',
  755. array(&$this, '_doAnchors_inline_callback'), $text);
  756. #
  757. # Last, handle reference-style shortcuts: [link text]
  758. # These must come last in case you've also got [link text][1]
  759. # or [link text](/foo)
  760. #
  761. $text = preg_replace_callback('{
  762. ( # wrap whole match in $1
  763. \[
  764. ([^\[\]]+) # link text = $2; can\'t contain [ or ]
  765. \]
  766. )
  767. }xs',
  768. array(&$this, '_doAnchors_reference_callback'), $text);
  769. $this->in_anchor = false;
  770. return $text;
  771. }
  772. protected function _doAnchors_reference_callback($matches) {
  773. $whole_match = $matches[1];
  774. $link_text = $matches[2];
  775. $link_id =& $matches[3];
  776. if ($link_id == "") {
  777. # for shortcut links like [this][] or [this].
  778. $link_id = $link_text;
  779. }
  780. # lower-case and turn embedded newlines into spaces
  781. $link_id = strtolower($link_id);
  782. $link_id = preg_replace('{[ ]?\n}', ' ', $link_id);
  783. if (isset($this->urls[$link_id])) {
  784. $url = $this->urls[$link_id];
  785. $url = $this->encodeAttribute($url);
  786. $result = "<a href=\"$url\"";
  787. if ( isset( $this->titles[$link_id] ) ) {
  788. $title = $this->titles[$link_id];
  789. $title = $this->encodeAttribute($title);
  790. $result .= " title=\"$title\"";
  791. }
  792. $link_text = $this->runSpanGamut($link_text);
  793. $result .= ">$link_text</a>";
  794. $result = $this->hashPart($result);
  795. }
  796. else {
  797. $result = $whole_match;
  798. }
  799. return $result;
  800. }
  801. protected function _doAnchors_inline_callback($matches) {
  802. $whole_match = $matches[1];
  803. $link_text = $this->runSpanGamut($matches[2]);
  804. $url = $matches[3] == '' ? $matches[4] : $matches[3];
  805. $title =& $matches[7];
  806. $url = $this->encodeAttribute($url);
  807. $result = "<a href=\"$url\"";
  808. if (isset($title)) {
  809. $title = $this->encodeAttribute($title);
  810. $result .= " title=\"$title\"";
  811. }
  812. $link_text = $this->runSpanGamut($link_text);
  813. $result .= ">$link_text</a>";
  814. return $this->hashPart($result);
  815. }
  816. #
  817. # Process <email> or <url>
  818. #
  819. protected function doAutoLinks($text) {
  820. $text = preg_replace_callback('{<((https?|ftp|dict):[^\'">\s]+)>}i',
  821. array(&$this, '_doAutoLinks_url_callback'), $text);
  822. # Email addresses: <address@domain.foo>
  823. $text = preg_replace_callback('{
  824. <
  825. (?:mailto:)?
  826. (
  827. (?:
  828. [-!#$%&\'*+/=?^_`.{|}~\w\x80-\xFF]+
  829. |
  830. ".*?"
  831. )
  832. \@
  833. (?:
  834. [-a-z0-9\x80-\xFF]+(\.[-a-z0-9\x80-\xFF]+)*\.[a-z]+
  835. |
  836. \[[\d.a-fA-F:]+\] # IPv4 & IPv6
  837. )
  838. )
  839. >
  840. }xi',
  841. array(&$this, '_doAutoLinks_email_callback'), $text);
  842. return $text;
  843. }
  844. protected function _doAutoLinks_url_callback($matches) {
  845. $url = $this->encodeAttribute($matches[1]);
  846. $link = "<a href=\"$url\">$url</a>";
  847. return $this->hashPart($link);
  848. }
  849. protected function _doAutoLinks_email_callback($matches) {
  850. $address = $matches[1];
  851. $link = $this->encodeEmailAddress($address);
  852. return $this->hashPart($link);
  853. }
  854. #
  855. # Input: an email address, e.g. "foo@example.com"
  856. #
  857. # Output: the email address as a mailto link, with each character
  858. # of the address encoded as either a decimal or hex entity, in
  859. # the hopes of foiling most address harvesting spam bots. E.g.:
  860. #
  861. # <p><a href="&#109;&#x61;&#105;&#x6c;&#116;&#x6f;&#58;&#x66;o&#111;
  862. # &#x40;&#101;&#x78;&#97;&#x6d;&#112;&#x6c;&#101;&#46;&#x63;&#111;
  863. # &#x6d;">&#x66;o&#111;&#x40;&#101;&#x78;&#97;&#x6d;&#112;&#x6c;
  864. # &#101;&#46;&#x63;&#111;&#x6d;</a></p>
  865. #
  866. # Based by a filter by Matthew Wickline, posted to BBEdit-Talk.
  867. # With some optimizations by Milian Wolff.
  868. #
  869. protected function encodeEmailAddress($addr) {
  870. $addr = "mailto:" . $addr;
  871. $chars = preg_split('/(?<!^)(?!$)/', $addr);
  872. $seed = (int)abs(crc32($addr) / strlen($addr)); # Deterministic seed.
  873. foreach ($chars as $key => $char) {
  874. $ord = ord($char);
  875. # Ignore non-ascii chars.
  876. if ($ord < 128) {
  877. $r = ($seed * (1 + $key)) % 100; # Pseudo-random function.
  878. # roughly 10% raw, 45% hex, 45% dec
  879. # '@' *must* be encoded. I insist.
  880. if ($r > 90 && $char != '@') /* do nothing */;
  881. else if ($r < 45) $chars[$key] = '&#x'.dechex($ord).';';
  882. else $chars[$key] = '&#'.$ord.';';
  883. }
  884. }
  885. $addr = implode('', $chars);
  886. $text = implode('', array_slice($chars, 7)); # text without `mailto:`
  887. $addr = "<a href=\"$addr\">$text</a>";
  888. return $addr;
  889. }
  890. #
  891. # Process italic and bold
  892. #
  893. protected function doItalicsAndBold($text) {
  894. $token_stack = array('');
  895. $text_stack = array('');
  896. $em = '';
  897. $strong = '';
  898. $tree_char_em = false;
  899. while (1) {
  900. #
  901. # Get prepared regular expression for seraching emphasis tokens
  902. # in current context.
  903. #
  904. $token_re = $this->em_strong_prepared_relist["$em$strong"];
  905. #
  906. # Each loop iteration search for the next emphasis token.
  907. # Each token is then passed to handleSpanToken.
  908. #
  909. $parts = preg_split($token_re, $text, 2, PREG_SPLIT_DELIM_CAPTURE);
  910. $text_stack[0] .= $parts[0];
  911. $token =& $parts[1];
  912. $text =& $parts[2];
  913. if (empty($token)) {
  914. # Reached end of text span: empty stack without emitting.
  915. # any more emphasis.
  916. while ($token_stack[0]) {
  917. $text_stack[1] .= array_shift($token_stack);
  918. $text_stack[0] .= array_shift($text_stack);
  919. }
  920. break;
  921. }
  922. $token_len = strlen($token);
  923. if ($tree_char_em) {
  924. # Reached closing marker while inside a three-char emphasis.
  925. if ($token_len == 3) {
  926. # Three-char closing marker, close em and strong.
  927. array_shift($token_stack);
  928. $span = array_shift($text_stack);
  929. $span = $this->runSpanGamut($span);
  930. $span = "<strong><em>$span</em></strong>";
  931. $text_stack[0] .= $this->hashPart($span);
  932. $em = '';
  933. $strong = '';
  934. } else {
  935. # Other closing marker: close one em or strong and
  936. # change current token state to match the other
  937. $token_stack[0] = str_repeat($token{0}, 3-$token_len);
  938. $tag = $token_len == 2 ? "strong" : "em";
  939. $span = $text_stack[0];
  940. $span = $this->runSpanGamut($span);
  941. $span = "<$tag>$span</$tag>";
  942. $text_stack[0] = $this->hashPart($span);
  943. $$tag = ''; # $$tag stands for $em or $strong
  944. }
  945. $tree_char_em = false;
  946. } else if ($token_len == 3) {
  947. if ($em) {
  948. # Reached closing marker for both em and strong.
  949. # Closing strong marker:
  950. for ($i = 0; $i < 2; ++$i) {
  951. $shifted_token = array_shift($token_stack);
  952. $tag = strlen($shifted_token) == 2 ? "strong" : "em";
  953. $span = array_shift($text_stack);
  954. $span = $this->runSpanGamut($span);
  955. $span = "<$tag>$span</$tag>";
  956. $text_stack[0] .= $this->hashPart($span);
  957. $$tag = ''; # $$tag stands for $em or $strong
  958. }
  959. } else {
  960. # Reached opening three-char emphasis marker. Push on token
  961. # stack; will be handled by the special condition above.
  962. $em = $token{0};
  963. $strong = "$em$em";
  964. array_unshift($token_stack, $token);
  965. array_unshift($text_stack, '');
  966. $tree_char_em = true;
  967. }
  968. } else if ($token_len == 2) {
  969. if ($strong) {
  970. # Unwind any dangling emphasis marker:
  971. if (strlen($token_stack[0]) == 1) {
  972. $text_stack[1] .= array_shift($token_stack);
  973. $text_stack[0] .= array_shift($text_stack);
  974. }
  975. # Closing strong marker:
  976. array_shift($token_stack);
  977. $span = array_shift($text_stack);
  978. $span = $this->runSpanGamut($span);
  979. $span = "<strong>$span</strong>";
  980. $text_stack[0] .= $this->hashPart($span);
  981. $strong = '';
  982. } else {
  983. array_unshift($token_stack, $token);
  984. array_unshift($text_stack, '');
  985. $strong = $token;
  986. }
  987. } else {
  988. # Here $token_len == 1
  989. if ($em) {
  990. if (strlen($token_stack[0]) == 1) {
  991. # Closing emphasis marker:
  992. array_shift($token_stack);
  993. $span = array_shift($text_stack);
  994. $span = $this->runSpanGamut($span);
  995. $span = "<em>$span</em>";
  996. $text_stack[0] .= $this->hashPart($span);
  997. $em = '';
  998. } else {
  999. $text_stack[0] .= $token;
  1000. }
  1001. } else {
  1002. array_unshift($token_stack, $token);
  1003. array_unshift($text_stack, '');
  1004. $em = $token;
  1005. }
  1006. }
  1007. }
  1008. return $text_stack[0];
  1009. }
  1010. #
  1011. # Process hard breaks
  1012. #
  1013. protected function doHardBreaks($text) {
  1014. return preg_replace_callback('/ {2,}\n/',
  1015. array(&$this, '_doHardBreaks_callback'), $text);
  1016. }
  1017. protected function _doHardBreaks_callback($matches) {
  1018. return $this->hashPart("<br />\n");
  1019. }
  1020. #
  1021. # Called whenever a tag must be hashed when a function insert an atomic
  1022. # element in the text stream. Passing $text to through this function gives
  1023. # a unique text-token which will be reverted back when calling unhash.
  1024. #
  1025. # The $boundary argument specify what character should be used to surround
  1026. # the token. By convension, "B" is used for block elements that needs not
  1027. # to be wrapped into paragraph tags at the end, ":" is used for elements
  1028. # that are word separators and "X" is used in the general case.
  1029. #
  1030. protected function hashPart($text, $boundary = 'X') {
  1031. # Swap back any tag hash found in $text so we do not have to `unhash`
  1032. # multiple times at the end.
  1033. $text = $this->unhash($text);
  1034. # Then hash the block.
  1035. static $i = 0;
  1036. $key = "$boundary\x1A" . ++$i . $boundary;
  1037. $this->html_hashes[$key] = $text;
  1038. return $key; # String that will replace the tag.
  1039. }
  1040. #
  1041. # Shortcut function for hashPart with block-level boundaries.
  1042. #
  1043. protected function hashBlock($text) {
  1044. return $this->hashPart($text, 'B');
  1045. }
  1046. #
  1047. # Swap back in all the tags hashed by _HashHTMLBlocks.
  1048. # Is this function still usefull, considering _HashHTMLBlocks was removed (no
  1049. # html tags authorized in comments)?
  1050. #
  1051. protected function unhash($text) {
  1052. return preg_replace_callback('/(.)\x1A[0-9]+\1/',
  1053. array(&$this, '_unhash_callback'), $text);
  1054. }
  1055. protected function _unhash_callback($matches) {
  1056. return $this->html_hashes[$matches[0]];
  1057. }
  1058. #
  1059. # Encode text for a double-quoted HTML attribute. This function is *not*
  1060. # suitable for attributes enclosed in single quotes.
  1061. #
  1062. protected function encodeAttribute($text) {
  1063. $text = $this->encodeAmpsAndAngles($text);
  1064. $text = str_replace('"', '&quot;', $text);
  1065. return $text;
  1066. }
  1067. #
  1068. # Smart processing for ampersands and angle brackets that need to be encoded.
  1069. # Valid character entities are left alone unless the no-entities mode is set.
  1070. #
  1071. protected function encodeAmpsAndAngles($text) {
  1072. # Ampersand-encoding based entirely on Nat Irons's Amputator
  1073. # MT plugin: <http://bumppo.net/projects/amputator/>
  1074. $text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/',
  1075. '&amp;', $text);;
  1076. # Encode remaining <'s
  1077. $text = str_replace('<', '&lt;', $text);
  1078. return $text;
  1079. }
  1080. }
  1081. ?>