PageRenderTime 63ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 1ms

/anchor/libraries/markdown.php

https://github.com/xplicitdj/anchor-cms
PHP | 1480 lines | 989 code | 165 blank | 326 comment | 67 complexity | d9e8a55cdc4d069c960d435fd20fdae4 MD5 | raw file
Possible License(s): MIT
  1. <?php
  2. #
  3. # Markdown - A text-to-HTML conversion tool for web writers
  4. #
  5. # PHP Markdown
  6. # Copyright (c) 2004-2012 Michel Fortin
  7. # <http://michelf.com/projects/php-markdown/>
  8. #
  9. # Original Markdown
  10. # Copyright (c) 2004-2006 John Gruber
  11. # <http://daringfireball.net/projects/markdown/>
  12. #
  13. !defined('MARKDOWN_EMPTY_ELEMENT_SUFFIX') and define('MARKDOWN_EMPTY_ELEMENT_SUFFIX', ">");
  14. !defined('MARKDOWN_TAB_WIDTH') and define('MARKDOWN_TAB_WIDTH', 4);
  15. class Markdown {
  16. ### Configuration Variables ###
  17. # Change to ">" for HTML output.
  18. var $empty_element_suffix = MARKDOWN_EMPTY_ELEMENT_SUFFIX;
  19. var $tab_width = MARKDOWN_TAB_WIDTH;
  20. # Change to `true` to disallow markup or entities.
  21. var $no_markup = false;
  22. var $no_entities = false;
  23. # Predefined urls and titles for reference links and images.
  24. var $predef_urls = array();
  25. var $predef_titles = array();
  26. ### Parser Implementation ###
  27. # Regex to match balanced [brackets].
  28. # Needed to insert a maximum bracked depth while converting to PHP.
  29. var $nested_brackets_depth = 6;
  30. var $nested_brackets_re;
  31. var $nested_url_parenthesis_depth = 4;
  32. var $nested_url_parenthesis_re;
  33. # Table of hash values for escaped characters:
  34. var $escape_chars = '\`*_{}[]()>#+-.!';
  35. var $escape_chars_re;
  36. function __construct() {
  37. #
  38. # Constructor function. Initialize appropriate member variables.
  39. #
  40. $this->_initDetab();
  41. $this->prepareItalicsAndBold();
  42. $this->nested_brackets_re =
  43. str_repeat('(?>[^\[\]]+|\[', $this->nested_brackets_depth).
  44. str_repeat('\])*', $this->nested_brackets_depth);
  45. $this->nested_url_parenthesis_re =
  46. str_repeat('(?>[^()\s]+|\(', $this->nested_url_parenthesis_depth).
  47. str_repeat('(?>\)))*', $this->nested_url_parenthesis_depth);
  48. $this->escape_chars_re = '['.preg_quote($this->escape_chars).']';
  49. # Sort document, block, and span gamut in ascendent priority order.
  50. asort($this->document_gamut);
  51. asort($this->block_gamut);
  52. asort($this->span_gamut);
  53. }
  54. # Internal hashes used during transformation.
  55. var $urls = array();
  56. var $titles = array();
  57. var $html_hashes = array();
  58. # Status flag to avoid invalid nesting.
  59. var $in_anchor = false;
  60. function setup() {
  61. #
  62. # Called before the transformation process starts to setup parser
  63. # states.
  64. #
  65. # Clear global hashes.
  66. $this->urls = $this->predef_urls;
  67. $this->titles = $this->predef_titles;
  68. $this->html_hashes = array();
  69. $in_anchor = false;
  70. }
  71. function teardown() {
  72. #
  73. # Called after the transformation process to clear any variable
  74. # which may be taking up memory unnecessarly.
  75. #
  76. $this->urls = array();
  77. $this->titles = array();
  78. $this->html_hashes = array();
  79. }
  80. function transform($text) {
  81. #
  82. # Main function. Performs some preprocessing on the input text
  83. # and pass it through the document gamut.
  84. #
  85. $this->setup();
  86. # Remove UTF-8 BOM and marker character in input, if present.
  87. $text = preg_replace('{^\xEF\xBB\xBF|\x1A}', '', $text);
  88. # Standardize line endings:
  89. # DOS to Unix and Mac to Unix
  90. $text = preg_replace('{\r\n?}', "\n", $text);
  91. # Make sure $text ends with a couple of newlines:
  92. $text .= "\n\n";
  93. # Convert all tabs to spaces.
  94. $text = $this->detab($text);
  95. # Turn block-level HTML blocks into hash entries
  96. $text = $this->hashHTMLBlocks($text);
  97. # Strip any lines consisting only of spaces and tabs.
  98. # This makes subsequent regexen easier to write, because we can
  99. # match consecutive blank lines with /\n+/ instead of something
  100. # contorted like /[ ]*\n+/ .
  101. $text = preg_replace('/^[ ]+$/m', '', $text);
  102. # Run document gamut methods.
  103. foreach ($this->document_gamut as $method => $priority) {
  104. $text = $this->$method($text);
  105. }
  106. $this->teardown();
  107. return $text . "\n";
  108. }
  109. var $document_gamut = array(
  110. # Strip link definitions, store in hashes.
  111. "stripLinkDefinitions" => 20,
  112. "runBasicBlockGamut" => 30,
  113. );
  114. function stripLinkDefinitions($text) {
  115. #
  116. # Strips link definitions from text, stores the URLs and titles in
  117. # hash references.
  118. #
  119. $less_than_tab = $this->tab_width - 1;
  120. # Link defs are in the form: ^[id]: url "optional title"
  121. $text = preg_replace_callback('{
  122. ^[ ]{0,'.$less_than_tab.'}\[(.+)\][ ]?: # id = $1
  123. [ ]*
  124. \n? # maybe *one* newline
  125. [ ]*
  126. (?:
  127. <(.+?)> # url = $2
  128. |
  129. (\S+?) # url = $3
  130. )
  131. [ ]*
  132. \n? # maybe one newline
  133. [ ]*
  134. (?:
  135. (?<=\s) # lookbehind for whitespace
  136. ["(]
  137. (.*?) # title = $4
  138. [")]
  139. [ ]*
  140. )? # title is optional
  141. (?:\n+|\Z)
  142. }xm',
  143. array(&$this, '_stripLinkDefinitions_callback'),
  144. $text);
  145. return $text;
  146. }
  147. function _stripLinkDefinitions_callback($matches) {
  148. $link_id = strtolower($matches[1]);
  149. $url = $matches[2] == '' ? $matches[3] : $matches[2];
  150. $this->urls[$link_id] = $url;
  151. $this->titles[$link_id] =& $matches[4];
  152. return ''; # String that will replace the block
  153. }
  154. function hashHTMLBlocks($text) {
  155. if ($this->no_markup) return $text;
  156. $less_than_tab = $this->tab_width - 1;
  157. # Hashify HTML blocks:
  158. # We only want to do this for block-level HTML tags, such as headers,
  159. # lists, and tables. That's because we still want to wrap <p>s around
  160. # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
  161. # phrase emphasis, and spans. The list of tags we're looking for is
  162. # hard-coded:
  163. #
  164. # * List "a" is made of tags which can be both inline or block-level.
  165. # These will be treated block-level when the start tag is alone on
  166. # its line, otherwise they're not matched here and will be taken as
  167. # inline later.
  168. # * List "b" is made of tags which are always block-level;
  169. #
  170. $block_tags_a_re = 'ins|del';
  171. $block_tags_b_re = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|'.
  172. 'script|noscript|form|fieldset|iframe|math|svg|'.
  173. 'article|section|nav|aside|hgroup|header|footer|'.
  174. 'figure';
  175. # Regular expression for the content of a block tag.
  176. $nested_tags_level = 4;
  177. $attr = '
  178. (?> # optional tag attributes
  179. \s # starts with whitespace
  180. (?>
  181. [^>"/]+ # text outside quotes
  182. |
  183. /+(?!>) # slash not followed by ">"
  184. |
  185. "[^"]*" # text inside double quotes (tolerate ">")
  186. |
  187. \'[^\']*\' # text inside single quotes (tolerate ">")
  188. )*
  189. )?
  190. ';
  191. $content =
  192. str_repeat('
  193. (?>
  194. [^<]+ # content without tag
  195. |
  196. <\2 # nested opening tag
  197. '.$attr.' # attributes
  198. (?>
  199. />
  200. |
  201. >', $nested_tags_level). # end of opening tag
  202. '.*?'. # last level nested tag content
  203. str_repeat('
  204. </\2\s*> # closing nested tag
  205. )
  206. |
  207. <(?!/\2\s*> # other tags with a different name
  208. )
  209. )*',
  210. $nested_tags_level);
  211. $content2 = str_replace('\2', '\3', $content);
  212. # First, look for nested blocks, e.g.:
  213. # <div>
  214. # <div>
  215. # tags for inner block must be indented.
  216. # </div>
  217. # </div>
  218. #
  219. # The outermost tags must start at the left margin for this to match, and
  220. # the inner nested divs must be indented.
  221. # We need to do this before the next, more liberal match, because the next
  222. # match will start at the first `<div>` and stop at the first `</div>`.
  223. $text = preg_replace_callback('{(?>
  224. (?>
  225. (?<=\n\n) # Starting after a blank line
  226. | # or
  227. \A\n? # the beginning of the doc
  228. )
  229. ( # save in $1
  230. # Match from `\n<tag>` to `</tag>\n`, handling nested tags
  231. # in between.
  232. [ ]{0,'.$less_than_tab.'}
  233. <('.$block_tags_b_re.')# start tag = $2
  234. '.$attr.'> # attributes followed by > and \n
  235. '.$content.' # content, support nesting
  236. </\2> # the matching end tag
  237. [ ]* # trailing spaces/tabs
  238. (?=\n+|\Z) # followed by a newline or end of document
  239. | # Special version for tags of group a.
  240. [ ]{0,'.$less_than_tab.'}
  241. <('.$block_tags_a_re.')# start tag = $3
  242. '.$attr.'>[ ]*\n # attributes followed by >
  243. '.$content2.' # content, support nesting
  244. </\3> # the matching end tag
  245. [ ]* # trailing spaces/tabs
  246. (?=\n+|\Z) # followed by a newline or end of document
  247. | # Special case just for <hr />. It was easier to make a special
  248. # case than to make the other regex more complicated.
  249. [ ]{0,'.$less_than_tab.'}
  250. <(hr) # start tag = $2
  251. '.$attr.' # attributes
  252. /?> # the matching end tag
  253. [ ]*
  254. (?=\n{2,}|\Z) # followed by a blank line or end of document
  255. | # Special case for standalone HTML comments:
  256. [ ]{0,'.$less_than_tab.'}
  257. (?s:
  258. <!-- .*? -->
  259. )
  260. [ ]*
  261. (?=\n{2,}|\Z) # followed by a blank line or end of document
  262. | # PHP and ASP-style processor instructions (<? and <%)
  263. [ ]{0,'.$less_than_tab.'}
  264. (?s:
  265. <([?%]) # $2
  266. .*?
  267. \2>
  268. )
  269. [ ]*
  270. (?=\n{2,}|\Z) # followed by a blank line or end of document
  271. )
  272. )}Sxmi',
  273. array(&$this, '_hashHTMLBlocks_callback'),
  274. $text);
  275. return $text;
  276. }
  277. function _hashHTMLBlocks_callback($matches) {
  278. $text = $matches[1];
  279. $key = $this->hashBlock($text);
  280. return "\n\n$key\n\n";
  281. }
  282. function hashPart($text, $boundary = 'X') {
  283. #
  284. # Called whenever a tag must be hashed when a function insert an atomic
  285. # element in the text stream. Passing $text to through this function gives
  286. # a unique text-token which will be reverted back when calling unhash.
  287. #
  288. # The $boundary argument specify what character should be used to surround
  289. # the token. By convension, "B" is used for block elements that needs not
  290. # to be wrapped into paragraph tags at the end, ":" is used for elements
  291. # that are word separators and "X" is used in the general case.
  292. #
  293. # Swap back any tag hash found in $text so we do not have to `unhash`
  294. # multiple times at the end.
  295. $text = $this->unhash($text);
  296. # Then hash the block.
  297. static $i = 0;
  298. $key = "$boundary\x1A" . ++$i . $boundary;
  299. $this->html_hashes[$key] = $text;
  300. return $key; # String that will replace the tag.
  301. }
  302. function hashBlock($text) {
  303. #
  304. # Shortcut function for hashPart with block-level boundaries.
  305. #
  306. return $this->hashPart($text, 'B');
  307. }
  308. var $block_gamut = array(
  309. #
  310. # These are all the transformations that form block-level
  311. # tags like paragraphs, headers, and list items.
  312. #
  313. "doHeaders" => 10,
  314. "doHorizontalRules" => 20,
  315. "doLists" => 40,
  316. "doCodeBlocks" => 50,
  317. "doBlockQuotes" => 60,
  318. );
  319. function runBlockGamut($text) {
  320. #
  321. # Run block gamut tranformations.
  322. #
  323. # We need to escape raw HTML in Markdown source before doing anything
  324. # else. This need to be done for each block, and not only at the
  325. # begining in the Markdown function since hashed blocks can be part of
  326. # list items and could have been indented. Indented blocks would have
  327. # been seen as a code block in a previous pass of hashHTMLBlocks.
  328. $text = $this->hashHTMLBlocks($text);
  329. return $this->runBasicBlockGamut($text);
  330. }
  331. function runBasicBlockGamut($text) {
  332. #
  333. # Run block gamut tranformations, without hashing HTML blocks. This is
  334. # useful when HTML blocks are known to be already hashed, like in the first
  335. # whole-document pass.
  336. #
  337. foreach ($this->block_gamut as $method => $priority) {
  338. $text = $this->$method($text);
  339. }
  340. # Finally form paragraph and restore hashed blocks.
  341. $text = $this->formParagraphs($text);
  342. return $text;
  343. }
  344. function doHorizontalRules($text) {
  345. # Do Horizontal Rules:
  346. return preg_replace(
  347. '{
  348. ^[ ]{0,3} # Leading space
  349. ([-*_]) # $1: First marker
  350. (?> # Repeated marker group
  351. [ ]{0,2} # Zero, one, or two spaces.
  352. \1 # Marker character
  353. ){2,} # Group repeated at least twice
  354. [ ]* # Tailing spaces
  355. $ # End of line.
  356. }mx',
  357. "\n".$this->hashBlock("<hr$this->empty_element_suffix")."\n",
  358. $text);
  359. }
  360. var $span_gamut = array(
  361. #
  362. # These are all the transformations that occur *within* block-level
  363. # tags like paragraphs, headers, and list items.
  364. #
  365. # Process character escapes, code spans, and inline HTML
  366. # in one shot.
  367. "parseSpan" => -30,
  368. # Process anchor and image tags. Images must come first,
  369. # because ![foo][f] looks like an anchor.
  370. "doImages" => 10,
  371. "doAnchors" => 20,
  372. # Make links out of things like `<http://example.com/>`
  373. # Must come after doAnchors, because you can use < and >
  374. # delimiters in inline links like [this](<url>).
  375. "doAutoLinks" => 30,
  376. "encodeAmpsAndAngles" => 40,
  377. "doItalicsAndBold" => 50,
  378. "doHardBreaks" => 60,
  379. );
  380. function runSpanGamut($text) {
  381. #
  382. # Run span gamut tranformations.
  383. #
  384. foreach ($this->span_gamut as $method => $priority) {
  385. $text = $this->$method($text);
  386. }
  387. return $text;
  388. }
  389. function doHardBreaks($text) {
  390. # Do hard breaks:
  391. return preg_replace_callback('/ {2,}\n/',
  392. array(&$this, '_doHardBreaks_callback'), $text);
  393. }
  394. function _doHardBreaks_callback($matches) {
  395. return $this->hashPart("<br$this->empty_element_suffix\n");
  396. }
  397. function doAnchors($text) {
  398. #
  399. # Turn Markdown link shortcuts into XHTML <a> tags.
  400. #
  401. if ($this->in_anchor) return $text;
  402. $this->in_anchor = true;
  403. #
  404. # First, handle reference-style links: [link text] [id]
  405. #
  406. $text = preg_replace_callback('{
  407. ( # wrap whole match in $1
  408. \[
  409. ('.$this->nested_brackets_re.') # link text = $2
  410. \]
  411. [ ]? # one optional space
  412. (?:\n[ ]*)? # one optional newline followed by spaces
  413. \[
  414. (.*?) # id = $3
  415. \]
  416. )
  417. }xs',
  418. array(&$this, '_doAnchors_reference_callback'), $text);
  419. #
  420. # Next, inline-style links: [link text](url "optional title")
  421. #
  422. $text = preg_replace_callback('{
  423. ( # wrap whole match in $1
  424. \[
  425. ('.$this->nested_brackets_re.') # link text = $2
  426. \]
  427. \( # literal paren
  428. [ \n]*
  429. (?:
  430. <(.+?)> # href = $3
  431. |
  432. ('.$this->nested_url_parenthesis_re.') # href = $4
  433. )
  434. [ \n]*
  435. ( # $5
  436. ([\'"]) # quote char = $6
  437. (.*?) # Title = $7
  438. \6 # matching quote
  439. [ \n]* # ignore any spaces/tabs between closing quote and )
  440. )? # title is optional
  441. \)
  442. )
  443. }xs',
  444. array(&$this, '_doAnchors_inline_callback'), $text);
  445. #
  446. # Last, handle reference-style shortcuts: [link text]
  447. # These must come last in case you've also got [link text][1]
  448. # or [link text](/foo)
  449. #
  450. $text = preg_replace_callback('{
  451. ( # wrap whole match in $1
  452. \[
  453. ([^\[\]]+) # link text = $2; can\'t contain [ or ]
  454. \]
  455. )
  456. }xs',
  457. array(&$this, '_doAnchors_reference_callback'), $text);
  458. $this->in_anchor = false;
  459. return $text;
  460. }
  461. function _doAnchors_reference_callback($matches) {
  462. $whole_match = $matches[1];
  463. $link_text = $matches[2];
  464. $link_id =& $matches[3];
  465. if ($link_id == "") {
  466. # for shortcut links like [this][] or [this].
  467. $link_id = $link_text;
  468. }
  469. # lower-case and turn embedded newlines into spaces
  470. $link_id = strtolower($link_id);
  471. $link_id = preg_replace('{[ ]?\n}', ' ', $link_id);
  472. if (isset($this->urls[$link_id])) {
  473. $url = $this->urls[$link_id];
  474. $url = $this->encodeAttribute($url);
  475. $result = "<a href=\"$url\"";
  476. if ( isset( $this->titles[$link_id] ) ) {
  477. $title = $this->titles[$link_id];
  478. $title = $this->encodeAttribute($title);
  479. $result .= " title=\"$title\"";
  480. }
  481. $link_text = $this->runSpanGamut($link_text);
  482. $result .= ">$link_text</a>";
  483. $result = $this->hashPart($result);
  484. }
  485. else {
  486. $result = $whole_match;
  487. }
  488. return $result;
  489. }
  490. function _doAnchors_inline_callback($matches) {
  491. $whole_match = $matches[1];
  492. $link_text = $this->runSpanGamut($matches[2]);
  493. $url = $matches[3] == '' ? $matches[4] : $matches[3];
  494. $title =& $matches[7];
  495. $url = $this->encodeAttribute($url);
  496. $result = "<a href=\"$url\"";
  497. if (isset($title)) {
  498. $title = $this->encodeAttribute($title);
  499. $result .= " title=\"$title\"";
  500. }
  501. $link_text = $this->runSpanGamut($link_text);
  502. $result .= ">$link_text</a>";
  503. return $this->hashPart($result);
  504. }
  505. function doImages($text) {
  506. #
  507. # Turn Markdown image shortcuts into <img> tags.
  508. #
  509. #
  510. # First, handle reference-style labeled images: ![alt text][id]
  511. #
  512. $text = preg_replace_callback('{
  513. ( # wrap whole match in $1
  514. !\[
  515. ('.$this->nested_brackets_re.') # alt text = $2
  516. \]
  517. [ ]? # one optional space
  518. (?:\n[ ]*)? # one optional newline followed by spaces
  519. \[
  520. (.*?) # id = $3
  521. \]
  522. )
  523. }xs',
  524. array(&$this, '_doImages_reference_callback'), $text);
  525. #
  526. # Next, handle inline images: ![alt text](url "optional title")
  527. # Don't forget: encode * and _
  528. #
  529. $text = preg_replace_callback('{
  530. ( # wrap whole match in $1
  531. !\[
  532. ('.$this->nested_brackets_re.') # alt text = $2
  533. \]
  534. \s? # One optional whitespace character
  535. \( # literal paren
  536. [ \n]*
  537. (?:
  538. <(\S*)> # src url = $3
  539. |
  540. ('.$this->nested_url_parenthesis_re.') # src url = $4
  541. )
  542. [ \n]*
  543. ( # $5
  544. ([\'"]) # quote char = $6
  545. (.*?) # title = $7
  546. \6 # matching quote
  547. [ \n]*
  548. )? # title is optional
  549. \)
  550. )
  551. }xs',
  552. array(&$this, '_doImages_inline_callback'), $text);
  553. return $text;
  554. }
  555. function _doImages_reference_callback($matches) {
  556. $whole_match = $matches[1];
  557. $alt_text = $matches[2];
  558. $link_id = strtolower($matches[3]);
  559. if ($link_id == "") {
  560. $link_id = strtolower($alt_text); # for shortcut links like ![this][].
  561. }
  562. $alt_text = $this->encodeAttribute($alt_text);
  563. if (isset($this->urls[$link_id])) {
  564. $url = $this->encodeAttribute($this->urls[$link_id]);
  565. $result = "<img src=\"$url\" alt=\"$alt_text\"";
  566. if (isset($this->titles[$link_id])) {
  567. $title = $this->titles[$link_id];
  568. $title = $this->encodeAttribute($title);
  569. $result .= " title=\"$title\"";
  570. }
  571. $result .= $this->empty_element_suffix;
  572. $result = $this->hashPart($result);
  573. }
  574. else {
  575. # If there's no such link ID, leave intact:
  576. $result = $whole_match;
  577. }
  578. return $result;
  579. }
  580. function _doImages_inline_callback($matches) {
  581. $whole_match = $matches[1];
  582. $alt_text = $matches[2];
  583. $url = $matches[3] == '' ? $matches[4] : $matches[3];
  584. $title =& $matches[7];
  585. $alt_text = $this->encodeAttribute($alt_text);
  586. $url = $this->encodeAttribute($url);
  587. $result = "<img src=\"$url\" alt=\"$alt_text\"";
  588. if (isset($title)) {
  589. $title = $this->encodeAttribute($title);
  590. $result .= " title=\"$title\""; # $title already quoted
  591. }
  592. $result .= $this->empty_element_suffix;
  593. return $this->hashPart($result);
  594. }
  595. function doHeaders($text) {
  596. # Setext-style headers:
  597. # Header 1
  598. # ========
  599. #
  600. # Header 2
  601. # --------
  602. #
  603. $text = preg_replace_callback('{ ^(.+?)[ ]*\n(=+|-+)[ ]*\n+ }mx',
  604. array(&$this, '_doHeaders_callback_setext'), $text);
  605. # atx-style headers:
  606. # # Header 1
  607. # ## Header 2
  608. # ## Header 2 with closing hashes ##
  609. # ...
  610. # ###### Header 6
  611. #
  612. $text = preg_replace_callback('{
  613. ^(\#{1,6}) # $1 = string of #\'s
  614. [ ]*
  615. (.+?) # $2 = Header text
  616. [ ]*
  617. \#* # optional closing #\'s (not counted)
  618. \n+
  619. }xm',
  620. array(&$this, '_doHeaders_callback_atx'), $text);
  621. return $text;
  622. }
  623. function _doHeaders_callback_setext($matches) {
  624. # Terrible hack to check we haven't found an empty list item.
  625. if ($matches[2] == '-' && preg_match('{^-(?: |$)}', $matches[1]))
  626. return $matches[0];
  627. $level = $matches[2]{0} == '=' ? 1 : 2;
  628. $block = "<h$level>".$this->runSpanGamut($matches[1])."</h$level>";
  629. return "\n" . $this->hashBlock($block) . "\n\n";
  630. }
  631. function _doHeaders_callback_atx($matches) {
  632. $level = strlen($matches[1]);
  633. $block = "<h$level>".$this->runSpanGamut($matches[2])."</h$level>";
  634. return "\n" . $this->hashBlock($block) . "\n\n";
  635. }
  636. function doLists($text) {
  637. #
  638. # Form HTML ordered (numbered) and unordered (bulleted) lists.
  639. #
  640. $less_than_tab = $this->tab_width - 1;
  641. # Re-usable patterns to match list item bullets and number markers:
  642. $marker_ul_re = '[*+-]';
  643. $marker_ol_re = '\d+[\.]';
  644. $marker_any_re = "(?:$marker_ul_re|$marker_ol_re)";
  645. $markers_relist = array(
  646. $marker_ul_re => $marker_ol_re,
  647. $marker_ol_re => $marker_ul_re,
  648. );
  649. foreach ($markers_relist as $marker_re => $other_marker_re) {
  650. # Re-usable pattern to match any entirel ul or ol list:
  651. $whole_list_re = '
  652. ( # $1 = whole list
  653. ( # $2
  654. ([ ]{0,'.$less_than_tab.'}) # $3 = number of spaces
  655. ('.$marker_re.') # $4 = first list item marker
  656. [ ]+
  657. )
  658. (?s:.+?)
  659. ( # $5
  660. \z
  661. |
  662. \n{2,}
  663. (?=\S)
  664. (?! # Negative lookahead for another list item marker
  665. [ ]*
  666. '.$marker_re.'[ ]+
  667. )
  668. |
  669. (?= # Lookahead for another kind of list
  670. \n
  671. \3 # Must have the same indentation
  672. '.$other_marker_re.'[ ]+
  673. )
  674. )
  675. )
  676. '; // mx
  677. # We use a different prefix before nested lists than top-level lists.
  678. # See extended comment in _ProcessListItems().
  679. if ($this->list_level) {
  680. $text = preg_replace_callback('{
  681. ^
  682. '.$whole_list_re.'
  683. }mx',
  684. array(&$this, '_doLists_callback'), $text);
  685. }
  686. else {
  687. $text = preg_replace_callback('{
  688. (?:(?<=\n)\n|\A\n?) # Must eat the newline
  689. '.$whole_list_re.'
  690. }mx',
  691. array(&$this, '_doLists_callback'), $text);
  692. }
  693. }
  694. return $text;
  695. }
  696. function _doLists_callback($matches) {
  697. # Re-usable patterns to match list item bullets and number markers:
  698. $marker_ul_re = '[*+-]';
  699. $marker_ol_re = '\d+[\.]';
  700. $marker_any_re = "(?:$marker_ul_re|$marker_ol_re)";
  701. $list = $matches[1];
  702. $list_type = preg_match("/$marker_ul_re/", $matches[4]) ? "ul" : "ol";
  703. $marker_any_re = ( $list_type == "ul" ? $marker_ul_re : $marker_ol_re );
  704. $list .= "\n";
  705. $result = $this->processListItems($list, $marker_any_re);
  706. $result = $this->hashBlock("<$list_type>\n" . $result . "</$list_type>");
  707. return "\n". $result ."\n\n";
  708. }
  709. var $list_level = 0;
  710. function processListItems($list_str, $marker_any_re) {
  711. #
  712. # Process the contents of a single ordered or unordered list, splitting it
  713. # into individual list items.
  714. #
  715. # The $this->list_level global keeps track of when we're inside a list.
  716. # Each time we enter a list, we increment it; when we leave a list,
  717. # we decrement. If it's zero, we're not in a list anymore.
  718. #
  719. # We do this because when we're not inside a list, we want to treat
  720. # something like this:
  721. #
  722. # I recommend upgrading to version
  723. # 8. Oops, now this line is treated
  724. # as a sub-list.
  725. #
  726. # As a single paragraph, despite the fact that the second line starts
  727. # with a digit-period-space sequence.
  728. #
  729. # Whereas when we're inside a list (or sub-list), that line will be
  730. # treated as the start of a sub-list. What a kludge, huh? This is
  731. # an aspect of Markdown's syntax that's hard to parse perfectly
  732. # without resorting to mind-reading. Perhaps the solution is to
  733. # change the syntax rules such that sub-lists must start with a
  734. # starting cardinal number; e.g. "1." or "a.".
  735. $this->list_level++;
  736. # trim trailing blank lines:
  737. $list_str = preg_replace("/\n{2,}\\z/", "\n", $list_str);
  738. $list_str = preg_replace_callback('{
  739. (\n)? # leading line = $1
  740. (^[ ]*) # leading whitespace = $2
  741. ('.$marker_any_re.' # list marker and space = $3
  742. (?:[ ]+|(?=\n)) # space only required if item is not empty
  743. )
  744. ((?s:.*?)) # list item text = $4
  745. (?:(\n+(?=\n))|\n) # tailing blank line = $5
  746. (?= \n* (\z | \2 ('.$marker_any_re.') (?:[ ]+|(?=\n))))
  747. }xm',
  748. array(&$this, '_processListItems_callback'), $list_str);
  749. $this->list_level--;
  750. return $list_str;
  751. }
  752. function _processListItems_callback($matches) {
  753. $item = $matches[4];
  754. $leading_line =& $matches[1];
  755. $leading_space =& $matches[2];
  756. $marker_space = $matches[3];
  757. $tailing_blank_line =& $matches[5];
  758. if ($leading_line || $tailing_blank_line ||
  759. preg_match('/\n{2,}/', $item))
  760. {
  761. # Replace marker with the appropriate whitespace indentation
  762. $item = $leading_space . str_repeat(' ', strlen($marker_space)) . $item;
  763. $item = $this->runBlockGamut($this->outdent($item)."\n");
  764. }
  765. else {
  766. # Recursion for sub-lists:
  767. $item = $this->doLists($this->outdent($item));
  768. $item = preg_replace('/\n+$/', '', $item);
  769. $item = $this->runSpanGamut($item);
  770. }
  771. return "<li>" . $item . "</li>\n";
  772. }
  773. function doCodeBlocks($text) {
  774. #
  775. # Process Markdown `<pre><code>` blocks.
  776. #
  777. $text = preg_replace_callback('{
  778. (?:\n\n|\A\n?)
  779. ( # $1 = the code block -- one or more lines, starting with a space/tab
  780. (?>
  781. [ ]{'.$this->tab_width.'} # Lines must start with a tab or a tab-width of spaces
  782. .*\n+
  783. )+
  784. )
  785. ((?=^[ ]{0,'.$this->tab_width.'}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
  786. }xm',
  787. array(&$this, '_doCodeBlocks_callback'), $text);
  788. return $text;
  789. }
  790. function _doCodeBlocks_callback($matches) {
  791. $codeblock = $matches[1];
  792. $codeblock = $this->outdent($codeblock);
  793. $codeblock = htmlspecialchars($codeblock, ENT_NOQUOTES);
  794. # trim leading newlines and trailing newlines
  795. $codeblock = preg_replace('/\A\n+|\n+\z/', '', $codeblock);
  796. $codeblock = "<pre><code>$codeblock\n</code></pre>";
  797. return "\n\n".$this->hashBlock($codeblock)."\n\n";
  798. }
  799. function makeCodeSpan($code) {
  800. #
  801. # Create a code span markup for $code. Called from handleSpanToken.
  802. #
  803. $code = htmlspecialchars(trim($code), ENT_NOQUOTES);
  804. return $this->hashPart("<code>$code</code>");
  805. }
  806. var $em_relist = array(
  807. '' => '(?:(?<!\*)\*(?!\*)|(?<!_)_(?!_))(?=\S|$)(?![\.,:;]\s)',
  808. '*' => '(?<=\S|^)(?<!\*)\*(?!\*)',
  809. '_' => '(?<=\S|^)(?<!_)_(?!_)',
  810. );
  811. var $strong_relist = array(
  812. '' => '(?:(?<!\*)\*\*(?!\*)|(?<!_)__(?!_))(?=\S|$)(?![\.,:;]\s)',
  813. '**' => '(?<=\S|^)(?<!\*)\*\*(?!\*)',
  814. '__' => '(?<=\S|^)(?<!_)__(?!_)',
  815. );
  816. var $em_strong_relist = array(
  817. '' => '(?:(?<!\*)\*\*\*(?!\*)|(?<!_)___(?!_))(?=\S|$)(?![\.,:;]\s)',
  818. '***' => '(?<=\S|^)(?<!\*)\*\*\*(?!\*)',
  819. '___' => '(?<=\S|^)(?<!_)___(?!_)',
  820. );
  821. var $em_strong_prepared_relist;
  822. function prepareItalicsAndBold() {
  823. #
  824. # Prepare regular expressions for searching emphasis tokens in any
  825. # context.
  826. #
  827. foreach ($this->em_relist as $em => $em_re) {
  828. foreach ($this->strong_relist as $strong => $strong_re) {
  829. # Construct list of allowed token expressions.
  830. $token_relist = array();
  831. if (isset($this->em_strong_relist["$em$strong"])) {
  832. $token_relist[] = $this->em_strong_relist["$em$strong"];
  833. }
  834. $token_relist[] = $em_re;
  835. $token_relist[] = $strong_re;
  836. # Construct master expression from list.
  837. $token_re = '{('. implode('|', $token_relist) .')}';
  838. $this->em_strong_prepared_relist["$em$strong"] = $token_re;
  839. }
  840. }
  841. }
  842. function doItalicsAndBold($text) {
  843. $token_stack = array('');
  844. $text_stack = array('');
  845. $em = '';
  846. $strong = '';
  847. $tree_char_em = false;
  848. while (1) {
  849. #
  850. # Get prepared regular expression for seraching emphasis tokens
  851. # in current context.
  852. #
  853. $token_re = $this->em_strong_prepared_relist["$em$strong"];
  854. #
  855. # Each loop iteration search for the next emphasis token.
  856. # Each token is then passed to handleSpanToken.
  857. #
  858. $parts = preg_split($token_re, $text, 2, PREG_SPLIT_DELIM_CAPTURE);
  859. $text_stack[0] .= $parts[0];
  860. $token =& $parts[1];
  861. $text =& $parts[2];
  862. if (empty($token)) {
  863. # Reached end of text span: empty stack without emitting.
  864. # any more emphasis.
  865. while ($token_stack[0]) {
  866. $text_stack[1] .= array_shift($token_stack);
  867. $text_stack[0] .= array_shift($text_stack);
  868. }
  869. break;
  870. }
  871. $token_len = strlen($token);
  872. if ($tree_char_em) {
  873. # Reached closing marker while inside a three-char emphasis.
  874. if ($token_len == 3) {
  875. # Three-char closing marker, close em and strong.
  876. array_shift($token_stack);
  877. $span = array_shift($text_stack);
  878. $span = $this->runSpanGamut($span);
  879. $span = "<strong><em>$span</em></strong>";
  880. $text_stack[0] .= $this->hashPart($span);
  881. $em = '';
  882. $strong = '';
  883. } else {
  884. # Other closing marker: close one em or strong and
  885. # change current token state to match the other
  886. $token_stack[0] = str_repeat($token{0}, 3-$token_len);
  887. $tag = $token_len == 2 ? "strong" : "em";
  888. $span = $text_stack[0];
  889. $span = $this->runSpanGamut($span);
  890. $span = "<$tag>$span</$tag>";
  891. $text_stack[0] = $this->hashPart($span);
  892. $$tag = ''; # $$tag stands for $em or $strong
  893. }
  894. $tree_char_em = false;
  895. } else if ($token_len == 3) {
  896. if ($em) {
  897. # Reached closing marker for both em and strong.
  898. # Closing strong marker:
  899. for ($i = 0; $i < 2; ++$i) {
  900. $shifted_token = array_shift($token_stack);
  901. $tag = strlen($shifted_token) == 2 ? "strong" : "em";
  902. $span = array_shift($text_stack);
  903. $span = $this->runSpanGamut($span);
  904. $span = "<$tag>$span</$tag>";
  905. $text_stack[0] .= $this->hashPart($span);
  906. $$tag = ''; # $$tag stands for $em or $strong
  907. }
  908. } else {
  909. # Reached opening three-char emphasis marker. Push on token
  910. # stack; will be handled by the special condition above.
  911. $em = $token{0};
  912. $strong = "$em$em";
  913. array_unshift($token_stack, $token);
  914. array_unshift($text_stack, '');
  915. $tree_char_em = true;
  916. }
  917. } else if ($token_len == 2) {
  918. if ($strong) {
  919. # Unwind any dangling emphasis marker:
  920. if (strlen($token_stack[0]) == 1) {
  921. $text_stack[1] .= array_shift($token_stack);
  922. $text_stack[0] .= array_shift($text_stack);
  923. }
  924. # Closing strong marker:
  925. array_shift($token_stack);
  926. $span = array_shift($text_stack);
  927. $span = $this->runSpanGamut($span);
  928. $span = "<strong>$span</strong>";
  929. $text_stack[0] .= $this->hashPart($span);
  930. $strong = '';
  931. } else {
  932. array_unshift($token_stack, $token);
  933. array_unshift($text_stack, '');
  934. $strong = $token;
  935. }
  936. } else {
  937. # Here $token_len == 1
  938. if ($em) {
  939. if (strlen($token_stack[0]) == 1) {
  940. # Closing emphasis marker:
  941. array_shift($token_stack);
  942. $span = array_shift($text_stack);
  943. $span = $this->runSpanGamut($span);
  944. $span = "<em>$span</em>";
  945. $text_stack[0] .= $this->hashPart($span);
  946. $em = '';
  947. } else {
  948. $text_stack[0] .= $token;
  949. }
  950. } else {
  951. array_unshift($token_stack, $token);
  952. array_unshift($text_stack, '');
  953. $em = $token;
  954. }
  955. }
  956. }
  957. return $text_stack[0];
  958. }
  959. function doBlockQuotes($text) {
  960. $text = preg_replace_callback('/
  961. ( # Wrap whole match in $1
  962. (?>
  963. ^[ ]*>[ ]? # ">" at the start of a line
  964. .+\n # rest of the first line
  965. (.+\n)* # subsequent consecutive lines
  966. \n* # blanks
  967. )+
  968. )
  969. /xm',
  970. array(&$this, '_doBlockQuotes_callback'), $text);
  971. return $text;
  972. }
  973. function _doBlockQuotes_callback($matches) {
  974. $bq = $matches[1];
  975. # trim one level of quoting - trim whitespace-only lines
  976. $bq = preg_replace('/^[ ]*>[ ]?|^[ ]+$/m', '', $bq);
  977. $bq = $this->runBlockGamut($bq); # recurse
  978. $bq = preg_replace('/^/m', " ", $bq);
  979. # These leading spaces cause problem with <pre> content,
  980. # so we need to fix that:
  981. $bq = preg_replace_callback('{(\s*<pre>.+?</pre>)}sx',
  982. array(&$this, '_doBlockQuotes_callback2'), $bq);
  983. return "\n". $this->hashBlock("<blockquote>\n$bq\n</blockquote>")."\n\n";
  984. }
  985. function _doBlockQuotes_callback2($matches) {
  986. $pre = $matches[1];
  987. $pre = preg_replace('/^ /m', '', $pre);
  988. return $pre;
  989. }
  990. function formParagraphs($text) {
  991. #
  992. # Params:
  993. # $text - string to process with html <p> tags
  994. #
  995. # Strip leading and trailing lines:
  996. $text = preg_replace('/\A\n+|\n+\z/', '', $text);
  997. $grafs = preg_split('/\n{2,}/', $text, -1, PREG_SPLIT_NO_EMPTY);
  998. #
  999. # Wrap <p> tags and unhashify HTML blocks
  1000. #
  1001. foreach ($grafs as $key => $value) {
  1002. if (!preg_match('/^B\x1A[0-9]+B$/', $value)) {
  1003. # Is a paragraph.
  1004. $value = $this->runSpanGamut($value);
  1005. $value = preg_replace('/^([ ]*)/', "<p>", $value);
  1006. $value .= "</p>";
  1007. $grafs[$key] = $this->unhash($value);
  1008. }
  1009. else {
  1010. # Is a block.
  1011. # Modify elements of @grafs in-place...
  1012. $graf = $value;
  1013. $block = $this->html_hashes[$graf];
  1014. $graf = $block;
  1015. // if (preg_match('{
  1016. // \A
  1017. // ( # $1 = <div> tag
  1018. // <div \s+
  1019. // [^>]*
  1020. // \b
  1021. // markdown\s*=\s* ([\'"]) # $2 = attr quote char
  1022. // 1
  1023. // \2
  1024. // [^>]*
  1025. // >
  1026. // )
  1027. // ( # $3 = contents
  1028. // .*
  1029. // )
  1030. // (</div>) # $4 = closing tag
  1031. // \z
  1032. // }xs', $block, $matches))
  1033. // {
  1034. // list(, $div_open, , $div_content, $div_close) = $matches;
  1035. //
  1036. // # We can't call Markdown(), because that resets the hash;
  1037. // # that initialization code should be pulled into its own sub, though.
  1038. // $div_content = $this->hashHTMLBlocks($div_content);
  1039. //
  1040. // # Run document gamut methods on the content.
  1041. // foreach ($this->document_gamut as $method => $priority) {
  1042. // $div_content = $this->$method($div_content);
  1043. // }
  1044. //
  1045. // $div_open = preg_replace(
  1046. // '{\smarkdown\s*=\s*([\'"]).+?\1}', '', $div_open);
  1047. //
  1048. // $graf = $div_open . "\n" . $div_content . "\n" . $div_close;
  1049. // }
  1050. $grafs[$key] = $graf;
  1051. }
  1052. }
  1053. return implode("\n\n", $grafs);
  1054. }
  1055. function encodeAttribute($text) {
  1056. #
  1057. # Encode text for a double-quoted HTML attribute. This function
  1058. # is *not* suitable for attributes enclosed in single quotes.
  1059. #
  1060. $text = $this->encodeAmpsAndAngles($text);
  1061. $text = str_replace('"', '&quot;', $text);
  1062. return $text;
  1063. }
  1064. function encodeAmpsAndAngles($text) {
  1065. #
  1066. # Smart processing for ampersands and angle brackets that need to
  1067. # be encoded. Valid character entities are left alone unless the
  1068. # no-entities mode is set.
  1069. #
  1070. if ($this->no_entities) {
  1071. $text = str_replace('&', '&amp;', $text);
  1072. } else {
  1073. # Ampersand-encoding based entirely on Nat Irons's Amputator
  1074. # MT plugin: <http://bumppo.net/projects/amputator/>
  1075. $text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/',
  1076. '&amp;', $text);;
  1077. }
  1078. # Encode remaining <'s
  1079. $text = str_replace('<', '&lt;', $text);
  1080. return $text;
  1081. }
  1082. function doAutoLinks($text) {
  1083. $text = preg_replace_callback('{<((https?|ftp|dict):[^\'">\s]+)>}i',
  1084. array(&$this, '_doAutoLinks_url_callback'), $text);
  1085. # Email addresses: <address@domain.foo>
  1086. $text = preg_replace_callback('{
  1087. <
  1088. (?:mailto:)?
  1089. (
  1090. (?:
  1091. [-!#$%&\'*+/=?^_`.{|}~\w\x80-\xFF]+
  1092. |
  1093. ".*?"
  1094. )
  1095. \@
  1096. (?:
  1097. [-a-z0-9\x80-\xFF]+(\.[-a-z0-9\x80-\xFF]+)*\.[a-z]+
  1098. |
  1099. \[[\d.a-fA-F:]+\] # IPv4 & IPv6
  1100. )
  1101. )
  1102. >
  1103. }xi',
  1104. array(&$this, '_doAutoLinks_email_callback'), $text);
  1105. return $text;
  1106. }
  1107. function _doAutoLinks_url_callback($matches) {
  1108. $url = $this->encodeAttribute($matches[1]);
  1109. $link = "<a href=\"$url\">$url</a>";
  1110. return $this->hashPart($link);
  1111. }
  1112. function _doAutoLinks_email_callback($matches) {
  1113. $address = $matches[1];
  1114. $link = $this->encodeEmailAddress($address);
  1115. return $this->hashPart($link);
  1116. }
  1117. function encodeEmailAddress($addr) {
  1118. #
  1119. # Input: an email address, e.g. "foo@example.com"
  1120. #
  1121. # Output: the email address as a mailto link, with each character
  1122. # of the address encoded as either a decimal or hex entity, in
  1123. # the hopes of foiling most address harvesting spam bots. E.g.:
  1124. #
  1125. # <p><a href="&#109;&#x61;&#105;&#x6c;&#116;&#x6f;&#58;&#x66;o&#111;
  1126. # &#x40;&#101;&#x78;&#97;&#x6d;&#112;&#x6c;&#101;&#46;&#x63;&#111;
  1127. # &#x6d;">&#x66;o&#111;&#x40;&#101;&#x78;&#97;&#x6d;&#112;&#x6c;
  1128. # &#101;&#46;&#x63;&#111;&#x6d;</a></p>
  1129. #
  1130. # Based by a filter by Matthew Wickline, posted to BBEdit-Talk.
  1131. # With some optimizations by Milian Wolff.
  1132. #
  1133. $addr = "mailto:" . $addr;
  1134. $chars = preg_split('/(?<!^)(?!$)/', $addr);
  1135. $seed = (int)abs(crc32($addr) / strlen($addr)); # Deterministic seed.
  1136. foreach ($chars as $key => $char) {
  1137. $ord = ord($char);
  1138. # Ignore non-ascii chars.
  1139. if ($ord < 128) {
  1140. $r = ($seed * (1 + $key)) % 100; # Pseudo-random function.
  1141. # roughly 10% raw, 45% hex, 45% dec
  1142. # '@' *must* be encoded. I insist.
  1143. if ($r > 90 && $char != '@') /* do nothing */;
  1144. else if ($r < 45) $chars[$key] = '&#x'.dechex($ord).';';
  1145. else $chars[$key] = '&#'.$ord.';';
  1146. }
  1147. }
  1148. $addr = implode('', $chars);
  1149. $text = implode('', array_slice($chars, 7)); # text without `mailto:`
  1150. $addr = "<a href=\"$addr\">$text</a>";
  1151. return $addr;
  1152. }
  1153. function parseSpan($str) {
  1154. #
  1155. # Take the string $str and parse it into tokens, hashing embeded HTML,
  1156. # escaped characters and handling code spans.
  1157. #
  1158. $output = '';
  1159. $span_re = '{
  1160. (
  1161. \\\\'.$this->escape_chars_re.'
  1162. |
  1163. (?<![`\\\\])
  1164. `+ # code span marker
  1165. '.( $this->no_markup ? '' : '
  1166. |
  1167. <!-- .*? --> # comment
  1168. |
  1169. <\?.*?\?> | <%.*?%> # processing instruction
  1170. |
  1171. <[/!$]?[-a-zA-Z0-9:_]+ # regular tags
  1172. (?>
  1173. \s
  1174. (?>[^"\'>]+|"[^"]*"|\'[^\']*\')*
  1175. )?
  1176. >
  1177. ').'
  1178. )
  1179. }xs';
  1180. while (1) {
  1181. #
  1182. # Each loop iteration seach for either the next tag, the next
  1183. # openning code span marker, or the next escaped character.
  1184. # Each token is then passed to handleSpanToken.
  1185. #
  1186. $parts = preg_split($span_re, $str, 2, PREG_SPLIT_DELIM_CAPTURE);
  1187. # Create token from text preceding tag.
  1188. if ($parts[0] != "") {
  1189. $output .= $parts[0];
  1190. }
  1191. # Check if we reach the end.
  1192. if (isset($parts[1])) {
  1193. $output .= $this->handleSpanToken($parts[1], $parts[2]);
  1194. $str = $parts[2];
  1195. }
  1196. else {
  1197. break;
  1198. }
  1199. }
  1200. return $output;
  1201. }
  1202. function handleSpanToken($token, &$str) {
  1203. #
  1204. # Handle $token provided by parseSpan by determining its nature and
  1205. # returning the corresponding value that should replace it.
  1206. #
  1207. switch ($token{0}) {
  1208. case "\\":
  1209. return $this->hashPart("&#". ord($token{1}). ";");
  1210. case "`":
  1211. # Search for end marker in remaining text.
  1212. if (preg_match('/^(.*?[^`])'.preg_quote($token).'(?!`)(.*)$/sm',
  1213. $str, $matches))
  1214. {
  1215. $str = $matches[2];
  1216. $codespan = $this->makeCodeSpan($matches[1]);
  1217. return $this->hashPart($codespan);
  1218. }
  1219. return $token; // return as text since no ending marker found.
  1220. default:
  1221. return $this->hashPart($token);
  1222. }
  1223. }
  1224. function outdent($text) {
  1225. #
  1226. # Remove one level of line-leading tabs or spaces
  1227. #
  1228. return preg_replace('/^(\t|[ ]{1,'.$this->tab_width.'})/m', '', $text);
  1229. }
  1230. # String length function for detab. `_initDetab` will create a function to
  1231. # hanlde UTF-8 if the default function does not exist.
  1232. var $utf8_strlen = 'mb_strlen';
  1233. function detab($text) {
  1234. #
  1235. # Replace tabs with the appropriate amount of space.
  1236. #
  1237. # For each line we separate the line in blocks delemited by
  1238. # tab characters. Then we reconstruct every line by adding the
  1239. # appropriate number of space between each blocks.
  1240. $text = preg_replace_callback('/^.*\t.*$/m',
  1241. array(&$this, '_detab_callback'), $text);
  1242. return $text;
  1243. }
  1244. function _detab_callback($matches) {
  1245. $line = $matches[0];
  1246. $strlen = $this->utf8_strlen; # strlen function for UTF-8.
  1247. # Split in blocks.
  1248. $blocks = explode("\t", $line);
  1249. # Add each blocks to the line.
  1250. $line = $blocks[0];
  1251. unset($blocks[0]); # Do not add first block twice.
  1252. foreach ($blocks as $block) {
  1253. # Calculate amount of space, insert spaces, insert block.
  1254. $amount = $this->tab_width -
  1255. $strlen($line, 'UTF-8') % $this->tab_width;
  1256. $line .= str_repeat(" ", $amount) . $block;
  1257. }
  1258. return $line;
  1259. }
  1260. function _initDetab() {
  1261. #
  1262. # Check for the availability of the function in the `utf8_strlen` property
  1263. # (initially `mb_strlen`). If the function is not available, create a
  1264. # function that will loosely count the number of UTF-8 characters with a
  1265. # regular expression.
  1266. #
  1267. if (function_exists($this->utf8_strlen)) return;
  1268. $this->utf8_strlen = create_function('$text', 'return preg_match_all(
  1269. "/[\\\\x00-\\\\xBF]|[\\\\xC0-\\\\xFF][\\\\x80-\\\\xBF]*/",
  1270. $text, $m);');
  1271. }
  1272. function unhash($text) {
  1273. #
  1274. # Swap back in all the tags hashed by _HashHTMLBlocks.
  1275. #
  1276. return preg_replace_callback('/(.)\x1A[0-9]+\1/',
  1277. array(&$this, '_unhash_callback'), $text);
  1278. }
  1279. function _unhash_callback($matches) {
  1280. return $this->html_hashes[$matches[0]];
  1281. }
  1282. }