PageRenderTime 56ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/sources/ext/markdownify/markdownify.php

https://github.com/Arantor/Elkarte
PHP | 1494 lines | 941 code | 82 blank | 471 comment | 157 complexity | e4dfece984a91f2cebf15ed4631b36c1 MD5 | raw file
Possible License(s): BSD-3-Clause, LGPL-3.0
  1. <?php
  2. /**
  3. * Markdownify converts HTML Markup to [Markdown][1] (by [John Gruber][2]. It
  4. * also supports [Markdown Extra][3] by [Michel Fortin][4] via Markdownify_Extra.
  5. *
  6. * It all started as `html2text.php` - a port of [Aaron Swartz'][5] [`html2text.py`][6] - but
  7. * got a long way since. This is far more than a mere port now!
  8. * Starting with version 2.0.0 this is a complete rewrite and cannot be
  9. * compared to Aaron Swatz' `html2text.py` anylonger. I'm now using a HTML parser
  10. * (see `parsehtml.php` which I also wrote) which makes most of the evil
  11. * RegEx magic go away and additionally it gives a much cleaner class
  12. * structure. Also notably is the fact that I now try to prevent regressions by
  13. * utilizing testcases of Michel Fortin's [MDTest][7].
  14. *
  15. * [1]: http://daringfireball.com/projects/markdown
  16. * [2]: http://daringfireball.com/
  17. * [3]: http://www.michelf.com/projects/php-markdown/extra/
  18. * [4]: http://www.michelf.com/
  19. * [5]: http://www.aaronsw.com/
  20. * [6]: http://www.aaronsw.com/2002/html2text/
  21. * [7]: http://article.gmane.org/gmane.text.markdown.general/2540
  22. *
  23. * @version 2.0.0 alpha
  24. * @author Milian Wolff (<mail@milianw.de>, <http://milianw.de>)
  25. * @license LGPL, see LICENSE_LGPL.txt and the summary below
  26. * @copyright (C) 2007 Milian Wolff
  27. *
  28. * This library is free software; you can redistribute it and/or
  29. * modify it under the terms of the GNU Lesser General Public
  30. * License as published by the Free Software Foundation; either
  31. * version 2.1 of the License, or (at your option) any later version.
  32. *
  33. * This library is distributed in the hope that it will be useful,
  34. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  35. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  36. * Lesser General Public License for more details.
  37. *
  38. * You should have received a copy of the GNU Lesser General Public
  39. * License along with this library; if not, write to the Free Software
  40. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  41. */
  42. /**
  43. * HTML Parser, see http://sf.net/projects/parseHTML
  44. */
  45. require_once dirname(__FILE__) . '/parsehtml.php';
  46. /**
  47. * default configuration
  48. */
  49. define('MDFY_LINKS_EACH_PARAGRAPH', false);
  50. define('MDFY_BODYWIDTH', false);
  51. define('MDFY_KEEPHTML', true);
  52. /**
  53. * HTML to Markdown converter class
  54. */
  55. class Markdownify
  56. {
  57. /**
  58. * html parser object
  59. *
  60. * @var parseHTML
  61. */
  62. var $parser;
  63. /**
  64. * markdown output
  65. *
  66. * @var string
  67. */
  68. var $output;
  69. /**
  70. * stack with tags which where not converted to html
  71. *
  72. * @var array<string>
  73. */
  74. var $notConverted = array();
  75. /**
  76. * skip conversion to markdown
  77. *
  78. * @var bool
  79. */
  80. var $skipConversion = false;
  81. /* options */
  82. /**
  83. * keep html tags which cannot be converted to markdown
  84. *
  85. * @var bool
  86. */
  87. var $keepHTML = false;
  88. /**
  89. * wrap output, set to 0 to skip wrapping
  90. *
  91. * @var int
  92. */
  93. var $bodyWidth = 0;
  94. /**
  95. * minimum body width
  96. *
  97. * @var int
  98. */
  99. var $minBodyWidth = 25;
  100. /**
  101. * display links after each paragraph
  102. *
  103. * @var bool
  104. */
  105. var $linksAfterEachParagraph = false;
  106. /**
  107. * constructor, set options, setup parser
  108. *
  109. * @param bool $linksAfterEachParagraph wether or not to flush stacked links after each paragraph
  110. * defaults to false
  111. * @param int $bodyWidth wether or not to wrap the output to the given width
  112. * defaults to false
  113. * @param bool $keepHTML wether to keep non markdownable HTML or to discard it
  114. * defaults to true (HTML will be kept)
  115. * @return void
  116. */
  117. function Markdownify($linksAfterEachParagraph = MDFY_LINKS_EACH_PARAGRAPH, $bodyWidth = MDFY_BODYWIDTH, $keepHTML = MDFY_KEEPHTML)
  118. {
  119. $this->linksAfterEachParagraph = $linksAfterEachParagraph;
  120. $this->keepHTML = $keepHTML;
  121. if ($bodyWidth > $this->minBodyWidth)
  122. {
  123. $this->bodyWidth = intval($bodyWidth);
  124. }
  125. else
  126. {
  127. $this->bodyWidth = false;
  128. }
  129. $this->parser = new parseHTML;
  130. $this->parser->noTagsInCode = true;
  131. # we don't have to do this every time
  132. $search = array();
  133. $replace = array();
  134. foreach ($this->escapeInText as $s => $r)
  135. {
  136. array_push($search, '#(?<!\\\)' . $s . '#U');
  137. array_push($replace, $r);
  138. }
  139. $this->escapeInText = array(
  140. 'search' => $search,
  141. 'replace' => $replace
  142. );
  143. }
  144. /**
  145. * parse a HTML string
  146. *
  147. * @param string $html
  148. * @return string markdown formatted
  149. */
  150. function parseString($html)
  151. {
  152. $this->parser->html = $html;
  153. $this->parse();
  154. return $this->output;
  155. }
  156. /**
  157. * tags with elements which can be handled by markdown
  158. *
  159. * @var array<string>
  160. */
  161. var $isMarkdownable = array(
  162. 'p' => array(),
  163. 'ul' => array(),
  164. 'ol' => array(),
  165. 'li' => array(),
  166. 'br' => array(),
  167. 'blockquote' => array(),
  168. 'code' => array(),
  169. 'pre' => array(),
  170. 'a' => array(
  171. 'href' => 'required',
  172. 'title' => 'optional',
  173. ),
  174. 'strong' => array(),
  175. 'b' => array(),
  176. 'em' => array(),
  177. 'i' => array(),
  178. 'img' => array(
  179. 'src' => 'required',
  180. 'alt' => 'optional',
  181. 'title' => 'optional',
  182. ),
  183. 'h1' => array(),
  184. 'h2' => array(),
  185. 'h3' => array(),
  186. 'h4' => array(),
  187. 'h5' => array(),
  188. 'h6' => array(),
  189. 'hr' => array(),
  190. );
  191. /**
  192. * html tags to be ignored (contents will be parsed)
  193. *
  194. * @var array<string>
  195. */
  196. var $ignore = array(
  197. 'html',
  198. 'body',
  199. );
  200. /**
  201. * html tags to be dropped (contents will not be parsed!)
  202. *
  203. * @var array<string>
  204. */
  205. var $drop = array(
  206. 'script',
  207. 'head',
  208. 'style',
  209. 'form',
  210. 'area',
  211. 'object',
  212. 'param',
  213. 'iframe',
  214. );
  215. /**
  216. * Markdown indents which could be wrapped
  217. * @note: use strings in regex format
  218. *
  219. * @var array<string>
  220. */
  221. var $wrappableIndents = array(
  222. '\* ', # ul
  223. '\d. ', # ol
  224. '\d\d. ', # ol
  225. '> ', # blockquote
  226. '', # p
  227. );
  228. /**
  229. * list of chars which have to be escaped in normal text
  230. * @note: use strings in regex format
  231. *
  232. * @var array
  233. *
  234. * TODO: what's with block chars / sequences at the beginning of a block?
  235. */
  236. var $escapeInText = array(
  237. '([-*_])([ ]{0,2}\1){2,}' => '\\\\$0|', # hr
  238. '\*\*([^*\s]+)\*\*' => '\*\*$1\*\*', # strong
  239. '\*([^*\s]+)\*' => '\*$1\*', # em
  240. '__(?! |_)(.+)(?!<_| )__' => '\_\_$1\_\_', # em
  241. '_(?! |_)(.+)(?!<_| )_' => '\_$1\_', # em
  242. '`(.+)`' => '\`$1\`', # code
  243. '\[(.+)\](\s*\()' => '\[$1\]$2', # links: [text] (url) => [text\] (url)
  244. '\[(.+)\](\s*)\[(.*)\]' => '\[$1\]$2\[$3\]', # links: [text][id] => [text\][id\]
  245. );
  246. /**
  247. * wether last processed node was a block tag or not
  248. *
  249. * @var bool
  250. */
  251. var $lastWasBlockTag = false;
  252. /**
  253. * name of last closed tag
  254. *
  255. * @var string
  256. */
  257. var $lastClosedTag = '';
  258. /**
  259. * iterate through the nodes and decide what we
  260. * shall do with the current node
  261. *
  262. * @param void
  263. * @return void
  264. */
  265. function parse()
  266. {
  267. $this->output = '';
  268. # drop tags
  269. $this->parser->html = preg_replace('#<(' . implode('|', $this->drop) . ')[^>]*>.*</\\1>#sU', '', $this->parser->html);
  270. while ($this->parser->nextNode())
  271. {
  272. switch ($this->parser->nodeType)
  273. {
  274. case 'doctype':
  275. break;
  276. case 'pi':
  277. case 'comment':
  278. if ($this->keepHTML)
  279. {
  280. $this->flushLinebreaks();
  281. $this->out($this->parser->node);
  282. $this->setLineBreaks(2);
  283. }
  284. # else drop
  285. break;
  286. case 'text':
  287. $this->handleText();
  288. break;
  289. case 'tag':
  290. if (in_array($this->parser->tagName, $this->ignore))
  291. {
  292. break;
  293. }
  294. if ($this->parser->isStartTag)
  295. {
  296. $this->flushLinebreaks();
  297. }
  298. if ($this->skipConversion)
  299. {
  300. $this->isMarkdownable(); # update notConverted
  301. $this->handleTagToText();
  302. continue;
  303. }
  304. if (!$this->parser->keepWhitespace && $this->parser->isBlockElement && $this->parser->isStartTag)
  305. {
  306. $this->parser->html = ltrim($this->parser->html);
  307. }
  308. if ($this->isMarkdownable())
  309. {
  310. if ($this->parser->isBlockElement && $this->parser->isStartTag && !$this->lastWasBlockTag && !empty($this->output))
  311. {
  312. if (!empty($this->buffer))
  313. {
  314. $str = & $this->buffer[count($this->buffer) - 1];
  315. }
  316. else
  317. {
  318. $str = & $this->output;
  319. }
  320. if (substr($str, -strlen($this->indent) - 1) != "\n" . $this->indent)
  321. {
  322. $str .= "\n" . $this->indent;
  323. }
  324. }
  325. $func = 'handleTag_' . $this->parser->tagName;
  326. $this->$func();
  327. if ($this->linksAfterEachParagraph && $this->parser->isBlockElement && !$this->parser->isStartTag && empty($this->parser->openTags))
  328. {
  329. $this->flushStacked();
  330. }
  331. if (!$this->parser->isStartTag)
  332. {
  333. $this->lastClosedTag = $this->parser->tagName;
  334. }
  335. }
  336. else
  337. {
  338. $this->handleTagToText();
  339. $this->lastClosedTag = '';
  340. }
  341. break;
  342. default:
  343. trigger_error('invalid node type', E_USER_ERROR);
  344. break;
  345. }
  346. $this->lastWasBlockTag = $this->parser->nodeType == 'tag' && $this->parser->isStartTag && $this->parser->isBlockElement;
  347. }
  348. if (!empty($this->buffer))
  349. {
  350. trigger_error('buffer was not flushed, this is a bug. please report!', E_USER_WARNING);
  351. while (!empty($this->buffer))
  352. {
  353. $this->out($this->unbuffer());
  354. }
  355. }
  356. ### cleanup
  357. $this->output = rtrim(str_replace('&amp;', '&', str_replace('&lt;', '<', str_replace('&gt;', '>', $this->output))));
  358. # end parsing, flush stacked tags
  359. $this->flushStacked();
  360. $this->stack = array();
  361. }
  362. /**
  363. * check if current tag can be converted to Markdown
  364. *
  365. * @param void
  366. * @return bool
  367. */
  368. function isMarkdownable()
  369. {
  370. if (!isset($this->isMarkdownable[$this->parser->tagName]))
  371. {
  372. # simply not markdownable
  373. return false;
  374. }
  375. if ($this->parser->isStartTag)
  376. {
  377. $return = true;
  378. if ($this->keepHTML)
  379. {
  380. $diff = array_diff(array_keys($this->parser->tagAttributes), array_keys($this->isMarkdownable[$this->parser->tagName]));
  381. if (!empty($diff))
  382. {
  383. # non markdownable attributes given
  384. $return = false;
  385. }
  386. }
  387. if ($return)
  388. {
  389. foreach ($this->isMarkdownable[$this->parser->tagName] as $attr => $type)
  390. {
  391. if ($type == 'required' && !isset($this->parser->tagAttributes[$attr]))
  392. {
  393. # required markdown attribute not given
  394. $return = false;
  395. break;
  396. }
  397. }
  398. }
  399. if (!$return)
  400. {
  401. array_push($this->notConverted, $this->parser->tagName . '::' . implode('/', $this->parser->openTags));
  402. }
  403. return $return;
  404. }
  405. else
  406. {
  407. if (!empty($this->notConverted) && end($this->notConverted) === $this->parser->tagName . '::' . implode('/', $this->parser->openTags))
  408. {
  409. array_pop($this->notConverted);
  410. return false;
  411. }
  412. return true;
  413. }
  414. }
  415. /**
  416. * output all stacked tags
  417. *
  418. * @param void
  419. * @return void
  420. */
  421. function flushStacked()
  422. {
  423. # links
  424. foreach ($this->stack as $tag => $a)
  425. {
  426. if (!empty($a))
  427. {
  428. call_user_func(array(&$this, 'flushStacked_' . $tag));
  429. }
  430. }
  431. }
  432. /**
  433. * output link references (e.g. [1]: http://example.com "title");
  434. *
  435. * @param void
  436. * @return void
  437. */
  438. function flushStacked_a()
  439. {
  440. $out = false;
  441. foreach ($this->stack['a'] as $k => $tag)
  442. {
  443. if (!isset($tag['unstacked']))
  444. {
  445. if (!$out)
  446. {
  447. $out = true;
  448. $this->out("\n\n", true);
  449. }
  450. else
  451. {
  452. $this->out("\n", true);
  453. }
  454. $this->out(' [' . $tag['linkID'] . ']: ' . $tag['href'] . (isset($tag['title']) ? ' "' . $tag['title'] . '"' : ''), true);
  455. $tag['unstacked'] = true;
  456. $this->stack['a'][$k] = $tag;
  457. }
  458. }
  459. }
  460. /**
  461. * flush enqued linebreaks
  462. *
  463. * @param void
  464. * @return void
  465. */
  466. function flushLinebreaks()
  467. {
  468. if ($this->lineBreaks && !empty($this->output))
  469. {
  470. $this->out(str_repeat("\n" . $this->indent, $this->lineBreaks), true);
  471. }
  472. $this->lineBreaks = 0;
  473. }
  474. /**
  475. * handle non Markdownable tags
  476. *
  477. * @param void
  478. * @return void
  479. */
  480. function handleTagToText()
  481. {
  482. if (!$this->keepHTML)
  483. {
  484. if (!$this->parser->isStartTag && $this->parser->isBlockElement)
  485. {
  486. $this->setLineBreaks(2);
  487. }
  488. }
  489. else
  490. {
  491. # dont convert to markdown inside this tag
  492. /** TODO: markdown extra * */
  493. if (!$this->parser->isEmptyTag)
  494. {
  495. if ($this->parser->isStartTag)
  496. {
  497. if (!$this->skipConversion)
  498. {
  499. $this->skipConversion = $this->parser->tagName . '::' . implode('/', $this->parser->openTags);
  500. }
  501. }
  502. else
  503. {
  504. if ($this->skipConversion == $this->parser->tagName . '::' . implode('/', $this->parser->openTags))
  505. {
  506. $this->skipConversion = false;
  507. }
  508. }
  509. }
  510. if ($this->parser->isBlockElement)
  511. {
  512. if ($this->parser->isStartTag)
  513. {
  514. if (in_array($this->parent(), array('ins', 'del')))
  515. {
  516. # looks like ins or del are block elements now
  517. $this->out("\n", true);
  518. $this->indent(' ');
  519. }
  520. if ($this->parser->tagName != 'pre')
  521. {
  522. $this->out($this->parser->node . "\n" . $this->indent);
  523. if (!$this->parser->isEmptyTag)
  524. {
  525. $this->indent(' ');
  526. }
  527. else
  528. {
  529. $this->setLineBreaks(1);
  530. }
  531. $this->parser->html = ltrim($this->parser->html);
  532. }
  533. else
  534. {
  535. # don't indent inside <pre> tags
  536. $this->out($this->parser->node);
  537. static $indent;
  538. $indent = $this->indent;
  539. $this->indent = '';
  540. }
  541. }
  542. else
  543. {
  544. if (!$this->parser->keepWhitespace)
  545. {
  546. $this->output = rtrim($this->output);
  547. }
  548. if ($this->parser->tagName != 'pre')
  549. {
  550. $this->indent(' ');
  551. $this->out("\n" . $this->indent . $this->parser->node);
  552. }
  553. else
  554. {
  555. # reset indentation
  556. $this->out($this->parser->node);
  557. static $indent;
  558. $this->indent = $indent;
  559. }
  560. if (in_array($this->parent(), array('ins', 'del')))
  561. {
  562. # ins or del was block element
  563. $this->out("\n");
  564. $this->indent(' ');
  565. }
  566. if ($this->parser->tagName == 'li')
  567. {
  568. $this->setLineBreaks(1);
  569. }
  570. else
  571. {
  572. $this->setLineBreaks(2);
  573. }
  574. }
  575. }
  576. else
  577. {
  578. $this->out($this->parser->node);
  579. }
  580. if (in_array($this->parser->tagName, array('code', 'pre')))
  581. {
  582. if ($this->parser->isStartTag)
  583. {
  584. $this->buffer();
  585. }
  586. else
  587. {
  588. # add stuff so cleanup just reverses this
  589. $this->out(str_replace('&lt;', '&amp;lt;', str_replace('&gt;', '&amp;gt;', $this->unbuffer())));
  590. }
  591. }
  592. }
  593. }
  594. /**
  595. * handle plain text
  596. *
  597. * @param void
  598. * @return void
  599. */
  600. function handleText()
  601. {
  602. if ($this->hasParent('pre') && strpos($this->parser->node, "\n") !== false)
  603. {
  604. $this->parser->node = str_replace("\n", "\n" . $this->indent, $this->parser->node);
  605. }
  606. if (!$this->hasParent('code') && !$this->hasParent('pre'))
  607. {
  608. # entity decode
  609. $this->parser->node = $this->decode($this->parser->node);
  610. if (!$this->skipConversion)
  611. {
  612. # escape some chars in normal Text
  613. $this->parser->node = preg_replace($this->escapeInText['search'], $this->escapeInText['replace'], $this->parser->node);
  614. }
  615. }
  616. else
  617. {
  618. $this->parser->node = str_replace(array('&quot;', '&apos'), array('"', '\''), $this->parser->node);
  619. }
  620. $this->out($this->parser->node);
  621. $this->lastClosedTag = '';
  622. }
  623. /**
  624. * handle <em> and <i> tags
  625. *
  626. * @param void
  627. * @return void
  628. */
  629. function handleTag_em()
  630. {
  631. $this->out('*', true);
  632. }
  633. function handleTag_i()
  634. {
  635. $this->handleTag_em();
  636. }
  637. /**
  638. * handle <strong> and <b> tags
  639. *
  640. * @param void
  641. * @return void
  642. */
  643. function handleTag_strong()
  644. {
  645. $this->out('**', true);
  646. }
  647. function handleTag_b()
  648. {
  649. $this->handleTag_strong();
  650. }
  651. /**
  652. * handle <h1> tags
  653. *
  654. * @param void
  655. * @return void
  656. */
  657. function handleTag_h1()
  658. {
  659. $this->handleHeader(1);
  660. }
  661. /**
  662. * handle <h2> tags
  663. *
  664. * @param void
  665. * @return void
  666. */
  667. function handleTag_h2()
  668. {
  669. $this->handleHeader(2);
  670. }
  671. /**
  672. * handle <h3> tags
  673. *
  674. * @param void
  675. * @return void
  676. */
  677. function handleTag_h3()
  678. {
  679. $this->handleHeader(3);
  680. }
  681. /**
  682. * handle <h4> tags
  683. *
  684. * @param void
  685. * @return void
  686. */
  687. function handleTag_h4()
  688. {
  689. $this->handleHeader(4);
  690. }
  691. /**
  692. * handle <h5> tags
  693. *
  694. * @param void
  695. * @return void
  696. */
  697. function handleTag_h5()
  698. {
  699. $this->handleHeader(5);
  700. }
  701. /**
  702. * handle <h6> tags
  703. *
  704. * @param void
  705. * @return void
  706. */
  707. function handleTag_h6()
  708. {
  709. $this->handleHeader(6);
  710. }
  711. /**
  712. * number of line breaks before next inline output
  713. */
  714. var $lineBreaks = 0;
  715. /**
  716. * handle header tags (<h1> - <h6>)
  717. *
  718. * @param int $level 1-6
  719. * @return void
  720. */
  721. function handleHeader($level)
  722. {
  723. if ($this->parser->isStartTag)
  724. {
  725. $this->out(str_repeat('#', $level) . ' ', true);
  726. }
  727. else
  728. {
  729. $this->setLineBreaks(2);
  730. }
  731. }
  732. /**
  733. * handle <p> tags
  734. *
  735. * @param void
  736. * @return void
  737. */
  738. function handleTag_p()
  739. {
  740. if (!$this->parser->isStartTag)
  741. {
  742. $this->setLineBreaks(2);
  743. }
  744. }
  745. /**
  746. * handle <a> tags
  747. *
  748. * @param void
  749. * @return void
  750. */
  751. function handleTag_a()
  752. {
  753. if ($this->parser->isStartTag)
  754. {
  755. $this->buffer();
  756. if (isset($this->parser->tagAttributes['title']))
  757. {
  758. $this->parser->tagAttributes['title'] = $this->decode($this->parser->tagAttributes['title']);
  759. }
  760. else
  761. {
  762. $this->parser->tagAttributes['title'] = null;
  763. }
  764. $this->parser->tagAttributes['href'] = $this->decode(trim($this->parser->tagAttributes['href']));
  765. $this->stack();
  766. }
  767. else
  768. {
  769. $tag = $this->unstack();
  770. $buffer = $this->unbuffer();
  771. if (empty($tag['href']) && empty($tag['title']))
  772. {
  773. # empty links... testcase mania, who would possibly do anything like that?!
  774. $this->out('[' . $buffer . ']()', true);
  775. return;
  776. }
  777. if ($buffer == $tag['href'] && empty($tag['title']))
  778. {
  779. # <http://example.com>
  780. $this->out('<' . $buffer . '>', true);
  781. return;
  782. }
  783. $bufferDecoded = $this->decode(trim($buffer));
  784. if (substr($tag['href'], 0, 7) == 'mailto:' && 'mailto:' . $bufferDecoded == $tag['href'])
  785. {
  786. if (is_null($tag['title']))
  787. {
  788. # <mail@example.com>
  789. $this->out('<' . $bufferDecoded . '>', true);
  790. return;
  791. }
  792. # [mail@example.com][1]
  793. # ...
  794. # [1]: mailto:mail@example.com Title
  795. $tag['href'] = 'mailto:' . $bufferDecoded;
  796. }
  797. # [This link][id]
  798. foreach ($this->stack['a'] as $tag2)
  799. {
  800. if ($tag2['href'] == $tag['href'] && $tag2['title'] === $tag['title'])
  801. {
  802. $tag['linkID'] = $tag2['linkID'];
  803. break;
  804. }
  805. }
  806. // Inline Style for our bbc links
  807. if (isset($tag['class']) && $tag['class'] == 'bbc_link')
  808. {
  809. $tag['linkID'] = $tag['href'];
  810. $this->out('[' . $buffer . '](' . $tag['href'] . ')', true);
  811. return;
  812. }
  813. if (!isset($tag['linkID']))
  814. {
  815. $tag['linkID'] = count($this->stack['a']) + 1;
  816. array_push($this->stack['a'], $tag);
  817. }
  818. $this->out('[' . $buffer . '][' . $tag['linkID'] . ']', true);
  819. }
  820. }
  821. /**
  822. * handle <img /> tags
  823. *
  824. * @param void
  825. * @return void
  826. */
  827. function handleTag_img()
  828. {
  829. if (!$this->parser->isStartTag)
  830. {
  831. return; # just to be sure this is really an empty tag...
  832. }
  833. if (isset($this->parser->tagAttributes['title']))
  834. {
  835. $this->parser->tagAttributes['title'] = $this->decode($this->parser->tagAttributes['title']);
  836. }
  837. else
  838. {
  839. $this->parser->tagAttributes['title'] = null;
  840. }
  841. if (isset($this->parser->tagAttributes['alt']))
  842. {
  843. $this->parser->tagAttributes['alt'] = $this->decode($this->parser->tagAttributes['alt']);
  844. }
  845. else
  846. {
  847. $this->parser->tagAttributes['alt'] = null;
  848. }
  849. if (empty($this->parser->tagAttributes['src']))
  850. {
  851. # support for "empty" images... dunno if this is really needed
  852. # but there are some testcases which do that...
  853. if (!empty($this->parser->tagAttributes['title']))
  854. {
  855. $this->parser->tagAttributes['title'] = ' ' . $this->parser->tagAttributes['title'] . ' ';
  856. }
  857. $this->out('![' . $this->parser->tagAttributes['alt'] . '](' . $this->parser->tagAttributes['title'] . ')', true);
  858. return;
  859. }
  860. else
  861. {
  862. $this->parser->tagAttributes['src'] = $this->decode($this->parser->tagAttributes['src']);
  863. }
  864. # [This link][id]
  865. $link_id = false;
  866. if (!empty($this->stack['a']))
  867. {
  868. foreach ($this->stack['a'] as $tag)
  869. {
  870. if ($tag['href'] == $this->parser->tagAttributes['src']
  871. && $tag['title'] === $this->parser->tagAttributes['title'])
  872. {
  873. $link_id = $tag['linkID'];
  874. break;
  875. }
  876. }
  877. }
  878. else
  879. {
  880. $this->stack['a'] = array();
  881. }
  882. if (!$link_id)
  883. {
  884. $link_id = count($this->stack['a']) + 1;
  885. $tag = array(
  886. 'href' => $this->parser->tagAttributes['src'],
  887. 'linkID' => $link_id,
  888. 'title' => $this->parser->tagAttributes['title']
  889. );
  890. array_push($this->stack['a'], $tag);
  891. }
  892. $this->out('![' . $this->parser->tagAttributes['alt'] . '][' . $link_id . ']', true);
  893. }
  894. /**
  895. * handle <code> tags
  896. *
  897. * @param void
  898. * @return void
  899. */
  900. function handleTag_code()
  901. {
  902. if ($this->hasParent('pre'))
  903. {
  904. # ignore code blocks inside <pre>
  905. return;
  906. }
  907. if ($this->parser->isStartTag)
  908. {
  909. $this->buffer();
  910. }
  911. else
  912. {
  913. $buffer = $this->unbuffer();
  914. # use as many backticks as needed
  915. preg_match_all('#`+#', $buffer, $matches);
  916. if (!empty($matches[0]))
  917. {
  918. rsort($matches[0]);
  919. $ticks = '`';
  920. while (true)
  921. {
  922. if (!in_array($ticks, $matches[0]))
  923. {
  924. break;
  925. }
  926. $ticks .= '`';
  927. }
  928. }
  929. else
  930. {
  931. $ticks = '`';
  932. }
  933. if ($buffer[0] == '`' || substr($buffer, -1) == '`')
  934. {
  935. $buffer = ' ' . $buffer . ' ';
  936. }
  937. $this->out($ticks . $buffer . $ticks, true);
  938. }
  939. }
  940. /**
  941. * handle <pre> tags
  942. *
  943. * @param void
  944. * @return void
  945. */
  946. function handleTag_pre()
  947. {
  948. if ($this->keepHTML && $this->parser->isStartTag)
  949. {
  950. # check if a simple <code> follows
  951. if (!preg_match('#^\s*<code\s*>#Us', $this->parser->html))
  952. {
  953. # this is no standard markdown code block
  954. $this->handleTagToText();
  955. return;
  956. }
  957. }
  958. $this->indent(' ');
  959. if (!$this->parser->isStartTag)
  960. {
  961. $this->setLineBreaks(2);
  962. }
  963. else
  964. {
  965. $this->parser->html = ltrim($this->parser->html);
  966. }
  967. }
  968. /**
  969. * handle <blockquote> tags
  970. *
  971. * @param void
  972. * @return void
  973. */
  974. function handleTag_blockquote()
  975. {
  976. $this->indent('> ');
  977. }
  978. /**
  979. * handle <ul> tags
  980. *
  981. * @param void
  982. * @return void
  983. */
  984. function handleTag_ul()
  985. {
  986. if ($this->parser->isStartTag)
  987. {
  988. $this->stack();
  989. if (!$this->keepHTML && $this->lastClosedTag == $this->parser->tagName)
  990. {
  991. $this->out("\n" . $this->indent . '<!-- -->' . "\n" . $this->indent . "\n" . $this->indent);
  992. }
  993. }
  994. else
  995. {
  996. $this->unstack();
  997. if ($this->parent() != 'li' || preg_match('#^\s*(</li\s*>\s*<li\s*>\s*)?<(p|blockquote)\s*>#sU', $this->parser->html))
  998. {
  999. # dont make Markdown add unneeded paragraphs
  1000. $this->setLineBreaks(2);
  1001. }
  1002. }
  1003. }
  1004. /**
  1005. * handle <ul> tags
  1006. *
  1007. * @param void
  1008. * @return void
  1009. */
  1010. function handleTag_ol()
  1011. {
  1012. # same as above
  1013. $this->parser->tagAttributes['num'] = 0;
  1014. $this->handleTag_ul();
  1015. }
  1016. /**
  1017. * handle <li> tags
  1018. *
  1019. * @param void
  1020. * @return void
  1021. */
  1022. function handleTag_li()
  1023. {
  1024. if ($this->parent() == 'ol')
  1025. {
  1026. $parent = & $this->getStacked('ol');
  1027. if ($this->parser->isStartTag)
  1028. {
  1029. $parent['num']++;
  1030. $this->out($parent['num'] . '.' . str_repeat(' ', 3 - strlen($parent['num'])), true);
  1031. }
  1032. $this->indent(' ', false);
  1033. }
  1034. else
  1035. {
  1036. if ($this->parser->isStartTag)
  1037. {
  1038. $this->out('* ', true);
  1039. }
  1040. $this->indent(' ', false);
  1041. }
  1042. if (!$this->parser->isStartTag)
  1043. {
  1044. $this->setLineBreaks(1);
  1045. }
  1046. }
  1047. /**
  1048. * handle <hr /> tags
  1049. *
  1050. * @param void
  1051. * @return void
  1052. */
  1053. function handleTag_hr()
  1054. {
  1055. if (!$this->parser->isStartTag)
  1056. {
  1057. return; # just to be sure this really is an empty tag
  1058. }
  1059. $this->out('* * *', true);
  1060. $this->setLineBreaks(2);
  1061. }
  1062. /**
  1063. * handle <br /> tags
  1064. *
  1065. * @param void
  1066. * @return void
  1067. */
  1068. function handleTag_br()
  1069. {
  1070. $this->out(" \n" . $this->indent, true);
  1071. $this->parser->html = ltrim($this->parser->html);
  1072. }
  1073. /**
  1074. * node stack, e.g. for <a> and <abbr> tags
  1075. *
  1076. * @var array<array>
  1077. */
  1078. var $stack = array();
  1079. /**
  1080. * add current node to the stack
  1081. * this only stores the attributes
  1082. *
  1083. * @param void
  1084. * @return void
  1085. */
  1086. function stack()
  1087. {
  1088. if (!isset($this->stack[$this->parser->tagName]))
  1089. {
  1090. $this->stack[$this->parser->tagName] = array();
  1091. }
  1092. array_push($this->stack[$this->parser->tagName], $this->parser->tagAttributes);
  1093. }
  1094. /**
  1095. * remove current tag from stack
  1096. *
  1097. * @param void
  1098. * @return array
  1099. */
  1100. function unstack()
  1101. {
  1102. if (!isset($this->stack[$this->parser->tagName]) || !is_array($this->stack[$this->parser->tagName]))
  1103. {
  1104. trigger_error('Trying to unstack from empty stack. This must not happen.', E_USER_ERROR);
  1105. }
  1106. return array_pop($this->stack[$this->parser->tagName]);
  1107. }
  1108. /**
  1109. * get last stacked element of type $tagName
  1110. *
  1111. * @param string $tagName
  1112. * @return array
  1113. */
  1114. function & getStacked($tagName)
  1115. {
  1116. // no end() so it can be referenced
  1117. return $this->stack[$tagName][count($this->stack[$tagName]) - 1];
  1118. }
  1119. /**
  1120. * set number of line breaks before next start tag
  1121. *
  1122. * @param int $number
  1123. * @return void
  1124. */
  1125. function setLineBreaks($number)
  1126. {
  1127. if ($this->lineBreaks < $number)
  1128. {
  1129. $this->lineBreaks = $number;
  1130. }
  1131. }
  1132. /**
  1133. * stores current buffers
  1134. *
  1135. * @var array<string>
  1136. */
  1137. var $buffer = array();
  1138. /**
  1139. * buffer next parser output until unbuffer() is called
  1140. *
  1141. * @param void
  1142. * @return void
  1143. */
  1144. function buffer()
  1145. {
  1146. array_push($this->buffer, '');
  1147. }
  1148. /**
  1149. * end current buffer and return buffered output
  1150. *
  1151. * @param void
  1152. * @return string
  1153. */
  1154. function unbuffer()
  1155. {
  1156. return array_pop($this->buffer);
  1157. }
  1158. /**
  1159. * append string to the correct var, either
  1160. * directly to $this->output or to the current
  1161. * buffers
  1162. *
  1163. * @param string $put
  1164. * @return void
  1165. */
  1166. function out($put, $nowrap = false)
  1167. {
  1168. if (empty($put))
  1169. {
  1170. return;
  1171. }
  1172. if (!empty($this->buffer))
  1173. {
  1174. $this->buffer[count($this->buffer) - 1] .= $put;
  1175. }
  1176. else
  1177. {
  1178. if ($this->bodyWidth && !$this->parser->keepWhitespace)
  1179. { # wrap lines
  1180. // get last line
  1181. $pos = strrpos($this->output, "\n");
  1182. if ($pos === false)
  1183. {
  1184. $line = $this->output;
  1185. }
  1186. else
  1187. {
  1188. $line = substr($this->output, $pos);
  1189. }
  1190. if ($nowrap)
  1191. {
  1192. if ($put[0] != "\n" && $this->strlen($line) + $this->strlen($put) > $this->bodyWidth)
  1193. {
  1194. $this->output .= "\n" . $this->indent . $put;
  1195. }
  1196. else
  1197. {
  1198. $this->output .= $put;
  1199. }
  1200. return;
  1201. }
  1202. else
  1203. {
  1204. $put .= "\n"; # make sure we get all lines in the while below
  1205. $lineLen = $this->strlen($line);
  1206. while ($pos = strpos($put, "\n"))
  1207. {
  1208. $putLine = substr($put, 0, $pos + 1);
  1209. $put = substr($put, $pos + 1);
  1210. $putLen = $this->strlen($putLine);
  1211. if ($lineLen + $putLen < $this->bodyWidth)
  1212. {
  1213. $this->output .= $putLine;
  1214. $lineLen = $putLen;
  1215. }
  1216. else
  1217. {
  1218. $split = preg_split('#^(.{0,' . ($this->bodyWidth - $lineLen) . '})\b#', $putLine, 2, PREG_SPLIT_OFFSET_CAPTURE | PREG_SPLIT_DELIM_CAPTURE);
  1219. $this->output .= rtrim($split[1][0]) . "\n" . $this->indent . $this->wordwrap(ltrim($split[2][0]), $this->bodyWidth, "\n" . $this->indent, false);
  1220. $this->output = rtrim($this->output, $this->indent);
  1221. }
  1222. }
  1223. $this->output = substr($this->output, 0, -1);
  1224. return;
  1225. }
  1226. }
  1227. else
  1228. {
  1229. $this->output .= $put;
  1230. }
  1231. }
  1232. }
  1233. /**
  1234. * current indentation
  1235. *
  1236. * @var string
  1237. */
  1238. var $indent = '';
  1239. /**
  1240. * indent next output (start tag) or unindent (end tag)
  1241. *
  1242. * @param string $str indentation
  1243. * @param bool $output add indendation to output
  1244. * @return void
  1245. */
  1246. function indent($str, $output = true)
  1247. {
  1248. if ($this->parser->isStartTag)
  1249. {
  1250. $this->indent .= $str;
  1251. if ($output)
  1252. {
  1253. $this->out($str, true);
  1254. }
  1255. }
  1256. else
  1257. {
  1258. $this->indent = substr($this->indent, 0, -strlen($str));
  1259. }
  1260. }
  1261. /**
  1262. * decode email addresses
  1263. *
  1264. * @author derernst@gmx.ch <http://www.php.net/manual/en/function.html-entity-decode.php#68536>
  1265. * @author Milian Wolff <http://milianw.de>
  1266. */
  1267. function decode($text, $quote_style = ENT_QUOTES)
  1268. {
  1269. if (version_compare(PHP_VERSION, '5', '>='))
  1270. {
  1271. # UTF-8 is only supported in PHP 5.x.x and above
  1272. $text = html_entity_decode($text, $quote_style, 'UTF-8');
  1273. }
  1274. else
  1275. {
  1276. if (function_exists('html_entity_decode'))
  1277. {
  1278. $text = html_entity_decode($text, $quote_style, 'ISO-8859-1');
  1279. }
  1280. else
  1281. {
  1282. static $trans_tbl;
  1283. if (!isset($trans_tbl))
  1284. {
  1285. $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, $quote_style));
  1286. }
  1287. $text = strtr($text, $trans_tbl);
  1288. }
  1289. $text = preg_replace_callback('~&#x([0-9a-f]+);~i', array(&$this, '_decode_hex'), $text);
  1290. $text = preg_replace_callback('~&#(\d{2,5});~', array(&$this, '_decode_numeric'), $text);
  1291. }
  1292. return $text;
  1293. }
  1294. /**
  1295. * callback for decode() which converts a hexadecimal entity to UTF-8
  1296. *
  1297. * @param array $matches
  1298. * @return string UTF-8 encoded
  1299. */
  1300. function _decode_hex($matches)
  1301. {
  1302. return $this->unichr(hexdec($matches[1]));
  1303. }
  1304. /**
  1305. * callback for decode() which converts a numerical entity to UTF-8
  1306. *
  1307. * @param array $matches
  1308. * @return string UTF-8 encoded
  1309. */
  1310. function _decode_numeric($matches)
  1311. {
  1312. return $this->unichr($matches[1]);
  1313. }
  1314. /**
  1315. * UTF-8 chr() which supports numeric entities
  1316. *
  1317. * @author grey - greywyvern - com <http://www.php.net/manual/en/function.chr.php#55978>
  1318. * @param array $matches
  1319. * @return string UTF-8 encoded
  1320. */
  1321. function unichr($dec)
  1322. {
  1323. if ($dec < 128)
  1324. {
  1325. $utf = chr($dec);
  1326. }
  1327. else if ($dec < 2048)
  1328. {
  1329. $utf = chr(192 + (($dec - ($dec % 64)) / 64));
  1330. $utf .= chr(128 + ($dec % 64));
  1331. }
  1332. else
  1333. {
  1334. $utf = chr(224 + (($dec - ($dec % 4096)) / 4096));
  1335. $utf .= chr(128 + ((($dec % 4096) - ($dec % 64)) / 64));
  1336. $utf .= chr(128 + ($dec % 64));
  1337. }
  1338. return $utf;
  1339. }
  1340. /**
  1341. * UTF-8 strlen()
  1342. *
  1343. * @param string $str
  1344. * @return int
  1345. *
  1346. * @author dtorop 932 at hotmail dot com <http://www.php.net/manual/en/function.strlen.php#37975>
  1347. * @author Milian Wolff <http://milianw.de>
  1348. */
  1349. function strlen($str)
  1350. {
  1351. if (function_exists('mb_strlen'))
  1352. {
  1353. return mb_strlen($str, 'UTF-8');
  1354. }
  1355. else
  1356. {
  1357. return preg_match_all('/[\x00-\x7F\xC0-\xFD]/', $str, $var_empty);
  1358. }
  1359. }
  1360. /**
  1361. * wordwrap for utf8 encoded strings
  1362. *
  1363. * @param string $str
  1364. * @param integer $len
  1365. * @param string $what
  1366. * @return string
  1367. */
  1368. function wordwrap($str, $width, $break, $cut = false)
  1369. {
  1370. if (!$cut)
  1371. {
  1372. $regexp = '#^(?:[\x00-\x7F]|[\xC0-\xFF][\x80-\xBF]+){1,' . $width . '}\b#';
  1373. }
  1374. else
  1375. {
  1376. $regexp = '#^(?:[\x00-\x7F]|[\xC0-\xFF][\x80-\xBF]+){' . $width . '}#';
  1377. }
  1378. $return = '';
  1379. while (preg_match($regexp, $str, $matches))
  1380. {
  1381. $string = $matches[0];
  1382. $str = ltrim(substr($str, strlen($string)));
  1383. if (!$cut && isset($str[0]) && in_array($str[0], array('.', '!', ';', ':', '?', ',')))
  1384. {
  1385. $string .= $str[0];
  1386. $str = ltrim(substr($str, 1));
  1387. }
  1388. $return .= $string . $break;
  1389. }
  1390. return $return . ltrim($str);
  1391. }
  1392. /**
  1393. * check if current node has a $tagName as parent (somewhere, not only the direct parent)
  1394. *
  1395. * @param string $tagName
  1396. * @return bool
  1397. */
  1398. function hasParent($tagName)
  1399. {
  1400. return in_array($tagName, $this->parser->openTags);
  1401. }
  1402. /**
  1403. * get tagName of direct parent tag
  1404. *
  1405. * @param void
  1406. * @return string $tagName
  1407. */
  1408. function parent()
  1409. {
  1410. return end($this->parser->openTags);
  1411. }
  1412. }