PageRenderTime 51ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 1ms

/app/classes/markdownify/markdownify.php

https://github.com/ISVGitHub/bolt
PHP | 1184 lines | 696 code | 18 blank | 470 comment | 193 complexity | 163b8f84bb809c6e6f2f8e1e8c04dd66 MD5 | raw file
Possible License(s): LGPL-2.1, GPL-2.0
  1. <?php
  2. /**
  3. * Markdownify converts HTML Markup to [Markdown][1] (by [John Gruber][2]. It
  4. * also supports [Markdown Extra][3] by [Michel Fortin][4] via Markdownify_Extra.
  5. *
  6. * It all started as `html2text.php` - a port of [Aaron Swartz'][5] [`html2text.py`][6] - but
  7. * got a long way since. This is far more than a mere port now!
  8. * Starting with version 2.0.0 this is a complete rewrite and cannot be
  9. * compared to Aaron Swatz' `html2text.py` anylonger. I'm now using a HTML parser
  10. * (see `parsehtml.php` which I also wrote) which makes most of the evil
  11. * RegEx magic go away and additionally it gives a much cleaner class
  12. * structure. Also notably is the fact that I now try to prevent regressions by
  13. * utilizing testcases of Michel Fortin's [MDTest][7].
  14. *
  15. * [1]: http://daringfireball.com/projects/markdown
  16. * [2]: http://daringfireball.com/
  17. * [3]: http://www.michelf.com/projects/php-markdown/extra/
  18. * [4]: http://www.michelf.com/
  19. * [5]: http://www.aaronsw.com/
  20. * [6]: http://www.aaronsw.com/2002/html2text/
  21. * [7]: http://article.gmane.org/gmane.text.markdown.general/2540
  22. *
  23. * @version 2.0.0 alpha
  24. * @author Milian Wolff (<mail@milianw.de>, <http://milianw.de>)
  25. * @license LGPL, see LICENSE_LGPL.txt and the summary below
  26. * @copyright (C) 2007 Milian Wolff
  27. *
  28. * This library is free software; you can redistribute it and/or
  29. * modify it under the terms of the GNU Lesser General Public
  30. * License as published by the Free Software Foundation; either
  31. * version 2.1 of the License, or (at your option) any later version.
  32. *
  33. * This library is distributed in the hope that it will be useful,
  34. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  35. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  36. * Lesser General Public License for more details.
  37. *
  38. * You should have received a copy of the GNU Lesser General Public
  39. * License along with this library; if not, write to the Free Software
  40. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  41. */
  42. /**
  43. * HTML Parser, see http://sf.net/projects/parseHTML
  44. */
  45. require_once dirname(__FILE__).'/parsehtml.php';
  46. /**
  47. * default configuration
  48. */
  49. define('MDFY_LINKS_EACH_PARAGRAPH', false);
  50. define('MDFY_BODYWIDTH', false);
  51. define('MDFY_KEEPHTML', true);
  52. /**
  53. * HTML to Markdown converter class
  54. */
  55. class Markdownify {
  56. /**
  57. * html parser object
  58. *
  59. * @var parseHTML
  60. */
  61. var $parser;
  62. /**
  63. * markdown output
  64. *
  65. * @var string
  66. */
  67. var $output;
  68. /**
  69. * stack with tags which where not converted to html
  70. *
  71. * @var array<string>
  72. */
  73. var $notConverted = array();
  74. /**
  75. * skip conversion to markdown
  76. *
  77. * @var bool
  78. */
  79. var $skipConversion = false;
  80. /* options */
  81. /**
  82. * keep html tags which cannot be converted to markdown
  83. *
  84. * @var bool
  85. */
  86. var $keepHTML = false;
  87. /**
  88. * wrap output, set to 0 to skip wrapping
  89. *
  90. * @var int
  91. */
  92. var $bodyWidth = 0;
  93. /**
  94. * minimum body width
  95. *
  96. * @var int
  97. */
  98. var $minBodyWidth = 25;
  99. /**
  100. * display links after each paragraph
  101. *
  102. * @var bool
  103. */
  104. var $linksAfterEachParagraph = false;
  105. /**
  106. * constructor, set options, setup parser
  107. *
  108. * @param bool $linksAfterEachParagraph wether or not to flush stacked links after each paragraph
  109. * defaults to false
  110. * @param int $bodyWidth wether or not to wrap the output to the given width
  111. * defaults to false
  112. * @param bool $keepHTML wether to keep non markdownable HTML or to discard it
  113. * defaults to true (HTML will be kept)
  114. * @return void
  115. */
  116. function Markdownify($linksAfterEachParagraph = MDFY_LINKS_EACH_PARAGRAPH, $bodyWidth = MDFY_BODYWIDTH, $keepHTML = MDFY_KEEPHTML) {
  117. $this->linksAfterEachParagraph = $linksAfterEachParagraph;
  118. $this->keepHTML = $keepHTML;
  119. if ($bodyWidth > $this->minBodyWidth) {
  120. $this->bodyWidth = intval($bodyWidth);
  121. } else {
  122. $this->bodyWidth = false;
  123. }
  124. $this->parser = new parseHTML;
  125. $this->parser->noTagsInCode = true;
  126. # we don't have to do this every time
  127. $search = array();
  128. $replace = array();
  129. foreach ($this->escapeInText as $s => $r) {
  130. array_push($search, '#(?<!\\\)'.$s.'#U');
  131. array_push($replace, $r);
  132. }
  133. $this->escapeInText = array(
  134. 'search' => $search,
  135. 'replace' => $replace
  136. );
  137. }
  138. /**
  139. * parse a HTML string
  140. *
  141. * @param string $html
  142. * @return string markdown formatted
  143. */
  144. function parseString($html) {
  145. $this->parser->html = $html;
  146. $this->parse();
  147. return $this->output;
  148. }
  149. /**
  150. * tags with elements which can be handled by markdown
  151. *
  152. * @var array<string>
  153. */
  154. var $isMarkdownable = array(
  155. 'p' => array(),
  156. 'ul' => array(),
  157. 'ol' => array(),
  158. 'li' => array(),
  159. 'br' => array(),
  160. 'blockquote' => array(),
  161. 'code' => array(),
  162. 'pre' => array(),
  163. 'a' => array(
  164. 'href' => 'required',
  165. 'title' => 'optional',
  166. ),
  167. 'strong' => array(),
  168. 'b' => array(),
  169. 'em' => array(),
  170. 'i' => array(),
  171. 'img' => array(
  172. 'src' => 'required',
  173. 'alt' => 'optional',
  174. 'title' => 'optional',
  175. ),
  176. 'h1' => array(),
  177. 'h2' => array(),
  178. 'h3' => array(),
  179. 'h4' => array(),
  180. 'h5' => array(),
  181. 'h6' => array(),
  182. 'hr' => array(),
  183. );
  184. /**
  185. * html tags to be ignored (contents will be parsed)
  186. *
  187. * @var array<string>
  188. */
  189. var $ignore = array(
  190. 'html',
  191. 'body',
  192. );
  193. /**
  194. * html tags to be dropped (contents will not be parsed!)
  195. *
  196. * @var array<string>
  197. */
  198. var $drop = array(
  199. 'script',
  200. 'head',
  201. 'style',
  202. 'form',
  203. 'area',
  204. 'object',
  205. 'param',
  206. 'iframe',
  207. );
  208. /**
  209. * Markdown indents which could be wrapped
  210. * @note: use strings in regex format
  211. *
  212. * @var array<string>
  213. */
  214. var $wrappableIndents = array(
  215. '\* ', # ul
  216. '\d. ', # ol
  217. '\d\d. ', # ol
  218. '> ', # blockquote
  219. '', # p
  220. );
  221. /**
  222. * list of chars which have to be escaped in normal text
  223. * @note: use strings in regex format
  224. *
  225. * @var array
  226. *
  227. * TODO: what's with block chars / sequences at the beginning of a block?
  228. */
  229. var $escapeInText = array(
  230. '([-*_])([ ]{0,2}\1){2,}' => '\\\\$0|', # hr
  231. '\*\*([^*\s]+)\*\*' => '\*\*$1\*\*', # strong
  232. '\*([^*\s]+)\*' => '\*$1\*', # em
  233. '__(?! |_)(.+)(?!<_| )__' => '\_\_$1\_\_', # em
  234. '_(?! |_)(.+)(?!<_| )_' => '\_$1\_', # em
  235. '`(.+)`' => '\`$1\`', # code
  236. '\[(.+)\](\s*\()' => '\[$1\]$2', # links: [text] (url) => [text\] (url)
  237. '\[(.+)\](\s*)\[(.*)\]' => '\[$1\]$2\[$3\]', # links: [text][id] => [text\][id\]
  238. );
  239. /**
  240. * wether last processed node was a block tag or not
  241. *
  242. * @var bool
  243. */
  244. var $lastWasBlockTag = false;
  245. /**
  246. * name of last closed tag
  247. *
  248. * @var string
  249. */
  250. var $lastClosedTag = '';
  251. /**
  252. * iterate through the nodes and decide what we
  253. * shall do with the current node
  254. *
  255. * @param void
  256. * @return void
  257. */
  258. function parse() {
  259. $this->output = '';
  260. # drop tags
  261. $this->parser->html = preg_replace('#<('.implode('|', $this->drop).')[^>]*>.*</\\1>#sU', '', $this->parser->html);
  262. while ($this->parser->nextNode()) {
  263. switch ($this->parser->nodeType) {
  264. case 'doctype':
  265. break;
  266. case 'pi':
  267. case 'comment':
  268. if ($this->keepHTML) {
  269. $this->flushLinebreaks();
  270. $this->out($this->parser->node);
  271. $this->setLineBreaks(2);
  272. }
  273. # else drop
  274. break;
  275. case 'text':
  276. $this->handleText();
  277. break;
  278. case 'tag':
  279. if (in_array($this->parser->tagName, $this->ignore)) {
  280. break;
  281. }
  282. if ($this->parser->isStartTag) {
  283. $this->flushLinebreaks();
  284. }
  285. if ($this->skipConversion) {
  286. $this->isMarkdownable(); # update notConverted
  287. $this->handleTagToText();
  288. continue;
  289. }
  290. if (!$this->parser->keepWhitespace && $this->parser->isBlockElement && $this->parser->isStartTag) {
  291. $this->parser->html = ltrim($this->parser->html);
  292. }
  293. if ($this->isMarkdownable()) {
  294. if ($this->parser->isBlockElement && $this->parser->isStartTag && !$this->lastWasBlockTag && !empty($this->output)) {
  295. if (!empty($this->buffer)) {
  296. $str =& $this->buffer[count($this->buffer) -1];
  297. } else {
  298. $str =& $this->output;
  299. }
  300. if (substr($str, -strlen($this->indent)-1) != "\n".$this->indent) {
  301. $str .= "\n".$this->indent;
  302. }
  303. }
  304. $func = 'handleTag_'.$this->parser->tagName;
  305. $this->$func();
  306. if ($this->linksAfterEachParagraph && $this->parser->isBlockElement && !$this->parser->isStartTag && empty($this->parser->openTags)) {
  307. $this->flushStacked();
  308. }
  309. if (!$this->parser->isStartTag) {
  310. $this->lastClosedTag = $this->parser->tagName;
  311. }
  312. } else {
  313. $this->handleTagToText();
  314. $this->lastClosedTag = '';
  315. }
  316. break;
  317. default:
  318. trigger_error('invalid node type', E_USER_ERROR);
  319. break;
  320. }
  321. $this->lastWasBlockTag = $this->parser->nodeType == 'tag' && $this->parser->isStartTag && $this->parser->isBlockElement;
  322. }
  323. if (!empty($this->buffer)) {
  324. trigger_error('buffer was not flushed, this is a bug. please report!', E_USER_WARNING);
  325. while (!empty($this->buffer)) {
  326. $this->out($this->unbuffer());
  327. }
  328. }
  329. ### cleanup
  330. $this->output = rtrim(str_replace('&amp;', '&', str_replace('&lt;', '<', str_replace('&gt;', '>', $this->output))));
  331. # end parsing, flush stacked tags
  332. $this->flushStacked();
  333. $this->stack = array();
  334. }
  335. /**
  336. * check if current tag can be converted to Markdown
  337. *
  338. * @param void
  339. * @return bool
  340. */
  341. function isMarkdownable() {
  342. if (!isset($this->isMarkdownable[$this->parser->tagName])) {
  343. # simply not markdownable
  344. return false;
  345. }
  346. if ($this->parser->isStartTag) {
  347. $return = true;
  348. if ($this->keepHTML) {
  349. $diff = array_diff(array_keys($this->parser->tagAttributes), array_keys($this->isMarkdownable[$this->parser->tagName]));
  350. if (!empty($diff)) {
  351. # non markdownable attributes given
  352. $return = false;
  353. }
  354. }
  355. if ($return) {
  356. foreach ($this->isMarkdownable[$this->parser->tagName] as $attr => $type) {
  357. if ($type == 'required' && !isset($this->parser->tagAttributes[$attr])) {
  358. # required markdown attribute not given
  359. $return = false;
  360. break;
  361. }
  362. }
  363. }
  364. if (!$return) {
  365. array_push($this->notConverted, $this->parser->tagName.'::'.implode('/', $this->parser->openTags));
  366. }
  367. return $return;
  368. } else {
  369. if (!empty($this->notConverted) && end($this->notConverted) === $this->parser->tagName.'::'.implode('/', $this->parser->openTags)) {
  370. array_pop($this->notConverted);
  371. return false;
  372. }
  373. return true;
  374. }
  375. }
  376. /**
  377. * output all stacked tags
  378. *
  379. * @param void
  380. * @return void
  381. */
  382. function flushStacked() {
  383. # links
  384. foreach ($this->stack as $tag => $a) {
  385. if (!empty($a)) {
  386. call_user_func(array(&$this, 'flushStacked_'.$tag));
  387. }
  388. }
  389. }
  390. /**
  391. * output link references (e.g. [1]: http://example.com "title");
  392. *
  393. * @param void
  394. * @return void
  395. */
  396. function flushStacked_a() {
  397. $out = false;
  398. foreach ($this->stack['a'] as $k => $tag) {
  399. if (!isset($tag['unstacked'])) {
  400. if (!$out) {
  401. $out = true;
  402. $this->out("\n\n", true);
  403. } else {
  404. $this->out("\n", true);
  405. }
  406. $this->out(' ['.$tag['linkID'].']: '.$tag['href'].(isset($tag['title']) ? ' "'.$tag['title'].'"' : ''), true);
  407. $tag['unstacked'] = true;
  408. $this->stack['a'][$k] = $tag;
  409. }
  410. }
  411. }
  412. /**
  413. * flush enqued linebreaks
  414. *
  415. * @param void
  416. * @return void
  417. */
  418. function flushLinebreaks() {
  419. if ($this->lineBreaks && !empty($this->output)) {
  420. $this->out(str_repeat("\n".$this->indent, $this->lineBreaks), true);
  421. }
  422. $this->lineBreaks = 0;
  423. }
  424. /**
  425. * handle non Markdownable tags
  426. *
  427. * @param void
  428. * @return void
  429. */
  430. function handleTagToText() {
  431. if (!$this->keepHTML) {
  432. if (!$this->parser->isStartTag && $this->parser->isBlockElement) {
  433. $this->setLineBreaks(2);
  434. }
  435. } else {
  436. # dont convert to markdown inside this tag
  437. /** TODO: markdown extra **/
  438. if (!$this->parser->isEmptyTag) {
  439. if ($this->parser->isStartTag) {
  440. if (!$this->skipConversion) {
  441. $this->skipConversion = $this->parser->tagName.'::'.implode('/', $this->parser->openTags);
  442. }
  443. } else {
  444. if ($this->skipConversion == $this->parser->tagName.'::'.implode('/', $this->parser->openTags)) {
  445. $this->skipConversion = false;
  446. }
  447. }
  448. }
  449. if ($this->parser->isBlockElement) {
  450. if ($this->parser->isStartTag) {
  451. if (in_array($this->parent(), array('ins', 'del'))) {
  452. # looks like ins or del are block elements now
  453. $this->out("\n", true);
  454. $this->indent(' ');
  455. }
  456. if ($this->parser->tagName != 'pre') {
  457. $this->out($this->parser->node."\n".$this->indent);
  458. if (!$this->parser->isEmptyTag) {
  459. $this->indent(' ');
  460. } else {
  461. $this->setLineBreaks(1);
  462. }
  463. $this->parser->html = ltrim($this->parser->html);
  464. } else {
  465. # don't indent inside <pre> tags
  466. $this->out($this->parser->node);
  467. static $indent;
  468. $indent = $this->indent;
  469. $this->indent = '';
  470. }
  471. } else {
  472. if (!$this->parser->keepWhitespace) {
  473. $this->output = rtrim($this->output);
  474. }
  475. if ($this->parser->tagName != 'pre') {
  476. $this->indent(' ');
  477. $this->out("\n".$this->indent.$this->parser->node);
  478. } else {
  479. # reset indentation
  480. $this->out($this->parser->node);
  481. static $indent;
  482. $this->indent = $indent;
  483. }
  484. if (in_array($this->parent(), array('ins', 'del'))) {
  485. # ins or del was block element
  486. $this->out("\n");
  487. $this->indent(' ');
  488. }
  489. if ($this->parser->tagName == 'li') {
  490. $this->setLineBreaks(1);
  491. } else {
  492. $this->setLineBreaks(2);
  493. }
  494. }
  495. } else {
  496. $this->out($this->parser->node);
  497. }
  498. if (in_array($this->parser->tagName, array('code', 'pre'))) {
  499. if ($this->parser->isStartTag) {
  500. $this->buffer();
  501. } else {
  502. # add stuff so cleanup just reverses this
  503. $this->out(str_replace('&lt;', '&amp;lt;', str_replace('&gt;', '&amp;gt;', $this->unbuffer())));
  504. }
  505. }
  506. }
  507. }
  508. /**
  509. * handle plain text
  510. *
  511. * @param void
  512. * @return void
  513. */
  514. function handleText() {
  515. if ($this->hasParent('pre') && strpos($this->parser->node, "\n") !== false) {
  516. $this->parser->node = str_replace("\n", "\n".$this->indent, $this->parser->node);
  517. }
  518. if (!$this->hasParent('code') && !$this->hasParent('pre')) {
  519. # entity decode
  520. $this->parser->node = $this->decode($this->parser->node);
  521. if (!$this->skipConversion) {
  522. # escape some chars in normal Text
  523. $this->parser->node = preg_replace($this->escapeInText['search'], $this->escapeInText['replace'], $this->parser->node);
  524. }
  525. } else {
  526. $this->parser->node = str_replace(array('&quot;', '&apos'), array('"', '\''), $this->parser->node);
  527. }
  528. $this->out($this->parser->node);
  529. $this->lastClosedTag = '';
  530. }
  531. /**
  532. * handle <em> and <i> tags
  533. *
  534. * @param void
  535. * @return void
  536. */
  537. function handleTag_em() {
  538. $this->out('*', true);
  539. }
  540. function handleTag_i() {
  541. $this->handleTag_em();
  542. }
  543. /**
  544. * handle <strong> and <b> tags
  545. *
  546. * @param void
  547. * @return void
  548. */
  549. function handleTag_strong() {
  550. $this->out('**', true);
  551. }
  552. function handleTag_b() {
  553. $this->handleTag_strong();
  554. }
  555. /**
  556. * handle <h1> tags
  557. *
  558. * @param void
  559. * @return void
  560. */
  561. function handleTag_h1() {
  562. $this->handleHeader(1);
  563. }
  564. /**
  565. * handle <h2> tags
  566. *
  567. * @param void
  568. * @return void
  569. */
  570. function handleTag_h2() {
  571. $this->handleHeader(2);
  572. }
  573. /**
  574. * handle <h3> tags
  575. *
  576. * @param void
  577. * @return void
  578. */
  579. function handleTag_h3() {
  580. $this->handleHeader(3);
  581. }
  582. /**
  583. * handle <h4> tags
  584. *
  585. * @param void
  586. * @return void
  587. */
  588. function handleTag_h4() {
  589. $this->handleHeader(4);
  590. }
  591. /**
  592. * handle <h5> tags
  593. *
  594. * @param void
  595. * @return void
  596. */
  597. function handleTag_h5() {
  598. $this->handleHeader(5);
  599. }
  600. /**
  601. * handle <h6> tags
  602. *
  603. * @param void
  604. * @return void
  605. */
  606. function handleTag_h6() {
  607. $this->handleHeader(6);
  608. }
  609. /**
  610. * number of line breaks before next inline output
  611. */
  612. var $lineBreaks = 0;
  613. /**
  614. * handle header tags (<h1> - <h6>)
  615. *
  616. * @param int $level 1-6
  617. * @return void
  618. */
  619. function handleHeader($level) {
  620. if ($this->parser->isStartTag) {
  621. $this->out(str_repeat('#', $level).' ', true);
  622. } else {
  623. $this->setLineBreaks(2);
  624. }
  625. }
  626. /**
  627. * handle <p> tags
  628. *
  629. * @param void
  630. * @return void
  631. */
  632. function handleTag_p() {
  633. if (!$this->parser->isStartTag) {
  634. $this->setLineBreaks(2);
  635. }
  636. }
  637. /**
  638. * handle <a> tags
  639. *
  640. * @param void
  641. * @return void
  642. */
  643. function handleTag_a() {
  644. if ($this->parser->isStartTag) {
  645. $this->buffer();
  646. if (isset($this->parser->tagAttributes['title'])) {
  647. $this->parser->tagAttributes['title'] = $this->decode($this->parser->tagAttributes['title']);
  648. } else {
  649. $this->parser->tagAttributes['title'] = null;
  650. }
  651. $this->parser->tagAttributes['href'] = $this->decode(trim($this->parser->tagAttributes['href']));
  652. $this->stack();
  653. } else {
  654. $tag = $this->unstack();
  655. $buffer = $this->unbuffer();
  656. if (empty($tag['href']) && empty($tag['title'])) {
  657. # empty links... testcase mania, who would possibly do anything like that?!
  658. $this->out('['.$buffer.']()', true);
  659. return;
  660. }
  661. if ($buffer == $tag['href'] && empty($tag['title'])) {
  662. # <http://example.com>
  663. $this->out('<'.$buffer.'>', true);
  664. return;
  665. }
  666. $bufferDecoded = $this->decode(trim($buffer));
  667. if (substr($tag['href'], 0, 7) == 'mailto:' && 'mailto:'.$bufferDecoded == $tag['href']) {
  668. if (is_null($tag['title'])) {
  669. # <mail@example.com>
  670. $this->out('<'.$bufferDecoded.'>', true);
  671. return;
  672. }
  673. # [mail@example.com][1]
  674. # ...
  675. # [1]: mailto:mail@example.com Title
  676. $tag['href'] = 'mailto:'.$bufferDecoded;
  677. }
  678. # [This link][id]
  679. foreach ($this->stack['a'] as $tag2) {
  680. if ($tag2['href'] == $tag['href'] && $tag2['title'] === $tag['title']) {
  681. $tag['linkID'] = $tag2['linkID'];
  682. break;
  683. }
  684. }
  685. if (!isset($tag['linkID'])) {
  686. $tag['linkID'] = count($this->stack['a']) + 1;
  687. array_push($this->stack['a'], $tag);
  688. }
  689. $this->out('['.$buffer.']['.$tag['linkID'].']', true);
  690. }
  691. }
  692. /**
  693. * handle <img /> tags
  694. *
  695. * @param void
  696. * @return void
  697. */
  698. function handleTag_img() {
  699. if (!$this->parser->isStartTag) {
  700. return; # just to be sure this is really an empty tag...
  701. }
  702. if (isset($this->parser->tagAttributes['title'])) {
  703. $this->parser->tagAttributes['title'] = $this->decode($this->parser->tagAttributes['title']);
  704. } else {
  705. $this->parser->tagAttributes['title'] = null;
  706. }
  707. if (isset($this->parser->tagAttributes['alt'])) {
  708. $this->parser->tagAttributes['alt'] = $this->decode($this->parser->tagAttributes['alt']);
  709. } else {
  710. $this->parser->tagAttributes['alt'] = null;
  711. }
  712. if (empty($this->parser->tagAttributes['src'])) {
  713. # support for "empty" images... dunno if this is really needed
  714. # but there are some testcases which do that...
  715. if (!empty($this->parser->tagAttributes['title'])) {
  716. $this->parser->tagAttributes['title'] = ' '.$this->parser->tagAttributes['title'].' ';
  717. }
  718. $this->out('!['.$this->parser->tagAttributes['alt'].']('.$this->parser->tagAttributes['title'].')', true);
  719. return;
  720. } else {
  721. $this->parser->tagAttributes['src'] = $this->decode($this->parser->tagAttributes['src']);
  722. }
  723. # [This link][id]
  724. $link_id = false;
  725. if (!empty($this->stack['a'])) {
  726. foreach ($this->stack['a'] as $tag) {
  727. if ($tag['href'] == $this->parser->tagAttributes['src']
  728. && $tag['title'] === $this->parser->tagAttributes['title']) {
  729. $link_id = $tag['linkID'];
  730. break;
  731. }
  732. }
  733. } else {
  734. $this->stack['a'] = array();
  735. }
  736. if (!$link_id) {
  737. $link_id = count($this->stack['a']) + 1;
  738. $tag = array(
  739. 'href' => $this->parser->tagAttributes['src'],
  740. 'linkID' => $link_id,
  741. 'title' => $this->parser->tagAttributes['title']
  742. );
  743. array_push($this->stack['a'], $tag);
  744. }
  745. $this->out('!['.$this->parser->tagAttributes['alt'].']['.$link_id.']', true);
  746. }
  747. /**
  748. * handle <code> tags
  749. *
  750. * @param void
  751. * @return void
  752. */
  753. function handleTag_code() {
  754. if ($this->hasParent('pre')) {
  755. # ignore code blocks inside <pre>
  756. return;
  757. }
  758. if ($this->parser->isStartTag) {
  759. $this->buffer();
  760. } else {
  761. $buffer = $this->unbuffer();
  762. # use as many backticks as needed
  763. preg_match_all('#`+#', $buffer, $matches);
  764. if (!empty($matches[0])) {
  765. rsort($matches[0]);
  766. $ticks = '`';
  767. while (true) {
  768. if (!in_array($ticks, $matches[0])) {
  769. break;
  770. }
  771. $ticks .= '`';
  772. }
  773. } else {
  774. $ticks = '`';
  775. }
  776. if ($buffer[0] == '`' || substr($buffer, -1) == '`') {
  777. $buffer = ' '.$buffer.' ';
  778. }
  779. $this->out($ticks.$buffer.$ticks, true);
  780. }
  781. }
  782. /**
  783. * handle <pre> tags
  784. *
  785. * @param void
  786. * @return void
  787. */
  788. function handleTag_pre() {
  789. if ($this->keepHTML && $this->parser->isStartTag) {
  790. # check if a simple <code> follows
  791. if (!preg_match('#^\s*<code\s*>#Us', $this->parser->html)) {
  792. # this is no standard markdown code block
  793. $this->handleTagToText();
  794. return;
  795. }
  796. }
  797. $this->indent(' ');
  798. if (!$this->parser->isStartTag) {
  799. $this->setLineBreaks(2);
  800. } else {
  801. $this->parser->html = ltrim($this->parser->html);
  802. }
  803. }
  804. /**
  805. * handle <blockquote> tags
  806. *
  807. * @param void
  808. * @return void
  809. */
  810. function handleTag_blockquote() {
  811. $this->indent('> ');
  812. }
  813. /**
  814. * handle <ul> tags
  815. *
  816. * @param void
  817. * @return void
  818. */
  819. function handleTag_ul() {
  820. if ($this->parser->isStartTag) {
  821. $this->stack();
  822. if (!$this->keepHTML && $this->lastClosedTag == $this->parser->tagName) {
  823. $this->out("\n".$this->indent.'<!-- -->'."\n".$this->indent."\n".$this->indent);
  824. }
  825. } else {
  826. $this->unstack();
  827. if ($this->parent() != 'li' || preg_match('#^\s*(</li\s*>\s*<li\s*>\s*)?<(p|blockquote)\s*>#sU', $this->parser->html)) {
  828. # dont make Markdown add unneeded paragraphs
  829. $this->setLineBreaks(2);
  830. }
  831. }
  832. }
  833. /**
  834. * handle <ul> tags
  835. *
  836. * @param void
  837. * @return void
  838. */
  839. function handleTag_ol() {
  840. # same as above
  841. $this->parser->tagAttributes['num'] = 0;
  842. $this->handleTag_ul();
  843. }
  844. /**
  845. * handle <li> tags
  846. *
  847. * @param void
  848. * @return void
  849. */
  850. function handleTag_li() {
  851. if ($this->parent() == 'ol') {
  852. $parent =& $this->getStacked('ol');
  853. if ($this->parser->isStartTag) {
  854. $parent['num']++;
  855. $this->out($parent['num'].'.'.str_repeat(' ', 3 - strlen($parent['num'])), true);
  856. }
  857. $this->indent(' ', false);
  858. } else {
  859. if ($this->parser->isStartTag) {
  860. $this->out('* ', true);
  861. }
  862. $this->indent(' ', false);
  863. }
  864. if (!$this->parser->isStartTag) {
  865. $this->setLineBreaks(1);
  866. }
  867. }
  868. /**
  869. * handle <hr /> tags
  870. *
  871. * @param void
  872. * @return void
  873. */
  874. function handleTag_hr() {
  875. if (!$this->parser->isStartTag) {
  876. return; # just to be sure this really is an empty tag
  877. }
  878. $this->out('* * *', true);
  879. $this->setLineBreaks(2);
  880. }
  881. /**
  882. * handle <br /> tags
  883. *
  884. * @param void
  885. * @return void
  886. */
  887. function handleTag_br() {
  888. $this->out(" \n".$this->indent, true);
  889. $this->parser->html = ltrim($this->parser->html);
  890. }
  891. /**
  892. * node stack, e.g. for <a> and <abbr> tags
  893. *
  894. * @var array<array>
  895. */
  896. var $stack = array();
  897. /**
  898. * add current node to the stack
  899. * this only stores the attributes
  900. *
  901. * @param void
  902. * @return void
  903. */
  904. function stack() {
  905. if (!isset($this->stack[$this->parser->tagName])) {
  906. $this->stack[$this->parser->tagName] = array();
  907. }
  908. array_push($this->stack[$this->parser->tagName], $this->parser->tagAttributes);
  909. }
  910. /**
  911. * remove current tag from stack
  912. *
  913. * @param void
  914. * @return array
  915. */
  916. function unstack() {
  917. if (!isset($this->stack[$this->parser->tagName]) || !is_array($this->stack[$this->parser->tagName])) {
  918. trigger_error('Trying to unstack from empty stack. This must not happen.', E_USER_ERROR);
  919. }
  920. return array_pop($this->stack[$this->parser->tagName]);
  921. }
  922. /**
  923. * get last stacked element of type $tagName
  924. *
  925. * @param string $tagName
  926. * @return array
  927. */
  928. function & getStacked($tagName) {
  929. // no end() so it can be referenced
  930. return $this->stack[$tagName][count($this->stack[$tagName])-1];
  931. }
  932. /**
  933. * set number of line breaks before next start tag
  934. *
  935. * @param int $number
  936. * @return void
  937. */
  938. function setLineBreaks($number) {
  939. if ($this->lineBreaks < $number) {
  940. $this->lineBreaks = $number;
  941. }
  942. }
  943. /**
  944. * stores current buffers
  945. *
  946. * @var array<string>
  947. */
  948. var $buffer = array();
  949. /**
  950. * buffer next parser output until unbuffer() is called
  951. *
  952. * @param void
  953. * @return void
  954. */
  955. function buffer() {
  956. array_push($this->buffer, '');
  957. }
  958. /**
  959. * end current buffer and return buffered output
  960. *
  961. * @param void
  962. * @return string
  963. */
  964. function unbuffer() {
  965. return array_pop($this->buffer);
  966. }
  967. /**
  968. * append string to the correct var, either
  969. * directly to $this->output or to the current
  970. * buffers
  971. *
  972. * @param string $put
  973. * @return void
  974. */
  975. function out($put, $nowrap = false) {
  976. if (empty($put)) {
  977. return;
  978. }
  979. if (!empty($this->buffer)) {
  980. $this->buffer[count($this->buffer) - 1] .= $put;
  981. } else {
  982. if ($this->bodyWidth && !$this->parser->keepWhitespace) { # wrap lines
  983. // get last line
  984. $pos = strrpos($this->output, "\n");
  985. if ($pos === false) {
  986. $line = $this->output;
  987. } else {
  988. $line = substr($this->output, $pos);
  989. }
  990. if ($nowrap) {
  991. if ($put[0] != "\n" && $this->strlen($line) + $this->strlen($put) > $this->bodyWidth) {
  992. $this->output .= "\n".$this->indent.$put;
  993. } else {
  994. $this->output .= $put;
  995. }
  996. return;
  997. } else {
  998. $put .= "\n"; # make sure we get all lines in the while below
  999. $lineLen = $this->strlen($line);
  1000. while ($pos = strpos($put, "\n")) {
  1001. $putLine = substr($put, 0, $pos+1);
  1002. $put = substr($put, $pos+1);
  1003. $putLen = $this->strlen($putLine);
  1004. if ($lineLen + $putLen < $this->bodyWidth) {
  1005. $this->output .= $putLine;
  1006. $lineLen = $putLen;
  1007. } else {
  1008. $split = preg_split('#^(.{0,'.($this->bodyWidth - $lineLen).'})\b#', $putLine, 2, PREG_SPLIT_OFFSET_CAPTURE | PREG_SPLIT_DELIM_CAPTURE);
  1009. $this->output .= rtrim($split[1][0])."\n".$this->indent.$this->wordwrap(ltrim($split[2][0]), $this->bodyWidth, "\n".$this->indent, false);
  1010. }
  1011. }
  1012. $this->output = substr($this->output, 0, -1);
  1013. return;
  1014. }
  1015. } else {
  1016. $this->output .= $put;
  1017. }
  1018. }
  1019. }
  1020. /**
  1021. * current indentation
  1022. *
  1023. * @var string
  1024. */
  1025. var $indent = '';
  1026. /**
  1027. * indent next output (start tag) or unindent (end tag)
  1028. *
  1029. * @param string $str indentation
  1030. * @param bool $output add indendation to output
  1031. * @return void
  1032. */
  1033. function indent($str, $output = true) {
  1034. if ($this->parser->isStartTag) {
  1035. $this->indent .= $str;
  1036. if ($output) {
  1037. $this->out($str, true);
  1038. }
  1039. } else {
  1040. $this->indent = substr($this->indent, 0, -strlen($str));
  1041. }
  1042. }
  1043. /**
  1044. * decode email addresses
  1045. *
  1046. * @author derernst@gmx.ch <http://www.php.net/manual/en/function.html-entity-decode.php#68536>
  1047. * @author Milian Wolff <http://milianw.de>
  1048. */
  1049. function decode($text, $quote_style = ENT_QUOTES) {
  1050. if (version_compare(PHP_VERSION, '5', '>=')) {
  1051. # UTF-8 is only supported in PHP 5.x.x and above
  1052. $text = html_entity_decode($text, $quote_style, 'UTF-8');
  1053. } else {
  1054. if (function_exists('html_entity_decode')) {
  1055. $text = html_entity_decode($text, $quote_style, 'ISO-8859-1');
  1056. } else {
  1057. static $trans_tbl;
  1058. if (!isset($trans_tbl)) {
  1059. $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, $quote_style));
  1060. }
  1061. $text = strtr($text, $trans_tbl);
  1062. }
  1063. $text = preg_replace_callback('~&#x([0-9a-f]+);~i', array(&$this, '_decode_hex'), $text);
  1064. $text = preg_replace_callback('~&#(\d{2,5});~', array(&$this, '_decode_numeric'), $text);
  1065. }
  1066. return $text;
  1067. }
  1068. /**
  1069. * callback for decode() which converts a hexadecimal entity to UTF-8
  1070. *
  1071. * @param array $matches
  1072. * @return string UTF-8 encoded
  1073. */
  1074. function _decode_hex($matches) {
  1075. return $this->unichr(hexdec($matches[1]));
  1076. }
  1077. /**
  1078. * callback for decode() which converts a numerical entity to UTF-8
  1079. *
  1080. * @param array $matches
  1081. * @return string UTF-8 encoded
  1082. */
  1083. function _decode_numeric($matches) {
  1084. return $this->unichr($matches[1]);
  1085. }
  1086. /**
  1087. * UTF-8 chr() which supports numeric entities
  1088. *
  1089. * @author grey - greywyvern - com <http://www.php.net/manual/en/function.chr.php#55978>
  1090. * @param array $matches
  1091. * @return string UTF-8 encoded
  1092. */
  1093. function unichr($dec) {
  1094. if ($dec < 128) {
  1095. $utf = chr($dec);
  1096. } else if ($dec < 2048) {
  1097. $utf = chr(192 + (($dec - ($dec % 64)) / 64));
  1098. $utf .= chr(128 + ($dec % 64));
  1099. } else {
  1100. $utf = chr(224 + (($dec - ($dec % 4096)) / 4096));
  1101. $utf .= chr(128 + ((($dec % 4096) - ($dec % 64)) / 64));
  1102. $utf .= chr(128 + ($dec % 64));
  1103. }
  1104. return $utf;
  1105. }
  1106. /**
  1107. * UTF-8 strlen()
  1108. *
  1109. * @param string $str
  1110. * @return int
  1111. *
  1112. * @author dtorop 932 at hotmail dot com <http://www.php.net/manual/en/function.strlen.php#37975>
  1113. * @author Milian Wolff <http://milianw.de>
  1114. */
  1115. function strlen($str) {
  1116. if (function_exists('mb_strlen')) {
  1117. return mb_strlen($str, 'UTF-8');
  1118. } else {
  1119. return preg_match_all('/[\x00-\x7F\xC0-\xFD]/', $str, $var_empty);
  1120. }
  1121. }
  1122. /**
  1123. * wordwrap for utf8 encoded strings
  1124. *
  1125. * @param string $str
  1126. * @param integer $len
  1127. * @param string $what
  1128. * @return string
  1129. */
  1130. function wordwrap($str, $width, $break, $cut = false){
  1131. if (!$cut) {
  1132. $regexp = '#^(?:[\x00-\x7F]|[\xC0-\xFF][\x80-\xBF]+){1,'.$width.'}\b#';
  1133. } else {
  1134. $regexp = '#^(?:[\x00-\x7F]|[\xC0-\xFF][\x80-\xBF]+){'.$width.'}#';
  1135. }
  1136. $return = '';
  1137. while (preg_match($regexp, $str, $matches)) {
  1138. $string = $matches[0];
  1139. $str = ltrim(substr($str, strlen($string)));
  1140. if (!$cut && isset($str[0]) && in_array($str[0], array('.', '!', ';', ':', '?', ','))) {
  1141. $string .= $str[0];
  1142. $str = ltrim(substr($str, 1));
  1143. }
  1144. $return .= $string.$break;
  1145. }
  1146. return $return.ltrim($str);
  1147. }
  1148. /**
  1149. * check if current node has a $tagName as parent (somewhere, not only the direct parent)
  1150. *
  1151. * @param string $tagName
  1152. * @return bool
  1153. */
  1154. function hasParent($tagName) {
  1155. return in_array($tagName, $this->parser->openTags);
  1156. }
  1157. /**
  1158. * get tagName of direct parent tag
  1159. *
  1160. * @param void
  1161. * @return string $tagName
  1162. */
  1163. function parent() {
  1164. return end($this->parser->openTags);
  1165. }
  1166. }