PageRenderTime 44ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/app/vendors/markdownify/parsehtml/parsehtml.php

http://github.com/Datawalke/Coordino
PHP | 618 lines | 374 code | 16 blank | 228 comment | 80 complexity | b3ba9f18883e2b02d37522e9dfcd150d MD5 | raw file
  1. <?php
  2. /**
  3. * parseHTML is a HTML parser which works with PHP 4 and above.
  4. * It tries to handle invalid HTML to some degree.
  5. *
  6. * @version 1.0 beta
  7. * @author Milian Wolff (mail@milianw.de, http://milianw.de)
  8. * @license LGPL, see LICENSE_LGPL.txt and the summary below
  9. * @copyright (C) 2007 Milian Wolff
  10. *
  11. * This library is free software; you can redistribute it and/or
  12. * modify it under the terms of the GNU Lesser General Public
  13. * License as published by the Free Software Foundation; either
  14. * version 2.1 of the License, or (at your option) any later version.
  15. *
  16. * This library is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  19. * Lesser General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU Lesser General Public
  22. * License along with this library; if not, write to the Free Software
  23. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24. */
  25. class parseHTML {
  26. /**
  27. * tags which are always empty (<br /> etc.)
  28. *
  29. * @var array<string>
  30. */
  31. var $emptyTags = array(
  32. 'br',
  33. 'hr',
  34. 'input',
  35. 'img',
  36. 'area',
  37. 'link',
  38. 'meta',
  39. 'param',
  40. );
  41. /**
  42. * tags with preformatted text
  43. * whitespaces wont be touched in them
  44. *
  45. * @var array<string>
  46. */
  47. var $preformattedTags = array(
  48. 'script',
  49. 'style',
  50. 'pre',
  51. 'code',
  52. );
  53. /**
  54. * supress HTML tags inside preformatted tags (see above)
  55. *
  56. * @var bool
  57. */
  58. var $noTagsInCode = false;
  59. /**
  60. * html to be parsed
  61. *
  62. * @var string
  63. */
  64. var $html = '';
  65. /**
  66. * node type:
  67. *
  68. * - tag (see isStartTag)
  69. * - text (includes cdata)
  70. * - comment
  71. * - doctype
  72. * - pi (processing instruction)
  73. *
  74. * @var string
  75. */
  76. var $nodeType = '';
  77. /**
  78. * current node content, i.e. either a
  79. * simple string (text node), or something like
  80. * <tag attrib="value"...>
  81. *
  82. * @var string
  83. */
  84. var $node = '';
  85. /**
  86. * wether current node is an opening tag (<a>) or not (</a>)
  87. * set to NULL if current node is not a tag
  88. * NOTE: empty tags (<br />) set this to true as well!
  89. *
  90. * @var bool | null
  91. */
  92. var $isStartTag = null;
  93. /**
  94. * wether current node is an empty tag (<br />) or not (<a></a>)
  95. *
  96. * @var bool | null
  97. */
  98. var $isEmptyTag = null;
  99. /**
  100. * tag name
  101. *
  102. * @var string | null
  103. */
  104. var $tagName = '';
  105. /**
  106. * attributes of current tag
  107. *
  108. * @var array (attribName=>value) | null
  109. */
  110. var $tagAttributes = null;
  111. /**
  112. * wether the current tag is a block element
  113. *
  114. * @var bool | null
  115. */
  116. var $isBlockElement = null;
  117. /**
  118. * keep whitespace
  119. *
  120. * @var int
  121. */
  122. var $keepWhitespace = 0;
  123. /**
  124. * list of open tags
  125. * count this to get current depth
  126. *
  127. * @var array
  128. */
  129. var $openTags = array();
  130. /**
  131. * list of block elements
  132. *
  133. * @var array
  134. * TODO: what shall we do with <del> and <ins> ?!
  135. */
  136. var $blockElements = array (
  137. # tag name => <bool> is block
  138. # block elements
  139. 'address' => true,
  140. 'blockquote' => true,
  141. 'center' => true,
  142. 'del' => true,
  143. 'dir' => true,
  144. 'div' => true,
  145. 'dl' => true,
  146. 'fieldset' => true,
  147. 'form' => true,
  148. 'h1' => true,
  149. 'h2' => true,
  150. 'h3' => true,
  151. 'h4' => true,
  152. 'h5' => true,
  153. 'h6' => true,
  154. 'hr' => true,
  155. 'ins' => true,
  156. 'isindex' => true,
  157. 'menu' => true,
  158. 'noframes' => true,
  159. 'noscript' => true,
  160. 'ol' => true,
  161. 'p' => true,
  162. 'pre' => true,
  163. 'table' => true,
  164. 'ul' => true,
  165. # set table elements and list items to block as well
  166. 'thead' => true,
  167. 'tbody' => true,
  168. 'tfoot' => true,
  169. 'td' => true,
  170. 'tr' => true,
  171. 'th' => true,
  172. 'li' => true,
  173. 'dd' => true,
  174. 'dt' => true,
  175. # header items and html / body as well
  176. 'html' => true,
  177. 'body' => true,
  178. 'head' => true,
  179. 'meta' => true,
  180. 'link' => true,
  181. 'style' => true,
  182. 'title' => true,
  183. # unfancy media tags, when indented should be rendered as block
  184. 'map' => true,
  185. 'object' => true,
  186. 'param' => true,
  187. 'embed' => true,
  188. 'area' => true,
  189. # inline elements
  190. 'a' => false,
  191. 'abbr' => false,
  192. 'acronym' => false,
  193. 'applet' => false,
  194. 'b' => false,
  195. 'basefont' => false,
  196. 'bdo' => false,
  197. 'big' => false,
  198. 'br' => false,
  199. 'button' => false,
  200. 'cite' => false,
  201. 'code' => false,
  202. 'del' => false,
  203. 'dfn' => false,
  204. 'em' => false,
  205. 'font' => false,
  206. 'i' => false,
  207. 'img' => false,
  208. 'ins' => false,
  209. 'input' => false,
  210. 'iframe' => false,
  211. 'kbd' => false,
  212. 'label' => false,
  213. 'q' => false,
  214. 'samp' => false,
  215. 'script' => false,
  216. 'select' => false,
  217. 'small' => false,
  218. 'span' => false,
  219. 'strong' => false,
  220. 'sub' => false,
  221. 'sup' => false,
  222. 'textarea' => false,
  223. 'tt' => false,
  224. 'var' => false,
  225. );
  226. /**
  227. * get next node, set $this->html prior!
  228. *
  229. * @param void
  230. * @return bool
  231. */
  232. function nextNode() {
  233. if (empty($this->html)) {
  234. # we are done with parsing the html string
  235. return false;
  236. }
  237. static $skipWhitespace = true;
  238. if ($this->isStartTag && !$this->isEmptyTag) {
  239. array_push($this->openTags, $this->tagName);
  240. if (in_array($this->tagName, $this->preformattedTags)) {
  241. # dont truncate whitespaces for <code> or <pre> contents
  242. $this->keepWhitespace++;
  243. }
  244. }
  245. if ($this->html[0] == '<') {
  246. $token = substr($this->html, 0, 9);
  247. if (substr($token, 0, 2) == '<?') {
  248. # xml prolog or other pi's
  249. /** TODO **/
  250. #trigger_error('this might need some work', E_USER_NOTICE);
  251. $pos = strpos($this->html, '>');
  252. $this->setNode('pi', $pos + 1);
  253. return true;
  254. }
  255. if (substr($token, 0, 4) == '<!--') {
  256. # comment
  257. $pos = strpos($this->html, '-->');
  258. if ($pos === false) {
  259. # could not find a closing -->, use next gt instead
  260. # this is firefox' behaviour
  261. $pos = strpos($this->html, '>') + 1;
  262. } else {
  263. $pos += 3;
  264. }
  265. $this->setNode('comment', $pos);
  266. $skipWhitespace = true;
  267. return true;
  268. }
  269. if ($token == '<!DOCTYPE') {
  270. # doctype
  271. $this->setNode('doctype', strpos($this->html, '>')+1);
  272. $skipWhitespace = true;
  273. return true;
  274. }
  275. if ($token == '<![CDATA[') {
  276. # cdata, use text node
  277. # remove leading <![CDATA[
  278. $this->html = substr($this->html, 9);
  279. $this->setNode('text', strpos($this->html, ']]>')+3);
  280. # remove trailing ]]> and trim
  281. $this->node = substr($this->node, 0, -3);
  282. $this->handleWhitespaces();
  283. $skipWhitespace = true;
  284. return true;
  285. }
  286. if ($this->parseTag()) {
  287. # seems to be a tag
  288. # handle whitespaces
  289. if ($this->isBlockElement) {
  290. $skipWhitespace = true;
  291. } else {
  292. $skipWhitespace = false;
  293. }
  294. return true;
  295. }
  296. }
  297. if ($this->keepWhitespace) {
  298. $skipWhitespace = false;
  299. }
  300. # when we get here it seems to be a text node
  301. $pos = strpos($this->html, '<');
  302. if ($pos === false) {
  303. $pos = strlen($this->html);
  304. }
  305. $this->setNode('text', $pos);
  306. $this->handleWhitespaces();
  307. if ($skipWhitespace && $this->node == ' ') {
  308. return $this->nextNode();
  309. }
  310. $skipWhitespace = false;
  311. return true;
  312. }
  313. /**
  314. * parse tag, set tag name and attributes, see if it's a closing tag and so forth...
  315. *
  316. * @param void
  317. * @return bool
  318. */
  319. function parseTag() {
  320. static $a_ord, $z_ord, $special_ords;
  321. if (!isset($a_ord)) {
  322. $a_ord = ord('a');
  323. $z_ord = ord('z');
  324. $special_ords = array(
  325. ord(':'), // for xml:lang
  326. ord('-'), // for http-equiv
  327. );
  328. }
  329. $tagName = '';
  330. $pos = 1;
  331. $isStartTag = $this->html[$pos] != '/';
  332. if (!$isStartTag) {
  333. $pos++;
  334. }
  335. # get tagName
  336. while (isset($this->html[$pos])) {
  337. $pos_ord = ord(strtolower($this->html[$pos]));
  338. if (($pos_ord >= $a_ord && $pos_ord <= $z_ord) || (!empty($tagName) && is_numeric($this->html[$pos]))) {
  339. $tagName .= $this->html[$pos];
  340. $pos++;
  341. } else {
  342. $pos--;
  343. break;
  344. }
  345. }
  346. $tagName = strtolower($tagName);
  347. if (empty($tagName) || !isset($this->blockElements[$tagName])) {
  348. # something went wrong => invalid tag
  349. $this->invalidTag();
  350. return false;
  351. }
  352. if ($this->noTagsInCode && end($this->openTags) == 'code' && !($tagName == 'code' && !$isStartTag)) {
  353. # we supress all HTML tags inside code tags
  354. $this->invalidTag();
  355. return false;
  356. }
  357. # get tag attributes
  358. /** TODO: in html 4 attributes do not need to be quoted **/
  359. $isEmptyTag = false;
  360. $attributes = array();
  361. $currAttrib = '';
  362. while (isset($this->html[$pos+1])) {
  363. $pos++;
  364. # close tag
  365. if ($this->html[$pos] == '>' || $this->html[$pos].$this->html[$pos+1] == '/>') {
  366. if ($this->html[$pos] == '/') {
  367. $isEmptyTag = true;
  368. $pos++;
  369. }
  370. break;
  371. }
  372. $pos_ord = ord(strtolower($this->html[$pos]));
  373. if ( ($pos_ord >= $a_ord && $pos_ord <= $z_ord) || in_array($pos_ord, $special_ords)) {
  374. # attribute name
  375. $currAttrib .= $this->html[$pos];
  376. } elseif (in_array($this->html[$pos], array(' ', "\t", "\n"))) {
  377. # drop whitespace
  378. } elseif (in_array($this->html[$pos].$this->html[$pos+1], array('="', "='"))) {
  379. # get attribute value
  380. $pos++;
  381. $await = $this->html[$pos]; # single or double quote
  382. $pos++;
  383. $value = '';
  384. while (isset($this->html[$pos]) && $this->html[$pos] != $await) {
  385. $value .= $this->html[$pos];
  386. $pos++;
  387. }
  388. $attributes[$currAttrib] = $value;
  389. $currAttrib = '';
  390. } else {
  391. $this->invalidTag();
  392. return false;
  393. }
  394. }
  395. if ($this->html[$pos] != '>') {
  396. $this->invalidTag();
  397. return false;
  398. }
  399. if (!empty($currAttrib)) {
  400. # html 4 allows something like <option selected> instead of <option selected="selected">
  401. $attributes[$currAttrib] = $currAttrib;
  402. }
  403. if (!$isStartTag) {
  404. if (!empty($attributes) || $tagName != end($this->openTags)) {
  405. # end tags must not contain any attributes
  406. # or maybe we did not expect a different tag to be closed
  407. $this->invalidTag();
  408. return false;
  409. }
  410. array_pop($this->openTags);
  411. if (in_array($tagName, $this->preformattedTags)) {
  412. $this->keepWhitespace--;
  413. }
  414. }
  415. $pos++;
  416. $this->node = substr($this->html, 0, $pos);
  417. $this->html = substr($this->html, $pos);
  418. $this->tagName = $tagName;
  419. $this->tagAttributes = $attributes;
  420. $this->isStartTag = $isStartTag;
  421. $this->isEmptyTag = $isEmptyTag || in_array($tagName, $this->emptyTags);
  422. if ($this->isEmptyTag) {
  423. # might be not well formed
  424. $this->node = preg_replace('# */? *>$#', ' />', $this->node);
  425. }
  426. $this->nodeType = 'tag';
  427. $this->isBlockElement = $this->blockElements[$tagName];
  428. return true;
  429. }
  430. /**
  431. * handle invalid tags
  432. *
  433. * @param void
  434. * @return void
  435. */
  436. function invalidTag() {
  437. $this->html = substr_replace($this->html, '&lt;', 0, 1);
  438. }
  439. /**
  440. * update all vars and make $this->html shorter
  441. *
  442. * @param string $type see description for $this->nodeType
  443. * @param int $pos to which position shall we cut?
  444. * @return void
  445. */
  446. function setNode($type, $pos) {
  447. if ($this->nodeType == 'tag') {
  448. # set tag specific vars to null
  449. # $type == tag should not be called here
  450. # see this::parseTag() for more
  451. $this->tagName = null;
  452. $this->tagAttributes = null;
  453. $this->isStartTag = null;
  454. $this->isEmptyTag = null;
  455. $this->isBlockElement = null;
  456. }
  457. $this->nodeType = $type;
  458. $this->node = substr($this->html, 0, $pos);
  459. $this->html = substr($this->html, $pos);
  460. }
  461. /**
  462. * check if $this->html begins with $str
  463. *
  464. * @param string $str
  465. * @return bool
  466. */
  467. function match($str) {
  468. return substr($this->html, 0, strlen($str)) == $str;
  469. }
  470. /**
  471. * truncate whitespaces
  472. *
  473. * @param void
  474. * @return void
  475. */
  476. function handleWhitespaces() {
  477. if ($this->keepWhitespace) {
  478. # <pre> or <code> before...
  479. return;
  480. }
  481. # truncate multiple whitespaces to a single one
  482. $this->node = preg_replace('#\s+#s', ' ', $this->node);
  483. }
  484. /**
  485. * normalize self::node
  486. *
  487. * @param void
  488. * @return void
  489. */
  490. function normalizeNode() {
  491. $this->node = '<';
  492. if (!$this->isStartTag) {
  493. $this->node .= '/'.$this->tagName.'>';
  494. return;
  495. }
  496. $this->node .= $this->tagName;
  497. foreach ($this->tagAttributes as $name => $value) {
  498. $this->node .= ' '.$name.'="'.str_replace('"', '&quot;', $value).'"';
  499. }
  500. if ($this->isEmptyTag) {
  501. $this->node .= ' /';
  502. }
  503. $this->node .= '>';
  504. }
  505. }
  506. /**
  507. * indent a HTML string properly
  508. *
  509. * @param string $html
  510. * @param string $indent optional
  511. * @return string
  512. */
  513. function indentHTML($html, $indent = " ", $noTagsInCode = false) {
  514. $parser = new parseHTML;
  515. $parser->noTagsInCode = $noTagsInCode;
  516. $parser->html = $html;
  517. $html = '';
  518. $last = true; # last tag was block elem
  519. $indent_a = array();
  520. while($parser->nextNode()) {
  521. if ($parser->nodeType == 'tag') {
  522. $parser->normalizeNode();
  523. }
  524. if ($parser->nodeType == 'tag' && $parser->isBlockElement) {
  525. $isPreOrCode = in_array($parser->tagName, array('code', 'pre'));
  526. if (!$parser->keepWhitespace && !$last && !$isPreOrCode) {
  527. $html = rtrim($html)."\n";
  528. }
  529. if ($parser->isStartTag) {
  530. $html .= implode($indent_a);
  531. if (!$parser->isEmptyTag) {
  532. array_push($indent_a, $indent);
  533. }
  534. } else {
  535. array_pop($indent_a);
  536. if (!$isPreOrCode) {
  537. $html .= implode($indent_a);
  538. }
  539. }
  540. $html .= $parser->node;
  541. if (!$parser->keepWhitespace && !($isPreOrCode && $parser->isStartTag)) {
  542. $html .= "\n";
  543. }
  544. $last = true;
  545. } else {
  546. if ($parser->nodeType == 'tag' && $parser->tagName == 'br') {
  547. $html .= $parser->node."\n";
  548. $last = true;
  549. continue;
  550. } elseif ($last && !$parser->keepWhitespace) {
  551. $html .= implode($indent_a);
  552. $parser->node = ltrim($parser->node);
  553. }
  554. $html .= $parser->node;
  555. if (in_array($parser->nodeType, array('comment', 'pi', 'doctype'))) {
  556. $html .= "\n";
  557. } else {
  558. $last = false;
  559. }
  560. }
  561. }
  562. return $html;
  563. }
  564. /*
  565. # testcase / example
  566. error_reporting(E_ALL);
  567. $html = '<p>Simple block on one line:</p>
  568. <div>foo</div>
  569. <p>And nested without indentation:</p>
  570. <div>
  571. <div>
  572. <div>
  573. foo
  574. </div>
  575. <div style=">"/>
  576. </div>
  577. <div>bar</div>
  578. </div>
  579. <p>And with attributes:</p>
  580. <div>
  581. <div id="foo">
  582. </div>
  583. </div>
  584. <p>This was broken in 1.0.2b7:</p>
  585. <div class="inlinepage">
  586. <div class="toggleableend">
  587. foo
  588. </div>
  589. </div>';
  590. #$html = '<a href="asdfasdf" title=\'asdf\' foo="bar">asdf</a>';
  591. echo indentHTML($html);
  592. die();
  593. */