PageRenderTime 66ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/HTML/SemiParser.php

https://github.com/texhapb/quickfw
PHP | 570 lines | 370 code | 17 blank | 183 comment | 33 complexity | 0b21beba6f075517828406e863260933 MD5 | raw file
  1. <?php
  2. /**
  3. * HTML_SemiParser: selective fast-and-dirty tags processing via callbacks.
  4. * (C) 2005 Dmitry Koterov, http://forum.dklab.ru/users/DmitryKoterov/
  5. *
  6. * This library is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. * See http://www.gnu.org/copyleft/lesser.html
  11. *
  12. * The main idea is to assign callbacks for specified tags and containers
  13. * (e.g. <a>, <input>, <img> etc.) and run them for various HTML text
  14. * to get result of substitutions. It could be used, for example, to
  15. * "on the fly" replacement in the following tasks:
  16. * - insert 'value="..."' attributes to <input> tags based on $_REQUEST.
  17. * See HTML_FormPersister class.
  18. * - replace 'href='...'" in links to make "transparent" human-readable
  19. * URLs for ready scripts.
  20. * - automatically insert "width=... height=..." into <img> tags.
  21. *
  22. * You may use this class in three different modes.
  23. *
  24. * 1. Create separate instance and then call addTag(), addContainer() etc.
  25. * for it adding callback functions for each needed element:
  26. *
  27. * $parser = new HTML_SemiParser();
  28. * $parser->addTag('a', 'handleLingsCallback');
  29. * echo $parser->process($text);
  30. * ...
  31. * function handleLingsCallback($parsedTagAttributes) { ... }
  32. *
  33. * 2. Create subclass of HTML_SemiParser and define methods with names
  34. * tag_*(), container_*() and re_*() in it.
  35. *
  36. * class HTML_ImageWidthHeightAutosetter extends HTML_SemiParser {
  37. * function tag_img($parsedTagAttributes) { ... }
  38. * }
  39. * $parser = new HTML_ImageWidthHeightAutosetter();
  40. * echo $parser->process($text);
  41. *
  42. * 3. Add all handlers from any HTML_SemiParser object to another object:
  43. *
  44. * $parserA = new HTML_LinkParser();
  45. * $parserB = new HTML_ImageWidthHeightAutosetter();
  46. * $parserA->addObject($parserB);
  47. *
  48. * If two callback handlers want to use the same tag (for example, we may
  49. * have two callbacks for <img> tag: first - for automatically setting width
  50. * and height attributes, and second - to replace images by their icons),
  51. * handlers are called one by one, like in conveyer.
  52. *
  53. * Order of substitution:
  54. * - direct regular expressions;
  55. * - tags;
  56. * - containers.
  57. *
  58. * @author Dmitry Koterov
  59. * @version 1.108
  60. * @package HTML
  61. */
  62. class HTML_SemiParser
  63. {
  64. /**
  65. * Handled tags, containers and regiular expressions.
  66. */
  67. var $sp_tags = array(); // array(tagName => list( h1, h2, ...), ...)
  68. var $sp_cons = array();
  69. var $sp_res = array();
  70. var $sp_precachers = array();
  71. var $sp_postprocs = array();
  72. var $sp_preprocs = array();
  73. /**
  74. * Functions for quoting/dequoting.
  75. */
  76. var $sp_quoteHandler = null;
  77. var $sp_dequoteHandler = null;
  78. /**
  79. * Object-callback name prefixes.
  80. */
  81. var $sp_preTag = "tag_"; // for tag gandlers
  82. var $sp_preCon = "container_"; // for container handlers
  83. var $sp_preRe = "re_"; // for REs
  84. /**
  85. * Characters inside tag RE (between < and >).
  86. */
  87. var $sp_reTagIn = '(?>(?xs) (?> [^>"\']+ | " [^"]* " | \' [^\']* \' )* )';
  88. /**
  89. * Containers, whose bodies are not parsed by the library.
  90. */
  91. var $sp_IGNORED = array('script', 'iframe', 'textarea', 'select', 'title');
  92. var $sp_SKIP_IGNORED = true;
  93. /**
  94. * Local temp variables.
  95. */
  96. var $sp_replaceHash; // unique hash to replace all the tags
  97. /**
  98. * HTML_SemiParser constructor.
  99. */
  100. function HTML_SemiParser()
  101. {
  102. // Add child handlers.
  103. $this->sp_selfAdd = true;
  104. $this->addObject($this);
  105. unset($this->sp_selfAdd);
  106. // Initialize quoters.
  107. $this->sp_quoteHandler = 'htmlspecialchars';
  108. $this->sp_dequoteHandler = array(get_class($this), '_unhtmlspecialchars');
  109. // Generate unique hash.
  110. static $num = 0;
  111. $uniq = md5(microtime() . ' ' . ++$num . ' ' . getmypid());
  112. $this->sp_replaceHash = $uniq;
  113. }
  114. /**
  115. * Add new tag handler for future processing.
  116. *
  117. * Handler is a callback which is will be for each tag found in the
  118. * parsed document. This callback could be used to replace tag. Here is
  119. * the prototype:
  120. *
  121. * mixed handler(array $attributes)
  122. *
  123. * Callback get 1 parameter - parset tag attribute array.
  124. * The following types instead of "mixed" is supported:
  125. *
  126. * - bool or NULL If handler returns FALSE or NULL, source tag is
  127. * not modified.
  128. * - string Returning value is used t replace original tag.
  129. * - array Returning value is treated as associative array of
  130. * tag attributes. Array also contains two special
  131. * elements:
  132. * - "_tagName": name of tag;
  133. * - "_text": string representation of tag body
  134. * (for containers only, see below).
  135. * String representation of tag will be
  136. * reconstructed automatically by that array.
  137. *
  138. * @param string $tagName Name of tag to handle. E.g., "a", "img" etc.
  139. * @param callback $handler Callback which will be called on for found tag.
  140. * @return void
  141. */
  142. function addTag($tagName, $handler, $atFront=false)
  143. {
  144. $tagName = strtolower($tagName);
  145. if (!isSet($this->sp_tags[$tagName])) $this->sp_tags[$tagName] = array();
  146. if (!$atFront) array_push($this->sp_tags[$tagName], $handler);
  147. else array_unshift($this->sp_tags[$tagName], $handler);
  148. // echo "Tag added: $tagName<br>\n";
  149. }
  150. /**
  151. * Add the container handler.
  152. *
  153. * Containers are processed just like simple tags (see addTag()), but they also have
  154. * bodies saved in "_text" attribute.
  155. *
  156. * @param string $contName Name of container to search.
  157. * @param callback $handler Call this function to replace.
  158. * @return void
  159. */
  160. function addContainer($tagName, $handler, $atFront=false)
  161. {
  162. $tagName = strtolower($tagName);
  163. if (!isSet($this->sp_cons[$tagName])) $this->sp_cons[$tagName] = array();
  164. if (!$atFront) array_push($this->sp_cons[$tagName], $handler);
  165. else array_unshift($this->sp_cons[$tagName], $handler);
  166. // echo "Container added: $tagName\n";
  167. }
  168. /**
  169. * Add regular expression replacer.
  170. *
  171. * Use callback with one parameter: RE matched pockets.
  172. *
  173. * @param string $re Regular Expression to search for.
  174. * @param callback $handler Call this function to replace.
  175. * @return void
  176. */
  177. function addReplace($re, $handler, $atFront=false)
  178. {
  179. if (!isSet($this->sp_res[$re])) $this->sp_res[$re] = array();
  180. if (!$atFront) array_push($this->sp_res[$re], $handler);
  181. else array_unshift($this->sp_res[$re], $handler);
  182. }
  183. /**
  184. * Add all the callback methods from $obj.
  185. *
  186. * Types of handlers (tag, container or RE) depend on method name prefix:
  187. * see $sp_pre* properties above.
  188. *
  189. * @param object $obj Use this object methods as callbacks.
  190. * @return void
  191. */
  192. function addObject(&$obj, $noPrecache=false, $atFront=false)
  193. {
  194. // Search for all the derieved handlers.
  195. foreach (get_class_methods($obj) as $m) {
  196. if (strpos($m, $this->sp_preTag) === 0) {
  197. $this->addTag(substr($m, strlen($this->sp_preTag)), array(&$obj, $m), $atFront);
  198. }
  199. if (strpos($m, $this->sp_preCon) === 0) {
  200. $this->addContainer(substr($m, strlen($this->sp_preCon)), array(&$obj, $m), $atFront);
  201. }
  202. if (strpos($m, $this->sp_preRe) === 0) {
  203. $meth = substr($m, strlen($this->sp_preRe));
  204. $re = call_user_func(array(&$obj, $m));
  205. if ($re !== false && $re !== null) {
  206. $this->addReplace($re, array(&$obj, $meth), $atFront);
  207. }
  208. }
  209. }
  210. // Add object precacher & post-processors if present.
  211. if (!isset($this->sp_selfAdd)) {
  212. foreach (array('precacheTags'=>'sp_precachers', 'postprocText'=>'sp_postprocs', 'preprocText'=>'sp_preprocs') as $pname=>$var) {
  213. if (method_exists($obj, $pname)) {
  214. if (!$atFront) array_push($this->$var, array(&$obj, $pname));
  215. else array_unshift($this->$var, array(&$obj, $pname));
  216. }
  217. }
  218. }
  219. }
  220. /**
  221. * Quote HTML entities.
  222. * You may override this method or set $this->sp_quoteHandler property.
  223. *
  224. * @param string $str String to quote.
  225. * @return string Quoted string.
  226. */
  227. function quoteHandler($value)
  228. {
  229. return call_user_func($this->sp_quoteHandler, $value);
  230. }
  231. /**
  232. * Dequote HTML entities.
  233. * You may override this method or set $this->sp_dequoteHandler property.
  234. *
  235. * @param string $str String to dequote.
  236. * @return string Dequoted string.
  237. */
  238. function dequoteHandler($value)
  239. {
  240. return call_user_func($this->sp_dequoteHandler, $value);
  241. }
  242. /**
  243. * Reverse function for htmlspecialchars().
  244. */
  245. function _unhtmlspecialchars($value)
  246. {
  247. // Generate entity translation table (only once!).
  248. static $sp_trans = null;
  249. if (!$sp_trans) {
  250. $sp_trans = array_flip(get_html_translation_table(HTML_SPECIALCHARS));
  251. $sp_trans['&#039;'] = "'"; // manually translate apostroph for FireFox
  252. }
  253. return strtr($value, $sp_trans);
  254. }
  255. /**
  256. * Process HTML string and call all the callbacks for it.
  257. *
  258. * @param string $buf HTML text.
  259. * @return Text after all the replaces.
  260. */
  261. function process($buf)
  262. {
  263. $reTagIn = $this->sp_reTagIn;
  264. // Preprocess the text.
  265. $new = $this->preprocText($buf);
  266. if ($new !== null) $buf = $new;
  267. // Remove ignored container bodies from the string.
  268. $this->sp_ignored = array();
  269. if ($this->sp_SKIP_IGNORED) {
  270. $reIgnoredNames = join("|", $this->sp_IGNORED);
  271. $reIgnored = "{(<($reIgnoredNames) (?> \s+ $reTagIn)? >) (.*?) (</\\2>)}six";
  272. // Note that we MUST increase backtrack_limit, else error
  273. // PREG_BACKTRACK_LIMIT_ERROR will be generated on large SELECTs
  274. // (see preg_last_error() in PHP5).
  275. $oldLimit = ini_get('pcre.backtrack_limit');
  276. ini_set('pcre.backtrack_limit', 1024 * 1024 * 10);
  277. $buf = preg_replace_callback(
  278. $reIgnored,
  279. array(&$this, "_callbackIgnored2Hash"),
  280. $buf
  281. );
  282. ini_set('pcre.backtrack_limit', $oldLimit);
  283. }
  284. $sp_ignored = array($this->sp_ignored, array_keys($this->sp_ignored), array_values($this->sp_ignored));
  285. unset($this->sp_ignored);
  286. // Replace custom REs.
  287. if ($this->sp_res) {
  288. foreach ($this->sp_res as $re => $handlers) {
  289. foreach ($handlers as $h) {
  290. $buf = preg_replace_callback($re, $h, $buf);
  291. }
  292. }
  293. }
  294. // Replace tags and containers.
  295. $hashlen = strlen($this->sp_replaceHash) + 10;
  296. $reTagNames = join("|", array_keys($this->sp_tags));
  297. $reConNames = join("|", array_keys($this->sp_cons));
  298. $infos = array();
  299. // (? >...) [without space] is much faster than (?:...) in this case.
  300. if ($this->sp_tags)
  301. $infos["sp_tags"] = "/( <($reTagNames) (?> (\s+ $reTagIn) )? > () )/isx";
  302. if ($this->sp_cons)
  303. $infos["sp_cons"] = "/( <($reConNames) (?> (\s+ $reTagIn) )? > (.*?) (?: <\\/ \\2 \\s* > | \$ ) )/isx";
  304. foreach ($infos as $src => $re) {
  305. // Split buffer into tags.
  306. $chunks = preg_split($re, $buf, 0, PREG_SPLIT_DELIM_CAPTURE);
  307. $textParts = array($chunks[0]); // unparsed text parts
  308. $foundTags = array(); // found tags
  309. for ($i=1, $n=count($chunks); $i<$n; $i+=5) {
  310. // $i points to sequential tag (or container) subchain.
  311. $tOrig = $chunks[$i]; // - original tag text
  312. $tName = $chunks[$i+1]; // - tag name
  313. $tAttr = $chunks[$i+2]; // - tag attributes
  314. $tBody = $chunks[$i+3]; // - container body
  315. $tFollow = $chunks[$i+4]; // - following unparsed text block
  316. // Add tag to array for precaching.
  317. $tag = array();
  318. $this->parseAttrib($tAttr, $tag);
  319. $tag['_orig'] = $tOrig;
  320. $tag['_tagName'] = $tName;
  321. if ($src == "sp_cons") {
  322. if (strlen($tBody) < $hashlen && isset($sp_ignored[0][$tBody])) {
  323. // Maybe it is temporarily removed content - place back!
  324. // Fast solution working in most cases (key-based hash lookup
  325. // is much faster than str_replace() below).
  326. $tBody = $sp_ignored[0][$tBody];
  327. } else {
  328. // We must pass unmangled content to container processors!
  329. $tBody = str_replace($sp_ignored[1], $sp_ignored[2], $tBody);
  330. }
  331. $tag['_text'] = $tBody;
  332. } else if (substr($tAttr, -1) == '/') {
  333. $tag['_text'] = null;
  334. }
  335. $foundTags[] = $tag;
  336. $textParts[] = $tFollow;
  337. }
  338. // Save original tags.
  339. $origTags = $foundTags;
  340. // Precache (possibly modifying) all the found tags (if needed).
  341. $this->precacheTags($foundTags);
  342. // Process all found tags and join the buffer.
  343. $buf = $textParts[0];
  344. for ($i=0, $n=count($foundTags); $i<$n; $i++) {
  345. $tag = $this->_runHandlersForTag($foundTags[$i]);
  346. if (!is_array($tag)) {
  347. // String representation.
  348. $buf .= $tag;
  349. } else {
  350. $left = isset($tag['_left'])? $tag['_left'] : ""; unset($tag['_left']);
  351. $right = isset($tag['_right'])? $tag['_right'] : ""; unset($tag['_right']);
  352. if (!isset($tag['_orig']) || $tag !== $origTags[$i]) {
  353. // Build the tag back if it is changed.
  354. $text = $this->makeTag($tag);
  355. } else {
  356. // Else - use original tag string.
  357. // We use this algorythm because of non-unicode tag parsing mode:
  358. // e.g. entity &nbsp; in tag attributes is replaced by &amp;nbsp;
  359. // in makeTag(), but if the tag is not modified at all, we do
  360. // not care and do not call makeTag() at all saving original &nbsp;.
  361. $text = $tag['_orig'];
  362. }
  363. $buf .= $left . $text . $right;
  364. }
  365. $buf .= $textParts[$i+1];
  366. }
  367. }
  368. // Return temporarily removed containers back.
  369. $buf = str_replace($sp_ignored[1], $sp_ignored[2], $buf);
  370. $new = $this->postprocText($buf);
  371. if ($new !== null) $buf = $new;
  372. return $buf;
  373. }
  374. /**
  375. * Recreate the tag or container by its parsed attributes.
  376. *
  377. * If $attr[_text] is present, make container.
  378. *
  379. * @param array $attr Attributes of tag. These attributes could
  380. * include two special attributes:
  381. * '_text': tag is a container with body.
  382. * If null - <tag ... />.
  383. * If not present - <tag ...>.
  384. * '_tagName': name of this tag.
  385. * '_orig': ignored (internal usage).
  386. *
  387. * @return HTML-strict representation of tag or container.
  388. */
  389. function makeTag($attr)
  390. {
  391. // Join & return tag.
  392. $s = "";
  393. foreach($attr as $k => $v) {
  394. if ($k == "_text" || $k == "_tagName" || $k == "_orig") continue;
  395. $s .= " " . $k;
  396. if ($v !== null) $s .= '="' . $this->quoteHandler($v) . '"';
  397. }
  398. if (!isset($attr['_tagName']) || !$attr['_tagName'])
  399. $attr['_tagName'] = "???";
  400. if (!array_key_exists('_text', $attr)) { // do not use isset()!
  401. $tag = "<{$attr['_tagName']}{$s}>";
  402. } else if ($attr['_text'] === null) { // null
  403. $tag = "<{$attr['_tagName']}{$s} />";
  404. } else {
  405. $tag = "<{$attr['_tagName']}{$s}>{$attr['_text']}</{$attr['_tagName']}>";
  406. }
  407. return $tag;
  408. }
  409. /**
  410. * Virtual user-defined client precache functions.
  411. *
  412. * This function is called after all tags and containers are
  413. * found in HTML text, but BEFORE any replaces. It could work with
  414. * $foundTags to process all found data at once (for
  415. * faster replacing later). E.g., if callbacks use MySQL, it is
  416. * much more faster to perform one SQL-query with big IN() clause
  417. * than a lot of simple SQL querise with their own get_result()
  418. * calls.
  419. *
  420. * @return void
  421. */
  422. function precacheTags(&$foundTags)
  423. {
  424. foreach ($this->sp_precachers as $pk) {
  425. // call_user_func() does not support &-parameters
  426. // while allow_call_time_pass_reference=false
  427. call_user_func_array($pk, array(&$foundTags));
  428. }
  429. }
  430. /**
  431. * Called after all the tags ane containers are processed,
  432. * but before HTML is sent to caller context.
  433. */
  434. function preprocText($buf)
  435. {
  436. foreach ($this->sp_preprocs as $pk) {
  437. // call_user_func() does not support &-parameters
  438. // while allow_call_time_pass_reference=false
  439. $new = call_user_func($pk, $buf);
  440. if ($new !== null) $buf = $new;
  441. }
  442. return $buf;
  443. }
  444. /**
  445. * Called after all the tags ane containers are processed,
  446. * but before HTML is sent to caller context.
  447. */
  448. function postprocText($buf)
  449. {
  450. foreach ($this->sp_postprocs as $pk) {
  451. // call_user_func() does not support &-parameters
  452. // while allow_call_time_pass_reference=false
  453. $new = call_user_func($pk, $buf);
  454. if ($new !== null) $buf = $new;
  455. }
  456. return $buf;
  457. }
  458. /**
  459. * Replace found ignored container body by hash value.
  460. *
  461. * Container's open and close tags are NOT modified!
  462. * Later hash value will be replaced back to original text.
  463. */
  464. function _callbackIgnored2Hash($m)
  465. {
  466. static $counter = 0;
  467. $hash = $this->sp_replaceHash . ++$counter . "|";
  468. // DO NOT use chr(0) here!!!
  469. $this->sp_ignored[$hash] = $m[3];
  470. return $m[1] . $hash . $m[4];
  471. }
  472. /**
  473. * Process the tag.
  474. *
  475. * @param array $attr Parsed tag.
  476. * @return Attributes of processed tag.
  477. */
  478. function _runHandlersForTag($tag)
  479. {
  480. // Processing tag or container?..
  481. $tagName = strtolower($tag['_tagName']);
  482. if (isset($tag['_text'])) {
  483. // If $tag['_text'] === null, it is NOT a container but self-closed tag!
  484. // And isset(null) returns false, as we need, and we do not get here.
  485. $handlers = $this->sp_cons[$tagName];
  486. } else {
  487. $handlers = $this->sp_tags[$tagName];
  488. }
  489. // Use all handlers right-to-left.
  490. for ($i = count($handlers)-1; $i >= 0; $i--) {
  491. $h = $handlers[$i];
  492. $result = call_user_func($h, $tag, $tagName);
  493. // If returned false, tag is not changed.
  494. if ($result !== false && $result !== null) {
  495. // If the string is returned, stop processing now.
  496. if (!is_array($result)) return $result;
  497. // Else continue.
  498. $tag = $result;
  499. }
  500. }
  501. return $tag;
  502. }
  503. /**
  504. * Parse the attribute string: "a1=v1 a2=v2 ..." of the tag.
  505. *
  506. * @param $body Tag body between < and >.
  507. * @param &$attr Resulting Array of tag attributes
  508. * @return void.
  509. */
  510. function parseAttrib($body, &$attr)
  511. {
  512. $preg = '/([-\w:]+) \s* ( = \s* (?> ("[^"]*" | \'[^\']*\' | \S*) ) )?/sx';
  513. $regs = null;
  514. preg_match_all($preg, $body, $regs);
  515. $names = $regs[1];
  516. $checks = $regs[2];
  517. $values = $regs[3];
  518. $attr = array();
  519. for ($i = 0, $c = count($names); $i < $c; $i++) {
  520. $name = strtolower($names[$i]);
  521. if (!isset($checks[$i]) || !$checks[$i]) {
  522. $value = $name;
  523. } else {
  524. $value = $values[$i];
  525. if ($value[0] == '"' || $value[0] == "'") {
  526. $value = substr($value, 1, -1);
  527. }
  528. }
  529. if (strpos($value, '&') !== false)
  530. $value = $this->dequoteHandler($value);
  531. $attr[$name] = $value;
  532. }
  533. }
  534. }
  535. ?>