PageRenderTime 47ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/gforge/plugins/wiki/www/lib/HtmlParser.php

https://github.com/neymanna/fusionforge
PHP | 432 lines | 259 code | 38 blank | 135 comment | 54 complexity | e11c3b434b6b91764f1c58d7b31814a2 MD5 | raw file
Possible License(s): GPL-2.0, MPL-2.0-no-copyleft-exception
  1. <?php // -*-php-*-
  2. rcs_id('$Id: HtmlParser.php,v 1.3 2004/12/26 17:10:44 rurban Exp $');
  3. /**
  4. * HtmlParser Class: Conversion HTML => wikimarkup
  5. * Requires XmlParser, XmlElement and the expat (or now the libxml) library. This is all in core.
  6. */
  7. /*
  8. Copyright (C) 2004 Reini Urban
  9. This file is part of PhpWiki.
  10. PhpWiki is free software; you can redistribute it and/or modify
  11. it under the terms of the GNU General Public License as published by
  12. the Free Software Foundation; either version 2 of the License, or
  13. (at your option) any later version.
  14. PhpWiki is distributed in the hope that it will be useful,
  15. but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. GNU General Public License for more details.
  18. You should have received a copy of the GNU General Public License
  19. along with PhpWiki; if not, write to the Free Software
  20. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  21. */
  22. /**
  23. * Base class to implement html => wikitext converters,
  24. * extendable for various wiki syntax versions.
  25. * This is needed to be able to use htmlarea-alike editors,
  26. * and to import HTML documents.
  27. *
  28. * See also php-html.sf.net for a php-only version, if
  29. * you don't have the expat/libxml extension included.
  30. * See also http://search.cpan.org/~diberri/HTML-WikiConverter/
  31. *
  32. */
  33. // RssParser contains the XML (expat) and url-grabber methods
  34. require_once('lib/XmlParser.php');
  35. class HtmlParser
  36. extends XmlParser
  37. {
  38. var $dialect, $_handlers, $root;
  39. /**
  40. * dialect: "PhpWiki2", "PhpWiki"
  41. * possible more dialects: MediaWiki, kwiki, c2
  42. */
  43. function HtmlParser($dialect = "PhpWiki2", $encoding = '') {
  44. $classname = "HtmlParser_".$dialect;
  45. if (class_exists($classname))
  46. $this->dialect = new $classname;
  47. else {
  48. trigger_error(sprintf("unknown HtmlParser dialect %s",$dialect),E_USER_ERROR);
  49. }
  50. $this->_handlers =& $this->dialect->_handlers;
  51. $this->XmlParser($encoding);
  52. xml_parser_set_option($this->_parser, XML_OPTION_CASE_FOLDING, 0);
  53. xml_parser_set_option($this->_parser, XML_OPTION_SKIP_WHITE, 1);
  54. }
  55. // The three callbacks, called on walking through the HTML tree.
  56. // No extensions needed from XmlParser.
  57. /*
  58. function tag_open($parser, $name, $attrs='') {
  59. }
  60. function tag_close($parser, $name, $attrs='') {
  61. }
  62. function cdata($parser, $data) {
  63. }
  64. function parse_url($file, $debug=false)
  65. */
  66. function output () {
  67. if (is_null($this->root))
  68. $this->root = $GLOBALS['xml_parser_root'];
  69. $output = $this->wikify( $this->root );
  70. return $output;
  71. }
  72. function wikify ($node, $parent = null) {
  73. $output = '';
  74. if( isa($node, 'XmlElement')) {
  75. $dialect =& $this->dialect;
  76. $conv = $dialect->_handlers[$node->_tag];
  77. if( is_string($conv) and method_exists($dialect, $conv)) {
  78. $output = $dialect->$conv($node);
  79. } elseif( is_array($conv) ) {
  80. foreach ($node->getContent() as $n) {
  81. $output .= $this->wikify($n, $node);
  82. }
  83. $output = $conv[0] . $output . $conv[count($conv)-1];
  84. } elseif( !empty($conv) ) {
  85. $output = $conv;
  86. foreach ($node->getContent() as $n) {
  87. $output .= $this->wikify($n, $node);
  88. }
  89. } else {
  90. foreach ($node->getContent() as $n) {
  91. $output .= $this->wikify($n, $node);
  92. }
  93. }
  94. } else {
  95. $output = $node;
  96. if ($parent and $parent->_tag != 'pre')
  97. preg_replace("/ {2,}/"," ",$output);
  98. if (trim($output) == '')
  99. $output = '';
  100. }
  101. return $output;
  102. }
  103. /** elem_contents()
  104. * $output = $parser->elem_contents( $elem );
  105. * Returns a wikified version of the contents of the specified
  106. * HTML element. This is done by passing each element of this
  107. * element's content list through the C<wikify()> method, and
  108. * returning the concatenated result.
  109. */
  110. function elem_contents($node) {
  111. $output = '';
  112. if (isa($node,'XmlElement')) {
  113. foreach ($node->getContent() as $child) {
  114. $output .= $this->wikify($child, isset($node->parent) ? $node->parent : null);
  115. }
  116. } else {
  117. $output = $this->wikify($content);
  118. }
  119. return $output;
  120. }
  121. //
  122. // Private function: _elem_attr_str( $elem, @attrs )
  123. //
  124. // Returns a string containing a list of attribute names and
  125. // values associated with the specified HTML element. Only
  126. // attribute names included in @attrs will be added to the
  127. // string of attributes that is returned. The return value
  128. // is suitable for inserting into an HTML document, as
  129. // attribute name/value pairs are specified in attr="value"
  130. // format.
  131. //
  132. function _elem_attr_str($node, $attrs) {
  133. $s = '';
  134. foreach ($node->_attr as $attr => $val) {
  135. $attr = strtolower($attr);
  136. if (in_array($attr,$attrs))
  137. $s .= " $attr=\"$val\"";
  138. }
  139. return $s;
  140. }
  141. //
  142. // Private function: _elem_has_ancestor( $elem, $tagname )
  143. //
  144. // Returns true if the specified HtmlElement has an ancestor element
  145. // whose element tag equals $tag. This is useful for determining if
  146. // an element belongs to the specified tag.
  147. //
  148. function _elem_has_ancestor($node, $tag) {
  149. if (isset($node->parent)) {
  150. if ($node->parent->_tag == $tag) return true;
  151. return $this->_elem_has_ancestor($node->parent, $tag);
  152. }
  153. return false;
  154. }
  155. //
  156. // Private function: _elem_is_image_div( $elem )
  157. //
  158. // Returns true $elem is a container element (P or DIV) meant only to
  159. // lay out an IMG.
  160. //
  161. // More specifically, returns true if the given element is a DIV or P
  162. // element and the only child it contains is an IMG tag or an IMG tag
  163. // contained within a sole A tag (not counting child elements with
  164. // whitespace text only).
  165. //
  166. function _elem_is_image_div( $node ) {
  167. // Return false if node is undefined or isn't a DIV at all
  168. if (!$node or !in_array($node->_tag,array("div","p")))
  169. return false;
  170. $contents = $node->getContent();
  171. // Returns true if sole child is an IMG tag
  172. if (count($contents) == 1 and isset($contents[0]) and $contents[0]->_tag == 'img')
  173. return true;
  174. // Check if child is a sole A tag that contains an IMG tag
  175. if (count($contents) == 1 and isset($contents[0]) and $contents[0]->_tag == 'a') {
  176. $children = $contents[0]->getContent();
  177. if (count($children) == 1 and isset($children[0]) and $children[0]->_tag == 'img')
  178. return true;
  179. }
  180. return false;
  181. }
  182. /** preserves tags and content
  183. */
  184. function wikify_default($node) {
  185. return $this->wikify_preserve($node);
  186. }
  187. /** preserves tags and content
  188. */
  189. function wikify_preserve($node) {
  190. return $node->asXML();
  191. }
  192. function log($dummy) {}
  193. }
  194. class HtmlParser_PhpWiki2
  195. extends HtmlParser
  196. {
  197. function HtmlParser_PhpWiki2() {
  198. $this->_handlers =
  199. array('html' => '',
  200. 'head' => '',
  201. 'title' => '',
  202. 'meta' => '',
  203. 'link' => '',
  204. 'script' => '',
  205. 'body' => '',
  206. 'br' => "<br>",
  207. 'b' => array( "*" ),
  208. 'strong' => array( "*" ),
  209. 'i' => array( "_" ),
  210. 'em' => array( "_" ),
  211. 'hr' => "----\n\n",
  212. // PRE blocks are handled specially (see tidy_whitespace and
  213. // wikify methods)
  214. 'pre' => array( "<pre>", "</pre>" ),
  215. 'dl' => array( '', "\n\n" ),
  216. 'dt' => array( ';', '' ),
  217. 'dd' => array( ':', '' ),
  218. 'p' => array( "\n\n", "\n\n" ),
  219. 'ul' => array( '', "\n" ),
  220. 'ol' => array( '', "\n" ),
  221. 'li' => "wikify_list_item",
  222. 'table' => "wikify_table",
  223. 'tr' => "wikify_tr",
  224. 'td' => "wikify_td",
  225. 'th' => "wikify_td",
  226. 'div' => array( '', "\n\n" ),
  227. 'img' => "wikify_img",
  228. 'a' => "wikify_link",
  229. 'span' => array( '', '' ),
  230. 'h1' => "wikify_h",
  231. 'h2' => "wikify_h",
  232. 'h3' => "wikify_h",
  233. 'h4' => "wikify_h",
  234. 'h5' => "wikify_h",
  235. 'h6' => "wikify_h",
  236. 'font' => array( '', '' ),
  237. 'sup' => "wikify_default",
  238. 'sub' => "wikify_default",
  239. 'nowiki' => "wikify_verbatim",
  240. 'verbatim' => "wikify_default",
  241. );
  242. }
  243. function wikify_table( $node ) {
  244. $this->ident = '';
  245. return "| \n" . $this->elem_contents($node) . "|\n\n";
  246. }
  247. function wikify_tr( $node ) {
  248. return "\n| " . $this->elem_contents($node);
  249. }
  250. function wikify_th( $node ) {
  251. $ident = empty($this->ident) ? '' : $this->ident;
  252. $output = "$ident| ";
  253. $content = $this->elem_contents($node);
  254. preg_replace("s/^\s+/","",$content);
  255. $output .= $content;
  256. $this->ident .= ' ';
  257. return "$output |\n";
  258. }
  259. function wikify_list_item( $node ) {
  260. return ($this->_elem_has_ancestor($node, 'ol') ? '*' : '#') . " " . trim($this->elem_contents($node)). "\n";
  261. }
  262. function wikify_link( $node ) {
  263. $url = $this->absolute_url( $node->getAttr('href') );
  264. $title = $this->elem_contents($node);
  265. if (empty($url))
  266. $title = trim($title);
  267. // Just return the link title if this tag is contained
  268. // within an header tag
  269. if (isset($node->parent) and preg_match('/^h\d$/',$node->parent->_tag))
  270. return $title;
  271. // Return if this is a link to an image contained within
  272. if (isset($node->parent) and $this->_elem_is_image_div($node->parent))
  273. return $title;
  274. // If HREF is the same as the link title, then
  275. // just return the URL (it'll be converted into
  276. // a clickable link by the wiki engine)
  277. if ($url == $title) return $url;
  278. return "[ $url | $title ]";
  279. }
  280. function wikify_h( $node ) {
  281. $level = substr($node->_tag,1);
  282. if ($level < 4) {
  283. $markup = str_repeat('!',4 - $level);
  284. } else {
  285. $markup = '!';
  286. }
  287. return $markup.' '.trim($this->elem_contents($node))."\n\n";
  288. }
  289. function wikify_verbatim( $node ) {
  290. $contents = $this->elem_contents( $node );
  291. return "\n<verbatim>\n$contents\n</verbatim>";
  292. }
  293. function wikify_img( $node ) {
  294. $image_url = $this->absolute_url( $node->getAttr('src') );
  295. $file = basename( $image_url );
  296. $alignment = $node->getAttr('align');
  297. $this->log( "Processing IMG tag for SRC: ".$image_url."..." );
  298. //
  299. // Grab attributes to be added to the [ Image ] markup (since 1.3.10)
  300. //
  301. if (!$alignment) {
  302. if ($this->_elem_is_image_div( $node->parent ))
  303. $image_div = $node->parent;
  304. elseif (isset($node->parent) and $this->_elem_is_image_div( $node->parent->parent ))
  305. $image_div = $node->parent->parent;
  306. }
  307. if ( !$alignment and $image_div ) {
  308. $css_style = $image_div->getAttr('style');
  309. $css_class = $image_div->getAttr('class');
  310. // float => align: Check for float attribute; if it's there,
  311. // then we'll add it to the [Image] syntax
  312. if (!$alignment and preg_match("/float\:\s*(right|left)/i",$css_style,$m))
  313. $alignment = $m[1];
  314. if (!$alignment and preg_match("/float(right|left)/i",$css_class,$m));
  315. $alignment = $m[1];
  316. if( $alignment ) {
  317. $attrs[] = "align=$alignment";
  318. $this->log( " Image is contained within a DIV that specifies $alignment alignment" );
  319. $this->log( " Adding '$alignment' to [Image] markup attributes" );
  320. } else {
  321. $this->log( " Image is not contained within a DIV for alignment" );
  322. }
  323. } else {
  324. $this->log( " Image is not contained within a DIV" );
  325. }
  326. if ($alignment)
  327. $attrs[] = "align=$alignment";
  328. //
  329. // Check if we need to request a thumbnail of this
  330. // image; it's needed if the specified width attribute
  331. // differs from the default size of the image
  332. //
  333. if( $width = $node->getAttr('width') ) {
  334. $this->log( " Image has WIDTH attribute of $width" );
  335. $this->log( " Checking whether resulting [Image] markup should specify a thumbnail..." );
  336. // Download the image from the network and store
  337. $abs_url = $this->absolute_url( $node->getAttr('src') );
  338. $this->log( " Fetching image '$abs_url' from the network" );
  339. list( $actual_w, $actual_h, $flag, $attr_str) = getimagesize( $abs_url );
  340. // If the WIDTH attribute of the IMG tag is not equal
  341. // to the actual width of the image, then we need to
  342. // create a thumbnail
  343. if( preg_match("/^\d+$/",$width) and $width != $actual_w ) {
  344. $this->log( " IMG tag's WIDTH attribute ($width) differs from actual width of image ($actual_w)" );
  345. $this->log( " -- that means we're going to need a thumbnail" );
  346. $this->log( " Adding 'width' to list of attributes for [Image] markup" );
  347. $attrs[] = "width=$width";
  348. $width_added = true;
  349. }
  350. $height = $node->getAttr('height');
  351. if( preg_match("/^\d+$/",$height) and $height != $height_h ) {
  352. $this->log( " IMG tag's HEIGHT attribute ($height) differs from actual height of image ($actual_h)" );
  353. $this->log( " -- that means we're going to need a thumbnail" );
  354. $this->log( " Adding 'height' to list of attributes for [Image] markup" );
  355. if (isset($width_added))
  356. $attrs[count($attr)-1] = "size=".$width."x".$height;
  357. else
  358. $attrs[] = "height=$height";
  359. }
  360. }
  361. if ($alt = $node->getAttr('alt')) {
  362. $this->log( " Adding alternate text '$alt' to [Image] markup" );
  363. $attrs[] = "alt=$alt";
  364. }
  365. $attr_str = join(' ', $attrs);
  366. $this->log( "...done processing IMG tag\n" );
  367. return "[ $file $attr_str ]";
  368. }
  369. }
  370. // $Log: HtmlParser.php,v $
  371. // Revision 1.3 2004/12/26 17:10:44 rurban
  372. // just docs or whitespace
  373. //
  374. // Revision 1.2 2004/10/19 13:23:06 rurban
  375. // fixed: Unknown modifier "g"
  376. //
  377. // Revision 1.1 2004/05/24 17:31:31 rurban
  378. // new XmlParser and HtmlParser, RssParser based on that.
  379. //
  380. //
  381. // For emacs users
  382. // Local Variables:
  383. // mode: php
  384. // tab-width: 8
  385. // c-basic-offset: 4
  386. // c-hanging-comment-ender-p: nil
  387. // indent-tabs-mode: nil
  388. // End:
  389. ?>