PageRenderTime 30ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/parsers/graveyard/wiki2xml/php/xml2docbook_xml.php

https://github.com/ChuguluGames/mediawiki-svn
PHP | 456 lines | 383 code | 45 blank | 28 comment | 253 complexity | 9252b0fa2527f66fa1fe1db4da065efd MD5 | raw file
  1. <?php
  2. /**
  3. * This file contains the /element/ class needed by xml2tree.php
  4. * to create a tree which is then converted into DocBook XML
  5. */
  6. class element {
  7. var $name = '';
  8. var $attrs = array ();
  9. var $children = array ();
  10. # Temporary variables for link tags
  11. var $link_target = "" ;
  12. var $link_trail = "" ;
  13. var $link_parts = array () ;
  14. # Variables only used by $tree root
  15. var $list = array () ;
  16. var $opentags = array () ;
  17. var $sections = array () ;
  18. /**
  19. * Parse the children ... why won't anybody think of the children?
  20. */
  21. function sub_parse(& $tree) {
  22. $ret = '' ;
  23. $temp = "" ;
  24. foreach ($this->children as $key => $child) {
  25. if (is_string($child)) {
  26. $temp .= $child ;
  27. } elseif ($child->name != 'ATTRS') {
  28. $ret .= $this->add_temp_text ( $temp ) ;
  29. $sub = $child->parse ( $tree , "" , $this ) ;
  30. if ( $this->name == 'LINK' ) {
  31. if ( $child->name == 'TARGET' ) $this->link_target = $sub ;
  32. else if ( $child->name == 'PART' ) $this->link_parts[] = $sub ;
  33. else if ( $child->name == 'TRAIL' ) $this->link_trail = $sub ;
  34. }
  35. $ret .= $sub ;
  36. }
  37. }
  38. return $ret . $this->add_temp_text ( $temp ) ;
  39. }
  40. function fix_text ( $s ) {
  41. $s = html_entity_decode ( $s , ENT_COMPAT, 'UTF-8') ; // dbu 2007-08-20
  42. filter_named_entities ( $s ) ;
  43. $s = str_replace ( "&" , "&amp;" , $s ) ;
  44. $s = str_replace ( "<" , "&lt;" , $s ) ;
  45. $s = str_replace ( ">" , "&gt;" , $s ) ;
  46. return $s; // utf8_decode ( $s ) ; // dbu 2007-08-20
  47. }
  48. function add_temp_text ( &$temp ) {
  49. $s = $temp ;
  50. $temp = "" ;
  51. return $this->fix_text ( $s ) ;
  52. }
  53. function add_new ( $tag , &$tree ) {
  54. return $this->ensure_new ( $tag , $tree , "<{$tag}>\n" ) ;
  55. }
  56. function ensure_new ( $tag , &$tree , $opttag = "" ) {
  57. if ( $opttag == "" ) { # Catching special case (currently, <section>)
  58. foreach ( $tree->opentags AS $o ) {
  59. if ( $o == $tag ) return "" ; # Already open
  60. }
  61. }
  62. array_push ( $tree->opentags , $tag ) ;
  63. if ( $opttag == "" ) return "<{$tag}>\n" ;
  64. else return $opttag ;
  65. }
  66. function close_last ( $tag , &$tree , $all = false ) {
  67. $found = false ;
  68. foreach ( $tree->opentags AS $o ) {
  69. if ( $o == $tag ) $found = true ;
  70. }
  71. if ( !$found ) return "" ; # Already closed
  72. $ret = "\n" ;
  73. while ( count ( $tree->opentags ) > 0 ) {
  74. $o = array_pop ( $tree->opentags ) ;
  75. $ret .= "</{$o}>\n" ;
  76. if ( $o == $tag ) {
  77. if ( $all ) return $ret . $this->close_last ( $tag , $tree , true ) ;
  78. else return $ret ;
  79. }
  80. }
  81. }
  82. function handle_extensions ( &$tree ) {
  83. global $content_provider ;
  84. $sub = "" ;
  85. $name = strtolower ( $this->attrs['EXTENSION_NAME'] ) ;
  86. $ot = $tree->opentags ;
  87. $tree->opentags = array () ;
  88. if ( $name == 'ref' )
  89. $sub .= $this->ensure_new ( 'para' , $tree ) ;
  90. $sub .= $this->sub_parse ( $tree ) ;
  91. while ( count ( $tree->opentags ) > 0 )
  92. $sub .= "</" . array_pop ( $tree->opentags ) . ">\n" ;
  93. $tree->opentags = $ot ;
  94. if ( $name == 'ref' ) {
  95. $ret = '<footnote>' . $sub . '</footnote>' ;
  96. } else {
  97. $ret = $sub ;
  98. }
  99. return $ret ;
  100. }
  101. function internal_id ( $title ) {
  102. #return urlencode ( $title ) ;
  103. $ret = "" ;
  104. for ( $a = 0 ; $a < strlen ( $title ) ; $a++ ) {
  105. if ( ( $title[$a] >= 'A' && $title[$a] <= 'Z' ) ||
  106. ( $title[$a] >= 'a' && $title[$a] <= 'z' ) ||
  107. ( $title[$a] >= '0' && $title[$a] <= '9' ) )
  108. $ret .= $title[$a] ;
  109. else $ret .= "_" ;
  110. }
  111. return $ret ;
  112. }
  113. function handle_link ( &$tree ) {
  114. global $content_provider ;
  115. $ot = $tree->opentags ;
  116. $sub = $this->sub_parse ( $tree ) ;
  117. $tree->opentags = $ot ;
  118. $link = "" ;
  119. if ( isset ( $this->attrs['TYPE'] ) AND strtolower ( $this->attrs['TYPE'] ) == 'external' ) { # External link
  120. $href = htmlentities ( $this->attrs['HREF'] ) ;
  121. if ( trim ( $sub ) == "" ) {
  122. $sub = $href ;
  123. $sub = explode ( '://' , $sub , 2 ) ;
  124. $sub = explode ( '/' , array_pop ( $sub ) , 2 ) ;
  125. $sub = array_shift ( $sub ) ;
  126. }
  127. $sub = $this->fix_text ( $sub ) ;
  128. $link = "<ulink url=\"{$href}\"><citetitle>{$sub}</citetitle></ulink>" ;
  129. } else { # Internal link
  130. if ( count ( $this->link_parts ) > 0 ) {
  131. $link = array_pop ( $this->link_parts ) ;
  132. array_push ( $this->link_parts , $link ) ; # Compensating array_pop
  133. }
  134. $link_text = $link ;
  135. if ( $link == "" ) $link = $this->link_target ;
  136. $link .= $this->link_trail ;
  137. $ns = $content_provider->get_namespace_id ( $this->link_target ) ;
  138. if ( $ns == 6 ) { # Image
  139. $nstext = explode ( ":" , $this->link_target , 2 ) ;
  140. $target = array_pop ( $nstext ) ;
  141. $nstext = array_shift ( $nstext ) ;
  142. $text = array_pop ( $this->link_parts ) ;
  143. $is_thumb = false ;
  144. $align = '' ;
  145. $width = '' ;
  146. foreach ( $this->link_parts AS $s ) {
  147. $s = trim ( $s ) ;
  148. if ( $s == 'thumb' ) {
  149. $is_thumb = true ;
  150. if ( $align == '' ) $align = 'right' ;
  151. if ( $width == '' ) $width = '200px' ;
  152. }
  153. }
  154. $href = $content_provider->get_image_url ( $target ) ;
  155. $link = "<mediaobject>\n<imageobject>\n<imagedata" ;
  156. $link .= " fileref=\"{$href}\"" ;
  157. # if ( $align != '' ) $link .= " align='{$align}'" ; # Deactivated until DocBook supports floating images; meanwhile:
  158. if ( $align == 'center' ) $link .= " align='{$align}'" ;
  159. if ( $width != '' ) $link .= " width='$width' scalefit='1'" ; # depth='$width'
  160. $link .= "/>\n</imageobject>\n" ;
  161. $link .= "<textobject>\n" ;
  162. $link .= "<phrase>{$text}</phrase>\n" ;
  163. $link .= "</textobject>\n" ;
  164. if ( $is_thumb ) {
  165. $link .= "<caption>\n" ;
  166. if ( substr ( $text , 0 , 5 ) == '<para' ) $link .= $text ; # Para-noia!
  167. else $link .= "<para>{$text}</para>\n" ;
  168. $link .= "</caption>\n" ;
  169. }
  170. $link .= "</mediaobject>\n" ;
  171. } else if ( $ns == -9 ) { # Interlanguage link
  172. $sub = $this->link_target ;
  173. $nstext = explode ( ":" , $sub , 2 ) ;
  174. $name = array_pop ( $nstext ) ;
  175. $nstext = array_shift ( $nstext ) ;
  176. $href = "http://{$nstext}.wikipedia.org/wiki/" . htmlentities ( $name ) ;
  177. $link = "<ulink url=\"{$href}\"><citetitle>{$sub}</citetitle></ulink>" ;
  178. } else if ( $ns == -8 ) { # Category link
  179. if ( $link_text == "!" || $link_text == '*' ) $link = "" ;
  180. else $link = " ({$link})" ;
  181. $link = "" . $this->link_target . $link . "" ;
  182. } else {
  183. if ( $content_provider->is_an_article ( $this->link_target ) ) {
  184. $lt = $this->internal_id ( trim ( $this->link_target ) ) ;
  185. $lt = str_replace ( "+" , "_" , $lt ) ;
  186. $link = "<link linkend='{$lt}'>{$link}</link>" ;
  187. } else {
  188. #$link = "<link linkend='{$lt}'>{$link}</link>" ;
  189. }
  190. }
  191. }
  192. return $link ;
  193. }
  194. function make_tgroup ( &$tree ) {
  195. $num_rows = 0 ;
  196. $max_num_cols = 0 ;
  197. $caption = "" ;
  198. foreach ($this->children AS $key1 => $row) {
  199. if (is_string($row)) continue ;
  200. elseif ($row->name == 'TABLECAPTION') {
  201. $caption .= $row->parse ( $tree , "DOCAPTION" , $this ) ;
  202. continue ;
  203. } elseif ($row->name != 'TABLEROW') continue ;
  204. $num_rows++ ;
  205. $num_cols = 0 ;
  206. foreach ( $row->children AS $key2 => $col ) {
  207. if (is_string($col)) continue ;
  208. if ($col->name != 'TABLECELL' && $col->name != 'TABLEHEAD') continue ;
  209. if ( isset ( $col->attrs['COLSPAN'] ) ) $num_cols += $col->attrs['COLSPAN'] ;
  210. else $num_cols++ ;
  211. }
  212. if ( $num_cols > $max_num_cols )
  213. $max_num_cols = $num_cols ;
  214. }
  215. return "<title>{$caption}</title><tgroup cols='{$max_num_cols}'>" ;
  216. }
  217. function top_tag ( &$tree ) {
  218. if ( count ( $tree->opentags ) == 0 ) return "" ;
  219. $x = array_pop ( $tree->opentags ) ;
  220. array_push ( $tree->opentags , $x ) ;
  221. return $x ;
  222. }
  223. function convert_xhtml_tags ( &$oldtag , &$tree , &$ret ) {
  224. if ( substr ( $oldtag , 0 , 6 ) != 'XHTML:' )
  225. return false ;
  226. $tag = substr ( $oldtag , 6 ) ;
  227. if ( $tag == 'UL' || $tag == 'OL' ) {
  228. $ot = $tree->opentags ;
  229. $r = "" ;
  230. $found = false ;
  231. while ( count ( $ot ) > 0 ) {
  232. $x = array_pop ( $ot ) ;
  233. $r .= "</{$x}>\n" ;
  234. $found = true ;
  235. if ( $x == 'para' ) break ;
  236. # if ( $x == 'listitem' ) break ;
  237. $found = false ;
  238. }
  239. if ( !$found ) return false ;
  240. $tree->opentags = $ot ;
  241. if ( $tag == 'UL' ) $this->attrs['TYPE'] = "bullet" ;
  242. if ( $tag == 'OL' ) $this->attrs['TYPE'] = "numbered" ;
  243. $oldtag = 'LIST' ;
  244. $ret .= $r ;
  245. return true ;
  246. } else if ( $tag == 'LI' ) {
  247. # $tt = $this->top_tag ( $tree ) ;
  248. # print $tt . "<br/>" ;
  249. # if ( $tt != 'itemizedlist' && $tt != 'orderedlist' ) return false ;
  250. $oldtag = 'LISTITEM' ;
  251. }
  252. return false ; # No match
  253. }
  254. /*
  255. * Parse the tag
  256. */
  257. function parse ( &$tree , $param = "" , $root = "" ) {
  258. global $content_provider ;
  259. $ret = '';
  260. $tag = $this->name ;
  261. $close_tag = "" ;
  262. # Pre-fixing XHTML to wiki tags
  263. $xhtml_conversion = $this->convert_xhtml_tags ( $tag , $tree , $ret ) ;
  264. if ( $tag == 'SPACE' ) {
  265. return ' ' ; # Speedup
  266. } else if ( $tag == 'ARTICLES' ) {
  267. # dummy, to prevent default action to be called
  268. } else if ( $tag == 'AUTHORS' ) {
  269. # dummy, to prevent default action to be called
  270. } else if ( $tag == 'AUTHOR' ) {
  271. add_author ( $this->sub_parse ( $tree ) ) ;
  272. return "" ;
  273. } else if ( $tag == 'ARTICLE' ) {
  274. $title = isset ( $this->attrs["TITLE"] ) ? $this->attrs["TITLE"] : "Untiteled" ;
  275. $id = $this->internal_id ( $title ) ;
  276. $ret .= "<article id='{$id}'>\n";
  277. $ret .= "<title>" . urldecode ( $title ) . "</title>\n" ;
  278. } else if ( $tag == 'LINK' ) {
  279. return $this->handle_link ( $tree ) ; # Shortcut
  280. } else if ( $tag == 'EXTENSION' ) {
  281. return $this->handle_extensions ( $tree ) ; # Shortcut
  282. } else if ( $tag == 'HEADING' ) {
  283. $level = count ( $tree->sections ) ;
  284. $wanted = $this->attrs["LEVEL"] ;
  285. $ret .= $this->close_last ( "para" , $tree ) ;
  286. while ( $level >= $wanted ) {
  287. $x = array_pop ( $tree->sections ) ;
  288. if ( $x == 1 ) {
  289. $ret .= $this->close_last ( "section" , $tree ) ;
  290. }
  291. $level-- ;
  292. }
  293. while ( $level < $wanted ) {
  294. $level++ ;
  295. if ( $level < $wanted ) {
  296. array_push ( $tree->sections , 0 ) ;
  297. } else {
  298. $ret .= $this->ensure_new ( "section" , $tree , "<section>" ) ;
  299. array_push ( $tree->sections , 1 ) ;
  300. }
  301. }
  302. $ret .= "<title>" ;
  303. } else if ( $tag == 'PARAGRAPH' || $tag == 'XHTML:P' ) { # Paragraph
  304. $ret .= $this->close_last ( "para" , $tree ) ;
  305. $ret .= $this->ensure_new ( "para" , $tree ) ;
  306. } else if ( $tag == 'LIST' ) { # List
  307. $ret .= $this->close_last ( "para" , $tree ) ;
  308. $list_type = strtolower ( $this->attrs['TYPE'] ) ;
  309. if ( $list_type == 'bullet' || $list_type == 'ident' || $list_type == 'def' ) $ret .= '<itemizedlist mark="opencircle">' ;
  310. else if ( $list_type == 'numbered' ) $ret .= '<orderedlist numeration="arabic">' ;
  311. } else if ( $tag == 'LISTITEM' ) { # List item
  312. $ret .= $this->close_last ( "para" , $tree ) ;
  313. $ret .= "<listitem>\n" ;
  314. $ret .= $this->ensure_new ( "para" , $tree ) ;
  315. } else if ( $tag == 'TABLE' ) { # Table
  316. $ret .= $this->add_new ( "table" , $tree ) ;
  317. # $ret .= "<title></title>" ;
  318. $ret .= $this->make_tgroup ( $tree ) ;
  319. $ret .= "<tbody>" ;
  320. } else if ( $tag == 'TABLEROW' ) { # Tablerow
  321. $retl_before = strlen ( $ret ) ;
  322. $ret .= $this->add_new ( "row" , $tree ) ;
  323. $retl_after = strlen ( trim ( $ret ) ) ;
  324. } else if ( $tag == 'TABLEHEAD' ) { # Tablehead
  325. $ret .= $this->add_new ( "entry" , $tree ) ;
  326. } else if ( $tag == 'TABLECELL' ) { # Tablecell
  327. $old_ret = $ret ;
  328. $ret .= $this->add_new ( "entry" , $tree ) ;
  329. } else if ( $tag == 'TABLECAPTION' ) { # Tablecaption
  330. if ( $param != "DOCAPTION" ) return "" ;
  331. # $ret .= $this->add_new ( "title" , $tree ) ;
  332. } else if ( $tag == 'BOLD' || $tag == 'XHTML:STRONG' || $tag == 'XHTML:B' ) { # <b> or '''
  333. $ret .= $this->ensure_new ( "para" , $tree ) ;
  334. $ret .= '<emphasis role="bold">' ;
  335. $close_tag = "emphasis" ;
  336. } else if ( $tag == 'ITALICS' || $tag == 'XHTML:EM' || $tag == 'XHTML:I' ) { # <i> or ''
  337. $ret .= $this->ensure_new ( "para" , $tree ) ;
  338. $ret .= '<emphasis>' ;
  339. $close_tag = "emphasis" ;
  340. } else if ( $tag == 'XHTML:TT' ) { # <tt>
  341. $ret .= $this->ensure_new ( "para" , $tree ) ;
  342. $ret .= '<literal>' ;
  343. $close_tag = "literal" ;
  344. } else if ( $tag == 'XHTML:SUB' ) { # <sub>
  345. $ret .= $this->ensure_new ( "para" , $tree ) ;
  346. $ret .= '<subscript>' ;
  347. $close_tag = "subscript" ;
  348. } else if ( $tag == 'XHTML:SUP' ) { # <sup>
  349. $ret .= $this->ensure_new ( "para" , $tree ) ;
  350. $ret .= '<superscript>' ;
  351. $close_tag = "superscript" ;
  352. } else if ( $tag == 'XHTML:SUP' ) { # <sup>
  353. $ret .= $this->ensure_new ( "para" , $tree ) ;
  354. $ret .= '<superscript>' ;
  355. $close_tag = "superscript" ;
  356. } else if ( $tag == 'PRELINE' OR $tag == 'XHTML:PRE' ) { # <pre>
  357. $ret .= $this->ensure_new ( "para" , $tree ) ;
  358. $ret .= '<programlisting>' ;
  359. $close_tag = "programlisting" ;
  360. } else if ( $tag == 'DEFVAL' ) {
  361. $ret .= $this->ensure_new ( "para" , $tree ) ;
  362. $ret .= " : " ;
  363. } else { # Default : normal text
  364. $ret .= $this->ensure_new ( "para" , $tree ) ;
  365. }
  366. # Get the sub-items
  367. $length_between = strlen ( $ret ) ;
  368. if ( $tag != 'MAGIC_VARIABLE' && $tag != 'TEMPLATE' ) {
  369. $ret .= $this->sub_parse ( $tree ) ;
  370. }
  371. $length_between = strlen ( $ret ) - $length_between ;
  372. # Close tags
  373. if ( $tag == 'LIST' ) {
  374. $ret .= $this->close_last ( "para" , $tree ) ;
  375. if ( $list_type == 'bullet' || $list_type == 'ident' || $list_type == 'def' ) $ret .= "</itemizedlist>\n" ;
  376. else if ( $list_type == 'numbered' ) $ret .= "</orderedlist>\n" ;
  377. if ( $xhtml_conversion )
  378. $ret .= $this->ensure_new ( "para" , $tree ) ;
  379. } else if ( $tag == 'LISTITEM' ) {
  380. $ret .= $this->close_last ( "para" , $tree ) ;
  381. $ret .= "</listitem>\n" ;
  382. } else if ( $close_tag != "" ) {
  383. $ret .= "</{$close_tag}>" ;
  384. } else if ( $tag == 'HEADING' ) {
  385. $ret .= "</title>\n" ;
  386. } else if ( $tag == 'TABLE' ) { # Table
  387. $ret .= "</tbody>" ;
  388. $ret .= "</tgroup>" ;
  389. $ret .= $this->close_last ( "table" , $tree ) ;
  390. } else if ( $tag == 'TABLEROW' ) { # Tablerow
  391. if ( strlen ( trim ( $ret ) ) == $retl_after ) {
  392. $ret = substr ( $ret , 0 , $retl_before ) ;
  393. $this->close_last ( "row" , $tree ) ;
  394. } else $ret .= $this->close_last ( "row" , $tree ) ;
  395. } else if ( $tag == 'TABLEHEAD' ) { # Tablehead !!!!
  396. $ret .= $this->close_last ( "entry" , $tree ) ;
  397. } else if ( $tag == 'TABLECELL' ) { # Tablecell
  398. $ret .= $this->close_last ( "entry" , $tree ) ;
  399. # if ( $length_between == 0 ) $ret = $old_ret ;
  400. } else if ( $tag == 'TABLECAPTION' ) { # Tablecaption
  401. # $ret .= $this->close_last ( "title" , $tree ) ;
  402. } else if ( $tag == 'ARTICLE' ) {
  403. $ret .= $this->close_last ( "section" , $tree , true ) ;
  404. $ret .= $this->close_last ( "para" , $tree ) ;
  405. $ret .= "</article>";
  406. }
  407. return $ret;
  408. }
  409. }
  410. require_once ( "xml2tree.php" ) ; # Uses the "element" class defined above
  411. ?>