/parsers/graveyard/wiki2xml/php/xml2txt.php

https://github.com/ChuguluGames/mediawiki-svn · PHP · 187 lines · 141 code · 20 blank · 26 comment · 133 complexity · 2b722d5a025f984fd086ad2114be78ed MD5 · raw file

  1. <?php
  2. /**
  3. * This file contains the /element/ class needed by xml2tree.php
  4. * to create a tree which is then converted into plain text
  5. */
  6. class element {
  7. var $name = '';
  8. var $attrs = array ();
  9. var $children = array ();
  10. # Temporary variables for link tags
  11. var $link_target = "" ;
  12. var $link_trail = "" ;
  13. var $link_parts = array () ;
  14. # Variables only used by $tree root
  15. var $list = array () ;
  16. var $iter = 1 ;
  17. var $bold = "" ;
  18. var $italics = "" ;
  19. var $underline = "" ;
  20. var $pre_link = "" ;
  21. /**
  22. * Parse the children ... why won't anybody think of the children?
  23. */
  24. function sub_parse(& $tree) {
  25. $ret = '' ;
  26. foreach ($this->children as $key => $child) {
  27. if (is_string($child)) {
  28. $ret .= $child ;
  29. } elseif ($child->name != 'ATTRS') {
  30. $sub = $child->parse ( $tree ) ;
  31. if ( $this->name == 'LINK' ) {
  32. if ( $child->name == 'TARGET' ) {
  33. $this->link_target = $sub ;
  34. } else if ( $child->name == 'PART' ) $this->link_parts[] = $sub ;
  35. else if ( $child->name == 'TRAIL' ) $this->link_trail = $sub ;
  36. }
  37. $ret .= $sub ;
  38. }
  39. }
  40. return $ret ;
  41. }
  42. /*
  43. * Parse the tag
  44. */
  45. function parse ( &$tree ) {
  46. global $content_provider , $wiki2xml_authors , $xmlg ;
  47. $ret = '';
  48. $tag = $this->name ;
  49. $is_root = ( $tree->iter == 1 ) ;
  50. $tree->iter++ ;
  51. if ( $tag == 'SPACE' ) $ret .= ' ' ;
  52. else if ( $tag == 'HEADING' ) $ret .= "\m\n";
  53. else if ( $tag == 'PARAGRAPH' ) $ret .= "\n";
  54. else if ( $tag == 'TABLECELL' ) $ret .= "\n";
  55. else if ( $tag == 'TABLECAPTION' ) $ret .= "\n";
  56. else if ( $tag == 'TEMPLATE' ) return "" ; # Ignore unresolved template
  57. else if ( $tag == 'AUTHOR' ) { # Catch author for display later
  58. $author = $this->sub_parse ( $tree ) ;
  59. if ( !in_array ( $author , $wiki2xml_authors ) )
  60. $wiki2xml_authors[] = $author ;
  61. return "" ;
  62. }
  63. if ( $tag == "EXTENSION" ) {
  64. $sub = trim ( $this->sub_parse ( $tree ) ) ;
  65. if ( $sub == '' ) return '' ;
  66. return " [$sub] " ;
  67. } else if ( $tag == "LINK" ) {
  68. $sub = $this->sub_parse ( $tree ) ;
  69. $link = "" ;
  70. if ( isset ( $this->attrs['TYPE'] ) AND strtolower ( $this->attrs['TYPE'] ) == 'external' ) {
  71. if ( $sub != "" ) $link .= $sub . " " ;
  72. $link .= '[' . $this->attrs['HREF'] . ']' ;
  73. } else {
  74. if ( count ( $this->link_parts ) > 0 ) $link = array_pop ( $this->link_parts ) ;
  75. $link_text = $link ;
  76. if ( $link == "" ) $link = $this->link_target ;
  77. $link .= $this->link_trail ;
  78. $ns = $content_provider->get_namespace_id ( $this->link_target ) ;
  79. if ( $ns == 6 ) { # Surround image text with newlines
  80. if ( $xmlg['text_hide_images'] ) $link = '' ;
  81. else {
  82. $nstext = explode ( ":" , $this->link_target , 2 ) ;
  83. $nstext = "" ;
  84. # array_shift ( $nstext ) ;
  85. $link = "\m(" . $nstext . ":" . $link . ")\n" ;
  86. }
  87. } else if ( $ns == -9 ) { # Adding newline to interlanguage link
  88. if ( !$xmlg['keep_interlanguage'] ) return '' ;
  89. $link = "\m" . $link ;
  90. } else if ( $ns == -8 ) { # Adding newline to category link
  91. if ( !$xmlg['keep_categories'] ) return '' ;
  92. if ( $link_text == "!" || $link_text == '*' ) $link = "" ;
  93. else $link = " ({$link})" ;
  94. $link = "\m" . $this->link_target . $link . "\n" ;
  95. } else {
  96. $link = $tree->pre_link . $link ;
  97. }
  98. }
  99. $ret .= $link ;
  100. } else if ( $tag == "LIST" ) {
  101. $type = strtolower ( $this->attrs['TYPE'] ) ;
  102. $k = '*' ; # Dummy
  103. if ( $type == 'bullet' ) $k = "*" ;
  104. else if ( $type == 'numbered' ) $k = "1" ;
  105. else if ( $type == 'ident' ) $k = ">" ;
  106. array_push ( $tree->list , $k ) ;
  107. $ret .= $this->sub_parse ( $tree ) ;
  108. array_pop ( $tree->list ) ;
  109. } else if ( $tag == "LISTITEM" ) {
  110. $r = "" ;
  111. foreach ( $tree->list AS $k => $l ) {
  112. if ( $l == '*' ) $r .= '-' ;
  113. else if ( $l == '>' ) $r .= '<dd/>' ;
  114. else {
  115. $r .= $l . "." ;
  116. }
  117. }
  118. $ret .= "\m" . $r . " " ;
  119. $ret .= $this->sub_parse ( $tree ) ;
  120. if ( $tag == "LISTITEM" ) {
  121. $x = array_pop ( $tree->list ) ;
  122. if ( $x == "*" || $x == ">" ) array_push ( $tree->list , $x ) ; # Keep bullet
  123. else array_push ( $tree->list , $x + 1 ) ; # Increase last counter
  124. }
  125. } else {
  126. if ( $tag == "ARTICLE" && isset ( $this->attrs["TITLE"] ) ) {
  127. $ret .= strtoupper ( urldecode ( $this->attrs["TITLE"] ) ) . "\n" ;
  128. }
  129. if ( $xmlg['text_hide_tables'] && ( substr ( $tag , 0 , 5 ) == 'TABLE' ||
  130. $tag == 'XHTML:TABLE' ||
  131. $tag == 'XHTML:TH' ||
  132. $tag == 'XHTML:CAPTION' ||
  133. $tag == 'XHTML:TD' ||
  134. $tag == 'XHTML:TR' ) ) {
  135. $ret = '' ;
  136. } else {
  137. $ret .= $this->sub_parse ( $tree ) ;
  138. if ( $tag == "TABLEHEAD" || $tag == "XHTML:B" || $tag == "XHTML:STRONG" || $tag == "BOLD" ) $ret = $tree->bold . $ret . $tree->bold ;
  139. else if ( $tag == "XHTML:I" || $tag == "XHTML:EM" || $tag == "ITALICS" ) $ret = $tree->italics . $ret . $tree->italics ;
  140. else if ( $tag == "XHTML:U" ) $ret = $tree->underline . $ret . $tree->underline ;
  141. if ( $tag == "TABLEHEAD" ) $ret = "\n" . $ret ;
  142. }
  143. }
  144. $tree->iter-- ; # Unnecessary, since not really used
  145. if ( $is_root ) {
  146. $ret = str_replace ( "\m\m" , "\m" , $ret ) ;
  147. $ret = str_replace ( "\n\m" , "\n" , $ret ) ;
  148. $ret = str_replace ( "\m" , "\n" , $ret ) ;
  149. }
  150. return $ret;
  151. }
  152. }
  153. require_once ( "xml2tree.php" ) ;
  154. //_______________________________________________________________
  155. /*
  156. $infile = "Biology.xml" ;
  157. $xml = @file_get_contents ( $infile ) ;
  158. print htmlentities ( $xml ) . "<hr>" ;
  159. $x2t = new xml2php ;
  160. $tree = $x2t->scanString ( $xml ) ;
  161. $odt = new xml2odt ;
  162. $odt->parse ( $tree ) ;
  163. */
  164. ?>