PageRenderTime 27ms CodeModel.GetById 2ms RepoModel.GetById 0ms app.codeStats 1ms

/parsers/graveyard/wiki2xml/php/content_provider.php

https://github.com/ChuguluGames/mediawiki-svn
PHP | 379 lines | 288 code | 59 blank | 32 comment | 102 complexity | ceeac0fcb0c81646c862ee9b933b05e9 MD5 | raw file
  1. <?php
  2. # Abstract base class
  3. class ContentProvider {
  4. var $load_time = 0 ; # Time to load text and templates, to judge actual parsing speed
  5. var $article_list = array () ;
  6. var $authors = array () ;
  7. var $block_file_download = false ;
  8. function get_wiki_text ( $title , $do_cache = false ) { return "" ; } # dummy
  9. function get_template_text ( $title ) { return "" ; } # dummy
  10. function add_article ( $title ) {
  11. $this->article_list[] = urlencode ( trim ( $title ) ) ;
  12. }
  13. function is_an_article ( $title ) {
  14. $title = urlencode ( trim ( $title ) ) ;
  15. return in_array ( $title , $this->article_list ) ;
  16. }
  17. /**
  18. * XXX TODO: why are some negative?
  19. * Gets the numeric namespace
  20. * "6" = images
  21. * "-8" = category link
  22. * "-9" = interlanguage link
  23. * "11" = templates
  24. */ function get_namespace_id ( $text ) {
  25. $text = strtoupper ( $text ) ;
  26. $text = explode ( ":" , $text , 2 ) ;
  27. if ( count ( $text ) != 2 ) return 0 ;
  28. $text = trim ( array_shift ( $text ) ) ;
  29. if ( $text == "" ) return 0 ;
  30. $ns = 0 ;
  31. if ( $text == "CATEGORY" || $text == "KATEGORIE" ) return -8 ; # Hackish, for category link
  32. if ( strlen ( $text ) < 4 ) return -9 ; # Hackish, for interlanguage link
  33. if ( $text == "SIMPLE" ) return -9 ;
  34. # Horrible manual hack, for now
  35. if ( $text == "IMAGE" || $text == "BILD" ) $ns = 6 ;
  36. if ( $text == "TEMPLATE" || $text == "VORLAGE" ) $ns = 11 ;
  37. return $ns ;
  38. }
  39. function copyimagefromwiki ( $name , $url = "" ) {
  40. global $xmlg ;
  41. $dir = $xmlg['image_destination'] ;
  42. if ( $url == "" )
  43. $url = $this->get_image_url ( name ) ;
  44. $fname = urlencode ( $name ) ;
  45. $target = $dir . "/" . $fname ;
  46. if ( !file_exists ( $target ) && !$this->block_file_download ) {
  47. @mkdir ( $dir ) ;
  48. # dub sez... use cURL
  49. $ch = curl_init();
  50. curl_setopt($ch, CURLOPT_URL, $url);
  51. $fh = @fopen($target, 'w');
  52. curl_setopt($ch, CURLOPT_FILE, $fh);
  53. curl_exec($ch);
  54. curl_close($ch);
  55. @fclose($fh);
  56. }
  57. return $fname ;
  58. }
  59. function myurlencode ( $t ) {
  60. $t = str_replace ( " " , "_" , $t ) ;
  61. $t = urlencode ( $t ) ;
  62. return $t ;
  63. }
  64. function get_image_url ( $name ) {
  65. global $xmlg ;
  66. $site = $xmlg['site_base_url'] ;
  67. $parts = explode ( ".wikipedia.org/" , $site ) ;
  68. $parts2 = explode ( ".wikibooks.org/" , $site ) ;
  69. $image = utf8_encode ( $name ) ;
  70. $image2 = ucfirst ( str_replace ( " " , "_" , $name ) ) ;
  71. $m = md5( $image2 ) ;
  72. $m1 = substr ( $m , 0 , 1 ) ;
  73. $m2 = substr ( $m , 0 , 2 ) ;
  74. $i = "{$m1}/{$m2}/" . $this->myurlencode ( ucfirst ( $name ) ) ;
  75. if ( count ($parts ) > 1 ) {
  76. $lang = array_shift ( $parts ) ;
  77. $url = "http://upload.wikimedia.org/wikipedia/{$lang}/{$i}" ;
  78. $url2 = "http://upload.wikimedia.org/wikipedia/commons/{$i}" ;
  79. $h = @fopen ( $url , "r" ) ;
  80. if ( $h === false ) $url = $url2 ;
  81. else fclose ( $h ) ;
  82. } else if ( count ($parts2 ) > 1 ) {
  83. $lang = array_shift ( $parts2 ) ;
  84. $url = "http://upload.wikimedia.org/wikibooks/{$lang}/{$i}" ;
  85. $url2 = "http://upload.wikimedia.org/wikipedia/commons/{$i}" ;
  86. $h = @fopen ( $url , "r" ) ;
  87. if ( $h === false ) $url = $url2 ;
  88. else fclose ( $h ) ;
  89. } else {
  90. $url = "http://{$site}/images/{$i}" ;
  91. }
  92. # print "<a href='{$url}'>{$url}</a><br/>" ;
  93. return $url ;
  94. }
  95. function do_show_images () {
  96. return true ;
  97. }
  98. }
  99. # Access through HTTP protocol
  100. class ContentProviderHTTP extends ContentProvider {
  101. var $article_cache = array () ;
  102. var $first_title = "" ;
  103. var $load_error ;
  104. function between_tag ( $tag , &$text ) {
  105. $a = explode ( "<{$tag}" , $text , 2 ) ;
  106. if ( count ( $a ) == 1 ) return "" ;
  107. $a = explode ( ">" , " " . array_pop ( $a ) , 2 ) ;
  108. if ( count ( $a ) == 1 ) return "" ;
  109. $a = explode ( "</{$tag}>" , array_pop ( $a ) , 2 ) ;
  110. if ( count ( $a ) == 1 ) return "" ;
  111. return array_shift ( $a ) ;
  112. }
  113. function do_get_contents ( $title ) {
  114. global $xmlg ;
  115. $use_se = false ;
  116. if ( isset ( $xmlg["use_special_export"] ) && $xmlg["use_special_export"] == 1 ) $use_se = true ;
  117. if ( $xmlg["useapi"] ) {
  118. $url = "http://" . $xmlg["site_base_url"] . "/api.php?format=php&action=query&prop=revisions&rvexpandtemplates=1&rvprop=timestamp|user|comment|content&titles=" . urlencode ( $title ) ;
  119. $data = @file_get_contents ( $url ) ;
  120. $data = unserialize ( $data ) ;
  121. $data = $data['query'] ; if ( !isset ( $data ) ) return "" ;
  122. $data = $data['pages'] ; if ( !isset ( $data ) ) return "" ;
  123. $data = array_shift ( $data ) ;
  124. $data = $data['revisions'] ; if ( !isset ( $data ) ) return "" ;
  125. $data = $data['0'] ; if ( !isset ( $data ) ) return "" ;
  126. $data = $data['*'] ; if ( !isset ( $data ) ) return "" ;
  127. return $data ;
  128. # $data = $data['page'] ; if ( !isset ( $data ) ) return "" ;
  129. # $data = $data['revision'] ; if ( !isset ( $data ) ) return "" ;
  130. # $data = $data['ref'] ; if ( !isset ( $data ) ) return "" ;
  131. #print urldecode ( $url ) . "\n" ;
  132. print "<pre>" ; print_r ( $data ) ; print "</pre>" ;
  133. exit ;
  134. $s = "Still here..." ;
  135. return $s ;
  136. } else if ( $use_se ) {
  137. $url = "http://" . $xmlg["site_base_url"] . "/index.php?listauthors=1&title=Special:Export/" . urlencode ( $title ) ;
  138. } else {
  139. if ( $xmlg["use_toolserver_url"] ) {
  140. # $url = "http://" . $xmlg["site_base_url"] . "/index.php?action=raw&title=" . urlencode ( $title ) ;
  141. $u = urlencode ( $title ) ;
  142. $site = array_shift ( explode ( "/" , $xmlg["site_base_url"] ) ) ;
  143. $url = "http://tools.wikimedia.de/~daniel/WikiSense/WikiProxy.php?wiki={$site}&title={$u}&rev=0&go=Fetch" ;
  144. } else {
  145. $url = "http://" . $xmlg["site_base_url"] . "/index.php?action=raw&title=" . urlencode ( $title ) ;
  146. }
  147. }
  148. $s = @file_get_contents ( $url ) ;
  149. if ( $use_se ) {
  150. $text = html_entity_decode ( $this->between_tag ( "text" , $s ) ) ;
  151. $this->authors = array () ;
  152. $authors = $this->between_tag ( "contributors" , $s ) ;
  153. $authors = explode ( "</contributor><contributor>" , $authors ) ;
  154. foreach ( $authors AS $author ) {
  155. $id = $this->between_tag ( "id" , $author ) ;
  156. if ( $id == '0' || $id == '' ) continue ; # Skipping IPs and (possibly) broken entries
  157. $name = $this->between_tag ( "username" , $author ) ;
  158. $this->authors[] = $name ;
  159. }
  160. $s = $text ;
  161. }
  162. return $s ;
  163. }
  164. function get_wiki_text ( $title , $do_cache = false ) {
  165. global $xmlg ;
  166. $load_error = false ;
  167. $title = trim ( $title ) ;
  168. if ( $title == "" ) return "" ; # Just in case...
  169. if ( isset ( $this->article_cache[$title] ) ) # Already in the cache
  170. return $this->article_cache[$title] ;
  171. if ( $this->first_title == "" ) $this->first_title = $title ;
  172. # Retrieve it
  173. $t1 = microtime_float() ;
  174. $s = $this->do_get_contents ( $title ) ;
  175. if ( strtoupper ( substr ( $s , 0 , 9 ) ) == "#REDIRECT" ) {
  176. $t2 = explode ( "[[" , $s , 2 ) ;
  177. $t2 = array_pop ( $t2 ) ;
  178. $t2 = explode ( "]]" , $t2 , 2 ) ;
  179. $t2 = array_shift ( $t2 ) ;
  180. $s = $this->do_get_contents ( $t2 ) ;
  181. }
  182. $this->load_time += microtime_float() - $t1 ;
  183. $comp = '<!DOCTYPE html PUBLIC "-//W3C//DTD' ;
  184. if ( substr ( $s , 0 , strlen ( $comp ) ) == $comp ) $s = "" ; # Catching wrong title error
  185. if ( $do_cache ) $this->article_cache[$title] = $s ;
  186. return $s ;
  187. }
  188. function get_local_url ( $title ) {
  189. return "/" . array_pop ( explode ( "/" , $this->get_var ( 'site_base_url' ) , 2 ) ) . "/index.php?title=" . urlencode ( $title ) ;
  190. }
  191. function get_server_url () {
  192. return "http://" . array_shift ( explode ( "/" , $this->get_var ( 'site_base_url' ) , 2 ) ) ;
  193. }
  194. function get_full_url ( $title ) {
  195. return $this->get_server_url () . $this->get_local_url ( $title ) ;
  196. }
  197. function get_namespace_template () {
  198. return $this->get_var ( 'namespace_template' ) ;
  199. }
  200. function get_var ( $var ) {
  201. global $xmlg ;
  202. if ( !isset ( $xmlg[$var] ) ) return false ;
  203. return $xmlg[$var] ;
  204. }
  205. function get_template_text ( $title ) {
  206. # Check for fix variables
  207. if ( $title == "PAGENAME" ) return $this->first_title ;
  208. if ( $title == "PAGENAMEE" ) return urlencode ( $this->first_title ) ;
  209. if ( $title == "SERVER" ) return $this->get_server_url () ;
  210. if ( $title == "CURRENTDAYNAME" ) return date ( "l" ) ;
  211. if ( strtolower ( substr ( $title , 0 , 9 ) ) == "localurl:" )
  212. return $this->get_local_url ( substr ( $title , 9 ) ) ;
  213. $title = trim ( $title ) ;
  214. if ( count ( explode ( ":" , $title , 2 ) ) == 1 ) # Does the template title contain a ":"?
  215. $title = $this->get_namespace_template() . ":" . $title ;
  216. else if ( substr ( $title , 0 , 1 ) == ":" ) # Main namespace
  217. $title = substr ( $title , 1 ) ;
  218. return $this->get_wiki_text ( $title , true ) ; # Cache template texts
  219. }
  220. function get_internal_link ( $target , $text ) {
  221. return $text ; # Dummy
  222. }
  223. }
  224. # Access through text file structure
  225. class ContentProviderTextFile extends ContentProviderHTTP {
  226. var $file_ending = ".txt" ;
  227. function do_get_contents ( $title ) {
  228. return $this->get_page_text ( $title ) ;
  229. }
  230. /**
  231. Called from outside
  232. Could probably remained unchanged from HTTP class, but this is shorter, and caching is irrelevant for text files (disk cache)
  233. */
  234. function get_wiki_text ( $title , $do_cache = false ) {
  235. $title = trim ( $title ) ;
  236. if ( $title == "" ) return "" ; # Just in case...
  237. if ( $this->first_title == "" ) {
  238. $this->first_title = $title ;
  239. }
  240. $text = $this->get_page_text ( $title ) ;
  241. return $text ;
  242. }
  243. function get_file_location ( $ns , $title ) {
  244. return get_file_location_global ( $this->basedir , $ns , $title , false ) ;
  245. }
  246. function get_page_text ( $page , $allow_redirect = true ) {
  247. $filename = $this->get_file_location ( 0 , $page ) ;
  248. $filename = $filename->fullname . $this->file_ending ;
  249. if ( !file_exists ( $filename ) ) return "" ;
  250. $text = trim ( file_get_contents ( $filename ) ) ;
  251. # REDIRECT?
  252. if ( $allow_redirect && strtoupper ( substr ( $text , 0 , 9 ) ) == "#REDIRECT" ) {
  253. $text = substr ( $text , 9 ) ;
  254. $text = array_shift ( explode ( "\n" , $text , 2 ) ) ;
  255. $text = str_replace ( "[[" , "" , $text ) ;
  256. $text = str_replace ( "]]" , "" , $text ) ;
  257. $text = ucfirst ( trim ( $text ) ) ;
  258. $text = $this->get_page_text ( $text , false ) ;
  259. }
  260. return $text ;
  261. }
  262. function get_internal_link ( $target , $text ) {
  263. $file = $this->get_file_location ( 0 , $target ) ;
  264. if ( !file_exists ( $file->fullname.$this->file_ending ) ) return $text ;
  265. else return "<a href='browse_texts.php?title=" . urlencode ( $target ) . "'>{$text}</a>" ;
  266. }
  267. function do_show_images () {
  268. return false ;
  269. }
  270. }
  271. # Access through MySQL interface
  272. # (Used via the extension via Special::wiki2XML)
  273. class ContentProviderMySQL extends ContentProviderHTTP {
  274. function do_get_contents ( $title ) {
  275. return $this->get_page_text ( $title ) ;
  276. }
  277. /**
  278. Called from outside
  279. */
  280. function get_wiki_text ( $title , $do_cache = false ) {
  281. $title = trim ( $title ) ;
  282. if ( $title == "" ) return "" ; # Just in case...
  283. if ( $this->first_title == "" ) {
  284. $this->first_title = $title ;
  285. }
  286. $text = $this->get_page_text ( $title ) ;
  287. return $text ;
  288. }
  289. function get_file_location ( $ns , $title ) {
  290. return get_file_location_global ( $this->basedir , $ns , $title , false ) ;
  291. }
  292. function get_page_text ( $page , $allow_redirect = true ) {
  293. $title = Title::newFromText ( $page ) ;
  294. $article = new Article ( $title ) ;
  295. # article does not exist?
  296. if (!$article->exists()) {
  297. return "";
  298. }
  299. $text = $article->getContent () ;
  300. # REDIRECT?
  301. if ( $allow_redirect && strtoupper ( substr ( $text , 0 , 9 ) ) == "#REDIRECT" ) {
  302. $text = substr ( $text , 9 ) ;
  303. $text = array_shift ( explode ( "\n" , $text , 2 ) ) ;
  304. $text = str_replace ( "[[" , "" , $text ) ;
  305. $text = str_replace ( "]]" , "" , $text ) ;
  306. $text = ucfirst ( trim ( $text ) ) ;
  307. $text = $this->get_page_text ( $text , false ) ;
  308. }
  309. return $text ;
  310. }
  311. function get_internal_link ( $target , $text ) {
  312. $file = $this->get_file_location ( 0 , $target ) ;
  313. if ( !file_exists ( $file->fullname.$this->file_ending ) ) return $text ;
  314. else return "<a href='browse_texts.php?title=" . urlencode ( $target ) . "'>{$text}</a>" ;
  315. }
  316. function do_show_images () {
  317. return false ;
  318. }
  319. }
  320. ?>