/DumpReader.php

https://github.com/adamwight/wikipedia-offline-patch · PHP · 135 lines · 103 code · 12 blank · 20 comment · 15 complexity · 39169a89e6564362d8491daa0898a1dc MD5 · raw file

  1. <?php
  2. class DumpReader
  3. {
  4. #
  5. # open the index files and lookup the chunk. Load article source.
  6. #
  7. static function load_article($title)
  8. {
  9. $article_wml = null;
  10. $results = self::index_search($title);
  11. if (count($results) > 0) {
  12. $archive_file = $results[0][0];
  13. $title = $results[0][1];
  14. $article_wml = self::load_all_data($title, $archive_file);
  15. } else {
  16. $article_wml = ""; //TODO or null?
  17. }
  18. $article_wml = htmlspecialchars_decode($article_wml);
  19. return $article_wml;
  20. }
  21. static function load_all_data($title, $file_name)
  22. {
  23. wfDebug("loading chunk [$file_name] to find article [$title]");
  24. $article_wml = "";
  25. $matches = array();
  26. $all_chunk_data = self::load_bz($file_name);
  27. if (preg_match("/<title>".preg_quote($title, '/')."<\/title>.*?<text[^>]*>(.*)/s",
  28. $all_chunk_data, $matches))
  29. {
  30. $all_chunk_data = $matches[1];
  31. while (isset($all_chunk_data)) {
  32. $end_pos = strpos($all_chunk_data, '</text>');
  33. if ($end_pos !== FALSE) {
  34. $article_wml .= substr($all_chunk_data, 0, $end_pos);
  35. break;
  36. }
  37. $article_wml .= $all_chunk_data;
  38. wfDebug('continuing into next bz2 chunk');
  39. $file_name = self::increment_file($file_name);
  40. $all_chunk_data = self::load_bz($file_name);
  41. }
  42. }
  43. return $article_wml;
  44. }
  45. #
  46. # open chosen bz2 split, decompress and return
  47. # TODO begin bzcat at chunk, let pipe load in the bg process.
  48. #
  49. static function load_bz($file_name)
  50. {
  51. global $wgOfflineWikiPath;
  52. $path = "$wgOfflineWikiPath/$file_name";
  53. if (strlen($file_name) < 1) return null; #strange that bzopen doesn't choke on dir.
  54. $bz = bzopen($path, "r");
  55. if (!$bz) return null;
  56. $out = "";
  57. while ($bz && !feof($bz)) {
  58. $out .= bzread($bz, 8192);
  59. }
  60. bzclose($bz);
  61. return $out;
  62. }
  63. #
  64. # use the index files
  65. #
  66. static function index_search($title)
  67. {
  68. $title = strtr($title, '_', ' ');
  69. $title = strtolower(trim($title));
  70. #wfDebug("looking up word [$title]");
  71. try {
  72. require_once("xapian.php");
  73. global $wgOfflineWikiPath;
  74. $db = new XapianDatabase("$wgOfflineWikiPath/db");
  75. #$qp = new XapianQueryParser();
  76. #$qp->set_database($db);
  77. #$stemmer = new XapianStem("english");
  78. #$qp->set_stemmer($stemmer);
  79. #$query = $qp->parse_query($title);
  80. $query = new XapianQuery($title);
  81. $enquire = new XapianEnquire($db);
  82. $enquire->set_query($query);
  83. $matches = $enquire->get_mset(0, 25);
  84. if (0 /*SCORING*/) {
  85. $scores = array();
  86. for ($i = $matches->begin(); !$i->equals($matches->end()); $i->next())
  87. {
  88. $row = $i->get_document();
  89. $str = $i->get_percent()."% [".$row->get_data()."]";
  90. $scores[] = $str;
  91. if (1/*DEBUG*/) wfDebug("$str\n");
  92. }
  93. }
  94. $result = array();
  95. for ($i = $matches->begin(); !$i->equals($matches->end()); $i->next())
  96. {
  97. $entry = $i->get_document()->get_data();
  98. $fsep = strpos($entry, ':');
  99. $row = array(substr($entry, 0, $fsep), substr($entry, $fsep + 1));
  100. $result[] = $row;
  101. }
  102. # not in Xapian 1.0.X
  103. #$db->close();
  104. return $result;
  105. } catch (Exception $e) {
  106. wfDebug(__METHOD__.':'.$e->getMessage());
  107. return null;
  108. }
  109. }
  110. static function increment_file($fname)
  111. {
  112. // XXX assuming a lot
  113. $matches = array();
  114. // TODO fails on 99 ...
  115. if (preg_match('/(.*?)([1-9][0-9]*)(.*?)$/', $fname, $matches)) {
  116. $i = $matches[2];
  117. return preg_replace("/$i/", $i + 1, $fname);
  118. } else {
  119. wfDebug('Failed to grok your wiki-splits filename pattern');
  120. return false;
  121. }
  122. }
  123. }