PageRenderTime 49ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 1ms

/webui/include/libindri.php

https://github.com/oroszgy/sefh
PHP | 380 lines | 231 code | 85 blank | 64 comment | 43 complexity | d822f084c5966f3fb47f24072cafa27c MD5 | raw file
  1. <?php
  2. include( "config.php" );
  3. include( "libindri_php.php" );
  4. ?>
  5. <?php
  6. /*if (!dl( $indri_param['library'] )) {
  7. // failed to load shared library, bail the page
  8. ?>
  9. <html>
  10. <head>
  11. <title>Indri PHP test not ready</title>
  12. </head>
  13. <body>
  14. Unable to load library: <?= $indri_param['library'] ?>
  15. </body>
  16. </html>
  17. <?php
  18. return;
  19. }*/
  20. // library loaded, run the bits.
  21. // increase the memory limit for the script.
  22. ini_set("memory_limit", $indri_param[ 'memory_limit' ] );
  23. //
  24. // indri_getRawNodes
  25. //
  26. function indri_getRawNodes( $node ) {
  27. $results = array();
  28. if( $node->type == "RawScorerNode" ) {
  29. $results[] = $node->name;
  30. } else {
  31. $children = $node->children;
  32. foreach( $children as $child ) {
  33. $result = indri_getRawNodes($child);
  34. $previous = $results;
  35. $results = array_merge( $previous, $result );
  36. }
  37. }
  38. return $results;
  39. }
  40. //
  41. // indri_matchCompare
  42. //
  43. function indri_matchCompare( $one, $two ) {
  44. if( $one->begin < $two->begin )
  45. return -1;
  46. if( $one->begin > $two->begin )
  47. return 1;
  48. return 0;
  49. }
  50. //
  51. // position class
  52. //
  53. class position {
  54. var $begin;
  55. var $end;
  56. function position( $b, $e ) {
  57. $this->begin = $b;
  58. $this->end = $e;
  59. }
  60. };
  61. //
  62. // indri_documentMatches
  63. //
  64. function indri_documentMatches( $document, $annotations, $nodeNames, $range ) {
  65. $rawmatches = array();
  66. foreach( $nodeNames as $node ) {
  67. $positions = 0;
  68. if( isset($annotations[$node]) ) {
  69. $positions = $annotations[$node];
  70. }
  71. foreach( $positions as $position ) {
  72. if( $document == $position->document &&
  73. $range->begin <= $position->begin &&
  74. $range->end >= $position->end ) {
  75. $rawmatches[] = $position;
  76. }
  77. }
  78. }
  79. // sort the array
  80. usort( $rawmatches, "indri_matchCompare" );
  81. // remove and coalesce duplicates
  82. $matches = array();
  83. if( count($rawmatches) > 0 ) {
  84. $begin = $rawmatches[0]->begin;
  85. $end = $rawmatches[0]->end;
  86. for( $i = 1; $i < count($rawmatches); $i++ ) {
  87. if( $rawmatches[$i]->begin > $end ) {
  88. // add a match
  89. $matches[] = new position( $begin, $end );
  90. $begin = $rawmatches[$i]->begin;
  91. }
  92. if( $end < $rawmatches[$i]->end ) {
  93. $end = $rawmatches[$i]->end;
  94. }
  95. }
  96. // add a match
  97. $matches[] = new position( $begin, $end );
  98. }
  99. return $matches;
  100. }
  101. //
  102. // yank out any tags from $rawsnippet
  103. //
  104. function indri_sanitizetext( $rawsnippet ) {
  105. $rawsnippet = preg_replace("'<style[^>]*>.*</style>'siU",'', $rawsnippet );
  106. $rawsnippet = preg_replace("~<script[^>]*>.+</script[^>]*>~isU", "", $rawsnippet );
  107. return strip_tags( $rawsnippet );
  108. }
  109. //
  110. // $matches is an array of extents of term matches in the document
  111. // $text is the text of the document
  112. // $positions is an array of byte offsets of words the document text
  113. //
  114. // This method tries to put as many matches as possible into $windowSize
  115. // words. However, we give at least 7 words of context for every match;
  116. // that takes precidence. Therefore, some matches may get left out of
  117. // the snippet.
  118. //
  119. // If two matches are close enough, they will be contiguous in the
  120. // snippet. Otherwise, each match (and the context of the match)
  121. // is preceded by an ellipsis ('...').
  122. //
  123. // The first part of this method figures out which matches will be
  124. // included in the snippet, while the second part builds the snippet text.
  125. //
  126. function indri_buildsnippet( $text, $matches, $positions, $windowSize, $range ) {
  127. $characters = 0;
  128. $output = "...";
  129. $matchWidth = (int) ($windowSize / count($matches));
  130. if( $matchWidth < 15 ) {
  131. // want at least 7 words around each match
  132. $matchWidth = 15;
  133. } else if( $matchWidth > 30 ) {
  134. $matchWidth = 30;
  135. }
  136. $matchBegin = $matches[0]->begin;
  137. $matchEnd = $matches[0]->end;
  138. $match = array( "begin" => $matchBegin, "end" => $matchEnd );
  139. $words = 0;
  140. $begin = $matchBegin - (int) ceil($matchWidth / 2);
  141. $end = $matchEnd + (int) floor($matchWidth / 2);
  142. if( $begin < $range->begin )
  143. $begin = $range->begin;
  144. if( $end >= $range->end )
  145. $end = $range->end;
  146. if( $range->end - $range->begin <= $windowSize ) {
  147. $begin = $range->begin;
  148. $end = $range->end;
  149. }
  150. $segment = array("begin" => $begin,
  151. "end" => $end,
  152. "matches" => array( $match ) );
  153. $segments = array();
  154. // figure out what matches to coalesce
  155. for( $i=1; $i<count($matches); $i++ ) {
  156. $match = array( "begin" => $matches[$i]->begin,
  157. "end" => $matches[$i]->end );
  158. $begin = $matches[$i]->begin - (int) ceil($matchWidth / 2);
  159. $end = $matches[$i]->end + (int) floor($matchWidth / 2);
  160. if( $begin < $range->begin )
  161. $begin = $range->begin;
  162. if( $end >= $range->end )
  163. $end = $range->end;
  164. if( $segment["end"] >= $begin ) {
  165. $segment["end"] = $end;
  166. $segment["matches"][] = $match;
  167. $words += ($segment["end"] - $segment["begin"]);
  168. } else {
  169. $segments[] = $segment;
  170. $words += ($segment["end"] - $segment["begin"]);
  171. $segment = array( "begin" => $begin,
  172. "end" => $end, "matches" => array($match));
  173. }
  174. if( $words > $windowSize ) {
  175. break;
  176. }
  177. }
  178. $segments[] = $segment;
  179. $output = "";
  180. // build snippet from the list of segments
  181. for( $i=0; $i<count($segments); $i++ ) {
  182. $segment = $segments[$i];
  183. $begin = $segment["begin"];
  184. $end = $segment["end"];
  185. $matches = $segment["matches"];
  186. if( $begin > $range->begin && $i == 0 ) {
  187. $output .= "<strong>...</strong>";
  188. }
  189. $beginByte = $positions[$begin]->begin;
  190. $endByte = $positions[$end-1]->end;
  191. $current = $beginByte;
  192. for( $j=0; $j<count($matches); $j++ ) {
  193. $beginMatch = $matches[$j]["begin"];
  194. $endMatch = $matches[$j]["end"];
  195. $output .= indri_sanitizetext( substr( $text, $current, $positions[$beginMatch]->begin - $current ) );
  196. $output .= "<strong>";
  197. $output .= indri_sanitizetext( substr( $text,
  198. $positions[$beginMatch]->begin,
  199. $positions[$endMatch-1]->end - $positions[$beginMatch]->begin ) );
  200. $output .= "</strong>";
  201. $current = $positions[$endMatch-1]->end;
  202. }
  203. $output .= indri_sanitizetext( substr( $text, $current, $endByte - $current ) );
  204. if( $end < $range->end-1 ) {
  205. $output .= "<strong>...</strong>";
  206. }
  207. }
  208. return $output;
  209. }
  210. //
  211. // indri_setupenvironment
  212. //
  213. function indri_setupenvironment( $param, $env, $request ) {
  214. $env->addServer($indri_param['server']);
  215. //$env->addIndex( $param[ 'index' ] );
  216. //$rules = array( "method:dirichlet,mu:250,field:mainbody,operator:term", "method:dirichlet,mu:1000,field:mainbody,operator:window", "method:dirichlet,mu:100,field:inlink,operator:term", "method:dirichlet,mu:100,field:inlink,operator:window", "method:dirichlet,mu:10,field:title,operator:term", "method:dirichlet,mu:5,field:title,operator:window", "method:dirichlet,mu:40,field:heading,operator:term", "method:dirichlet,mu:80,field:heading,operator:window" );
  217. //$env->setScoringRules( $rules );
  218. if (isset($request['startdoc'])) {
  219. $startdoc = urldecode($request['startdoc']);
  220. } else {
  221. $startdoc = 0;
  222. }
  223. return $startdoc;
  224. }
  225. function indri_cleanquery( $query ) {
  226. return preg_replace("/\?/", "", $query);
  227. }
  228. function indri_timer() {
  229. list($usec, $sec) = explode(" ", microtime());
  230. return ((float)$usec + (float)$sec);
  231. }
  232. function indri_buildlink( $request, $document ) {
  233. $server = "";
  234. $index = "";
  235. return "showdoc.php?documentID=" . $document . $server . $index;
  236. }
  237. function indri_nextlink( $request, $startdoc ) {
  238. $server = "";
  239. $index = "";
  240. if (isset($request['query']) && $request['query'] != "None") {
  241. $query = "&query=" . $request['query'];
  242. }
  243. return indri_escapeurl( "query.php?startdoc=" . $startdoc . $query );
  244. }
  245. function indri_escapeurl( $text ) {
  246. $search = array( ' ', '#', '[', ']' );
  247. $replace = array( '%20', '%23', '%5B', '%5D' );
  248. return str_replace( $search, $replace, $text );
  249. }
  250. function indri_escapetags( $text ) {
  251. $search = array( '<', '>' );
  252. $replace = array( '&lt;', '&gt;' );
  253. $text = str_replace( $search, $replace, $text );
  254. return $text;
  255. }
  256. function indri_printlinks( $request, $startdoc, $resultCount, $pagedocs ) {
  257. $nextlink = indri_nextlink( $request, $startdoc + $pagedocs );
  258. $prevlink = indri_nextlink( $request, $startdoc - $pagedocs );
  259. $nextfull = "<a href=\"" . $nextlink . "\">Következő " . $pagedocs . "</a>";
  260. $prevfull = "<a href=\"" . $prevlink . "\">Előző " . $pagedocs . "</a>";
  261. $showprev = ( $startdoc >= $pagedocs );
  262. $shownext = ( $resultCount >= $pagedocs );
  263. $result = "<h4>";
  264. if( $showprev ) { $result .= $prevfull; }
  265. if( $showprev && $shownext ) { $result .= '|'; }
  266. if( $shownext ) { $result .= $nextfull; }
  267. $result .= "</h4>";
  268. return $result;
  269. }
  270. function indri_insert_base_tag( $document ) {
  271. if( !isset( $document->metadata["url"] ) ) {
  272. return $document->text;
  273. }
  274. $text = $document->text;
  275. $meta = $document->metadata;
  276. $url = $meta["url"];
  277. $has_base = strpos( $text, "<base" );
  278. if( $has_base === true ) {
  279. // don't need to add a base tag if it's already there
  280. return $text;
  281. }
  282. $base_url = $url;
  283. $last_slash = strrpos( $url, "/" );
  284. $last_dot = strrpos( $url, "." );
  285. if( $last_slash > $last_dot ) {
  286. $base_url = $url;
  287. } else {
  288. $base_url = substr( $url, 0, $last_slash . "/" );
  289. }
  290. // clean up image links
  291. $text = preg_replace( '@<img src="(?!http)([^"]*)@siU', '<img src="' . $base_url . '\1', $text );
  292. // clean up a links
  293. $text = preg_replace( '@<a href="(?!http)([^"]*)@siU', '<a href="' . $base_url . '\1', $text );
  294. return $text;
  295. }