PageRenderTime 48ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 1ms

/cms/modules/search/include/searchfuncs.php

https://github.com/akash6190/pragyan
PHP | 636 lines | 514 code | 109 blank | 13 comment | 175 complexity | e505cf6ebdae940136d991afddcb7ea8 MD5 | raw file
  1. <?php
  2. /*******************************************
  3. * Sphider Version 1.3.x
  4. * This program is licensed under the GNU GPL.
  5. * By Ando Saabas ando(a t)cs.ioc.ee
  6. ********************************************/
  7. error_reporting(E_ALL ^ E_NOTICE);
  8. function swap_max (&$arr, $start, $domain) {
  9. $pos = $start;
  10. $maxweight = $arr[$pos]['weight'];
  11. for ($i = $start; $i< count($arr); $i++) {
  12. if ($arr[$i]['domain'] == $domain) {
  13. $pos = $i;
  14. $maxweight = $arr[$i]['weight'];
  15. break;
  16. }
  17. if ($arr[$i]['weight'] > $maxweight) {
  18. $pos = $i;
  19. $maxweight = $arr[$i]['weight'];
  20. }
  21. }
  22. $temp = $arr[$start];
  23. $arr[$start] = $arr[$pos];
  24. $arr[$pos] = $temp;
  25. }
  26. function sort_with_domains (&$arr) {
  27. $domain = -1;
  28. for ($i = 0; $i< count($arr)-1; $i++) {
  29. swap_max($arr, $i, $domain);
  30. $domain = $arr[$i]['domain'];
  31. }
  32. }
  33. function cmp($a, $b) {
  34. if ($a['weight'] == $b['weight'])
  35. return 0;
  36. return ($a['weight'] > $b['weight']) ? -1 : 1;
  37. }
  38. function addmarks($a) {
  39. $a = preg_replace("/[ ]+/", " ", $a);
  40. $a = str_replace(" +", "+", $a);
  41. $a = str_replace(" ", "+", $a);
  42. return $a;
  43. }
  44. function makeboollist($a) {
  45. global $entities, $stem_words;
  46. while ($char = each($entities)) {
  47. $a = preg_replace("/".$char[0]."/i", $char[1], $a);
  48. }
  49. $a = trim($a);
  50. $a = preg_replace("/&quot;/i", "\"", $a);
  51. $returnWords = array();
  52. //get all phrases
  53. $regs = Array();
  54. while (preg_match("/([-]?)\"([^\"]+)\"/", $a, $regs)) {
  55. if ($regs[1] == '') {
  56. $returnWords['+s'][] = $regs[2];
  57. $returnWords['hilight'][] = $regs[2];
  58. } else {
  59. $returnWords['-s'][] = $regs[2];
  60. }
  61. $a = str_replace($regs[0], "", $a);
  62. }
  63. $a = strtolower(preg_replace("/[ ]+/", " ", $a));
  64. // $a = remove_accents($a);
  65. $a = trim($a);
  66. $words = explode(' ', $a);
  67. if ($a=="") {
  68. $limit = 0;
  69. } else {
  70. $limit = count($words);
  71. }
  72. $k = 0;
  73. //get all words (both include and exlude)
  74. $includeWords = array();
  75. while ($k < $limit) {
  76. if (substr($words[$k], 0, 1) == '+') {
  77. $includeWords[] = substr($words[$k], 1);
  78. if (!ignoreWord(substr($words[$k], 1))) {
  79. $returnWords['hilight'][] = substr($words[$k], 1);
  80. if ($stem_words == 1) {
  81. $returnWords['hilight'][] = stem(substr($words[$k], 1));
  82. }
  83. }
  84. } else if (substr($words[$k], 0, 1) == '-') {
  85. $returnWords['-'][] = substr($words[$k], 1);
  86. } else {
  87. $includeWords[] = $words[$k];
  88. if (!ignoreWord($words[$k])) {
  89. $returnWords['hilight'][] = $words[$k];
  90. if ($stem_words == 1) {
  91. $returnWords['hilight'][] = stem($words[$k]);
  92. }
  93. }
  94. }
  95. $k++;
  96. }
  97. //add words from phrases to includes
  98. if (isset($returnWords['+s'])) {
  99. foreach ($returnWords['+s'] as $phrase) {
  100. $phrase = strtolower(preg_replace("/[ ]+/", " ", $phrase));
  101. $phrase = trim($phrase);
  102. $temparr = explode(' ', $phrase);
  103. foreach ($temparr as $w)
  104. $includeWords[] = $w;
  105. }
  106. }
  107. foreach ($includeWords as $word) {
  108. if (!($word =='')) {
  109. if (ignoreWord($word)) {
  110. $returnWords['ignore'][] = $word;
  111. } else {
  112. $returnWords['+'][] = $word;
  113. }
  114. }
  115. }
  116. return $returnWords;
  117. }
  118. function ignoreword($word) {
  119. global $common;
  120. global $min_word_length;
  121. global $index_numbers;
  122. if ($index_numbers == 1) {
  123. $pattern = "[a-z0-9]+";
  124. } else {
  125. $pattern = "[a-z]+";
  126. }
  127. if (strlen($word) < $min_word_length || (!preg_match("/".$pattern."/i", remove_accents($word))) || ($common[$word] == 1)) {
  128. return 1;
  129. } else {
  130. return 0;
  131. }
  132. }
  133. function search($searchstr, $category, $start, $per_page, $type, $domain) {
  134. global $length_of_link_desc,$mysql_table_prefix, $show_meta_description, $merge_site_results, $stem_words, $did_you_mean_enabled ;
  135. $possible_to_find = 1;
  136. $result = mysql_query("select domain_id from ".$mysql_table_prefix."domains where domain = '$domain'");
  137. if (mysql_num_rows($result)> 0) {
  138. $thisrow = mysql_fetch_array($result);
  139. $domain_qry = "and domain = ".$thisrow[0];
  140. } else {
  141. $domain_qry = "";
  142. }
  143. //find all sites that should not be included in the result
  144. if (count($searchstr['+']) == 0) {
  145. return null;
  146. }
  147. $wordarray = $searchstr['-'];
  148. $notlist = array();
  149. $not_words = 0;
  150. while ($not_words < count($wordarray)) {
  151. if ($stem_words == 1) {
  152. $searchword = addslashes(stem($wordarray[$not_words]));
  153. } else {
  154. $searchword = addslashes($wordarray[$not_words]);
  155. }
  156. $wordmd5 = substr(md5($searchword), 0, 1);
  157. $query1 = "SELECT link_id from ".$mysql_table_prefix."link_keyword$wordmd5, ".$mysql_table_prefix."keywords where ".$mysql_table_prefix."link_keyword$wordmd5.keyword_id= ".$mysql_table_prefix."keywords.keyword_id and keyword='$searchword'";
  158. $result = mysql_query($query1);
  159. while ($row = mysql_fetch_row($result)) {
  160. $notlist[$not_words]['id'][$row[0]] = 1;
  161. }
  162. $not_words++;
  163. }
  164. //find all sites containing the search phrase
  165. $wordarray = $searchstr['+s'];
  166. $phrase_words = 0;
  167. while ($phrase_words < count($wordarray)) {
  168. $searchword = addslashes($wordarray[$phrase_words]);
  169. $query1 = "SELECT link_id from ".$mysql_table_prefix."links where fulltxt like '% $searchword%'";
  170. echo mysql_error();
  171. $result = mysql_query($query1);
  172. $num_rows = mysql_num_rows($result);
  173. if ($num_rows == 0) {
  174. $possible_to_find = 0;
  175. break;
  176. }
  177. while ($row = mysql_fetch_row($result)) {
  178. $phraselist[$phrase_words]['id'][$row[0]] = 1;
  179. }
  180. $phrase_words++;
  181. }
  182. if (($category> 0) && $possible_to_find==1) {
  183. $allcats = get_cats($category);
  184. $catlist = implode(",", $allcats);
  185. $query1 = "select link_id from ".$mysql_table_prefix."links, ".$mysql_table_prefix."sites, ".$mysql_table_prefix."categories, ".$mysql_table_prefix."site_category where ".$mysql_table_prefix."links.site_id = ".$mysql_table_prefix."sites.site_id and ".$mysql_table_prefix."sites.site_id = ".$mysql_table_prefix."site_category.site_id and ".$mysql_table_prefix."site_category.category_id in ($catlist)";
  186. $result = mysql_query($query1);
  187. echo mysql_error();
  188. $num_rows = mysql_num_rows($result);
  189. if ($num_rows == 0) {
  190. $possible_to_find = 0;
  191. }
  192. while ($row = mysql_fetch_row($result)) {
  193. $category_list[$row[0]] = 1;
  194. }
  195. }
  196. //find all sites that include the search word
  197. $wordarray = $searchstr['+'];
  198. $words = 0;
  199. $starttime = getmicrotime();
  200. while (($words < count($wordarray)) && $possible_to_find == 1) {
  201. if ($stem_words == 1) {
  202. $searchword = addslashes(stem($wordarray[$words]));
  203. } else {
  204. $searchword = addslashes($wordarray[$words]);
  205. }
  206. $wordmd5 = substr(md5($searchword), 0, 1);
  207. $query1 = "SELECT distinct link_id, weight, domain from ".$mysql_table_prefix."link_keyword$wordmd5, ".$mysql_table_prefix."keywords where ".$mysql_table_prefix."link_keyword$wordmd5.keyword_id= ".$mysql_table_prefix."keywords.keyword_id and keyword='$searchword' $domain_qry order by weight desc";
  208. echo mysql_error();
  209. $result = mysql_query($query1);
  210. $num_rows = mysql_num_rows($result);
  211. if ($num_rows == 0) {
  212. if ($type != "or") {
  213. $possible_to_find = 0;
  214. break;
  215. }
  216. }
  217. if ($type == "or") {
  218. $indx = 0;
  219. } else {
  220. $indx = $words;
  221. }
  222. while ($row = mysql_fetch_row($result)) {
  223. $linklist[$indx]['id'][] = $row[0];
  224. $domains[$row[0]] = $row[2];
  225. $linklist[$indx]['weight'][$row[0]] = $row[1];
  226. }
  227. $words++;
  228. }
  229. if ($type == "or") {
  230. $words = 1;
  231. }
  232. $result_array_full = Array();
  233. if ($possible_to_find !=0) {
  234. if ($words == 1 && $not_words == 0 && $category < 1) { //if there is only one search word, we already have the result
  235. $result_array_full = $linklist[0]['weight'];
  236. } else { //otherwise build an intersection of all the results
  237. $j= 1;
  238. $min = 0;
  239. while ($j < $words) {
  240. if (count($linklist[$min]['id']) > count($linklist[$j]['id'])) {
  241. $min = $j;
  242. }
  243. $j++;
  244. }
  245. $j = 0;
  246. $temp_array = $linklist[$min]['id'];
  247. $count = 0;
  248. while ($j < count($temp_array)) {
  249. $k = 0; //and word counter
  250. $n = 0; //not word counter
  251. $o = 0; //phrase word counter
  252. $weight = 1;
  253. $break = 0;
  254. while ($k < $words && $break== 0) {
  255. if ($linklist[$k]['weight'][$temp_array[$j]] > 0) {
  256. $weight = $weight + $linklist[$k]['weight'][$temp_array[$j]];
  257. } else {
  258. $break = 1;
  259. }
  260. $k++;
  261. }
  262. while ($n < $not_words && $break== 0) {
  263. if ($notlist[$n]['id'][$temp_array[$j]] > 0) {
  264. $break = 1;
  265. }
  266. $n++;
  267. }
  268. while ($o < $phrase_words && $break== 0) {
  269. if ($phraselist[$n]['id'][$temp_array[$j]] != 1) {
  270. $break = 1;
  271. }
  272. $o++;
  273. }
  274. if ($break== 0 && $category > 0 && $category_list[$temp_array[$j]] != 1) {
  275. $break = 1;
  276. }
  277. if ($break == 0) {
  278. $result_array_full[$temp_array[$j]] = $weight;
  279. $count ++;
  280. }
  281. $j++;
  282. }
  283. }
  284. }
  285. $end = getmicrotime()- $starttime;
  286. if ((count($result_array_full) == 0 || $possible_to_find == 0) && $did_you_mean_enabled == 1) {
  287. reset ($searchstr['+']);
  288. foreach ($searchstr['+'] as $word) {
  289. $word = addslashes($word);
  290. $result = mysql_query("select keyword from ".$mysql_table_prefix."keywords where soundex(keyword) = soundex('$word')");
  291. $max_distance = 100;
  292. $near_word ="";
  293. while ($row=mysql_fetch_row($result)) {
  294. $distance = levenshtein($row[0], $word);
  295. if ($distance < $max_distance && $distance <4) {
  296. $max_distance = $distance;
  297. $near_word = $row[0];
  298. }
  299. }
  300. if ($near_word != "" && $word != $near_word) {
  301. $near_words[$word] = $near_word;
  302. }
  303. }
  304. $res['did_you_mean'] = $near_words;
  305. return $res;
  306. }
  307. if (count($result_array_full) == 0) {
  308. return null;
  309. }
  310. arsort ($result_array_full);
  311. if ($merge_site_results == 1 && $domain_qry == "") {
  312. while (list($key, $value) = each($result_array_full)) {
  313. if (!isset($domains_to_show[$domains[$key]])) {
  314. $result_array_temp[$key] = $value;
  315. $domains_to_show[$domains[$key]] = 1;
  316. } else if ($domains_to_show[$domains[$key]] == 1) {
  317. $domains_to_show[$domains[$key]] = Array ($key => $value);
  318. }
  319. }
  320. } else {
  321. $result_array_temp = $result_array_full;
  322. }
  323. while (list($key, $value) = each ($result_array_temp)) {
  324. $result_array[$key] = $value;
  325. if (isset ($domains_to_show[$domains[$key]]) && $domains_to_show[$domains[$key]] != 1) {
  326. list ($k, $v) = each($domains_to_show[$domains[$key]]);
  327. $result_array[$k] = $v;
  328. }
  329. }
  330. $results = count($result_array);
  331. $keys = array_keys($result_array);
  332. $maxweight = $result_array[$keys[0]];
  333. for ($i = ($start -1)*$per_page; $i <min($results, ($start -1)*$per_page + $per_page) ; $i++) {
  334. $in[] = $keys[$i];
  335. }
  336. if (!is_array($in)) {
  337. $res['results'] = $results;
  338. return $res;
  339. }
  340. $inlist = implode(",", $in);
  341. if ($length_of_link_desc == 0) {
  342. $fulltxt = "fulltxt";
  343. } else {
  344. $fulltxt = "substring(fulltxt, 1, $length_of_link_desc)";
  345. }
  346. $query1 = "SELECT distinct link_id, url, title, description, $fulltxt, size FROM ".$mysql_table_prefix."links WHERE link_id in ($inlist)";
  347. $result = mysql_query($query1);
  348. echo mysql_error();
  349. $i = 0;
  350. while ($row = mysql_fetch_row($result)) {
  351. $res[$i]['title'] = $row[2];
  352. $res[$i]['url'] = $row[1];
  353. if ($row[3] != null && $show_meta_description == 1)
  354. $res[$i]['fulltxt'] = $row[3];
  355. else
  356. $res[$i]['fulltxt'] = $row[4];
  357. $res[$i]['size'] = $row[5];
  358. $res[$i]['weight'] = $result_array[$row[0]];
  359. $dom_result = mysql_query("select domain from ".$mysql_table_prefix."domains where domain_id='".$domains[$row[0]]."'");
  360. $dom_row = mysql_fetch_row($dom_result);
  361. $res[$i]['domain'] = $dom_row[0];
  362. $i++;
  363. }
  364. if ($merge_site_results && $domain_qry == "") {
  365. sort_with_domains($res);
  366. } else {
  367. usort($res, "cmp");
  368. }
  369. echo mysql_error();
  370. $res['maxweight'] = $maxweight;
  371. $res['results'] = $results;
  372. return $res;
  373. /**/
  374. }
  375. function get_search_results($query, $start, $category, $searchtype, $results, $domain) {
  376. global $sph_messages, $results_per_page,
  377. $links_to_next,
  378. $show_query_scores,
  379. $mysql_table_prefix,
  380. $desc_length;
  381. if ($results != "") {
  382. $results_per_page = $results;
  383. }
  384. if ($searchtype == "phrase") {
  385. $query=str_replace('"','',$query);
  386. $query = "\"".$query."\"";
  387. }
  388. $starttime = getmicrotime();
  389. // catch " if only one time entered
  390. if (substr_count($query,'"')==1){
  391. $query=str_replace('"','',$query);
  392. }
  393. $words = makeboollist($query);
  394. $ignorewords = $words['ignore'];
  395. $full_result['ignore_words'] = $words['ignore'];
  396. if ($start==0)
  397. $start=1;
  398. $result = search($words, $category, $start, $results_per_page, $searchtype, $domain);
  399. $query= stripslashes($query);
  400. $entitiesQuery = htmlspecialchars($query);
  401. $full_result['ent_query'] = $entitiesQuery;
  402. $endtime = getmicrotime() - $starttime;
  403. $rows = $result['results'];
  404. $time = round($endtime*100)/100;
  405. $full_result['time'] = $time;
  406. $did_you_mean = "";
  407. if (isset($result['did_you_mean'])) {
  408. $did_you_mean_b=$entitiesQuery;
  409. $did_you_mean=$entitiesQuery;
  410. while (list($key, $val) = each($result['did_you_mean'])) {
  411. if ($key != $val) {
  412. $did_you_mean_b = str_replace($key, "<b>$val</b>", $did_you_mean_b);
  413. $did_you_mean = str_replace($key, "$val", $did_you_mean);
  414. }
  415. }
  416. }
  417. $full_result['did_you_mean'] = $did_you_mean;
  418. $full_result['did_you_mean_b'] = $did_you_mean_b;
  419. $matchword = $sph_messages["matches"];
  420. if ($rows == 1) {
  421. $matchword= $sph_messages["match"];
  422. }
  423. $num_of_results = count($result) - 2;
  424. $full_result['num_of_results'] = $num_of_results;
  425. if ($start < 2)
  426. saveToLog(addslashes($query), $time, $rows);
  427. $from = ($start-1) * $results_per_page+1;
  428. $to = min(($start)*$results_per_page, $rows);
  429. $full_result['from'] = $from;
  430. $full_result['to'] = $to;
  431. $full_result['total_results'] = $rows;
  432. if ($rows>0) {
  433. $maxweight = $result['maxweight'];
  434. $i = 0;
  435. while ($i < $num_of_results && $i < $results_per_page) {
  436. $title = $result[$i]['title'];
  437. $url = $result[$i]['url'];
  438. $fulltxt = $result[$i]['fulltxt'];
  439. $page_size = $result[$i]['size'];
  440. $domain = $result[$i]['domain'];
  441. if ($page_size!="")
  442. $page_size = number_format($page_size, 1)."kb";
  443. $txtlen = strlen($fulltxt);
  444. if ($txtlen > $desc_length) {
  445. $places = array();
  446. foreach($words['hilight'] as $word) {
  447. $tmp = strtolower($fulltxt);
  448. $found_in = strpos($tmp, $word);
  449. $sum = -strlen($word);
  450. while (!($found_in =='')) {
  451. $pos = $found_in+strlen($word);
  452. $sum += $pos; //FIX!!
  453. $tmp = substr($tmp, $pos);
  454. $places[] = $sum;
  455. $found_in = strpos($tmp, $word);
  456. }
  457. }
  458. sort($places);
  459. $x = 0;
  460. $begin = 0;
  461. $end = 0;
  462. while(list($id, $place) = each($places)) {
  463. while ($places[$id + $x] - $place < $desc_length && $x+$id < count($places) && $place < strlen($fulltxt) -$desc_length) {
  464. $x++;
  465. $begin = $id;
  466. $end = $id + $x;
  467. }
  468. }
  469. $begin_pos = max(0, $places[$begin] - 30);
  470. $fulltxt = substr($fulltxt, $begin_pos, $desc_length);
  471. if ($places[$begin] > 0) {
  472. $begin_pos = strpos($fulltxt, " ");
  473. }
  474. $fulltxt = substr($fulltxt, $begin_pos, $desc_length);
  475. $fulltxt = substr($fulltxt, 0, strrpos($fulltxt, " "));
  476. $fulltxt = $fulltxt;
  477. }
  478. $weight = number_format($result[$i]['weight']/$maxweight*100, 2);
  479. if ($title=='')
  480. $title = $sph_messages["Untitled"];
  481. $regs = Array();
  482. if (strlen($title) > 80) {
  483. $title = substr($title, 0,76)."...";
  484. }
  485. foreach($words['hilight'] as $change) {
  486. while (preg_match("/[^\>](".$change.")[^\<]/i", " ".$title." ", $regs)) {
  487. $title = preg_replace("/".$regs[1]."/i", "<b>".$regs[1]."</b>", $title);
  488. }
  489. while (preg_match("/[^\>](".$change.")[^\<]/i", " ".$fulltxt." ", $regs)) {
  490. $fulltxt = preg_replace("/".$regs[1]."/i", "<b>".$regs[1]."</b>", $fulltxt);
  491. }
  492. $url2 = $url;
  493. while (preg_match("/[^\>](".$change.")[^\<]/i", $url2, $regs)) {
  494. $url2 = preg_replace("/".$regs[1]."/i", "<b>".$regs[1]."</b>", $url2);
  495. }
  496. }
  497. $num = $from + $i;
  498. $full_result['qry_results'][$i]['num'] = $num;
  499. $full_result['qry_results'][$i]['weight'] = $weight;
  500. $full_result['qry_results'][$i]['url'] = $url;
  501. $full_result['qry_results'][$i]['title'] = $title;
  502. $full_result['qry_results'][$i]['fulltxt'] = $fulltxt;
  503. $full_result['qry_results'][$i]['url2'] = $url2;
  504. $full_result['qry_results'][$i]['page_size'] = $page_size;
  505. $full_result['qry_results'][$i]['domain_name'] = $domain;
  506. $i++;
  507. }
  508. }
  509. $pages = ceil($rows / $results_per_page);
  510. $full_result['pages'] = $pages;
  511. $prev = $start - 1;
  512. $full_result['prev'] = $prev;
  513. $next = $start + 1;
  514. $full_result['next'] = $next;
  515. $full_result['start'] = $start;
  516. $full_result['query'] = $entitiesQuery;
  517. if ($from <= $to) {
  518. $firstpage = $start - $links_to_next;
  519. if ($firstpage < 1) $firstpage = 1;
  520. $lastpage = $start + $links_to_next;
  521. if ($lastpage > $pages) $lastpage = $pages;
  522. for ($x=$firstpage; $x<=$lastpage; $x++)
  523. $full_result['other_pages'][] = $x;
  524. }
  525. return $full_result;
  526. }
  527. ?>