PageRenderTime 60ms CodeModel.GetById 36ms RepoModel.GetById 1ms app.codeStats 0ms

/cms/modules/search/admin/spiderfuncs.php

https://github.com/swat/pragyan
PHP | 831 lines | 634 code | 45 blank | 152 comment | 105 complexity | 24925ab152dcdb25e07cead02386aac6 MD5 | raw file
  1. <?php
  2. function getFileContents($url) {
  3. global $user_agent;
  4. $urlparts = parse_url($url);
  5. $path = $urlparts['path'];
  6. $host = $urlparts['host'];
  7. if ($urlparts['query'] != "")
  8. $path .= "?".$urlparts['query'];
  9. if (isset ($urlparts['port'])) {
  10. $port = (int) $urlparts['port'];
  11. } else
  12. if ($urlparts['scheme'] == "http") {
  13. $port = 80;
  14. } else
  15. if ($urlparts['scheme'] == "https") {
  16. $port = 443;
  17. }
  18. if ($port == 80) {
  19. $portq = "";
  20. } else {
  21. $portq = ":$port";
  22. }
  23. $all = "*/*";
  24. $request = "GET $path HTTP/1.0\r\nHost: $host$portq\r\nAccept: $all\r\nUser-Agent: $user_agent\r\n\r\n";
  25. $fsocket_timeout = 30;
  26. if (substr($url, 0, 5) == "https") {
  27. $target = "ssl://".$host;
  28. } else {
  29. $target = $host;
  30. }
  31. $errno = 0;
  32. $errstr = "";
  33. $fp = @ fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
  34. print $errstr;
  35. if (!$fp) {
  36. $contents['state'] = "NOHOST";
  37. printConnectErrorReport($errstr);
  38. return $contents;
  39. } else {
  40. if (!fputs($fp, $request)) {
  41. $contents['state'] = "Cannot send request";
  42. return $contents;
  43. }
  44. $data = null;
  45. socket_set_timeout($fp, $fsocket_timeout);
  46. do{
  47. $status = socket_get_status($fp);
  48. $data .= fgets($fp, 8192);
  49. } while (!feof($fp) && !$status['timed_out']) ;
  50. fclose($fp);
  51. if ($status['timed_out'] == 1) {
  52. $contents['state'] = "timeout";
  53. } else
  54. $contents['state'] = "ok";
  55. $contents['file'] = substr($data, strpos($data, "\r\n\r\n") + 4);
  56. }
  57. return $contents;
  58. }
  59. /*
  60. check if file is available and in readable form
  61. */
  62. function url_status($url) {
  63. global $user_agent, $index_pdf, $index_doc, $index_xls, $index_ppt;
  64. $urlparts = parse_url($url);
  65. $path = $urlparts['path'];
  66. $host = $urlparts['host'];
  67. if (isset($urlparts['query']))
  68. $path .= "?".$urlparts['query'];
  69. if (isset ($urlparts['port'])) {
  70. $port = (int) $urlparts['port'];
  71. } else
  72. if ($urlparts['scheme'] == "http") {
  73. $port = 80;
  74. } else
  75. if ($urlparts['scheme'] == "https") {
  76. $port = 443;
  77. }
  78. if ($port == 80) {
  79. $portq = "";
  80. } else {
  81. $portq = ":$port";
  82. }
  83. $all = "*/*"; //just to prevent "comment effect" in get accept
  84. $request = "HEAD $path HTTP/1.1\r\nHost: $host$portq\r\nAccept: $all\r\nUser-Agent: $user_agent\r\n\r\n";
  85. if (substr($url, 0, 5) == "https") {
  86. $target = "ssl://".$host;
  87. } else {
  88. $target = $host;
  89. }
  90. $fsocket_timeout = 30;
  91. $errno = 0;
  92. $errstr = "";
  93. $fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
  94. $linkstate = "ok";
  95. if (!$fp) {
  96. $status['state'] = "NOHOST";
  97. } else {
  98. socket_set_timeout($fp, 30);
  99. fputs($fp, $request);
  100. $answer = fgets($fp, 4096);
  101. $regs = Array ();
  102. if (preg_match("/HTTP/[0-9.]+ (([0-9])[0-9]{2})/", $answer, $regs)) {
  103. $httpcode = $regs[2];
  104. $full_httpcode = $regs[1];
  105. if ($httpcode <> 2 && $httpcode <> 3) {
  106. $status['state'] = "Unreachable: http $full_httpcode";
  107. $linkstate = "Unreachable";
  108. }
  109. }
  110. if ($linkstate <> "Unreachable") {
  111. while ($answer) {
  112. $answer = fgets($fp, 4096);
  113. if (preg_match("/Location: *([^\n\r ]+)/", $answer, $regs) && $httpcode == 3 && $full_httpcode != 302) {
  114. $status['path'] = $regs[1];
  115. $status['state'] = "Relocation: http $full_httpcode";
  116. fclose($fp);
  117. return $status;
  118. }
  119. if (preg_match("/Last-Modified: *([a-z0-9,: ]+)/i", $answer, $regs)) {
  120. $status['date'] = $regs[1];
  121. }
  122. if (preg_match("/Content-Type:/i", $answer)) {
  123. $content = $answer;
  124. $answer = '';
  125. break;
  126. }
  127. }
  128. $socket_status = socket_get_status($fp);
  129. if (preg_match("/Content-Type: *([a-z\/.-]*)/i", $content, $regs)) {
  130. if ($regs[1] == 'text/html' || $regs[1] == 'text/' || $regs[1] == 'text/plain') {
  131. $status['content'] = 'text';
  132. $status['state'] = 'ok';
  133. } else if ($regs[1] == 'application/pdf' && $index_pdf == 1) {
  134. $status['content'] = 'pdf';
  135. $status['state'] = 'ok';
  136. } else if (($regs[1] == 'application/msword' || $regs[1] == 'application/vnd.ms-word') && $index_doc == 1) {
  137. $status['content'] = 'doc';
  138. $status['state'] = 'ok';
  139. } else if (($regs[1] == 'application/excel' || $regs[1] == 'application/vnd.ms-excel') && $index_xls == 1) {
  140. $status['content'] = 'xls';
  141. $status['state'] = 'ok';
  142. } else if (($regs[1] == 'application/mspowerpoint' || $regs[1] == 'application/vnd.ms-powerpoint') && $index_ppt == 1) {
  143. $status['content'] = 'ppt';
  144. $status['state'] = 'ok';
  145. } else {
  146. $status['state'] = "Not text or html";
  147. }
  148. } else
  149. if ($socket_status['timed_out'] == 1) {
  150. $status['state'] = "Timed out (no reply from server)";
  151. } else
  152. $status['state'] = "Not text or html";
  153. }
  154. }
  155. fclose($fp);
  156. return $status;
  157. }
  158. /*
  159. Read robots.txt file in the server, to find any disallowed files/folders
  160. */
  161. function check_robot_txt($url) {
  162. global $user_agent;
  163. $urlparts = parse_url($url);
  164. $url = 'http://'.$urlparts['host']."/robots.txt";
  165. $url_status = url_status($url);
  166. $omit = array ();
  167. if ($url_status['state'] == "ok") {
  168. $robot = file($url);
  169. if (!$robot) {
  170. $contents = getFileContents($url);
  171. $file = $contents['file'];
  172. $robot = explode("\n", $file);
  173. }
  174. $regs = Array ();
  175. $this_agent= "";
  176. while (list ($id, $line) = each($robot)) {
  177. if (preg_match("/^user-agent: *([^#]+) */", $line, $regs)) {
  178. $this_agent = trim($regs[1]);
  179. if ($this_agent == '*' || $this_agent == $user_agent)
  180. $check = 1;
  181. else
  182. $check = 0;
  183. }
  184. if (preg_match("/disallow: *([^#]+)/", $line, $regs) && $check == 1) {
  185. $disallow_str = preg_replace("/[\n ]+/i", "", $regs[1]);
  186. if (trim($disallow_str) != "") {
  187. $omit[] = $disallow_str;
  188. } else {
  189. if ($this_agent == '*' || $this_agent == $user_agent) {
  190. return null;
  191. }
  192. }
  193. }
  194. }
  195. }
  196. return $omit;
  197. }
  198. /*
  199. Remove the file part from an url (to build an url from an url and given relative path)
  200. */
  201. function remove_file_from_url($url) {
  202. $url_parts = parse_url($url);
  203. $path = $url_parts['path'];
  204. $regs = Array ();
  205. if (preg_match('/([^\/]+)$/i', $path, $regs)) {
  206. $file = $regs[1];
  207. $check = $file.'$';
  208. $path = preg_replace("/$check"."/i", "", $path);
  209. }
  210. if ($url_parts['port'] == 80 || $url_parts['port'] == "") {
  211. $portq = "";
  212. } else {
  213. $portq = ":".$url_parts['port'];
  214. }
  215. $url = $url_parts['scheme']."://".$url_parts['host'].$portq.$path;
  216. return $url;
  217. }
  218. /*
  219. Extract links from html
  220. */
  221. function get_links($file, $url, $can_leave_domain, $base) {
  222. $chunklist = array ();
  223. // The base URL comes from either the meta tag or the current URL.
  224. if (!empty($base)) {
  225. $url = $base;
  226. }
  227. $links = array ();
  228. $regs = Array ();
  229. $checked_urls = Array();
  230. preg_match_all("/href\s*=\s*[\'\"]?([+:%\/\?~=&;\\\(\),._a-zA-Z0-9-]*)(#[.a-zA-Z0-9-]*)?[\'\" ]?(\s*rel\s*=\s*[\'\"]?(nofollow)[\'\"]?)?/i", $file, $regs, PREG_SET_ORDER);
  231. foreach ($regs as $val) {
  232. if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
  233. if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
  234. $links[] = $a;
  235. }
  236. $checked_urls[$val[1]] = 1;
  237. }
  238. }
  239. preg_match_all("/(frame[^>]*src[[:blank:]]*)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
  240. foreach ($regs as $val) {
  241. if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
  242. if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
  243. $links[] = $a;
  244. }
  245. $checked_urls[$val[1]] = 1;
  246. }
  247. }
  248. preg_match_all("/(window[.]location)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
  249. foreach ($regs as $val) {
  250. if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
  251. if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
  252. $links[] = $a;
  253. }
  254. $checked_urls[$val[1]] = 1;
  255. }
  256. }
  257. preg_match_all("/(http-equiv=['\"]refresh['\"] *content=['\"][0-9]+;url)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
  258. foreach ($regs as $val) {
  259. if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
  260. if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
  261. $links[] = $a;
  262. }
  263. $checked_urls[$val[1]] = 1;
  264. }
  265. }
  266. preg_match_all("/(window[.]open[[:blank:]]*[(])[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
  267. foreach ($regs as $val) {
  268. if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
  269. if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
  270. $links[] = $a;
  271. }
  272. $checked_urls[$val[1]] = 1;
  273. }
  274. }
  275. return $links;
  276. }
  277. /*
  278. Function to build a unique word array from the text of a webpage, together with the count of each word
  279. */
  280. function unique_array($arr) {
  281. global $min_word_length;
  282. global $common;
  283. global $word_upper_bound;
  284. global $index_numbers, $stem_words;
  285. if ($stem_words == 1) {
  286. $newarr = Array();
  287. foreach ($arr as $val) {
  288. $newarr[] = stem($val);
  289. }
  290. $arr = $newarr;
  291. }
  292. sort($arr);
  293. reset($arr);
  294. $newarr = array ();
  295. $i = 0;
  296. $counter = 1;
  297. $element = current($arr);
  298. if ($index_numbers == 1) {
  299. $pattern = "/[a-z0-9]+/";
  300. } else {
  301. $pattern = "/[a-z]+/";
  302. }
  303. $regs = Array ();
  304. for ($n = 0; $n < sizeof($arr); $n ++) {
  305. //check if word is long enough, contains alphabetic characters and is not a common word
  306. //to eliminate/count multiple instance of words
  307. $next_in_arr = next($arr);
  308. if ($next_in_arr != $element) {
  309. if (strlen($element) >= $min_word_length && preg_match($pattern, remove_accents($element)) && (@ $common[$element] <> 1)) {
  310. if (preg_match("/^(-|\\\')(.*)/", $element, $regs))
  311. $element = $regs[2];
  312. if (preg_match("/(.*)(\\\'|-)$/", $element, $regs))
  313. $element = $regs[1];
  314. $newarr[$i][1] = $element;
  315. $newarr[$i][2] = $counter;
  316. $element = current($arr);
  317. $i ++;
  318. $counter = 1;
  319. } else {
  320. $element = $next_in_arr;
  321. }
  322. } else {
  323. if ($counter < $word_upper_bound)
  324. $counter ++;
  325. }
  326. }
  327. return $newarr;
  328. }
  329. /*
  330. Checks if url is legal, relative to the main url.
  331. */
  332. function url_purify($url, $parent_url, $can_leave_domain) {
  333. global $ext, $mainurl, $apache_indexes, $strip_sessids;
  334. $urlparts = parse_url($url);
  335. $main_url_parts = parse_url($mainurl);
  336. if ($urlparts['host'] != "" && $urlparts['host'] != $main_url_parts['host'] && $can_leave_domain != 1) {
  337. return '';
  338. }
  339. reset($ext);
  340. while (list ($id, $excl) = each($ext))
  341. if (preg_match("/\.$excl$/i", $url))
  342. return '';
  343. if (substr($url, -1) == '\\') {
  344. return '';
  345. }
  346. if (isset($urlparts['query'])) {
  347. if ($apache_indexes[$urlparts['query']]) {
  348. return '';
  349. }
  350. }
  351. if (preg_match("/[\/]?mailto:|[\/]?javascript:|[\/]?news:/i", $url)) {
  352. return '';
  353. }
  354. if (isset($urlparts['scheme'])) {
  355. $scheme = $urlparts['scheme'];
  356. } else {
  357. $scheme ="";
  358. }
  359. //only http and https links are followed
  360. if (!($scheme == 'http' || $scheme == '' || $scheme == 'https')) {
  361. return '';
  362. }
  363. //parent url might be used to build an url from relative path
  364. $parent_url = remove_file_from_url($parent_url);
  365. $parent_url_parts = parse_url($parent_url);
  366. if (substr($url, 0, 1) == '/') {
  367. $url = $parent_url_parts['scheme']."://".$parent_url_parts['host'].$url;
  368. } else
  369. if (!isset($urlparts['scheme'])) {
  370. $url = $parent_url.$url;
  371. }
  372. $url_parts = parse_url($url);
  373. $urlpath = $url_parts['path'];
  374. $regs = Array ();
  375. while (preg_match("/[^\/]*\/[.]{2}\//", $urlpath, $regs)) {
  376. $urlpath = str_replace($regs[0], "", $urlpath);
  377. }
  378. //remove relative path instructions like ../ etc
  379. $urlpath = preg_replace("/\/+/", "/", $urlpath);
  380. $urlpath = preg_replace("/[^\/]*\/[.]{2}/", "", $urlpath);
  381. $urlpath = str_replace("./", "", $urlpath);
  382. $query = "";
  383. if (isset($url_parts['query'])) {
  384. $query = "?".$url_parts['query'];
  385. }
  386. if ($main_url_parts['port'] == 80 || $url_parts['port'] == "") {
  387. $portq = "";
  388. } else {
  389. $portq = ":".$main_url_parts['port'];
  390. }
  391. $url = $url_parts['scheme']."://".$url_parts['host'].$portq.$urlpath.$query;
  392. //if we index sub-domains
  393. if ($can_leave_domain == 1) {
  394. return $url;
  395. }
  396. $mainurl = remove_file_from_url($mainurl);
  397. if ($strip_sessids == 1) {
  398. $url = remove_sessid($url);
  399. }
  400. //only urls in staying in the starting domain/directory are followed
  401. $url = convert_url($url);
  402. if (strstr($url, $mainurl) == false) {
  403. return '';
  404. } else
  405. return $url;
  406. }
  407. function save_keywords($wordarray, $link_id, $domain) {
  408. global $mysql_table_prefix, $all_keywords;
  409. reset($wordarray);
  410. while ($thisword = each($wordarray)) {
  411. $word = $thisword[1][1];
  412. $wordmd5 = substr(md5($word), 0, 1);
  413. $weight = $thisword[1][2];
  414. if (strlen($word)<= 30) {
  415. $keyword_id = $all_keywords[$word];
  416. if ($keyword_id == "") {
  417. mysql_query("insert into ".$mysql_table_prefix."keywords (keyword) values ('$word')");
  418. if (mysql_errno() == 1062) {
  419. $result = mysql_query("select keyword_ID from ".$mysql_table_prefix."keywords where keyword='$word'");
  420. echo mysql_error();
  421. $row = mysql_fetch_row($result);
  422. $keyword_id = $row[0];
  423. } else{
  424. $keyword_id = mysql_insert_id();
  425. $all_keywords[$word] = $keyword_id;
  426. echo mysql_error();
  427. }
  428. }
  429. $inserts[$wordmd5] .= ",($link_id, $keyword_id, $weight, $domain)";
  430. }
  431. }
  432. for ($i=0;$i<=15; $i++) {
  433. $char = dechex($i);
  434. $values= substr($inserts[$char], 1);
  435. if ($values!="") {
  436. $query = "insert into ".$mysql_table_prefix."link_keyword$char (link_id, keyword_id, weight, domain) values $values";
  437. mysql_query($query);
  438. echo mysql_error();
  439. }
  440. }
  441. }
  442. function get_head_data($file) {
  443. $headdata = "";
  444. preg_match("@<head[^>]*>(.*?)<\/head>@si",$file, $regs);
  445. $headdata = $regs[1];
  446. $description = "";
  447. $robots = "";
  448. $keywords = "";
  449. $base = "";
  450. $res = Array ();
  451. if ($headdata != "") {
  452. preg_match("/<meta +name *=[\"']?robots[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
  453. if (isset ($res)) {
  454. $robots = $res[1];
  455. }
  456. preg_match("/<meta +name *=[\"']?description[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
  457. if (isset ($res)) {
  458. $description = $res[1];
  459. }
  460. preg_match("/<meta +name *=[\"']?keywords[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
  461. if (isset ($res)) {
  462. $keywords = $res[1];
  463. }
  464. // e.g. <base href="http://www.consil.co.uk/index.php" />
  465. preg_match("/<base +href *= *[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
  466. if (isset ($res)) {
  467. $base = $res[1];
  468. }
  469. $keywords = preg_replace("/[, ]+/", " ", $keywords);
  470. $robots = explode(",", strtolower($robots));
  471. $nofollow = 0;
  472. $noindex = 0;
  473. foreach ($robots as $x) {
  474. if (trim($x) == "noindex") {
  475. $noindex = 1;
  476. }
  477. if (trim($x) == "nofollow") {
  478. $nofollow = 1;
  479. }
  480. }
  481. $data['description'] = addslashes($description);
  482. $data['keywords'] = addslashes($keywords);
  483. $data['nofollow'] = $nofollow;
  484. $data['noindex'] = $noindex;
  485. $data['base'] = $base;
  486. }
  487. return $data;
  488. }
  489. function clean_file($file, $url, $type) {
  490. global $entities, $index_host, $index_meta_keywords;
  491. $urlparts = parse_url($url);
  492. $host = $urlparts['host'];
  493. //remove filename from path
  494. $path = preg_replace('/([^\/]+)$/i', "", $urlparts['path']);
  495. $file = preg_replace("/<link rel[^<>]*>/i", " ", $file);
  496. $file = preg_replace("@<!--sphider_noindex-->.*?<!--\/sphider_noindex-->@si", " ",$file);
  497. $file = preg_replace("@<!--.*?-->@si", " ",$file);
  498. $file = preg_replace("@<script[^>]*?>.*?</script>@si", " ",$file);
  499. $headdata = get_head_data($file);
  500. $regs = Array ();
  501. if (preg_match("@<title *>(.*?)<\/title*>@si", $file, $regs)) {
  502. $title = trim($regs[1]);
  503. $file = str_replace($regs[0], "", $file);
  504. } else if ($type == 'pdf' || $type == 'doc') { //the title of a non-html file is its first few words
  505. $title = substr($file, 0, strrpos(substr($file, 0, 40), " "));
  506. }
  507. $file = preg_replace("@<style[^>]*>.*?<\/style>@si", " ", $file);
  508. //create spaces between tags, so that removing tags doesnt concatenate strings
  509. $file = preg_replace("/<[\w ]+>/", "\\0 ", $file);
  510. $file = preg_replace("/<\/[\w ]+>/", "\\0 ", $file);
  511. $file = strip_tags($file);
  512. $file = preg_replace("/&nbsp;/", " ", $file);
  513. $fulltext = $file;
  514. $file .= " ".$title;
  515. if ($index_host == 1) {
  516. $file = $file." ".$host." ".$path;
  517. }
  518. if ($index_meta_keywords == 1) {
  519. $file = $file." ".$headdata['keywords'];
  520. }
  521. //replace codes with ascii chars
  522. $file = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $file);
  523. $file = preg_replace('~&#([0-9]+);~e', 'chr("\\1")', $file);
  524. $file = strtolower($file);
  525. reset($entities);
  526. while ($char = each($entities)) {
  527. $file = preg_replace("/".$char[0]."/i", $char[1], $file);
  528. }
  529. $file = preg_replace("/&[a-z]{1,6};/", " ", $file);
  530. $file = preg_replace("/[\*\^\+\?\\\.\[\]\^\$\|\{\)\(\}~!\"\/@#£$%&=`´;><:,]+/", " ", $file);
  531. $file = preg_replace("/\s+/", " ", $file);
  532. $data['fulltext'] = addslashes($fulltext);
  533. $data['content'] = addslashes($file);
  534. $data['title'] = addslashes($title);
  535. $data['description'] = $headdata['description'];
  536. $data['keywords'] = $headdata['keywords'];
  537. $data['host'] = $host;
  538. $data['path'] = $path;
  539. $data['nofollow'] = $headdata['nofollow'];
  540. $data['noindex'] = $headdata['noindex'];
  541. $data['base'] = $headdata['base'];
  542. return $data;
  543. }
  544. function calc_weights($wordarray, $title, $host, $path, $keywords) {
  545. global $index_host, $index_meta_keywords;
  546. $hostarray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($host))));
  547. $patharray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($path))));
  548. $titlearray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($title))));
  549. $keywordsarray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($keywords))));
  550. $path_depth = countSubstrs($path, "/");
  551. while (list ($wid, $word) = each($wordarray)) {
  552. $word_in_path = 0;
  553. $word_in_domain = 0;
  554. $word_in_title = 0;
  555. $meta_keyword = 0;
  556. if ($index_host == 1) {
  557. while (list ($id, $path) = each($patharray)) {
  558. if ($path[1] == $word[1]) {
  559. $word_in_path = 1;
  560. break;
  561. }
  562. }
  563. reset($patharray);
  564. while (list ($id, $host) = each($hostarray)) {
  565. if ($host[1] == $word[1]) {
  566. $word_in_domain = 1;
  567. break;
  568. }
  569. }
  570. reset($hostarray);
  571. }
  572. if ($index_meta_keywords == 1) {
  573. while (list ($id, $keyword) = each($keywordsarray)) {
  574. if ($keyword[1] == $word[1]) {
  575. $meta_keyword = 1;
  576. break;
  577. }
  578. }
  579. reset($keywordsarray);
  580. }
  581. while (list ($id, $tit) = each($titlearray)) {
  582. if ($tit[1] == $word[1]) {
  583. $word_in_title = 1;
  584. break;
  585. }
  586. }
  587. reset($titlearray);
  588. $wordarray[$wid][2] = (int) (calc_weight($wordarray[$wid][2], $word_in_title, $word_in_domain, $word_in_path, $path_depth, $meta_keyword));
  589. }
  590. reset($wordarray);
  591. return $wordarray;
  592. }
  593. function isDuplicateMD5($md5sum) {
  594. global $mysql_table_prefix;
  595. $result = mysql_query("select link_id from ".$mysql_table_prefix."links where md5sum='$md5sum'");
  596. echo mysql_error();
  597. if (mysql_num_rows($result) > 0) {
  598. return true;
  599. }
  600. return false;
  601. }
  602. function check_include($link, $inc, $not_inc) {
  603. $url_inc = Array ();
  604. $url_not_inc = Array ();
  605. if ($inc != "") {
  606. $url_inc = explode("\n", $inc);
  607. }
  608. if ($not_inc != "") {
  609. $url_not_inc = explode("\n", $not_inc);
  610. }
  611. $oklinks = Array ();
  612. $include = true;
  613. foreach ($url_not_inc as $str) {
  614. $str = trim($str);
  615. if ($str != "") {
  616. if (substr($str, 0, 1) == '*') {
  617. if (preg_match(substr($str, 1), $link)) {
  618. $include = false;
  619. break;
  620. }
  621. } else {
  622. if (!(strpos($link, $str) === false)) {
  623. $include = false;
  624. break;
  625. }
  626. }
  627. }
  628. }
  629. if ($include && $inc != "") {
  630. $include = false;
  631. foreach ($url_inc as $str) {
  632. $str = trim($str);
  633. if ($str != "") {
  634. if (substr($str, 0, 1) == '*') {
  635. if (preg_match(substr($str, 1), $link)) {
  636. $include = true;
  637. break 2;
  638. }
  639. } else {
  640. if (strpos($link, $str) !== false) {
  641. $include = true;
  642. break;
  643. }
  644. }
  645. }
  646. }
  647. }
  648. return $include;
  649. }
  650. function check_for_removal($url) {
  651. global $mysql_table_prefix;
  652. global $command_line;
  653. $result = mysql_query("select link_id, visible from ".$mysql_table_prefix."links"." where url='$url'");
  654. echo mysql_error();
  655. if (mysql_num_rows($result) > 0) {
  656. $row = mysql_fetch_row($result);
  657. $link_id = $row[0];
  658. $visible = $row[1];
  659. if ($visible > 0) {
  660. $visible --;
  661. mysql_query("update ".$mysql_table_prefix."links set visible=$visible where link_id=$link_id");
  662. echo mysql_error();
  663. } else {
  664. mysql_query("delete from ".$mysql_table_prefix."links where link_id=$link_id");
  665. echo mysql_error();
  666. for ($i=0;$i<=15; $i++) {
  667. $char = dechex($i);
  668. mysql_query("delete from ".$mysql_table_prefix."link_keyword$char where link_id=$link_id");
  669. echo mysql_error();
  670. }
  671. printStandardReport('pageRemoved',$command_line);
  672. }
  673. }
  674. }
  675. function convert_url($url) {
  676. $url = str_replace("&amp;", "&", $url);
  677. $url = str_replace(" ", "%20", $url);
  678. return $url;
  679. }
  680. function extract_text($contents, $source_type) {
  681. global $tmp_dir, $pdftotext_path, $catdoc_path, $xls2csv_path, $catppt_path;
  682. $temp_file = "tmp_file";
  683. $filename = $tmp_dir."/".$temp_file ;
  684. if (!$handle = fopen($filename, 'w')) {
  685. die ("Cannot open file $filename");
  686. }
  687. if (fwrite($handle, $contents) === FALSE) {
  688. die ("Cannot write to file $filename");
  689. }
  690. fclose($handle);
  691. if ($source_type == 'pdf') {
  692. $command = $pdftotext_path." $filename -";
  693. $a = exec($command,$result, $retval);
  694. } else if ($source_type == 'doc') {
  695. $command = $catdoc_path." $filename";
  696. $a = exec($command,$result, $retval);
  697. } else if ($source_type == 'xls') {
  698. $command = $xls2csv_path." $filename";
  699. $a = exec($command,$result, $retval);
  700. } else if ($source_type == 'ppt') {
  701. $command = $catppt_path." $filename";
  702. $a = exec($command,$result, $retval);
  703. }
  704. unlink ($filename);
  705. return implode(' ', $result);
  706. }
  707. //function to calculate the weight of pages
  708. function calc_weight ($words_in_page, $word_in_title, $word_in_domain, $word_in_path, $path_depth, $meta_keyword) {
  709. global $title_weight, $domain_weight, $path_weight,$meta_weight;
  710. $weight = ($words_in_page + $word_in_title * $title_weight +
  711. $word_in_domain * $domain_weight +
  712. $word_in_path * $path_weight + $meta_keyword * $meta_weight) *10 / (0.8 +0.2*$path_depth);
  713. return $weight;
  714. }
  715. function remove_sessid($url) {
  716. return preg_replace("/(\?|&)(PHPSESSID|JSESSIONID|ASPSESSIONID|sid)=[0-9a-zA-Z]+$/", "", $url);
  717. }
  718. ?>