PageRenderTime 77ms CodeModel.GetById 30ms RepoModel.GetById 1ms app.codeStats 0ms

/search/inc/spider.inc.php

https://bitbucket.org/molusc/sma-website
PHP | 2239 lines | 1773 code | 272 blank | 194 comment | 438 complexity | 8db91d84a3e69e7c0759b1218c936bc1 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. <?php
  2. /******************************************************************************
  3. * iSearch2 - website search engine *
  4. * *
  5. * Visit the iSearch homepage at http://www.iSearchTheNet/isearch *
  6. * *
  7. * Copyright (C) 2002-2007 Z-Host. All rights reserved. *
  8. * *
  9. ******************************************************************************/
  10. if ( !defined('IN_ISEARCH') )
  11. {
  12. die('Hacking attempt');
  13. }
  14. /* Parse the robots.txt stored in the database */
  15. function isearch_parseRobots($domain)
  16. {
  17. global $isearch_config;
  18. global $isearch_table_info;
  19. global $isearch_db;
  20. global $isearch_base;
  21. $isearch_config['robots_domains'][] = $domain;
  22. // Store in a temp veriable to allow other relative URLs to be evaluated
  23. // after parsing robots.txt.
  24. $isearch_base_tmp = $isearch_base;
  25. $allData = isearch_readFile("http://$domain/robots.txt");
  26. $isearch_base = $isearch_base_tmp;
  27. if ($allData != '')
  28. {
  29. $allData = strtolower($allData);
  30. $lines = split("(\r|\n)", $allData);
  31. $validUseragent = False;
  32. $matched = False;
  33. foreach ($lines as $line)
  34. {
  35. $line = ereg_replace('#.*$', '', $line);
  36. $line = ereg_replace('[[:space:]]+', ' ', $line);
  37. $temp = explode(':', $line, 2);
  38. if (count($temp) == 2)
  39. {
  40. $field = trim($temp[0]);
  41. $value = trim($temp[1]);
  42. if ($field == 'user-agent')
  43. {
  44. $validUseragent = False;
  45. $useragents = explode(' ', $value);
  46. foreach ($useragents as $useragent)
  47. {
  48. if (($useragent == 'isearch') || (($useragent == '*') && (!$matched)))
  49. {
  50. $matched = True;
  51. $validUseragent = True;
  52. }
  53. }
  54. }
  55. else if (($validUseragent) && ($field == 'disallow'))
  56. {
  57. if ($value == '')
  58. {
  59. /* This is an allow - remove all previous disallows */
  60. unset($disallow);
  61. }
  62. else
  63. {
  64. $disallow[] = $value;
  65. }
  66. }
  67. }
  68. }
  69. if (isset($disallow))
  70. {
  71. foreach ($disallow as $temp)
  72. {
  73. if ($temp{0} != '/')
  74. {
  75. $temp = '/' . $temp;
  76. }
  77. $url = "^http://$domain$temp";
  78. $url = ereg_replace('\.', '\.', $url);
  79. $url = ereg_replace('\*', '.*', $url);
  80. $url = ereg_replace('\?', '\?', $url);
  81. $url = ereg_replace('\+', '\+', $url);
  82. $isearch_config['robots_excludes'][] = $url;
  83. }
  84. }
  85. }
  86. if (!mysql_query("UPDATE $isearch_table_info SET robots_domains='" . isearch_escape_string(implode(" ", $isearch_config['robots_domains'])) . "', robots_excludes='" . isearch_escape_string(implode(" ", $isearch_config['robots_excludes'])) . "' WHERE id='1'", $isearch_db))
  87. {
  88. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  89. }
  90. }
  91. /* Clear the iSearch log file */
  92. function isearch_clearLog()
  93. {
  94. global $isearch_table_spider_log;
  95. global $isearch_db;
  96. mysql_query("DELETE FROM $isearch_table_spider_log", $isearch_db);
  97. }
  98. /* Return the contents of the spider log */
  99. function isearch_getLog()
  100. {
  101. global $isearch_table_spider_log;
  102. global $isearch_db;
  103. $log = '';
  104. $result = mysql_query("SELECT * FROM $isearch_table_spider_log ORDER BY id", $isearch_db);
  105. if ($result)
  106. {
  107. while ($item = mysql_fetch_object($result))
  108. {
  109. $log .= str_replace(' ', '&nbsp;', htmlentities($item->msg)) . "<BR>\n";
  110. }
  111. }
  112. return $log;
  113. }
  114. /* Save the string in the iSearch log file */
  115. function isearch_log($string, $level=1)
  116. {
  117. global $isearch_table_spider_log;
  118. global $isearch_db;
  119. global $isearch_config;
  120. global $isearch_fromCommandLine;
  121. global $isearch_logEchoLevel;
  122. if ($level <= $isearch_config['log_level'])
  123. {
  124. mysql_query("INSERT INTO $isearch_table_spider_log (msg) VALUES ('" . isearch_escape_string($string) . "')", $isearch_db);
  125. }
  126. if (isset($isearch_fromCommandLine) && $isearch_fromCommandLine)
  127. {
  128. if ($level <= $isearch_logEchoLevel)
  129. {
  130. echo "$string\n";
  131. }
  132. }
  133. else
  134. {
  135. if ($level <= $isearch_config['log_echo_level'])
  136. {
  137. echo str_replace(' ', '&nbsp;', htmlentities($string)) . "<BR>\n";
  138. }
  139. }
  140. }
  141. /* Clean up a string to make it suitable for storing in search index */
  142. function isearch_cleanString($data, $charset)
  143. {
  144. global $isearch_config;
  145. if ($isearch_config['char_set_8_bit'])
  146. {
  147. /* Convert to lower case, doing accented character conversion correctly */
  148. $data = strtr($data, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ?ÁÂ?Ä??Ç?É?Ë?ÍÎ????ÓÔ?Ö×??Ú?ÜÝ?' . chr(0x8a) . chr(0x8e) ,
  149. 'abcdefghijklmnopqrstuvwxyz?áâ?ä??ç?é?ë?íî????óô?ö÷??ú?üý?' . chr(0x9a) . chr(0x9e) );
  150. }
  151. /* Strip out all HTML tags */
  152. $data = strip_tags($data);
  153. /* Replace some breaking chars with spaces */
  154. $data = ereg_replace('[\\;\?!]+', ' ', $data);
  155. if ($isearch_config['allow_dashes'] == 0)
  156. {
  157. /* Replace dashes with spaces */
  158. $data = str_replace('-', ' ', $data);
  159. }
  160. else if ($isearch_config['allow_dashes'] == 1)
  161. {
  162. /* Allow within words */
  163. $data = str_replace(' -', ' ', str_replace('- ', ' ', $data));
  164. }
  165. else if ($isearch_config['allow_dashes'] == 3)
  166. {
  167. /* Remove All */
  168. $data = str_replace('-', '', $data);
  169. }
  170. if ($isearch_config['allow_colons'] == 0)
  171. {
  172. /* Replace with spaces */
  173. $data = str_replace(':', ' ', $data);
  174. }
  175. else if ($isearch_config['allow_colons'] == 1)
  176. {
  177. /* Allow within words */
  178. $data = str_replace(' :', ' ', str_replace(': ', ' ', $data));
  179. }
  180. else if ($isearch_config['allow_colons'] == 3)
  181. {
  182. /* Remove All */
  183. $data = str_replace(':', '', $data);
  184. }
  185. if ($isearch_config['allow_dots'] == 0)
  186. {
  187. /* Replace with spaces */
  188. $data = str_replace('.', ' ', $data);
  189. }
  190. else if ($isearch_config['allow_dots'] == 1)
  191. {
  192. /* Allow within words */
  193. $data = str_replace(' .', ' ', str_replace('. ', ' ', $data));
  194. }
  195. else if ($isearch_config['allow_dots'] == 3)
  196. {
  197. /* Remove All */
  198. $data = str_replace('.', '', $data);
  199. }
  200. if ($isearch_config['allow_commas'] == 0)
  201. {
  202. /* Replace with spaces */
  203. $data = str_replace(',', ' ', $data);
  204. }
  205. else if ($isearch_config['allow_commas'] == 1)
  206. {
  207. /* Allow within words */
  208. $data = str_replace(' ,', ' ', str_replace(', ', ' ', $data));
  209. }
  210. else if ($isearch_config['allow_commas'] == 3)
  211. {
  212. /* Remove All */
  213. $data = str_replace(',', '', $data);
  214. }
  215. if ($isearch_config['allow_underscores'] == 0)
  216. {
  217. /* Replace with spaces */
  218. $data = str_replace('_', ' ', $data);
  219. }
  220. else if ($isearch_config['allow_underscores'] == 1)
  221. {
  222. /* Allow within words */
  223. $data = str_replace(' _', ' ', str_replace('_ ', ' ', $data));
  224. }
  225. else if ($isearch_config['allow_underscores'] == 3)
  226. {
  227. /* Remove All */
  228. $data = str_replace('_', '', $data);
  229. }
  230. if ($isearch_config['char_set_8_bit'])
  231. {
  232. /* Strip out all characters except whitespace numeric and alpha */
  233. $data = preg_replace('/([^-@0-9a-z:\\.,' . chr(0xbf) . '-' . chr(0xff) . chr(0x9a) . chr(0x9e) . '\s])/', '', $data);
  234. }
  235. else
  236. {
  237. /* Remove any single quotes and backslashes. */
  238. $data = ereg_replace('[\'\\]', '', $data);
  239. }
  240. /* Convert from source charset to charset used on results page */
  241. if ((strtolower($charset) != $isearch_config['char_set']) &&
  242. ($charset != '') &&
  243. ($isearch_config['char_set'] != ''))
  244. {
  245. if (function_exists('iconv'))
  246. {
  247. isearch_log("INFO: Converting $charset -> " . $isearch_config['char_set'], 5);
  248. $convertedData = iconv($charset, $isearch_config['char_set'], $data);
  249. if ($convertedData === False)
  250. {
  251. isearch_log("WARNING: Unable to convert $charset -> " . $isearch_config['char_set'], 3);
  252. }
  253. else
  254. {
  255. $data = $convertedData;
  256. }
  257. }
  258. else
  259. {
  260. isearch_log("WARNING: iconv not installed - unable to convert $charset -> " . $isearch_config['char_set'], 5);
  261. }
  262. }
  263. /* Compact all white space into a single space character */
  264. $data = preg_replace("/\\s+/", ' ', $data);
  265. /* Strip white space from beginning and end of the string */
  266. $data = trim($data);
  267. return $data;
  268. }
  269. function isearch_fread($handle, $length = 2147483647)
  270. {
  271. $bytesToRead = $length;
  272. $contents = '';
  273. while((!feof($handle)) && ($bytesToRead > 0))
  274. {
  275. $data = fread($handle, ($bytesToRead > 16384) ? 16384 : $bytesToRead);
  276. $bytesToRead -= strlen($data);
  277. $contents .= $data;
  278. }
  279. if (!feof($handle))
  280. {
  281. isearch_log('WARNING: File reading was truncated at '.($length/1024).' kbytes', 3);
  282. }
  283. return $contents;
  284. }
  285. function isearch_relativeToAbsoluteUrl($newUrl, $relativeToUrl)
  286. {
  287. /* Convert to absolute reference */
  288. if (eregi('^([a-z]+):', $newUrl))
  289. {
  290. $absoluteUrl = $newUrl;
  291. }
  292. else
  293. {
  294. $relativeParts = @parse_url($relativeToUrl);
  295. if ((!isset($relativeParts['scheme'])) || (!isset($relativeParts['host'])))
  296. {
  297. /* Unable to parse relativeToUrl */
  298. isearch_log("WARNING: Unable to parse relativeToUrl [$relativeToUrl]", 3);
  299. return;
  300. }
  301. unset($relativeParts['query']);
  302. unset($relativeParts['fragment']);
  303. if (ereg('^/', $newUrl))
  304. {
  305. /* New URL begins with a slash. It is within the site */
  306. unset($relativeParts['path']);
  307. $siteUrl = glue_url($relativeParts);
  308. $absoluteUrl = $siteUrl . $newUrl;
  309. }
  310. else
  311. {
  312. /* A relative reference (must be within this site) */
  313. /* Remove filename following the last slash */
  314. if (isset($relativeParts['path']))
  315. {
  316. $path = ereg_replace('/[^/]*\.[^/]*$', '/', $relativeParts['path']);
  317. if (ereg('/$', $path))
  318. {
  319. $path .= $newUrl;
  320. }
  321. else
  322. {
  323. $path .= '/' . $newUrl;
  324. }
  325. }
  326. else
  327. {
  328. $path = '/' . $newUrl;
  329. }
  330. $path = ereg_replace('/\.$', '', $path); /* Remove ending "/." */
  331. $path = ereg_replace('/(\./)+', '/', $path); /* Remove any "." references */
  332. $path = ereg_replace('/+/', '/', $path); /* Remove excess slashes */
  333. /* Resolve any ".." references */
  334. $temp = explode('/', $path);
  335. for ($i = 1; $i < count($temp); $i++)
  336. {
  337. if (($temp[$i] == "..") && ($i > 1))
  338. {
  339. for ($j = $i + 1; $j < count($temp); $j ++)
  340. {
  341. $temp[$j-2] = $temp[$j];
  342. }
  343. unset($temp[count($temp)-1]);
  344. unset($temp[count($temp)-1]);
  345. $i = $i - 2;
  346. }
  347. }
  348. $relativeParts['path'] = implode('/', $temp);
  349. $absoluteUrl = glue_url($relativeParts);
  350. }
  351. }
  352. return $absoluteUrl;
  353. }
  354. function isearch_readFile($url, $depth=0)
  355. {
  356. global $isearch_config;
  357. global $isearch_version;
  358. global $isearch_header;
  359. global $isearch_base;
  360. global $isearch_url_fopen_detected, $isearch_curl_detected, $isearch_sockets_detected;
  361. $isearch_header = array();
  362. isearch_log("TRACE: isearch_readFile($url, $depth)", 10);
  363. if ($depth >= 10)
  364. {
  365. /* Inline frame depth of 10 */
  366. isearch_log("WARNING: Inline frame depth limit $depth exceeded", 3);
  367. return '';
  368. }
  369. if ($isearch_config['url_search'] != '')
  370. {
  371. $url = ereg_replace($isearch_config['url_search'], $isearch_config['url_replace'], $url);
  372. isearch_log("INFO: Using replaced URL $url", 5);
  373. }
  374. if ($isearch_config['reading_mechanism'] == 0)
  375. {
  376. /* Autodetect */
  377. if (($isearch_url_fopen_detected) && (!$isearch_config['proxy_enable']))
  378. {
  379. $reading_mechanism = 1; /* fopen */
  380. }
  381. else if (($isearch_sockets_detected) && (eregi('^http://', $url)))
  382. {
  383. $reading_mechanism = 2; /* sockets */
  384. }
  385. else if ($isearch_curl_detected)
  386. {
  387. $reading_mechanism = 3; /* curl */
  388. }
  389. else
  390. {
  391. isearch_log('ERROR: Unable to detect a suitable reading mechanism.', 1);
  392. return '';
  393. }
  394. }
  395. else
  396. {
  397. $reading_mechanism = $isearch_config['reading_mechanism'];
  398. }
  399. $base = $url;
  400. if ($reading_mechanism == 1)
  401. {
  402. /* Use fopen/fread */
  403. isearch_log("INFO: Reading $url using fopen/fread", 5);
  404. $docData = '';
  405. @ini_set('user_agent', "iSearch/$isearch_version");
  406. if ($isearch_config['basic_authorization'] != '')
  407. {
  408. $url = str_replace('//', '//'.$isearch_config['basic_authorization'].'@', $url);
  409. }
  410. $fp = @fopen($url, 'r');
  411. if ($fp)
  412. {
  413. if (function_exists('stream_get_meta_data'))
  414. {
  415. /* Prior to PHP 4.3.0 use $http_response_header instead of stream_get_meta_data() */
  416. $meta_data = stream_get_meta_data($fp);
  417. $header_data = $meta_data['wrapper_data'];
  418. }
  419. else
  420. {
  421. $header_data = $http_response_header;
  422. }
  423. $header = array();
  424. foreach($header_data as $headerLine)
  425. {
  426. $data = explode(': ', $headerLine, 2);
  427. if (count($data) == 2)
  428. {
  429. $header[strtolower($data[0])] = $data[1];
  430. }
  431. }
  432. if (isset($header['content-location']))
  433. {
  434. isearch_log("INFO: Content-Location: ".$header['content-location'], 9);
  435. $base = isearch_relativeToAbsoluteUrl($header['content-location'], $base);
  436. }
  437. else if (isset($header['location']))
  438. {
  439. isearch_log("INFO: Location: ".$header['location'], 9);
  440. $base = isearch_relativeToAbsoluteUrl($header['location'], $base);
  441. }
  442. $docData = isearch_fread($fp, $isearch_config['max_file_size']);
  443. fclose($fp);
  444. }
  445. else
  446. {
  447. isearch_log("WARNING: Unable to fopen URL [$url]", ereg('/robots\.txt$', $url) ? 9 : 3);
  448. return '';
  449. }
  450. }
  451. else
  452. {
  453. $recurse = 10;
  454. while (1)
  455. {
  456. /* Check URL and determine whether this is a file or directory */
  457. $urlParts = @parse_url($url);
  458. if ((!isset($urlParts['scheme'])) || (!isset($urlParts['host'])))
  459. {
  460. isearch_log("WARNING: Unable to parse URL [$url]", 3);
  461. return '';
  462. }
  463. if (!eregi('^(https?|ftps?)$', $urlParts['scheme']))
  464. {
  465. isearch_log("WARNING: Unsupported URL scheme " . $urlParts['scheme'] . " [$url]", 4);
  466. return '';
  467. }
  468. if ($reading_mechanism == 2)
  469. {
  470. isearch_log("INFO: Reading $url using sockets", 5);
  471. if ($urlParts['scheme'] == 'http')
  472. {
  473. $secure = False;
  474. }
  475. else if ($urlParts['scheme'] == 'https')
  476. {
  477. $secure = True;
  478. }
  479. else
  480. {
  481. isearch_log("WARNING: URL scheme " . $urlParts['scheme'] . " not supported by sockets. Use CURL library. [$url]", 3);
  482. return '';
  483. }
  484. if ($isearch_config['proxy_enable'])
  485. {
  486. $host = $isearch_config['proxy_host'];
  487. $port = $isearch_config['proxy_port'];
  488. }
  489. else
  490. {
  491. $host = $urlParts['host'];
  492. if (isset($urlParts['port']))
  493. {
  494. $port = $urlParts['port'];
  495. }
  496. else
  497. {
  498. $port = $secure ? 443 : 80;
  499. }
  500. }
  501. $sock = fsockopen(($secure ? 'ssl://' : '').$host, $port, $errno, $errstr);
  502. if (!$sock)
  503. {
  504. isearch_log("ERROR: Unable to open socket to " . $host . " " . $port . " - $errno : $errstr", 1);
  505. return '';
  506. }
  507. $request = "GET $url HTTP/1.0\r\n";
  508. $request .= "Host: $host\r\n";
  509. if (($isearch_config['proxy_enable']) && ($isearch_config['proxy_user'] != ''))
  510. {
  511. $request .= "Proxy-Authorization: Basic " . base64_encode ($isearch_config['proxy_user'].':'.$isearch_config['proxy_pass']) . "\r\n";
  512. }
  513. $request .= "User-Agent: iSearch/$isearch_version\r\n";
  514. if ($isearch_config['basic_authorization'] != '')
  515. {
  516. $request .= "Authorization: Basic " . base64_encode($isearch_config['basic_authorization']) . "\r\n";
  517. }
  518. $request .= "Connection: Close\r\n\r\n";
  519. fputs($sock, $request);
  520. $allData = isearch_fread($sock, $isearch_config['max_file_size']);
  521. fclose($sock);
  522. }
  523. else
  524. {
  525. /* Use the CURL library */
  526. isearch_log("INFO: Reading $url using CURL", 5);
  527. $ch = curl_init($url);
  528. // curl_setopt($ch, CURLOPT_VERBOSE, 1);
  529. curl_setopt($ch, CURLOPT_USERAGENT, "iSearch/$isearch_version");
  530. curl_setopt($ch, CURLOPT_HEADER, TRUE);
  531. curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
  532. curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
  533. curl_setopt($ch, CURLOPT_HEADER, TRUE);
  534. curl_setopt($ch, CURLOPT_TIMEOUT, 30);
  535. curl_setopt($ch, CURLOPT_COOKIEJAR, "my_cookies.txt"); // Initiates cookie file if needed
  536. curl_setopt($ch, CURLOPT_COOKIEFILE, "my_cookies.txt"); // Uses cookies from previous session if exist
  537. if ($isearch_config['proxy_enable'])
  538. {
  539. if ($isearch_config['proxy_user'] != '')
  540. {
  541. curl_setopt($ch, CURLOPT_PROXYUSERPWD, $isearch_config['proxy_user'].':'.$isearch_config['proxy_pass']);
  542. }
  543. curl_setopt($ch, CURLOPT_PROXY, $isearch_config['proxy_host']);
  544. curl_setopt($ch, CURLOPT_PROXYPORT, $isearch_config['proxy_port']);
  545. }
  546. if ($isearch_config['basic_authorization'] != '')
  547. {
  548. curl_setopt($ch, CURLOPT_USERPWD, $isearch_config['basic_authorization']);
  549. }
  550. $allData = curl_exec($ch);
  551. curl_close($ch);
  552. }
  553. @list($headerData, $docData) = explode("\r\n\r\n", $allData, 2);
  554. $headerLines = explode("\r\n", $headerData);
  555. $status = $headerLines[0];
  556. isearch_log("INFO: Status Line $status", 8);
  557. $header = array();
  558. for ($i = count($headerLines)-1; $i > 0; $i--)
  559. {
  560. $data = explode(': ', $headerLines[$i], 2);
  561. if (count($data) == 2)
  562. {
  563. $header[strtolower($data[0])] = $data[1];
  564. }
  565. }
  566. /* Check status code:
  567. * "200" ; OK
  568. * "201" ; Created
  569. * "202" ; Accepted
  570. * "204" ; No Content
  571. * "301" ; Moved Permanently
  572. * "302" ; Moved Temporarily
  573. * "304" ; Not Modified
  574. * "400" ; Bad Request
  575. * "401" ; Unauthorized
  576. * "403" ; Forbidden
  577. * "404" ; Not Found
  578. * "500" ; Internal Server Error
  579. * "501" ; Not Implemented
  580. * "502" ; Bad Gateway
  581. * "503" ; Service Unavailable
  582. */
  583. $statusParts = explode(' ', $status, 3);
  584. if (count($statusParts) < 2)
  585. {
  586. isearch_log('ERROR: Unable to read status code', 1);
  587. return '';
  588. }
  589. $statusCode = $statusParts[1];
  590. if (($statusCode == '301') || ($statusCode == '302'))
  591. {
  592. /* Redirection. Get new location */
  593. if ($recurse <= 1)
  594. {
  595. /* Recursion limit reached */
  596. isearch_log('ERROR: URL recursion limit 10 exceeded', 1);
  597. return '';
  598. }
  599. $url = $header['location'];
  600. $recurse = $recurse - 1;
  601. }
  602. else if ($statusCode >= 300)
  603. {
  604. isearch_log("WARNING: HTTP Error : $status [$url]", ereg('/robots\.txt$', $url) ? 9 : 3);
  605. return '';
  606. }
  607. else
  608. {
  609. /* We have read the file */
  610. break;
  611. }
  612. }
  613. if (isset($header['content-location']))
  614. {
  615. isearch_log("INFO: Content-Location: ".$header['content-location'], 9);
  616. $base = $header['content-location'];
  617. }
  618. else
  619. {
  620. isearch_log("INFO: Url: ".$url, 9);
  621. $base = $url;
  622. }
  623. }
  624. if (preg_match("#<BASE\\s+[^>]*?HREF\\s*=\\s*['\"]?([^>]+?)['\"]?[\\s>]#i", $docData, $matches) == 1)
  625. {
  626. /* Found a "<BASE HREF=" tag in the document head */
  627. isearch_log("INFO: BASE: ".$matches[1], 9);
  628. $base = $matches[1];
  629. }
  630. /* Search for inline frames and replace them with frame contents */
  631. $regexp = "#<IFRAME[^>]*?\\sSRC\\s*=\\s*['\"]?(.*?)[\\s'\"][^>]*>#i";
  632. $matchCount = preg_match_all($regexp, $docData, $matches);
  633. for ($i = 0; $i < $matchCount; $i++)
  634. {
  635. $frameUrl = isearch_relativeToAbsoluteUrl($matches[1][$i], $base);
  636. isearch_log("INFO: Reading inline frame : $frameUrl", 5);
  637. $frameData = isearch_readFile($frameUrl, $depth+1);
  638. $docData = preg_replace($regexp, $frameData, $docData, 1);
  639. }
  640. if ($isearch_config['javascript_link_search'] == 2)
  641. {
  642. /* Search for external JavaScript files and replace them with file contents */
  643. $regexp = "#<SCRIPT[^>]*?\\sSRC\\s*=\\s*['\"](.*?)['\"][^>]*>#i";
  644. $matchCount = preg_match_all($regexp, $docData, $matches);
  645. for ($i = 0; $i < $matchCount; $i++)
  646. {
  647. $jsUrl = isearch_relativeToAbsoluteUrl($matches[1][$i], $base);
  648. isearch_log("INFO: Reading javascript : $jsUrl", 5);
  649. $jsData = isearch_readFile($jsUrl, $depth+1);
  650. $docData = preg_replace($regexp, "<SCRIPT>\n<!--\n".$jsData."\n-->\n</SCRIPT>\n", $docData, 1);
  651. }
  652. }
  653. $isearch_header = $header;
  654. $isearch_base = $base;
  655. return $docData;
  656. }
  657. function isearch_execConvert($data, $type, $url)
  658. {
  659. global $isearch_config;
  660. isearch_log("INFO: Exec conversion $type", 7);
  661. // Create a temporary filename to use for the conversion
  662. $tmpfname = tempnam($isearch_config['tmpdir'], "iSearch");
  663. // Write data to the temp file
  664. $fh = fopen($tmpfname, 'wb');
  665. if (!$fh)
  666. {
  667. isearch_log("ERROR: Unable to open tmp file $tmpfname", 1);
  668. return '';
  669. }
  670. fwrite($fh, $data);
  671. fclose($fh);
  672. // Execute the command
  673. $cmd = $isearch_config[$type . '_exec'];
  674. if ($cmd == '')
  675. {
  676. isearch_log("ERROR: Configuration error - executable for $type not set", 1);
  677. return '';
  678. }
  679. $pdf = False;
  680. if ($type == 'pdf')
  681. {
  682. $pdf = True;
  683. $cmd .= ' -htmlmeta '.escapeshellarg($tmpfname).' -';
  684. }
  685. else
  686. {
  687. $cmd .= ' '.escapeshellarg($tmpfname);
  688. }
  689. exec($cmd, $output, $retval);
  690. // Delete the temporary file
  691. unlink($tmpfname);
  692. // Read the stderr and stdout files
  693. if (is_file("$tmpfname.err"))
  694. {
  695. $err = trim(@implode(" ", file("$tmpfname.err")));
  696. unlink("$tmpfname.err");
  697. }
  698. else
  699. {
  700. $err = '';
  701. }
  702. if (is_file("$tmpfname.txt"))
  703. {
  704. $text = @implode("\n", file("$tmpfname.txt"));
  705. unlink("$tmpfname.txt");
  706. }
  707. else
  708. {
  709. $text = implode("\n", $output);
  710. }
  711. if ($retval == 0)
  712. {
  713. isearch_log("INFO: Executed command $cmd $tmpfname, Return Code $retval", 5);
  714. }
  715. else
  716. {
  717. isearch_log("ERROR: Executed command $cmd $tmpfname, Return Code $retval", 1);
  718. return '';
  719. }
  720. if ($err != '')
  721. {
  722. isearch_log("ERROR: Executed command $cmd $tmpfname, Error Msg: $err", 1);
  723. return '';
  724. }
  725. if ($text == '')
  726. {
  727. isearch_log("ERROR: Unable to read converted file", 2);
  728. return '';
  729. }
  730. if ($type != 'pdf')
  731. {
  732. // Wrap text in HTML
  733. $fname = ereg_replace('^.*[/\\\\]', '', $url);
  734. $text = "<HTML><HEAD><TITLE>$fname</TITLE></HEAD><BODY><PRE>$text</PRE></BODY></HTML>";
  735. }
  736. return $text;
  737. }
  738. function isearch_onlineConvert($data, $type, $url)
  739. {
  740. global $isearch_config;
  741. global $isearch_version;
  742. $host='convert.iSearchTheNet.com';
  743. $port=80;
  744. $path='/';
  745. $query="?type=$type&gzip=0";
  746. isearch_log("INFO: Online conversion $type", 7);
  747. $sock = fsockopen($host, $port, $errno, $errstr);
  748. if (!$sock)
  749. {
  750. isearch_log("ERROR: Unable to open socket $host $port - $errno : $errstr", 1);
  751. return '';
  752. }
  753. $boundary = '---------------------------' . md5('boundary');
  754. $postValues = "--$boundary\r\n";
  755. $postValues .= "Content-Disposition: form-data; name=\"file.1\"; filename=\"file.1\"\r\n";
  756. $postValues .= "Content-Type: application/$type\r\n";
  757. $postValues .= "\r\n";
  758. $postValues .= "$data\r\n";
  759. $postValues .= "--$boundary--\r\n\r\n";
  760. $request = "POST http://$host$path$query HTTP/1.0\r\n";
  761. $request .= "User-Agent: iSearch/$isearch_version\r\n";
  762. $request .= "Host: $host\r\n";
  763. $request .= "Authorization: ISEARCH " . $isearch_config['online_id'] . "\r\n";
  764. $request .= "Content-Type: multipart/form-data; boundary=$boundary\r\n";
  765. $request .= "Content-Length: " . strlen( $postValues ) . "\r\n";
  766. // $request .= "Connection: Close\r\n";
  767. $request .= "\r\n";
  768. fputs($sock, $request.$postValues);
  769. /* Read status line */
  770. $status = fgets($sock, 1024);
  771. /* Read the header */
  772. while (!feof($sock))
  773. {
  774. $line = trim(fgets($sock, 1024));
  775. if ($line == '')
  776. {
  777. break;
  778. }
  779. }
  780. $convertedData = isearch_fread($sock);
  781. fclose($sock);
  782. $statusParts = explode(' ', $status, 3);
  783. if ($statusParts[1] != '200')
  784. {
  785. isearch_log("ERROR: Online conversion error: $status", 1);
  786. return $data;
  787. }
  788. if ((strlen($convertedData) > 10) && ($convertedData[0] == 0x1f) && ($convertedData[1] == 0x8b))
  789. {
  790. isearch_log("INFO: Running gzinflate on converted data", 6);
  791. $convertedData = gzinflate(substr($convertedData,10));
  792. }
  793. // Check the first word of the converted data
  794. $firstWord = '';
  795. sscanf($convertedData, ' %s ', $firstWord);
  796. if (!eregi('<HTML>', $firstWord))
  797. {
  798. // Wrap text in HTML
  799. $fname = ereg_replace('^.*[/\\\\]', '', $url);
  800. $convertedData = "<HTML><HEAD><TITLE>$fname</TITLE></HEAD><BODY><PRE>$convertedData</PRE></BODY></HTML>";
  801. }
  802. return $convertedData;
  803. }
  804. function isearch_utf8_chr($ch)
  805. {
  806. if ($ch <= 0x7f)
  807. {
  808. return chr($ch);
  809. }
  810. if ($ch <= 0x7ff)
  811. {
  812. return chr(($ch >> 6) + 0xc0).chr(($ch & 0x3f) + 0x80);
  813. }
  814. if ($ch <= 0xffff)
  815. {
  816. return chr(($ch >> 12) + 0xe0).chr((($ch >> 6) & 0x3f) + 0x80).chr(($ch & 0x3f) + 0x80);
  817. }
  818. if ($ch <= 0x1fffff)
  819. {
  820. return chr(($ch >> 18) + 0xf0).chr((($ch >> 12) & 0x3f) + 0x80).chr((($ch >> 6) & 0x3f) + 0x80).chr(($num & 0x3f) + 0x80);
  821. }
  822. // Invalid UTF-8 code
  823. return '';
  824. }
  825. function isearch_html_entity_decode($string, $quote, $charset)
  826. {
  827. $version = phpversion();
  828. if ($version{0} == '4')
  829. {
  830. if (strtolower($charset) == 'utf-8')
  831. {
  832. // PHP4 html_entity_decode does not support multi-byte charsets
  833. static $utf8_trans_tbl;
  834. $string = preg_replace('/&#x([0-9a-f]+);/ei', 'isearch_utf8_chr(hexdec("\\1"))', $string);
  835. $string = preg_replace('/&#([0-9]+);/e', 'isearch_utf8_chr(\\1)', $string);
  836. if (!isset($utf8_trans_tbl))
  837. {
  838. $utf8_trans_tbl = array();
  839. foreach (get_html_translation_table(HTML_ENTITIES) as $val=>$key)
  840. {
  841. $utf8_trans_tbl[$key] = utf8_encode($val);
  842. }
  843. }
  844. return strtr($string, $utf8_trans_tbl);
  845. }
  846. $string = preg_replace('/&#x([0-9a-f]+);?/ei', 'chr(hexdec("\\1"))', $string);
  847. $string = preg_replace('/&#([0-9]+);?/e', 'chr("\\1")', $string);
  848. }
  849. if (!function_exists('html_entity_decode'))
  850. {
  851. // html_entity_decode was new in PHP 4.3.0
  852. global $isearch_htmlToAsciiTrans;
  853. if (!isset($isearch_htmlToAsciiTrans))
  854. {
  855. /* Translate from HTML to ASCII */
  856. $isearch_htmlToAsciiTrans = array_flip(get_html_translation_table(HTML_ENTITIES));
  857. }
  858. return strtr($string, $isearch_htmlToAsciiTrans);
  859. }
  860. return html_entity_decode($string, $quote, $charset);
  861. }
  862. /* Parse HTTP date format - one of the following:
  863. * Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123
  864. * Sunday, 06-Nov-94 08:49:37 GMT ; RFC 850, obsoleted by RFC 1036
  865. * Sun Nov 6 08:49:37 1994 ; ANSI C's asctime() format
  866. */
  867. function isearch_parseHttpDate($httpDate)
  868. {
  869. static $months;
  870. if (!isset($months))
  871. {
  872. $months = array('jan'=>1, 'feb'=>2, 'mar'=>3, 'apr'=>4, 'may'=>5, 'jun'=>6, 'jul'=>7, 'aug'=>8, 'sep'=>9, 'oct'=>10, 'nov'=>11, 'dec'=>12);
  873. }
  874. $time = 0;
  875. if (preg_match("#^[a-z]+,? +([0-9]{1,2})[ -]+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[ -]+([0-9]{2,4}) +([0-9]{2}):([0-9]{2}):([0-9]{2})#i", $httpDate, $matchName) == 1)
  876. {
  877. $day = $matchName[1];
  878. $monthName = strtolower($matchName[2]);
  879. $year = $matchName[3];
  880. $hour = $matchName[4];
  881. $min = $matchName[5];
  882. $sec = $matchName[6];
  883. }
  884. else if (preg_match("#^[a-z]+,?[ -]+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) +([0-9]{2}):([0-9]{2}):([0-9]{2}) +([0-9]{2,4})#i", $httpDate, $matchName) == 1)
  885. {
  886. $day = $matchName[2];
  887. $monthName = strtolower($matchName[1]);
  888. $year = $matchName[6];
  889. $hour = $matchName[3];
  890. $min = $matchName[4];
  891. $sec = $matchName[5];
  892. }
  893. else
  894. {
  895. isearch_log('WARNING: Unknown date format : ' . $httpDate, 7);
  896. }
  897. if (isset($day))
  898. {
  899. if ($year < 70)
  900. {
  901. $year += 2000;
  902. }
  903. else if ($year < 100)
  904. {
  905. $year += 1900;
  906. }
  907. $time = gmmktime($hour, $min, $sec, $months[$monthName], $day, $year);
  908. }
  909. return $time;
  910. }
  911. /* Spider a single file. Returns true if there are more files to spider, else false */
  912. function isearch_indexAFile($verbose = True)
  913. {
  914. global $isearch_table_info, $isearch_table_urls, $isearch_table_urls_new, $isearch_table_words, $isearch_table_words_new;
  915. global $isearch_db;
  916. global $isearch_config;
  917. global $isearch_header;
  918. global $isearch_base;
  919. if (! $verbose)
  920. {
  921. /* Disable display of messages. */
  922. $isearch_config['log_echo_level'] = 0;
  923. }
  924. $resultUrls = mysql_query("SELECT * FROM $isearch_table_urls_new WHERE state='new' LIMIT 1", $isearch_db);
  925. if (!$resultUrls)
  926. {
  927. /* MySQL error. Sleep and try again */
  928. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  929. sleep(5);
  930. return True;
  931. }
  932. if (mysql_num_rows($resultUrls) != 1)
  933. {
  934. isearch_log('INFO: Indexing completed.', 2);
  935. /* Indexing has completed */
  936. $now = time();
  937. if (!mysql_query("UPDATE $isearch_table_info SET last_update='$now' WHERE id='1'", $isearch_db))
  938. {
  939. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  940. }
  941. $result = mysql_query("SELECT url, state FROM $isearch_table_urls_new WHERE state!='ok'", $isearch_db);
  942. if (!$result)
  943. {
  944. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  945. }
  946. else if (mysql_num_rows($result) > 0)
  947. {
  948. isearch_log('INFO: Deleting the following URLs:', 4);
  949. while ($item = mysql_fetch_object($result))
  950. {
  951. isearch_log('INFO: ' . $item->url . ' (' . $item->state . ')', 4);
  952. }
  953. }
  954. /* Delete any unfound references */
  955. if (!mysql_query("DELETE FROM $isearch_table_urls_new WHERE state!='ok'", $isearch_db))
  956. {
  957. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  958. }
  959. /* Update referrer_id fields */
  960. $result = mysql_query("SELECT id, temp_referrer_id FROM $isearch_table_urls_new", $isearch_db);
  961. if (!$result)
  962. {
  963. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  964. }
  965. else if (mysql_num_rows($result) > 0)
  966. {
  967. mysql_query("LOCK TABLES $isearch_table_urls_new WRITE", $isearch_db);
  968. while ($item = mysql_fetch_object($result))
  969. {
  970. if (!mysql_query("UPDATE $isearch_table_urls_new SET referrer_id='$item->temp_referrer_id' WHERE id='$item->id'", $isearch_db))
  971. {
  972. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  973. }
  974. }
  975. mysql_query("UNLOCK TABLES", $isearch_db);
  976. }
  977. if ($isearch_config['test_mode'] == 0)
  978. {
  979. /* Swap the old and new tables */
  980. $backup_words = $isearch_table_words . '_tmp_backup';
  981. $backup_urls = $isearch_table_urls . '_tmp_backup';
  982. if (!mysql_query("RENAME TABLE $isearch_table_words TO $backup_words, " .
  983. "$isearch_table_words_new TO $isearch_table_words, " .
  984. "$backup_words TO $isearch_table_words_new, " .
  985. "$isearch_table_urls TO $backup_urls, " .
  986. "$isearch_table_urls_new TO $isearch_table_urls, " .
  987. "$backup_urls TO $isearch_table_urls_new", $isearch_db))
  988. {
  989. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  990. }
  991. }
  992. /* Empty the new words table */
  993. if (!mysql_query("DELETE FROM $isearch_table_words_new", $isearch_db))
  994. {
  995. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  996. }
  997. /* Empty the new urls table */
  998. if (!mysql_query("DELETE FROM $isearch_table_urls_new", $isearch_db))
  999. {
  1000. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1001. }
  1002. /* Optimize the tables */
  1003. if (!mysql_query("OPTIMIZE TABLE $isearch_table_urls, $isearch_table_words", $isearch_db))
  1004. {
  1005. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1006. }
  1007. return False;
  1008. }
  1009. $itemUrl = mysql_fetch_object($resultUrls);
  1010. $allData = isearch_readFile($itemUrl->url);
  1011. $newState = 'error';
  1012. if ($allData == '')
  1013. {
  1014. isearch_log("ERROR: Unable to open URL [$itemUrl->url]", 1);
  1015. $newState = 'notfound';
  1016. }
  1017. else
  1018. {
  1019. $size = strlen($allData);
  1020. $sig = md5($allData);
  1021. /* Look for a duplicate page */
  1022. $resultSig = mysql_query("SELECT * FROM $isearch_table_urls_new WHERE sig='$sig' AND size='$size' AND NOT url='" . isearch_escape_string($itemUrl->url) . "'", $isearch_db);
  1023. if (!$resultSig)
  1024. {
  1025. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1026. }
  1027. else if (mysql_num_rows($resultSig) > 0)
  1028. {
  1029. isearch_log("INFO: Duplicate URL - will not be processed [$itemUrl->url]", 4);
  1030. $newState = 'duplicate';
  1031. }
  1032. else
  1033. {
  1034. /* No duplicates found. */
  1035. isearch_log("INFO: Processing URL [$itemUrl->url]", 2);
  1036. if ((preg_match('%\.(doc|pdf)(\?.*|#.*)?$%i', $itemUrl->url, $matches)) ||
  1037. (preg_match('/^%(PDF)/i', $allData, $matches)))
  1038. {
  1039. if (strtolower($matches[1]) == 'doc')
  1040. {
  1041. switch ($isearch_config['msword_support'])
  1042. {
  1043. case 0:
  1044. /* Do nothing */
  1045. isearch_log("WARNING: Word support disabled", 3);
  1046. break;
  1047. case 1:
  1048. $allData = isearch_execConvert($allData, 'msword', $itemUrl->url);
  1049. break;
  1050. case 2:
  1051. $allData = isearch_onlineConvert($allData, 'msword', $itemUrl->url);
  1052. break;
  1053. default:
  1054. isearch_log("ERROR: Illegal Word document support setting", 1);
  1055. break;
  1056. }
  1057. }
  1058. else
  1059. {
  1060. switch ($isearch_config['pdf_support'])
  1061. {
  1062. case 0:
  1063. /* Do nothing */
  1064. isearch_log("WARNING: PDF support disabled", 3);
  1065. break;
  1066. case 1:
  1067. $allData = isearch_execConvert($allData, 'pdf', $itemUrl->url);
  1068. break;
  1069. case 2:
  1070. $allData = isearch_onlineConvert($allData, 'pdf', $itemUrl->url);
  1071. break;
  1072. default:
  1073. isearch_log("ERROR: Illegal PDF support setting", 1);
  1074. break;
  1075. }
  1076. }
  1077. }
  1078. if (($isearch_config['keep_cache']) && ($isearch_config['test_mode'] == 0))
  1079. {
  1080. $cache = $allData;
  1081. }
  1082. else
  1083. {
  1084. $cache = '';
  1085. }
  1086. /* Strip out all control characters and replace with spaces.
  1087. * Compact all white space into a single space character.
  1088. */
  1089. $allData = preg_replace("/\\s+/", ' ', $allData);
  1090. if (isset($isearch_header['content_type']) && ($isearch_header['content_type'] == 'text/plain'))
  1091. {
  1092. $headData = '';
  1093. $bodyData = $allData;
  1094. }
  1095. else
  1096. {
  1097. $tdata = spliti('</head[^>]*>', $allData, 2);
  1098. if (count($tdata) < 2)
  1099. {
  1100. $tdata = spliti('<body', $allData, 2);
  1101. if (count($tdata) < 2)
  1102. {
  1103. isearch_log('WARNING: <BODY> and </HEAD> tags not found', 4);
  1104. $headData = '';
  1105. $bodyData = $allData;
  1106. }
  1107. else
  1108. {
  1109. $headData = $tdata[0];
  1110. $bodyData = '<body' . $tdata[1];
  1111. }
  1112. }
  1113. else
  1114. {
  1115. $headData = $tdata[0];
  1116. $bodyData = $tdata[1];
  1117. }
  1118. }
  1119. /* Strip out HTML comments from head data */
  1120. $headData = preg_replace('/<!--.+?-->/','',$headData);
  1121. $headData = preg_replace("/\\s+/", ' ', $headData);
  1122. /* Strip out all HTML tags except ones we are interested in */
  1123. /* Includes workaround for PHP bug. See http://bugs.php.net/bug.php?id=21311 */
  1124. $headData = strip_tags(eregi_replace("<\!DOCTYPE [^>]*>", '', $headData), '<meta><title><base>');
  1125. $keyWords = '';
  1126. $description = '';
  1127. $title = '';
  1128. $index = True;
  1129. $follow = True;
  1130. $matchCount = preg_match_all("#<META\\s+([^>]*?)\\s*>#i", $allData, $matches);
  1131. for ($i = 0; $i < $matchCount; $i++)
  1132. {
  1133. if (preg_match("#CONTENT\\s*=\\s*(['\"])(.*?)\\1#i", $matches[1][$i], $matchContent) == 1)
  1134. {
  1135. $metaContent = $matchContent[2];
  1136. if (preg_match("#NAME\\s*=\\s*['\"]?(.*?)['\"]?(\\s|$)#i", $matches[1][$i], $matchName) == 1)
  1137. {
  1138. // <META NAME="keywords" CONTENT="keyword list">
  1139. // <META NAME="description" CONTENT="description">
  1140. // <META NAME="robots" CONTENT="nofollow,noindex,noarchive">
  1141. $metaName = strtolower($matchName[1]);
  1142. if ($metaName == 'keywords')
  1143. {
  1144. $keyWords = ereg_replace(',', ' ', $metaContent);
  1145. }
  1146. else if ($metaName == 'description')
  1147. {
  1148. $description = $metaContent;
  1149. }
  1150. else if ($metaName == 'robots')
  1151. {
  1152. if (eregi('noindex', $metaContent))
  1153. {
  1154. $index = False;
  1155. }
  1156. if (eregi('nofollow', $metaContent))
  1157. {
  1158. $follow = False;
  1159. }
  1160. if (eregi('noarchive', $metaContent))
  1161. {
  1162. $cache = '';
  1163. }
  1164. }
  1165. }
  1166. else if (preg_match("#HTTP-EQUIV\\s*=(['\"])(.*?)\\1#i", $matches[1][$i], $matchEquiv) == 1)
  1167. {
  1168. $isearch_header[strtolower($matchEquiv[2])] = $metaContent;
  1169. }
  1170. }
  1171. }
  1172. // Determine character set
  1173. $charset = $isearch_config['char_set'];
  1174. if ((isset($isearch_header['content-type'])) &&
  1175. (preg_match("#(^|\\s)\\s*CHARSET\\s*=\\s*['\"]?(.*?)['\"]?(\\s|$)#i", $isearch_header['content-type'], $matches) == 1))
  1176. {
  1177. $charset = $matches[2];
  1178. }
  1179. $lastModified = 0;
  1180. if (isset($isearch_header['last-modified']))
  1181. {
  1182. $lastModified = isearch_parseHttpDate($isearch_header['last-modified']);
  1183. }
  1184. $changefreq = '';
  1185. $priority = -1;
  1186. if ($follow)
  1187. {
  1188. $tdata1 = spliti('(<!-- ISEARCH_BEGIN_FOLLOW -->|</nofollow>)', ' ' . $bodyData);
  1189. if (count($tdata1) > 1)
  1190. {
  1191. /* At least 1 found. */
  1192. $followData = '';
  1193. /* Check for an END_INDEX before the first BEGIN_INDEX */
  1194. $tdata2 = spliti('(<!-- ISEARCH_END_FOLLOW -->|<nofollow>)', $tdata1[0]);
  1195. if (count($tdata2) > 1)
  1196. {
  1197. /* And END_INDEX was found. Add anything before it into the follow data */
  1198. $followData .= $tdata2[0];
  1199. }
  1200. for ($i = 1; $i < count($tdata1); $i++)
  1201. {
  1202. $tdata2 = spliti('(<!-- ISEARCH_END_FOLLOW -->|<nofollow>)', $tdata1[$i]);
  1203. $followData .= $tdata2[0];
  1204. }
  1205. }
  1206. else
  1207. {
  1208. $followData = $bodyData;
  1209. }
  1210. $urls = array();
  1211. if ($isearch_config['follow_meta_refresh'] && isset($isearch_header['refresh']) &&
  1212. (preg_match("#; *URL *= *(.*)$#i", $isearch_header['refresh'], $matches) == 1))
  1213. {
  1214. $urls[] = $matches[1];
  1215. }
  1216. /* Do agressive link searching */
  1217. if ($isearch_config['aggressive_link_search'])
  1218. {
  1219. $matchCount = preg_match_all("~(https?|ftps?)://[^'\"\\s>]*~i", $allData, $matches);
  1220. for ($i = 0; $i < $matchCount; $i++)
  1221. {
  1222. $urls[] = $matches[0][$i];
  1223. }
  1224. /* Find any links with no quotes */
  1225. $matchCount = preg_match_all("~<(A|AREA)\\s+([^>]*?\\s+)*?HREF\\s*=\\s*([^\\s>]+?)~i", $followData, $matches);
  1226. for ($i = 0; $i < $matchCount; $i++)
  1227. {
  1228. $urls[] = $matches[3][$i];
  1229. }
  1230. }
  1231. /* Do JavaScript link searching */
  1232. if ($isearch_config['javascript_link_search'])
  1233. {
  1234. /* Search for window.open() calls */
  1235. $matchCount = preg_match_all("~window.open\\s*\\(\\s*'(.+?)',~i", $allData, $matches);
  1236. for ($i = 0; $i < $matchCount; $i++)
  1237. {
  1238. if ($matches[1][$i] != '')
  1239. {
  1240. $urls[] = $matches[1][$i];
  1241. }
  1242. }
  1243. $matchCount = preg_match_all("~window.open\\s*\\(\\s*\"(.+?)\",~i", $allData, $matches);
  1244. for ($i = 0; $i < $matchCount; $i++)
  1245. {
  1246. if ($matches[1][$i] != '')
  1247. {
  1248. $urls[] = $matches[1][$i];
  1249. }
  1250. }
  1251. }
  1252. /* Remove JavaScript and comments */
  1253. $followData = preg_replace('#(<SCRIPT[^>]*?>.*?</SCRIPT>)|(<!--.*?-->)#i', '', $followData);
  1254. /* Find any double quoted links */
  1255. $matchCount = preg_match_all("~<(A|AREA)\\s+([^>]*?\\s+)*?HREF\\s*=\\s*\"\\s*([^>]+?)\\s*\"[^>]*>~i", $followData, $matches);
  1256. for ($i = 0; $i < $matchCount; $i++)
  1257. {
  1258. if (!eregi('rel="nofollow"', $matches[0][$i]))
  1259. {
  1260. $urls[] = $matches[3][$i];
  1261. }
  1262. }
  1263. /* Find any single quoted links */
  1264. $matchCount = preg_match_all("~<(A|AREA)\\s+([^>]*?\\s+)*?HREF\\s*=\\s*'\\s*([^>]+?)\\s*'[^>]*>~i", $followData, $matches);
  1265. for ($i = 0; $i < $matchCount; $i++)
  1266. {
  1267. if (!eregi('rel="nofollow"', $matches[0][$i]))
  1268. {
  1269. $urls[] = $matches[3][$i];
  1270. }
  1271. }
  1272. if ($isearch_config['follow_frames'])
  1273. {
  1274. /* Search for frames, and add the referenced docs to the URL list */
  1275. $matchCount = preg_match_all("~<FRAME\\s+(.*?)>~i", $allData, $matches);
  1276. for ($i = 0; $i < $matchCount; $i++)
  1277. {
  1278. $matchCount2 = preg_match_all("~SRC\\s*=\\s*\"\\s*([^>]+?)\\s*\"~i", $matches[1][$i], $matches2);
  1279. for ($j = 0; $j < $matchCount2; $j++)
  1280. {
  1281. $urls[] = $matches2[1][$j];
  1282. }
  1283. }
  1284. }
  1285. $lastUrl = '';
  1286. sort($urls);
  1287. foreach ($urls as $url)
  1288. {
  1289. $decodedUrl = str_replace(' ', '%20', ereg_replace('#.*$', '', urldecode(trim($url))));
  1290. if (($decodedUrl != $lastUrl) && (!eregi('^javascript:', $decodedUrl)))
  1291. {
  1292. isearch_addUrl(isearch_relativeToAbsoluteUrl($decodedUrl, $isearch_base), $itemUrl->id);
  1293. $lastUrl = $decodedUrl;
  1294. }
  1295. }
  1296. }
  1297. if ($isearch_config['test_mode'] == 1)
  1298. {
  1299. $newState = 'ok';
  1300. }
  1301. else if ($index)
  1302. {
  1303. /* Find a "<TITLE></TITLE>" tag in the document head */
  1304. if (preg_match("#<TITLE>\\s*(.*?)\\s*</TITLE>#i", $headData, $matches) == 1)
  1305. {
  1306. $title = $matches[1];
  1307. }
  1308. else
  1309. {
  1310. $title = $itemUrl->url;
  1311. }
  1312. /* Replace breaking tags and other special chars with spaces */
  1313. $bodyData = eregi_replace('(<(hr|br|p|td|th|li)(>| [^>]*>))|(&(nb|em|en)sp;?)', ' ', $bodyData);
  1314. /* Replace dash entities with dashes */
  1315. $bodyData = eregi_replace('&([nm]dash|shy);?', '-', $bodyData);
  1316. /* Replace other entities */
  1317. $bodyData = eregi_replace('&[lr]squo;?', "'", $bodyData);
  1318. $bodyData = eregi_replace('&[lr]dquo;?', '"', $bodyData);
  1319. $tdata1 = spliti('(<!-- ISEARCH_BEGIN_INDEX -->|</noindex>)', ' ' . $bodyData);
  1320. if (count($tdata1) > 1)
  1321. {
  1322. /* At least 1 found. */
  1323. $bodyData = '';
  1324. /* Check for an END_INDEX before the first BEGIN_INDEX */
  1325. $tdata2 = spliti('(<!-- ISEARCH_END_INDEX -->|<noindex>)', $tdata1[0], 2);
  1326. if (count($tdata2) > 1)
  1327. {
  1328. /* And END_INDEX was found. Add anything before it into the bodyData */
  1329. $bodyData .= $tdata2[0];
  1330. }
  1331. for ($i = 1; $i < count($tdata1); $i++)
  1332. {
  1333. $tdata2 = spliti('(<!-- ISEARCH_END_INDEX -->|<noindex>)', $tdata1[$i], 2);
  1334. $bodyData .= $tdata2[0];
  1335. }
  1336. }
  1337. else
  1338. {
  1339. $tdata2 = spliti('(<!-- ISEARCH_END_INDEX -->|<noindex>)', $tdata1[0], 2);
  1340. if (count($tdata2) > 1)
  1341. {
  1342. /* An END_INDEX was found. Add anything before it into the bodyData */
  1343. $bodyData = $tdata2[0];
  1344. }
  1345. }
  1346. /* Strip out JavaScript, HTML comments and embedded CSS
  1347. */
  1348. $bodyData = preg_replace('#(<SCRIPT[^>]*?>.*?</SCRIPT>)|(<!--.*?-->)|(<STYLE[^>]*?>.*?</STYLE>)#i', '', $bodyData);
  1349. /* Strip out all HTML tags except special ones */
  1350. $bodyData = strip_tags($bodyData, '<h1><h2><h3><h4><h5><img>');
  1351. if (! $isearch_config['ignore_image_alt_tags'])
  1352. {
  1353. /* Replace images with their alt text */
  1354. $bodyData = preg_replace('#<IMG\\s[^>]*?ALT\\s*=\\s*("|\')(.*?)\\1.*?>#i', ' \\2 ', $bodyData);
  1355. }
  1356. $bigWords = '';
  1357. $matchCount = preg_match_all("~<H[1-5]>\\s*(.*?)\\s*</H[1-5]>>~i", $bodyData, $matches);
  1358. for ($i = 0; $i < $matchCount; $i++)
  1359. {
  1360. $bigWords .= ' ' . $matches[1][$i];
  1361. }
  1362. /* Translate from HTML to ASCII */
  1363. $bodyData = isearch_html_entity_decode($bodyData, ENT_QUOTES, $isearch_config['char_set']);
  1364. $bigWords = isearch_html_entity_decode($bigWords, ENT_QUOTES, $isearch_config['char_set']);
  1365. $keyWords = isearch_html_entity_decode($keyWords, ENT_QUOTES, $isearch_config['char_set']);
  1366. $titleWords = isearch_html_entity_decode($title, ENT_QUOTES, $isearch_config['char_set']);
  1367. /* Keep stripped copy of document body */
  1368. $strippedBody = preg_replace("/\\s+/", ' ', strip_tags($bodyData));
  1369. /* Strip out unwanted characters from the strings that we
  1370. * search on
  1371. */
  1372. $bodyData = isearch_cleanString($bodyData, $charset);
  1373. $bigWords = isearch_cleanString($bigWords, $charset);
  1374. $keyWords = isearch_cleanString($keyWords, $charset);
  1375. $titleWords = isearch_cleanString($titleWords, $charset);
  1376. $urlWords = ereg_replace('((https?://)|[+/\\\\\\.]|(%20))', ' ', $itemUrl->url);
  1377. $urlWords = isearch_cleanString($urlWords, $charset);
  1378. $score = array();
  1379. for ($i = 0; $i < 5; $i++)
  1380. {
  1381. if ($i == 0)
  1382. {
  1383. $words = explode(' ', $bodyData);
  1384. $wordScore = $isearch_config['word_rank'];
  1385. }
  1386. else if ($i == 1)
  1387. {
  1388. $words = explode(' ', $bigWords);
  1389. $wordScore = $isearch_config['heading_rank'];
  1390. }
  1391. else if ($i == 2)
  1392. {
  1393. $words = explode(' ', $keyWords);
  1394. $wordScore = $isearch_config['keyword_rank'];
  1395. }
  1396. else if ($i == 3)
  1397. {
  1398. $words = explode(' ', $titleWords);
  1399. $wordScore = $isearch_config['title_rank'];
  1400. }
  1401. else if ($i == 4)
  1402. {
  1403. $words = explode(' ', $urlWords);
  1404. $wordScore = $isearch_config['url_rank'];
  1405. }
  1406. if ($wordScore == 0)
  1407. {
  1408. // Skip if wordScore is zero
  1409. continue;
  1410. }
  1411. foreach ($words as $word)
  1412. {
  1413. if (($word != '') && (!in_array($word, $isearch_config['stop_words'])) && (strlen($word) > $isearch_config['stop_words_length']))
  1414. {
  1415. if (isset($score[$word]))
  1416. {
  1417. $score[$word] += $wordScore;
  1418. }
  1419. else
  1420. {
  1421. $score[$word] = $wordScore;
  1422. }
  1423. }
  1424. }
  1425. }
  1426. if (count($score) == 0)
  1427. {
  1428. isearch_log('WARNING: No words were found', 4);
  1429. }
  1430. else
  1431. {
  1432. $query = "INSERT INTO $isearch_table_words_new (word, id, score) VALUES ";
  1433. $needComma = False;
  1434. $id = $itemUrl->id;
  1435. foreach (array_keys($score) as $word)
  1436. {
  1437. if ($needComma)
  1438. {
  1439. $query .= ',';
  1440. }
  1441. $needComma = True;
  1442. $query .= "('".isearch_escape_string($word)."', '$id', '".$score[$word]."')";
  1443. }
  1444. if (!mysql_query($query, $isearch_db))
  1445. {
  1446. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1447. }
  1448. }
  1449. /* Update this URL's details in the database */
  1450. if (!mysql_query("UPDATE $isearch_table_urls_new SET " .
  1451. "title='" . isearch_escape_string($title) . "', " .
  1452. "description='" . isearch_escape_string($description) . "', " .
  1453. "cache='" . isearch_escape_string($cache) . "', " .
  1454. "stripped_body='" . isearch_escape_string($strippedBody) . "', " .
  1455. "words=' " . isearch_escape_string($bodyData) . " ', " .
  1456. "size='$size', " .
  1457. "base='" . isearch_escape_string($isearch_base) . "', " .
  1458. "priority='$priority', " .
  1459. "changefreq='" . isearch_escape_string($changefreq) . "', " .
  1460. "lastmod='$lastModified', " .
  1461. "sig='" . isearch_escape_string($sig) . "' " .
  1462. "WHERE id='" . $itemUrl->id . "'", $isearch_db))
  1463. {
  1464. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1465. }
  1466. $newState = 'ok';
  1467. }
  1468. else /* if ($index) */
  1469. {
  1470. $newState = 'noindex';
  1471. }
  1472. }
  1473. }
  1474. if (!mysql_query("UPDATE $isearch_table_urls_new SET state='$newState' WHERE id='" . $itemUrl->id . "'", $isearch_db))
  1475. {
  1476. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1477. }
  1478. return True;
  1479. }
  1480. /* Glue the fragments from a parse_url together. */
  1481. function glue_url($parsed)
  1482. {
  1483. if (! is_array($parsed))
  1484. {
  1485. return false;
  1486. }
  1487. if (isset($parsed['scheme']))
  1488. {
  1489. $sep = (strtolower($parsed['scheme']) == 'mailto' ? ':' : '://');
  1490. $uri = $parsed['scheme'] . $sep;
  1491. }
  1492. else
  1493. {
  1494. $uri = '';
  1495. }
  1496. if (isset($parsed['pass']))
  1497. {
  1498. $uri .= "$parsed[user]:$parsed[pass]@";
  1499. }
  1500. elseif (isset($parsed['user']))
  1501. {
  1502. $uri .= "$parsed[user]@";
  1503. }
  1504. if (isset($parsed['host']))
  1505. {
  1506. $uri .= $parsed['host'];
  1507. }
  1508. if (isset($parsed['port']))
  1509. {
  1510. $uri .= ":$parsed[port]";
  1511. }
  1512. if (isset($parsed['path']))
  1513. {
  1514. $uri .= $parsed['path'];
  1515. }
  1516. if (isset($parsed['query']))
  1517. {
  1518. $uri .= "?$parsed[query]";
  1519. }
  1520. if (isset($parsed['fragment']))
  1521. {
  1522. $uri .= "#$parsed[fragment]";
  1523. }
  1524. return $uri;
  1525. }
  1526. /* Add a URL to the search index */
  1527. function isearch_addUrl($absoluteUrl, $referrer_id)
  1528. {
  1529. global $isearch_table_urls_new;
  1530. global $isearch_db;
  1531. global $isearch_config;
  1532. if (eregi('^mailto:', $absoluteUrl))
  1533. {
  1534. isearch_log("INFO: Ignoring URL [$absoluteUrl]", 6);
  1535. return;
  1536. }
  1537. isearch_log("INFO: Checking URL [$absoluteUrl]", 7);
  1538. /* Split the absolute URL into component parts */
  1539. $absoluteParts = @parse_url($absoluteUrl);
  1540. if ((!isset($absoluteParts['scheme'])) || (!isset($absoluteParts['host'])))
  1541. {
  1542. /* Unable to parse URL */
  1543. isearch_log("INFO: Unable to parse absoluteUrl [$absoluteUrl]", 4);
  1544. return;
  1545. }
  1546. $absoluteParts['scheme'] = strtolower($absoluteParts['scheme']);
  1547. if (!ereg('^(https?|ftps?)$', $absoluteParts['scheme']))
  1548. {
  1549. isearch_log("INFO: Rejecting unsupported URL format [$absoluteUrl]", 6);
  1550. return;
  1551. }
  1552. /* Remove any anchor reference in the URL */
  1553. unset($absoluteParts['fragment']);
  1554. /* Split the GET variables and remove unnecessary ones */
  1555. if ((count($isearch_config['remove_get_vars']) == 1) && ($isearch_config['remove_get_vars'][0] == '*'))
  1556. {
  1557. /* Strip all GET variables */
  1558. unset($absoluteParts['query']);
  1559. }
  1560. else if (isset($absoluteParts['query']))
  1561. {
  1562. $queryParts = explode('&', $absoluteParts['query']);
  1563. foreach (array_keys($queryParts) as $key)
  1564. {
  1565. $varName = eregi_replace('=.*$', '', $queryParts[$key]);
  1566. if (in_array($varName, $isearch_config['remove_get_vars']))
  1567. {
  1568. unset($queryParts[$key]);
  1569. }
  1570. }
  1571. $absoluteParts['query'] = implode('&', $queryParts);
  1572. if ($absoluteParts['query'] == '')
  1573. {
  1574. unset($absoluteParts['query']);
  1575. }
  1576. }
  1577. /* Handle the www subdomain according to the value of $isearch_config['www_option'].
  1578. * 1 = Leave as is
  1579. * 2 = Strip www subdomains
  1580. * 3 = Add www subdomain
  1581. */
  1582. if ($isearch_config['www_option'] == 2)
  1583. {
  1584. $absoluteParts['host'] = eregi_replace('^www\.', '', $absoluteParts['host']);
  1585. }
  1586. else if ($isearch_config['www_option'] == 3)
  1587. {
  1588. if (!eregi('^www\.', $absoluteParts['host']))
  1589. {
  1590. $absoluteParts['host'] = 'www.' . $absoluteParts['host'];
  1591. }
  1592. }
  1593. /* Replace space characters with %20 */
  1594. if (isset($absoluteParts['path']))
  1595. {
  1596. $absoluteParts['path'] = str_replace(' ', '%20', $absoluteParts['path']);
  1597. }
  1598. /* Glue URL parts together again */
  1599. $absoluteUrl = glue_url($absoluteParts);
  1600. if (isset($absoluteParts['path']))
  1601. {
  1602. $fileName = eregi_replace('.*/', '', $absoluteParts['path']);
  1603. $temp = explode('.', $fileName);
  1604. if (count($temp) < 2)
  1605. {
  1606. $fileExtension = '';
  1607. }
  1608. else
  1609. {
  1610. $fileExtension = strtolower($temp[count($temp)-1]);
  1611. }
  1612. }
  1613. else
  1614. {
  1615. $fileExtension = '';
  1616. $fileName = '/';
  1617. }
  1618. isearch_log("INFO: Absolute URL [$absoluteUrl]", 6);
  1619. $allowed = False;
  1620. if ($fileExtension == '')
  1621. {
  1622. /* Treat no file extension as a directory. */
  1623. if ($isearch_config['directory_handling'] != 0)
  1624. {
  1625. /* Directories are allowed */
  1626. $allowed = True;
  1627. if ($isearch_config['directory_handling'] == 2)
  1628. {
  1629. /* Add trailing slash to directories */
  1630. $absoluteUrl = ereg_replace('([^/])$', '\1/', $absoluteUrl);
  1631. }
  1632. else if ($isearch_config['directory_handling'] == 3)
  1633. {
  1634. /* Strip trailing slash from directories */
  1635. $absoluteUrl = ereg_replace('/$', '', $absoluteUrl);
  1636. }
  1637. }
  1638. }
  1639. else
  1640. {
  1641. /* Check whether this is a default file name, and strip it if so. */
  1642. foreach ($isearch_config['strip_defaults'] as $item)
  1643. {
  1644. if (($item != '') && ($item == $fileName))
  1645. {
  1646. $absoluteUrl = str_replace("/$item", '/', $absoluteUrl);
  1647. $allowed = True;
  1648. break;
  1649. }
  1650. }
  1651. /* Check whether this file extension is allowed. Always allow PDF files
  1652. * when PDF is enabled and DOC files when Word is enabled.
  1653. */
  1654. if ((in_array($fileExtension, $isearch_config['allowed_ext'])) ||
  1655. (($isearch_config['pdf_support'] != 0) && ($fileExtension == 'pdf')) ||
  1656. (($isearch_config['msword_support'] != 0) && ($fileExtension == 'doc')))
  1657. {
  1658. $allowed = True;
  1659. }
  1660. }
  1661. if (!$allowed)
  1662. {
  1663. isearch_log("INFO: Rejecting URL - it is not in allowed file extensions [$absoluteUrl]", 5);
  1664. }
  1665. else
  1666. {
  1667. /* Check whether it's already in the table of known URLs */
  1668. $result = mysql_query("SELECT state, id FROM $isearch_table_urls_new WHERE url='".isearch_escape_string($absoluteUrl)."'", $isearch_db);
  1669. if (!$result)
  1670. {
  1671. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1672. }
  1673. else if (mysql_num_rows($result) > 0)
  1674. {
  1675. /* Already in database */
  1676. isearch_log("INFO: URL allowed, but already in database [$absoluteUrl]", 6);
  1677. if ($item = mysql_fetch_object($result))
  1678. {
  1679. if (($item->state == 'new') && ($item->id != $referrer_id))
  1680. {
  1681. if (!mysql_query("UPDATE $isearch_table_urls_new SET temp_referrer_id='$referrer_id' WHERE url='".isearch_escape_string($absoluteUrl)."'", $isearch_db))
  1682. {
  1683. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1684. }
  1685. }
  1686. }
  1687. return;
  1688. }
  1689. $allowed = False;
  1690. /* Check that this URL is in the allowed list */
  1691. foreach (isearch_expandList($isearch_config['allowed_urls_beginning']) as $item)
  1692. {
  1693. if (($item != '') && ($item == substr($absoluteUrl, 0, strlen($item))))
  1694. {
  1695. $allowed = True;
  1696. break;
  1697. }
  1698. }
  1699. foreach (isearch_expandList($isearch_config['allowed_urls']) as $item)
  1700. {
  1701. if (($item != '') && (eregi($item, $absoluteUrl)))
  1702. {
  1703. $allowed = True;
  1704. break;
  1705. }
  1706. }
  1707. if (!$allowed)
  1708. {
  1709. isearch_log("INFO: Rejecting URL - it is not in allowed URL list [$absoluteUrl]", 5);
  1710. }
  1711. else
  1712. {
  1713. if (($absoluteParts['scheme'] == 'http') && (!in_array($absoluteParts['host'], $isearch_config['robots_domains'])))
  1714. {
  1715. /* Parse the robots.txt for this domain */
  1716. $host = $absoluteParts['host'];
  1717. if (isset($absoluteParts['port']))
  1718. {
  1719. $host .= ':'.$absoluteParts['port'];
  1720. }
  1721. isearch_parseRobots($host);
  1722. }
  1723. /* Check that this URL is not in the disallowed list */
  1724. foreach (isearch_expandList($isearch_config['exclude_urls_beginning']) as $item)
  1725. {
  1726. if (($item != '') && ($item == substr($absoluteUrl, 0, strlen($item))))
  1727. {
  1728. $allowed = False;
  1729. isearch_log("INFO: Rejecting URL - it is in disallowed URL beginning list [$absoluteUrl]", 5);
  1730. break;
  1731. }
  1732. }
  1733. if ($allowed)
  1734. {
  1735. foreach (isearch_expandList($isearch_config['exclude_urls']) as $item)
  1736. {
  1737. if (($item != '') && (eregi($item, $absoluteUrl)))
  1738. {
  1739. $allowed = False;
  1740. isearch_log("INFO: Rejecting URL - it is in disallowed URL regexp list [$absoluteUrl]", 5);
  1741. break;
  1742. }
  1743. }
  1744. }
  1745. if ($allowed)
  1746. {
  1747. /* Check the robots.txt excludes list */
  1748. foreach ($isearch_config['robots_excludes'] as $item)
  1749. {
  1750. if (($item != '') && (eregi($item, $absoluteUrl)))
  1751. {
  1752. $allowed = False;
  1753. isearch_log("INFO: Rejecting URL - it is disallowed by robots.txt [$absoluteUrl]", 5);
  1754. break;
  1755. }
  1756. }
  1757. }
  1758. }
  1759. }
  1760. if ($allowed)
  1761. {
  1762. /* Add it */
  1763. if (!mysql_query("INSERT INTO $isearch_table_urls_new (url, temp_referrer_id, state) VALUES ('".isearch_escape_string($absoluteUrl)."', '$referrer_id', 'new')", $isearch_db))
  1764. {
  1765. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1766. }
  1767. isearch_log("INFO: URL allowed, and added [$absoluteUrl]", 4);
  1768. }
  1769. }
  1770. /* Expand a list of URLs. If an entry begins with an "@" it is treated as a
  1771. * filename containing more URLs.
  1772. */
  1773. function isearch_expandList($list, $depth=0)
  1774. {
  1775. $items = array();
  1776. if ($depth > 20)
  1777. {
  1778. isearch_log('ERROR: Too much recursion in isearch_expandUrlList', 1);
  1779. return $items;
  1780. }
  1781. foreach ($list as $item)
  1782. {
  1783. if ($item{0} == '@')
  1784. {
  1785. $lines = @file(substr($item, 1));
  1786. if (is_array($lines))
  1787. {
  1788. $newList = array();
  1789. foreach ($lines as $line)
  1790. {
  1791. $newList[] = trim($line);
  1792. }
  1793. $items = array_merge($items, isearch_expandList($newList, $depth + 1));
  1794. }
  1795. else
  1796. {
  1797. isearch_log('WARNING: unable to read file : ' . substr($item, 1));
  1798. }
  1799. }
  1800. else
  1801. {
  1802. $items[] = $item;
  1803. }
  1804. }
  1805. return $items;
  1806. }
  1807. /* Reset the search index to allow site to be re-spidered */
  1808. function isearch_reset()
  1809. {
  1810. global $isearch_table_urls_new, $isearch_table_info, $isearch_table_words_new;
  1811. global $isearch_db;
  1812. global $isearch_config;
  1813. /* Clear the spider log */
  1814. isearch_clearLog();
  1815. isearch_log('INFO: Starting spidering ' . date('dS F Y h:i:s A'), 3);
  1816. /* Delete all entries from the new databases */
  1817. if (!mysql_query("DELETE FROM $isearch_table_urls_new", $isearch_db))
  1818. {
  1819. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1820. }
  1821. if (!mysql_query("DELETE FROM $isearch_table_words_new", $isearch_db))
  1822. {
  1823. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1824. }
  1825. $urls = isearch_expandList($isearch_config['start_urls']);
  1826. /* Add all start URLs to database */
  1827. foreach ($urls as $url)
  1828. {
  1829. /* Find whether this URL refers to a directory */
  1830. $urlParts = @parse_url($url);
  1831. if ((!isset($urlParts['scheme'])) || (!isset($urlParts['host'])))
  1832. {
  1833. /* Unable to parse URL */
  1834. isearch_log("ERROR: Unable to parse start URL [$url]", 1);
  1835. }
  1836. else
  1837. {
  1838. $filePath = isset($urlParts['path']) ? $urlParts['path'] : '';
  1839. $fileName = eregi_replace('.*/', '', $filePath);
  1840. $temp = explode('.', $fileName);
  1841. if (count($temp) < 2)
  1842. {
  1843. $fileExtension = '';
  1844. }
  1845. else
  1846. {
  1847. $fileExtension = $temp[count($temp)-1];
  1848. }
  1849. if ($fileExtension == '')
  1850. {
  1851. /* Treat no file extension as a directory. Check whether there is a
  1852. * trailing slash on the URL and add it if necessary.
  1853. */
  1854. if ($isearch_config['directory_handling'] == 2)
  1855. {
  1856. /* Add trailing slash to directories */
  1857. $url = ereg_replace('([^/])$', '\1/', $url);
  1858. }
  1859. else if ($isearch_config['directory_handling'] == 3)
  1860. {
  1861. /* Strip trailing slash from directories */
  1862. $url = ereg_replace('/$', '', $url);
  1863. }
  1864. }
  1865. if (!mysql_query("INSERT INTO $isearch_table_urls_new (url, temp_referrer_id, state) VALUES ('".isearch_escape_string($url)."', '-1', 'new')", $isearch_db))
  1866. {
  1867. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1868. }
  1869. isearch_log("INFO: Added start URL [$url]", 5);
  1870. }
  1871. }
  1872. /* Reset the robots.txt exclude list */
  1873. $isearch_config['robots_domains'] = array();
  1874. $isearch_config['robots_excludes'] = array();
  1875. if (!mysql_query("UPDATE $isearch_table_info SET robots_domains='', robots_excludes='' WHERE id='1'", $isearch_db))
  1876. {
  1877. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1878. }
  1879. /* Clear the last_update time to indicate that we are currently spidering */
  1880. if (!mysql_query("UPDATE $isearch_table_info SET last_update='0' WHERE id='1'", $isearch_db))
  1881. {
  1882. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1883. }
  1884. }
  1885. /* Get the number of URLs that are in the database with the specified state (or
  1886. * all if state is empty).
  1887. */
  1888. function isearch_getUrlCount($new = False, $state = '')
  1889. {
  1890. global $isearch_table_urls, $isearch_table_urls_new;
  1891. global $isearch_db;
  1892. $count = 0;
  1893. $query = 'SELECT COUNT(*) FROM ' . ($new ? $isearch_table_urls_new : $isearch_table_urls);
  1894. if ($state != '')
  1895. {
  1896. $query .= " WHERE state='$state'";
  1897. }
  1898. $result = mysql_query($query, $isearch_db);
  1899. if (!$result)
  1900. {
  1901. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1902. }
  1903. else
  1904. {
  1905. $count = mysql_result($result, 0, 0);
  1906. }
  1907. return $count;
  1908. }
  1909. /* Copy the current spidered tables to the search tables. This makes them
  1910. * searchable.
  1911. */
  1912. function isearch_copyUrlTables($toNew = False)
  1913. {
  1914. global $isearch_table_urls, $isearch_table_urls_new;
  1915. global $isearch_table_words, $isearch_table_words_new;
  1916. global $isearch_db;
  1917. if ($toNew)
  1918. {
  1919. mysql_query("LOCK TABLES $isearch_table_urls_new WRITE, $isearch_table_words_new WRITE, $isearch_table_urls READ, $isearch_table_words READ", $isearch_db);
  1920. if (!mysql_query("DELETE FROM $isearch_table_urls_new", $isearch_db))
  1921. {
  1922. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1923. }
  1924. if (!mysql_query("DELETE FROM $isearch_table_words_new", $isearch_db))
  1925. {
  1926. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1927. }
  1928. if (!mysql_query("INSERT INTO $isearch_table_urls_new SELECT * FROM $isearch_table_urls", $isearch_db))
  1929. {
  1930. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1931. }
  1932. if (!mysql_query("INSERT INTO $isearch_table_words_new SELECT * FROM $isearch_table_words", $isearch_db))
  1933. {
  1934. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1935. }
  1936. }
  1937. else
  1938. {
  1939. mysql_query("LOCK TABLES $isearch_table_urls WRITE, $isearch_table_words WRITE, $isearch_table_urls_new READ, $isearch_table_words_new READ", $isearch_db);
  1940. if (!mysql_query("DELETE FROM $isearch_table_urls", $isearch_db))
  1941. {
  1942. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1943. }
  1944. if (!mysql_query("DELETE FROM $isearch_table_words", $isearch_db))
  1945. {
  1946. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1947. }
  1948. if (!mysql_query("INSERT INTO $isearch_table_urls SELECT * FROM $isearch_table_urls_new WHERE state='ok'", $isearch_db))
  1949. {
  1950. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1951. }
  1952. if (!mysql_query("INSERT INTO $isearch_table_words SELECT * FROM $isearch_table_words_new", $isearch_db))
  1953. {
  1954. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1955. }
  1956. }
  1957. mysql_query("UNLOCK TABLES", $isearch_db);
  1958. }
  1959. $isearch_url_fopen_detected = (bool) ini_get('allow_url_fopen');
  1960. $isearch_curl_detected = (function_exists('curl_init')) ? True : False;
  1961. $isearch_sockets_detected = (function_exists('fsockopen')) ? True : False;
  1962. ?>