PageRenderTime 56ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 1ms

/search/inc/spider.inc.php

https://bitbucket.org/molusc/sma-website
PHP | 2239 lines | 1773 code | 272 blank | 194 comment | 438 complexity | 8db91d84a3e69e7c0759b1218c936bc1 MD5 | raw file
Possible License(s): BSD-3-Clause

Large files files are truncated, but you can click here to view the full file

  1. <?php
  2. /******************************************************************************
  3. * iSearch2 - website search engine *
  4. * *
  5. * Visit the iSearch homepage at http://www.iSearchTheNet/isearch *
  6. * *
  7. * Copyright (C) 2002-2007 Z-Host. All rights reserved. *
  8. * *
  9. ******************************************************************************/
  10. if ( !defined('IN_ISEARCH') )
  11. {
  12. die('Hacking attempt');
  13. }
  14. /* Parse the robots.txt stored in the database */
  15. function isearch_parseRobots($domain)
  16. {
  17. global $isearch_config;
  18. global $isearch_table_info;
  19. global $isearch_db;
  20. global $isearch_base;
  21. $isearch_config['robots_domains'][] = $domain;
  22. // Store in a temp veriable to allow other relative URLs to be evaluated
  23. // after parsing robots.txt.
  24. $isearch_base_tmp = $isearch_base;
  25. $allData = isearch_readFile("http://$domain/robots.txt");
  26. $isearch_base = $isearch_base_tmp;
  27. if ($allData != '')
  28. {
  29. $allData = strtolower($allData);
  30. $lines = split("(\r|\n)", $allData);
  31. $validUseragent = False;
  32. $matched = False;
  33. foreach ($lines as $line)
  34. {
  35. $line = ereg_replace('#.*$', '', $line);
  36. $line = ereg_replace('[[:space:]]+', ' ', $line);
  37. $temp = explode(':', $line, 2);
  38. if (count($temp) == 2)
  39. {
  40. $field = trim($temp[0]);
  41. $value = trim($temp[1]);
  42. if ($field == 'user-agent')
  43. {
  44. $validUseragent = False;
  45. $useragents = explode(' ', $value);
  46. foreach ($useragents as $useragent)
  47. {
  48. if (($useragent == 'isearch') || (($useragent == '*') && (!$matched)))
  49. {
  50. $matched = True;
  51. $validUseragent = True;
  52. }
  53. }
  54. }
  55. else if (($validUseragent) && ($field == 'disallow'))
  56. {
  57. if ($value == '')
  58. {
  59. /* This is an allow - remove all previous disallows */
  60. unset($disallow);
  61. }
  62. else
  63. {
  64. $disallow[] = $value;
  65. }
  66. }
  67. }
  68. }
  69. if (isset($disallow))
  70. {
  71. foreach ($disallow as $temp)
  72. {
  73. if ($temp{0} != '/')
  74. {
  75. $temp = '/' . $temp;
  76. }
  77. $url = "^http://$domain$temp";
  78. $url = ereg_replace('\.', '\.', $url);
  79. $url = ereg_replace('\*', '.*', $url);
  80. $url = ereg_replace('\?', '\?', $url);
  81. $url = ereg_replace('\+', '\+', $url);
  82. $isearch_config['robots_excludes'][] = $url;
  83. }
  84. }
  85. }
  86. if (!mysql_query("UPDATE $isearch_table_info SET robots_domains='" . isearch_escape_string(implode(" ", $isearch_config['robots_domains'])) . "', robots_excludes='" . isearch_escape_string(implode(" ", $isearch_config['robots_excludes'])) . "' WHERE id='1'", $isearch_db))
  87. {
  88. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  89. }
  90. }
  91. /* Clear the iSearch log file */
  92. function isearch_clearLog()
  93. {
  94. global $isearch_table_spider_log;
  95. global $isearch_db;
  96. mysql_query("DELETE FROM $isearch_table_spider_log", $isearch_db);
  97. }
  98. /* Return the contents of the spider log */
  99. function isearch_getLog()
  100. {
  101. global $isearch_table_spider_log;
  102. global $isearch_db;
  103. $log = '';
  104. $result = mysql_query("SELECT * FROM $isearch_table_spider_log ORDER BY id", $isearch_db);
  105. if ($result)
  106. {
  107. while ($item = mysql_fetch_object($result))
  108. {
  109. $log .= str_replace(' ', '&nbsp;', htmlentities($item->msg)) . "<BR>\n";
  110. }
  111. }
  112. return $log;
  113. }
  114. /* Save the string in the iSearch log file */
  115. function isearch_log($string, $level=1)
  116. {
  117. global $isearch_table_spider_log;
  118. global $isearch_db;
  119. global $isearch_config;
  120. global $isearch_fromCommandLine;
  121. global $isearch_logEchoLevel;
  122. if ($level <= $isearch_config['log_level'])
  123. {
  124. mysql_query("INSERT INTO $isearch_table_spider_log (msg) VALUES ('" . isearch_escape_string($string) . "')", $isearch_db);
  125. }
  126. if (isset($isearch_fromCommandLine) && $isearch_fromCommandLine)
  127. {
  128. if ($level <= $isearch_logEchoLevel)
  129. {
  130. echo "$string\n";
  131. }
  132. }
  133. else
  134. {
  135. if ($level <= $isearch_config['log_echo_level'])
  136. {
  137. echo str_replace(' ', '&nbsp;', htmlentities($string)) . "<BR>\n";
  138. }
  139. }
  140. }
  141. /* Clean up a string to make it suitable for storing in search index */
  142. function isearch_cleanString($data, $charset)
  143. {
  144. global $isearch_config;
  145. if ($isearch_config['char_set_8_bit'])
  146. {
  147. /* Convert to lower case, doing accented character conversion correctly */
  148. $data = strtr($data, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ?ÁÂ?Ä??Ç?É?Ë?ÍÎ????ÓÔ?Ö×??Ú?ÜÝ?' . chr(0x8a) . chr(0x8e) ,
  149. 'abcdefghijklmnopqrstuvwxyz?áâ?ä??ç?é?ë?íî????óô?ö÷??ú?üý?' . chr(0x9a) . chr(0x9e) );
  150. }
  151. /* Strip out all HTML tags */
  152. $data = strip_tags($data);
  153. /* Replace some breaking chars with spaces */
  154. $data = ereg_replace('[\\;\?!]+', ' ', $data);
  155. if ($isearch_config['allow_dashes'] == 0)
  156. {
  157. /* Replace dashes with spaces */
  158. $data = str_replace('-', ' ', $data);
  159. }
  160. else if ($isearch_config['allow_dashes'] == 1)
  161. {
  162. /* Allow within words */
  163. $data = str_replace(' -', ' ', str_replace('- ', ' ', $data));
  164. }
  165. else if ($isearch_config['allow_dashes'] == 3)
  166. {
  167. /* Remove All */
  168. $data = str_replace('-', '', $data);
  169. }
  170. if ($isearch_config['allow_colons'] == 0)
  171. {
  172. /* Replace with spaces */
  173. $data = str_replace(':', ' ', $data);
  174. }
  175. else if ($isearch_config['allow_colons'] == 1)
  176. {
  177. /* Allow within words */
  178. $data = str_replace(' :', ' ', str_replace(': ', ' ', $data));
  179. }
  180. else if ($isearch_config['allow_colons'] == 3)
  181. {
  182. /* Remove All */
  183. $data = str_replace(':', '', $data);
  184. }
  185. if ($isearch_config['allow_dots'] == 0)
  186. {
  187. /* Replace with spaces */
  188. $data = str_replace('.', ' ', $data);
  189. }
  190. else if ($isearch_config['allow_dots'] == 1)
  191. {
  192. /* Allow within words */
  193. $data = str_replace(' .', ' ', str_replace('. ', ' ', $data));
  194. }
  195. else if ($isearch_config['allow_dots'] == 3)
  196. {
  197. /* Remove All */
  198. $data = str_replace('.', '', $data);
  199. }
  200. if ($isearch_config['allow_commas'] == 0)
  201. {
  202. /* Replace with spaces */
  203. $data = str_replace(',', ' ', $data);
  204. }
  205. else if ($isearch_config['allow_commas'] == 1)
  206. {
  207. /* Allow within words */
  208. $data = str_replace(' ,', ' ', str_replace(', ', ' ', $data));
  209. }
  210. else if ($isearch_config['allow_commas'] == 3)
  211. {
  212. /* Remove All */
  213. $data = str_replace(',', '', $data);
  214. }
  215. if ($isearch_config['allow_underscores'] == 0)
  216. {
  217. /* Replace with spaces */
  218. $data = str_replace('_', ' ', $data);
  219. }
  220. else if ($isearch_config['allow_underscores'] == 1)
  221. {
  222. /* Allow within words */
  223. $data = str_replace(' _', ' ', str_replace('_ ', ' ', $data));
  224. }
  225. else if ($isearch_config['allow_underscores'] == 3)
  226. {
  227. /* Remove All */
  228. $data = str_replace('_', '', $data);
  229. }
  230. if ($isearch_config['char_set_8_bit'])
  231. {
  232. /* Strip out all characters except whitespace numeric and alpha */
  233. $data = preg_replace('/([^-@0-9a-z:\\.,' . chr(0xbf) . '-' . chr(0xff) . chr(0x9a) . chr(0x9e) . '\s])/', '', $data);
  234. }
  235. else
  236. {
  237. /* Remove any single quotes and backslashes. */
  238. $data = ereg_replace('[\'\\]', '', $data);
  239. }
  240. /* Convert from source charset to charset used on results page */
  241. if ((strtolower($charset) != $isearch_config['char_set']) &&
  242. ($charset != '') &&
  243. ($isearch_config['char_set'] != ''))
  244. {
  245. if (function_exists('iconv'))
  246. {
  247. isearch_log("INFO: Converting $charset -> " . $isearch_config['char_set'], 5);
  248. $convertedData = iconv($charset, $isearch_config['char_set'], $data);
  249. if ($convertedData === False)
  250. {
  251. isearch_log("WARNING: Unable to convert $charset -> " . $isearch_config['char_set'], 3);
  252. }
  253. else
  254. {
  255. $data = $convertedData;
  256. }
  257. }
  258. else
  259. {
  260. isearch_log("WARNING: iconv not installed - unable to convert $charset -> " . $isearch_config['char_set'], 5);
  261. }
  262. }
  263. /* Compact all white space into a single space character */
  264. $data = preg_replace("/\\s+/", ' ', $data);
  265. /* Strip white space from beginning and end of the string */
  266. $data = trim($data);
  267. return $data;
  268. }
  269. function isearch_fread($handle, $length = 2147483647)
  270. {
  271. $bytesToRead = $length;
  272. $contents = '';
  273. while((!feof($handle)) && ($bytesToRead > 0))
  274. {
  275. $data = fread($handle, ($bytesToRead > 16384) ? 16384 : $bytesToRead);
  276. $bytesToRead -= strlen($data);
  277. $contents .= $data;
  278. }
  279. if (!feof($handle))
  280. {
  281. isearch_log('WARNING: File reading was truncated at '.($length/1024).' kbytes', 3);
  282. }
  283. return $contents;
  284. }
  285. function isearch_relativeToAbsoluteUrl($newUrl, $relativeToUrl)
  286. {
  287. /* Convert to absolute reference */
  288. if (eregi('^([a-z]+):', $newUrl))
  289. {
  290. $absoluteUrl = $newUrl;
  291. }
  292. else
  293. {
  294. $relativeParts = @parse_url($relativeToUrl);
  295. if ((!isset($relativeParts['scheme'])) || (!isset($relativeParts['host'])))
  296. {
  297. /* Unable to parse relativeToUrl */
  298. isearch_log("WARNING: Unable to parse relativeToUrl [$relativeToUrl]", 3);
  299. return;
  300. }
  301. unset($relativeParts['query']);
  302. unset($relativeParts['fragment']);
  303. if (ereg('^/', $newUrl))
  304. {
  305. /* New URL begins with a slash. It is within the site */
  306. unset($relativeParts['path']);
  307. $siteUrl = glue_url($relativeParts);
  308. $absoluteUrl = $siteUrl . $newUrl;
  309. }
  310. else
  311. {
  312. /* A relative reference (must be within this site) */
  313. /* Remove filename following the last slash */
  314. if (isset($relativeParts['path']))
  315. {
  316. $path = ereg_replace('/[^/]*\.[^/]*$', '/', $relativeParts['path']);
  317. if (ereg('/$', $path))
  318. {
  319. $path .= $newUrl;
  320. }
  321. else
  322. {
  323. $path .= '/' . $newUrl;
  324. }
  325. }
  326. else
  327. {
  328. $path = '/' . $newUrl;
  329. }
  330. $path = ereg_replace('/\.$', '', $path); /* Remove ending "/." */
  331. $path = ereg_replace('/(\./)+', '/', $path); /* Remove any "." references */
  332. $path = ereg_replace('/+/', '/', $path); /* Remove excess slashes */
  333. /* Resolve any ".." references */
  334. $temp = explode('/', $path);
  335. for ($i = 1; $i < count($temp); $i++)
  336. {
  337. if (($temp[$i] == "..") && ($i > 1))
  338. {
  339. for ($j = $i + 1; $j < count($temp); $j ++)
  340. {
  341. $temp[$j-2] = $temp[$j];
  342. }
  343. unset($temp[count($temp)-1]);
  344. unset($temp[count($temp)-1]);
  345. $i = $i - 2;
  346. }
  347. }
  348. $relativeParts['path'] = implode('/', $temp);
  349. $absoluteUrl = glue_url($relativeParts);
  350. }
  351. }
  352. return $absoluteUrl;
  353. }
  354. function isearch_readFile($url, $depth=0)
  355. {
  356. global $isearch_config;
  357. global $isearch_version;
  358. global $isearch_header;
  359. global $isearch_base;
  360. global $isearch_url_fopen_detected, $isearch_curl_detected, $isearch_sockets_detected;
  361. $isearch_header = array();
  362. isearch_log("TRACE: isearch_readFile($url, $depth)", 10);
  363. if ($depth >= 10)
  364. {
  365. /* Inline frame depth of 10 */
  366. isearch_log("WARNING: Inline frame depth limit $depth exceeded", 3);
  367. return '';
  368. }
  369. if ($isearch_config['url_search'] != '')
  370. {
  371. $url = ereg_replace($isearch_config['url_search'], $isearch_config['url_replace'], $url);
  372. isearch_log("INFO: Using replaced URL $url", 5);
  373. }
  374. if ($isearch_config['reading_mechanism'] == 0)
  375. {
  376. /* Autodetect */
  377. if (($isearch_url_fopen_detected) && (!$isearch_config['proxy_enable']))
  378. {
  379. $reading_mechanism = 1; /* fopen */
  380. }
  381. else if (($isearch_sockets_detected) && (eregi('^http://', $url)))
  382. {
  383. $reading_mechanism = 2; /* sockets */
  384. }
  385. else if ($isearch_curl_detected)
  386. {
  387. $reading_mechanism = 3; /* curl */
  388. }
  389. else
  390. {
  391. isearch_log('ERROR: Unable to detect a suitable reading mechanism.', 1);
  392. return '';
  393. }
  394. }
  395. else
  396. {
  397. $reading_mechanism = $isearch_config['reading_mechanism'];
  398. }
  399. $base = $url;
  400. if ($reading_mechanism == 1)
  401. {
  402. /* Use fopen/fread */
  403. isearch_log("INFO: Reading $url using fopen/fread", 5);
  404. $docData = '';
  405. @ini_set('user_agent', "iSearch/$isearch_version");
  406. if ($isearch_config['basic_authorization'] != '')
  407. {
  408. $url = str_replace('//', '//'.$isearch_config['basic_authorization'].'@', $url);
  409. }
  410. $fp = @fopen($url, 'r');
  411. if ($fp)
  412. {
  413. if (function_exists('stream_get_meta_data'))
  414. {
  415. /* Prior to PHP 4.3.0 use $http_response_header instead of stream_get_meta_data() */
  416. $meta_data = stream_get_meta_data($fp);
  417. $header_data = $meta_data['wrapper_data'];
  418. }
  419. else
  420. {
  421. $header_data = $http_response_header;
  422. }
  423. $header = array();
  424. foreach($header_data as $headerLine)
  425. {
  426. $data = explode(': ', $headerLine, 2);
  427. if (count($data) == 2)
  428. {
  429. $header[strtolower($data[0])] = $data[1];
  430. }
  431. }
  432. if (isset($header['content-location']))
  433. {
  434. isearch_log("INFO: Content-Location: ".$header['content-location'], 9);
  435. $base = isearch_relativeToAbsoluteUrl($header['content-location'], $base);
  436. }
  437. else if (isset($header['location']))
  438. {
  439. isearch_log("INFO: Location: ".$header['location'], 9);
  440. $base = isearch_relativeToAbsoluteUrl($header['location'], $base);
  441. }
  442. $docData = isearch_fread($fp, $isearch_config['max_file_size']);
  443. fclose($fp);
  444. }
  445. else
  446. {
  447. isearch_log("WARNING: Unable to fopen URL [$url]", ereg('/robots\.txt$', $url) ? 9 : 3);
  448. return '';
  449. }
  450. }
  451. else
  452. {
  453. $recurse = 10;
  454. while (1)
  455. {
  456. /* Check URL and determine whether this is a file or directory */
  457. $urlParts = @parse_url($url);
  458. if ((!isset($urlParts['scheme'])) || (!isset($urlParts['host'])))
  459. {
  460. isearch_log("WARNING: Unable to parse URL [$url]", 3);
  461. return '';
  462. }
  463. if (!eregi('^(https?|ftps?)$', $urlParts['scheme']))
  464. {
  465. isearch_log("WARNING: Unsupported URL scheme " . $urlParts['scheme'] . " [$url]", 4);
  466. return '';
  467. }
  468. if ($reading_mechanism == 2)
  469. {
  470. isearch_log("INFO: Reading $url using sockets", 5);
  471. if ($urlParts['scheme'] == 'http')
  472. {
  473. $secure = False;
  474. }
  475. else if ($urlParts['scheme'] == 'https')
  476. {
  477. $secure = True;
  478. }
  479. else
  480. {
  481. isearch_log("WARNING: URL scheme " . $urlParts['scheme'] . " not supported by sockets. Use CURL library. [$url]", 3);
  482. return '';
  483. }
  484. if ($isearch_config['proxy_enable'])
  485. {
  486. $host = $isearch_config['proxy_host'];
  487. $port = $isearch_config['proxy_port'];
  488. }
  489. else
  490. {
  491. $host = $urlParts['host'];
  492. if (isset($urlParts['port']))
  493. {
  494. $port = $urlParts['port'];
  495. }
  496. else
  497. {
  498. $port = $secure ? 443 : 80;
  499. }
  500. }
  501. $sock = fsockopen(($secure ? 'ssl://' : '').$host, $port, $errno, $errstr);
  502. if (!$sock)
  503. {
  504. isearch_log("ERROR: Unable to open socket to " . $host . " " . $port . " - $errno : $errstr", 1);
  505. return '';
  506. }
  507. $request = "GET $url HTTP/1.0\r\n";
  508. $request .= "Host: $host\r\n";
  509. if (($isearch_config['proxy_enable']) && ($isearch_config['proxy_user'] != ''))
  510. {
  511. $request .= "Proxy-Authorization: Basic " . base64_encode ($isearch_config['proxy_user'].':'.$isearch_config['proxy_pass']) . "\r\n";
  512. }
  513. $request .= "User-Agent: iSearch/$isearch_version\r\n";
  514. if ($isearch_config['basic_authorization'] != '')
  515. {
  516. $request .= "Authorization: Basic " . base64_encode($isearch_config['basic_authorization']) . "\r\n";
  517. }
  518. $request .= "Connection: Close\r\n\r\n";
  519. fputs($sock, $request);
  520. $allData = isearch_fread($sock, $isearch_config['max_file_size']);
  521. fclose($sock);
  522. }
  523. else
  524. {
  525. /* Use the CURL library */
  526. isearch_log("INFO: Reading $url using CURL", 5);
  527. $ch = curl_init($url);
  528. // curl_setopt($ch, CURLOPT_VERBOSE, 1);
  529. curl_setopt($ch, CURLOPT_USERAGENT, "iSearch/$isearch_version");
  530. curl_setopt($ch, CURLOPT_HEADER, TRUE);
  531. curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
  532. curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
  533. curl_setopt($ch, CURLOPT_HEADER, TRUE);
  534. curl_setopt($ch, CURLOPT_TIMEOUT, 30);
  535. curl_setopt($ch, CURLOPT_COOKIEJAR, "my_cookies.txt"); // Initiates cookie file if needed
  536. curl_setopt($ch, CURLOPT_COOKIEFILE, "my_cookies.txt"); // Uses cookies from previous session if exist
  537. if ($isearch_config['proxy_enable'])
  538. {
  539. if ($isearch_config['proxy_user'] != '')
  540. {
  541. curl_setopt($ch, CURLOPT_PROXYUSERPWD, $isearch_config['proxy_user'].':'.$isearch_config['proxy_pass']);
  542. }
  543. curl_setopt($ch, CURLOPT_PROXY, $isearch_config['proxy_host']);
  544. curl_setopt($ch, CURLOPT_PROXYPORT, $isearch_config['proxy_port']);
  545. }
  546. if ($isearch_config['basic_authorization'] != '')
  547. {
  548. curl_setopt($ch, CURLOPT_USERPWD, $isearch_config['basic_authorization']);
  549. }
  550. $allData = curl_exec($ch);
  551. curl_close($ch);
  552. }
  553. @list($headerData, $docData) = explode("\r\n\r\n", $allData, 2);
  554. $headerLines = explode("\r\n", $headerData);
  555. $status = $headerLines[0];
  556. isearch_log("INFO: Status Line $status", 8);
  557. $header = array();
  558. for ($i = count($headerLines)-1; $i > 0; $i--)
  559. {
  560. $data = explode(': ', $headerLines[$i], 2);
  561. if (count($data) == 2)
  562. {
  563. $header[strtolower($data[0])] = $data[1];
  564. }
  565. }
  566. /* Check status code:
  567. * "200" ; OK
  568. * "201" ; Created
  569. * "202" ; Accepted
  570. * "204" ; No Content
  571. * "301" ; Moved Permanently
  572. * "302" ; Moved Temporarily
  573. * "304" ; Not Modified
  574. * "400" ; Bad Request
  575. * "401" ; Unauthorized
  576. * "403" ; Forbidden
  577. * "404" ; Not Found
  578. * "500" ; Internal Server Error
  579. * "501" ; Not Implemented
  580. * "502" ; Bad Gateway
  581. * "503" ; Service Unavailable
  582. */
  583. $statusParts = explode(' ', $status, 3);
  584. if (count($statusParts) < 2)
  585. {
  586. isearch_log('ERROR: Unable to read status code', 1);
  587. return '';
  588. }
  589. $statusCode = $statusParts[1];
  590. if (($statusCode == '301') || ($statusCode == '302'))
  591. {
  592. /* Redirection. Get new location */
  593. if ($recurse <= 1)
  594. {
  595. /* Recursion limit reached */
  596. isearch_log('ERROR: URL recursion limit 10 exceeded', 1);
  597. return '';
  598. }
  599. $url = $header['location'];
  600. $recurse = $recurse - 1;
  601. }
  602. else if ($statusCode >= 300)
  603. {
  604. isearch_log("WARNING: HTTP Error : $status [$url]", ereg('/robots\.txt$', $url) ? 9 : 3);
  605. return '';
  606. }
  607. else
  608. {
  609. /* We have read the file */
  610. break;
  611. }
  612. }
  613. if (isset($header['content-location']))
  614. {
  615. isearch_log("INFO: Content-Location: ".$header['content-location'], 9);
  616. $base = $header['content-location'];
  617. }
  618. else
  619. {
  620. isearch_log("INFO: Url: ".$url, 9);
  621. $base = $url;
  622. }
  623. }
  624. if (preg_match("#<BASE\\s+[^>]*?HREF\\s*=\\s*['\"]?([^>]+?)['\"]?[\\s>]#i", $docData, $matches) == 1)
  625. {
  626. /* Found a "<BASE HREF=" tag in the document head */
  627. isearch_log("INFO: BASE: ".$matches[1], 9);
  628. $base = $matches[1];
  629. }
  630. /* Search for inline frames and replace them with frame contents */
  631. $regexp = "#<IFRAME[^>]*?\\sSRC\\s*=\\s*['\"]?(.*?)[\\s'\"][^>]*>#i";
  632. $matchCount = preg_match_all($regexp, $docData, $matches);
  633. for ($i = 0; $i < $matchCount; $i++)
  634. {
  635. $frameUrl = isearch_relativeToAbsoluteUrl($matches[1][$i], $base);
  636. isearch_log("INFO: Reading inline frame : $frameUrl", 5);
  637. $frameData = isearch_readFile($frameUrl, $depth+1);
  638. $docData = preg_replace($regexp, $frameData, $docData, 1);
  639. }
  640. if ($isearch_config['javascript_link_search'] == 2)
  641. {
  642. /* Search for external JavaScript files and replace them with file contents */
  643. $regexp = "#<SCRIPT[^>]*?\\sSRC\\s*=\\s*['\"](.*?)['\"][^>]*>#i";
  644. $matchCount = preg_match_all($regexp, $docData, $matches);
  645. for ($i = 0; $i < $matchCount; $i++)
  646. {
  647. $jsUrl = isearch_relativeToAbsoluteUrl($matches[1][$i], $base);
  648. isearch_log("INFO: Reading javascript : $jsUrl", 5);
  649. $jsData = isearch_readFile($jsUrl, $depth+1);
  650. $docData = preg_replace($regexp, "<SCRIPT>\n<!--\n".$jsData."\n-->\n</SCRIPT>\n", $docData, 1);
  651. }
  652. }
  653. $isearch_header = $header;
  654. $isearch_base = $base;
  655. return $docData;
  656. }
  657. function isearch_execConvert($data, $type, $url)
  658. {
  659. global $isearch_config;
  660. isearch_log("INFO: Exec conversion $type", 7);
  661. // Create a temporary filename to use for the conversion
  662. $tmpfname = tempnam($isearch_config['tmpdir'], "iSearch");
  663. // Write data to the temp file
  664. $fh = fopen($tmpfname, 'wb');
  665. if (!$fh)
  666. {
  667. isearch_log("ERROR: Unable to open tmp file $tmpfname", 1);
  668. return '';
  669. }
  670. fwrite($fh, $data);
  671. fclose($fh);
  672. // Execute the command
  673. $cmd = $isearch_config[$type . '_exec'];
  674. if ($cmd == '')
  675. {
  676. isearch_log("ERROR: Configuration error - executable for $type not set", 1);
  677. return '';
  678. }
  679. $pdf = False;
  680. if ($type == 'pdf')
  681. {
  682. $pdf = True;
  683. $cmd .= ' -htmlmeta '.escapeshellarg($tmpfname).' -';
  684. }
  685. else
  686. {
  687. $cmd .= ' '.escapeshellarg($tmpfname);
  688. }
  689. exec($cmd, $output, $retval);
  690. // Delete the temporary file
  691. unlink($tmpfname);
  692. // Read the stderr and stdout files
  693. if (is_file("$tmpfname.err"))
  694. {
  695. $err = trim(@implode(" ", file("$tmpfname.err")));
  696. unlink("$tmpfname.err");
  697. }
  698. else
  699. {
  700. $err = '';
  701. }
  702. if (is_file("$tmpfname.txt"))
  703. {
  704. $text = @implode("\n", file("$tmpfname.txt"));
  705. unlink("$tmpfname.txt");
  706. }
  707. else
  708. {
  709. $text = implode("\n", $output);
  710. }
  711. if ($retval == 0)
  712. {
  713. isearch_log("INFO: Executed command $cmd $tmpfname, Return Code $retval", 5);
  714. }
  715. else
  716. {
  717. isearch_log("ERROR: Executed command $cmd $tmpfname, Return Code $retval", 1);
  718. return '';
  719. }
  720. if ($err != '')
  721. {
  722. isearch_log("ERROR: Executed command $cmd $tmpfname, Error Msg: $err", 1);
  723. return '';
  724. }
  725. if ($text == '')
  726. {
  727. isearch_log("ERROR: Unable to read converted file", 2);
  728. return '';
  729. }
  730. if ($type != 'pdf')
  731. {
  732. // Wrap text in HTML
  733. $fname = ereg_replace('^.*[/\\\\]', '', $url);
  734. $text = "<HTML><HEAD><TITLE>$fname</TITLE></HEAD><BODY><PRE>$text</PRE></BODY></HTML>";
  735. }
  736. return $text;
  737. }
  738. function isearch_onlineConvert($data, $type, $url)
  739. {
  740. global $isearch_config;
  741. global $isearch_version;
  742. $host='convert.iSearchTheNet.com';
  743. $port=80;
  744. $path='/';
  745. $query="?type=$type&gzip=0";
  746. isearch_log("INFO: Online conversion $type", 7);
  747. $sock = fsockopen($host, $port, $errno, $errstr);
  748. if (!$sock)
  749. {
  750. isearch_log("ERROR: Unable to open socket $host $port - $errno : $errstr", 1);
  751. return '';
  752. }
  753. $boundary = '---------------------------' . md5('boundary');
  754. $postValues = "--$boundary\r\n";
  755. $postValues .= "Content-Disposition: form-data; name=\"file.1\"; filename=\"file.1\"\r\n";
  756. $postValues .= "Content-Type: application/$type\r\n";
  757. $postValues .= "\r\n";
  758. $postValues .= "$data\r\n";
  759. $postValues .= "--$boundary--\r\n\r\n";
  760. $request = "POST http://$host$path$query HTTP/1.0\r\n";
  761. $request .= "User-Agent: iSearch/$isearch_version\r\n";
  762. $request .= "Host: $host\r\n";
  763. $request .= "Authorization: ISEARCH " . $isearch_config['online_id'] . "\r\n";
  764. $request .= "Content-Type: multipart/form-data; boundary=$boundary\r\n";
  765. $request .= "Content-Length: " . strlen( $postValues ) . "\r\n";
  766. // $request .= "Connection: Close\r\n";
  767. $request .= "\r\n";
  768. fputs($sock, $request.$postValues);
  769. /* Read status line */
  770. $status = fgets($sock, 1024);
  771. /* Read the header */
  772. while (!feof($sock))
  773. {
  774. $line = trim(fgets($sock, 1024));
  775. if ($line == '')
  776. {
  777. break;
  778. }
  779. }
  780. $convertedData = isearch_fread($sock);
  781. fclose($sock);
  782. $statusParts = explode(' ', $status, 3);
  783. if ($statusParts[1] != '200')
  784. {
  785. isearch_log("ERROR: Online conversion error: $status", 1);
  786. return $data;
  787. }
  788. if ((strlen($convertedData) > 10) && ($convertedData[0] == 0x1f) && ($convertedData[1] == 0x8b))
  789. {
  790. isearch_log("INFO: Running gzinflate on converted data", 6);
  791. $convertedData = gzinflate(substr($convertedData,10));
  792. }
  793. // Check the first word of the converted data
  794. $firstWord = '';
  795. sscanf($convertedData, ' %s ', $firstWord);
  796. if (!eregi('<HTML>', $firstWord))
  797. {
  798. // Wrap text in HTML
  799. $fname = ereg_replace('^.*[/\\\\]', '', $url);
  800. $convertedData = "<HTML><HEAD><TITLE>$fname</TITLE></HEAD><BODY><PRE>$convertedData</PRE></BODY></HTML>";
  801. }
  802. return $convertedData;
  803. }
  804. function isearch_utf8_chr($ch)
  805. {
  806. if ($ch <= 0x7f)
  807. {
  808. return chr($ch);
  809. }
  810. if ($ch <= 0x7ff)
  811. {
  812. return chr(($ch >> 6) + 0xc0).chr(($ch & 0x3f) + 0x80);
  813. }
  814. if ($ch <= 0xffff)
  815. {
  816. return chr(($ch >> 12) + 0xe0).chr((($ch >> 6) & 0x3f) + 0x80).chr(($ch & 0x3f) + 0x80);
  817. }
  818. if ($ch <= 0x1fffff)
  819. {
  820. return chr(($ch >> 18) + 0xf0).chr((($ch >> 12) & 0x3f) + 0x80).chr((($ch >> 6) & 0x3f) + 0x80).chr(($num & 0x3f) + 0x80);
  821. }
  822. // Invalid UTF-8 code
  823. return '';
  824. }
  825. function isearch_html_entity_decode($string, $quote, $charset)
  826. {
  827. $version = phpversion();
  828. if ($version{0} == '4')
  829. {
  830. if (strtolower($charset) == 'utf-8')
  831. {
  832. // PHP4 html_entity_decode does not support multi-byte charsets
  833. static $utf8_trans_tbl;
  834. $string = preg_replace('/&#x([0-9a-f]+);/ei', 'isearch_utf8_chr(hexdec("\\1"))', $string);
  835. $string = preg_replace('/&#([0-9]+);/e', 'isearch_utf8_chr(\\1)', $string);
  836. if (!isset($utf8_trans_tbl))
  837. {
  838. $utf8_trans_tbl = array();
  839. foreach (get_html_translation_table(HTML_ENTITIES) as $val=>$key)
  840. {
  841. $utf8_trans_tbl[$key] = utf8_encode($val);
  842. }
  843. }
  844. return strtr($string, $utf8_trans_tbl);
  845. }
  846. $string = preg_replace('/&#x([0-9a-f]+);?/ei', 'chr(hexdec("\\1"))', $string);
  847. $string = preg_replace('/&#([0-9]+);?/e', 'chr("\\1")', $string);
  848. }
  849. if (!function_exists('html_entity_decode'))
  850. {
  851. // html_entity_decode was new in PHP 4.3.0
  852. global $isearch_htmlToAsciiTrans;
  853. if (!isset($isearch_htmlToAsciiTrans))
  854. {
  855. /* Translate from HTML to ASCII */
  856. $isearch_htmlToAsciiTrans = array_flip(get_html_translation_table(HTML_ENTITIES));
  857. }
  858. return strtr($string, $isearch_htmlToAsciiTrans);
  859. }
  860. return html_entity_decode($string, $quote, $charset);
  861. }
  862. /* Parse HTTP date format - one of the following:
  863. * Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123
  864. * Sunday, 06-Nov-94 08:49:37 GMT ; RFC 850, obsoleted by RFC 1036
  865. * Sun Nov 6 08:49:37 1994 ; ANSI C's asctime() format
  866. */
  867. function isearch_parseHttpDate($httpDate)
  868. {
  869. static $months;
  870. if (!isset($months))
  871. {
  872. $months = array('jan'=>1, 'feb'=>2, 'mar'=>3, 'apr'=>4, 'may'=>5, 'jun'=>6, 'jul'=>7, 'aug'=>8, 'sep'=>9, 'oct'=>10, 'nov'=>11, 'dec'=>12);
  873. }
  874. $time = 0;
  875. if (preg_match("#^[a-z]+,? +([0-9]{1,2})[ -]+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[ -]+([0-9]{2,4}) +([0-9]{2}):([0-9]{2}):([0-9]{2})#i", $httpDate, $matchName) == 1)
  876. {
  877. $day = $matchName[1];
  878. $monthName = strtolower($matchName[2]);
  879. $year = $matchName[3];
  880. $hour = $matchName[4];
  881. $min = $matchName[5];
  882. $sec = $matchName[6];
  883. }
  884. else if (preg_match("#^[a-z]+,?[ -]+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) +([0-9]{2}):([0-9]{2}):([0-9]{2}) +([0-9]{2,4})#i", $httpDate, $matchName) == 1)
  885. {
  886. $day = $matchName[2];
  887. $monthName = strtolower($matchName[1]);
  888. $year = $matchName[6];
  889. $hour = $matchName[3];
  890. $min = $matchName[4];
  891. $sec = $matchName[5];
  892. }
  893. else
  894. {
  895. isearch_log('WARNING: Unknown date format : ' . $httpDate, 7);
  896. }
  897. if (isset($day))
  898. {
  899. if ($year < 70)
  900. {
  901. $year += 2000;
  902. }
  903. else if ($year < 100)
  904. {
  905. $year += 1900;
  906. }
  907. $time = gmmktime($hour, $min, $sec, $months[$monthName], $day, $year);
  908. }
  909. return $time;
  910. }
  911. /* Spider a single file. Returns true if there are more files to spider, else false */
  912. function isearch_indexAFile($verbose = True)
  913. {
  914. global $isearch_table_info, $isearch_table_urls, $isearch_table_urls_new, $isearch_table_words, $isearch_table_words_new;
  915. global $isearch_db;
  916. global $isearch_config;
  917. global $isearch_header;
  918. global $isearch_base;
  919. if (! $verbose)
  920. {
  921. /* Disable display of messages. */
  922. $isearch_config['log_echo_level'] = 0;
  923. }
  924. $resultUrls = mysql_query("SELECT * FROM $isearch_table_urls_new WHERE state='new' LIMIT 1", $isearch_db);
  925. if (!$resultUrls)
  926. {
  927. /* MySQL error. Sleep and try again */
  928. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  929. sleep(5);
  930. return True;
  931. }
  932. if (mysql_num_rows($resultUrls) != 1)
  933. {
  934. isearch_log('INFO: Indexing completed.', 2);
  935. /* Indexing has completed */
  936. $now = time();
  937. if (!mysql_query("UPDATE $isearch_table_info SET last_update='$now' WHERE id='1'", $isearch_db))
  938. {
  939. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  940. }
  941. $result = mysql_query("SELECT url, state FROM $isearch_table_urls_new WHERE state!='ok'", $isearch_db);
  942. if (!$result)
  943. {
  944. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  945. }
  946. else if (mysql_num_rows($result) > 0)
  947. {
  948. isearch_log('INFO: Deleting the following URLs:', 4);
  949. while ($item = mysql_fetch_object($result))
  950. {
  951. isearch_log('INFO: ' . $item->url . ' (' . $item->state . ')', 4);
  952. }
  953. }
  954. /* Delete any unfound references */
  955. if (!mysql_query("DELETE FROM $isearch_table_urls_new WHERE state!='ok'", $isearch_db))
  956. {
  957. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  958. }
  959. /* Update referrer_id fields */
  960. $result = mysql_query("SELECT id, temp_referrer_id FROM $isearch_table_urls_new", $isearch_db);
  961. if (!$result)
  962. {
  963. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  964. }
  965. else if (mysql_num_rows($result) > 0)
  966. {
  967. mysql_query("LOCK TABLES $isearch_table_urls_new WRITE", $isearch_db);
  968. while ($item = mysql_fetch_object($result))
  969. {
  970. if (!mysql_query("UPDATE $isearch_table_urls_new SET referrer_id='$item->temp_referrer_id' WHERE id='$item->id'", $isearch_db))
  971. {
  972. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  973. }
  974. }
  975. mysql_query("UNLOCK TABLES", $isearch_db);
  976. }
  977. if ($isearch_config['test_mode'] == 0)
  978. {
  979. /* Swap the old and new tables */
  980. $backup_words = $isearch_table_words . '_tmp_backup';
  981. $backup_urls = $isearch_table_urls . '_tmp_backup';
  982. if (!mysql_query("RENAME TABLE $isearch_table_words TO $backup_words, " .
  983. "$isearch_table_words_new TO $isearch_table_words, " .
  984. "$backup_words TO $isearch_table_words_new, " .
  985. "$isearch_table_urls TO $backup_urls, " .
  986. "$isearch_table_urls_new TO $isearch_table_urls, " .
  987. "$backup_urls TO $isearch_table_urls_new", $isearch_db))
  988. {
  989. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  990. }
  991. }
  992. /* Empty the new words table */
  993. if (!mysql_query("DELETE FROM $isearch_table_words_new", $isearch_db))
  994. {
  995. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  996. }
  997. /* Empty the new urls table */
  998. if (!mysql_query("DELETE FROM $isearch_table_urls_new", $isearch_db))
  999. {
  1000. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1001. }
  1002. /* Optimize the tables */
  1003. if (!mysql_query("OPTIMIZE TABLE $isearch_table_urls, $isearch_table_words", $isearch_db))
  1004. {
  1005. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1006. }
  1007. return False;
  1008. }
  1009. $itemUrl = mysql_fetch_object($resultUrls);
  1010. $allData = isearch_readFile($itemUrl->url);
  1011. $newState = 'error';
  1012. if ($allData == '')
  1013. {
  1014. isearch_log("ERROR: Unable to open URL [$itemUrl->url]", 1);
  1015. $newState = 'notfound';
  1016. }
  1017. else
  1018. {
  1019. $size = strlen($allData);
  1020. $sig = md5($allData);
  1021. /* Look for a duplicate page */
  1022. $resultSig = mysql_query("SELECT * FROM $isearch_table_urls_new WHERE sig='$sig' AND size='$size' AND NOT url='" . isearch_escape_string($itemUrl->url) . "'", $isearch_db);
  1023. if (!$resultSig)
  1024. {
  1025. isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
  1026. }
  1027. else if (mysql_num_rows($resultSig) > 0)
  1028. {
  1029. isearch_log("INFO: Duplicate URL - will not be processed [$itemUrl->url]", 4);
  1030. $newState = 'duplicate';
  1031. }
  1032. else
  1033. {
  1034. /* No duplicates found. */
  1035. isearch_log("INFO: Processing URL [$itemUrl->url]", 2);
  1036. if ((preg_match('%\.(doc|pdf)(\?.*|#.*)?$%i', $itemUrl->url, $matches)) ||
  1037. (preg_match('/^%(PDF)/i', $allData, $matches)))
  1038. {
  1039. if (strtolower($matches[1]) == 'doc')
  1040. {
  1041. switch ($isearch_config['msword_support'])
  1042. {
  1043. case 0:
  1044. /* Do nothing */
  1045. isearch_log("WARNING: Word support disabled", 3);
  1046. break;
  1047. case 1:
  1048. $allData = isearch_execConvert($allData, 'msword', $itemUrl->url);
  1049. break;
  1050. case 2:
  1051. $allData = isearch_onlineConvert($allData, 'msword', $itemUrl->url);
  1052. break;
  1053. default:
  1054. isearch_log("ERROR: Illegal Word document support setting", 1);
  1055. break;
  1056. }
  1057. }
  1058. else
  1059. {
  1060. switch ($isearch_config['pdf_support'])
  1061. {
  1062. case 0:
  1063. /* Do nothing */
  1064. isearch_log("WARNING: PDF support disabled", 3);
  1065. break;
  1066. case 1:
  1067. $allData = isearch_execConvert($allData, 'pdf', $itemUrl->url);
  1068. break;
  1069. case 2:
  1070. $allData = isearch_onlineConvert($allData, 'pdf', $itemUrl->url);
  1071. break;
  1072. default:
  1073. isearch_log("ERROR: Illegal PDF support setting", 1);
  1074. break;
  1075. }
  1076. }
  1077. }
  1078. if (($isearch_config['keep_cache']) && ($isearch_config['test_mode'] == 0))
  1079. {
  1080. $cache = $allData;
  1081. }
  1082. else
  1083. {
  1084. $cache = '';
  1085. }
  1086. /* Strip out all control characters and replace with spaces.
  1087. * Compact all white space into a single space character.
  1088. */
  1089. $allData = preg_replace("/\\s+/", ' ', $allData);
  1090. if (isset($isearch_header['content_type']) && ($isearch_header['content_type'] == 'text/plain'))
  1091. {
  1092. $headData = '';
  1093. $bodyData = $allData;
  1094. }
  1095. else
  1096. {
  1097. $tdata = spliti('</head[^>]*>', $allData, 2);
  1098. if (count($tdata) < 2)
  1099. {
  1100. $tdata = spliti('<body', $allData, 2);
  1101. if (count($tdata) < 2)
  1102. {
  1103. isearch_log('WARNING: <BODY> and </HEAD> tags not found', 4);
  1104. $headData = '';
  1105. $bodyData = $allData;
  1106. }
  1107. else
  1108. {
  1109. $headData = $tdata[0];
  1110. $bodyData = '<body' . $tdata[1];
  1111. }
  1112. }
  1113. else
  1114. {
  1115. $headData = $tdata[0];
  1116. $bodyData = $tdata[1];
  1117. }
  1118. }
  1119. /* Strip out HTML comments from head data */
  1120. $headData = preg_replace('/<!--.+?-->/','',$headData);
  1121. $headData = preg_replace("/\\s+/", ' ', $headData);
  1122. /* Strip out all HTML tags except ones we are interested in */
  1123. /* Includes workaround for PHP bug. See http://bugs.php.net/bug.php?id=21311 */
  1124. $headData = strip_tags(eregi_replace("<\!DOCTYPE [^>]*>", '', $headData), '<meta><title><base>');
  1125. $keyWords = '';
  1126. $description = '';
  1127. $title = '';
  1128. $index = True;
  1129. $follow = True;
  1130. $matchCount = preg_match_all("#<META\\s+([^>]*?)\\s*>#i", $allData, $matches);
  1131. for ($i = 0; $i < $matchCount; $i++)
  1132. {
  1133. if (preg_match("#CONTENT\\s*=\\s*(['\"])(.*?)\\1#i", $matches[1][$i], $matchContent) == 1)
  1134. {
  1135. $metaContent = $matchContent[2];
  1136. if (preg_match("#NAME\\s*=\\s*['\"]?(.*?)['\"]?(\\s|$)#i", $matches[1][$i], $matchName) == 1)
  1137. {
  1138. // <META NAME="keywords" CONTENT="keyword list">
  1139. // <META NAME="description" CONTENT="description">
  1140. // <META NAME="robots" CONTENT="nofollow,noindex,noarchive">
  1141. $metaName = strtolower($matchName[1]);
  1142. if ($metaName == 'keywords')
  1143. {
  1144. $keyWords = ereg_replace(',', ' ', $metaContent);
  1145. }
  1146. else if ($metaName == 'description')
  1147. {
  1148. $description = $metaContent;
  1149. }
  1150. else if ($metaName == 'robots')
  1151. {
  1152. if (eregi('noindex', $metaContent))
  1153. {
  1154. $index = False;
  1155. }
  1156. if (eregi('nofollow', $metaContent))
  1157. {
  1158. $follow = False;
  1159. }
  1160. if (eregi('noarchive', $metaContent))
  1161. {
  1162. $cache = '';
  1163. }
  1164. }
  1165. }
  1166. else if (preg_match("#HTTP-EQUIV\\s*=(['\"])(.*?)\\1#i", $matches[1][$i], $matchEquiv) == 1)
  1167. {
  1168. $isearch_header[strtolower($matchEquiv[2])] = $metaContent;
  1169. }
  1170. }
  1171. }
  1172. // Determine character set
  1173. $charset = $isearch_config['char_set'];
  1174. if ((isset($isearch_header['content-type'])) &&
  1175. (preg_match("#(^|\\s)\\s*CHARSET\\s*=\\s*['\"]?(.*?)['\"]?(\\s|$)#i", $isearch_header['content-type'], $matches) == 1))
  1176. {
  1177. $charset = $matches[2];
  1178. }
  1179. $lastModified = 0;
  1180. if (isset($isearch_header['last-modified']))
  1181. {
  1182. $lastModified = isearch_parseHttpDate($isearch_header['last-modified']);
  1183. }
  1184. $changefreq = '';
  1185. $priority = -1;
  1186. if ($follow)
  1187. {
  1188. $tdata1 = spliti('(<!-- ISEARCH_BEGIN_FOLLOW -->|</nofollow>)', ' ' . $bodyData);
  1189. if (count($tdata1) > 1)
  1190. {
  1191. /* At least 1 found. */
  1192. $followData = '';
  1193. /* Check for an END_INDEX before the first BEGIN_INDEX */
  1194. $tdata2 = spliti('(<!-- ISEARCH_END_FOLLOW -->|<nofollow>)', $tdata1[0]);
  1195. if (count($tdata2) > 1)
  1196. {
  1197. /* And END_INDEX was found. Add anything before it into the follow data */
  1198. $followData .= $tdata2[0];
  1199. }
  1200. for ($i = 1; $i < count($tdata1); $i++)
  1201. {
  1202. $tdata2 = spliti('(<!-- ISEARCH_END_FOLLOW -->|<nofollow>)', $tdata1[$i]);
  1203. $followData .= $tdata2[0];
  1204. }
  1205. }
  1206. else
  1207. {
  1208. $followData = $bodyData;
  1209. }
  1210. $urls = array();
  1211. if ($isearch_config['follow_meta_refresh'] && isset($isearch_header['refresh']) &&
  1212. (preg_match("#; *URL *= *(.*)$#i", $isearch_header['refresh'], $matches) == 1))
  1213. {
  1214. $urls[] = $matches[1];
  1215. }
  1216. /* Do agressive link searching */
  1217. if ($isearch_config['aggressive_link_search'])
  1218. {
  1219. $matchCount = preg_match_all("~(https?|ftps?)://[^'\"\\s>]*~i", $allData, $matches);
  1220. for ($i = 0; $i < $matchCount; $i++)
  1221. {
  1222. $urls[] = $matches[0][$i];
  1223. }
  1224. /* Find any links with no quotes */
  1225. $matchCount = preg_match_all("~<(A|AREA)\\s+([^>]*?\\s+)*?HREF\\s*=\\s*([^\\s>]+?)~i", $followData, $matches);
  1226. for ($i = 0; $i < $matchCount; $i++)
  1227. {
  1228. $urls[] = $matches[3][$i];
  1229. }
  1230. }
  1231. /* Do JavaScript link searching */
  1232. if ($isearch_config['javascript_link_search'])
  1233. {
  1234. /* Search for window.open() calls */
  1235. $matchCount = preg_match_all("~window.open\\s*\\(\\s*'(.+?)',~i", $allData, $matches);
  1236. for ($i = 0; $i < $matchCount; $i++)
  1237. {
  1238. if ($matches[1][$i] != '')
  1239. {
  1240. $urls[] = $matches[1][$i];
  1241. }
  1242. }
  1243. $matchCount = preg_match_all("~window.open\\s*\\(\\s*\"(.+?)\",~i", $allData, $matches);
  1244. for ($i = 0; $i < $matchCount; $i++)
  1245. {
  1246. if ($matches[1][$i] != '')
  1247. {
  1248. $urls[] = $matches[1][$i];
  1249. }
  1250. }
  1251. }
  1252. /* Remove JavaScript and comments */
  1253. $followData = preg_replace('#(<SCRIPT[^>]*?>.*?</SCRIPT>)|(<!--.*?-->)#i', '', $followData);
  1254. /* Find any double quoted links */
  1255. $matchCount = preg_match_all("~<(A|AREA)\\s+([^>]*?\\s+…

Large files files are truncated, but you can click here to view the full file