/search/inc/spider.inc.php
PHP | 2239 lines | 1773 code | 272 blank | 194 comment | 438 complexity | 8db91d84a3e69e7c0759b1218c936bc1 MD5 | raw file
Possible License(s): BSD-3-Clause
Large files files are truncated, but you can click here to view the full file
- <?php
-
- /******************************************************************************
- * iSearch2 - website search engine *
- * *
- * Visit the iSearch homepage at http://www.iSearchTheNet/isearch *
- * *
- * Copyright (C) 2002-2007 Z-Host. All rights reserved. *
- * *
- ******************************************************************************/
-
- if ( !defined('IN_ISEARCH') )
- {
- die('Hacking attempt');
- }
-
- /* Parse the robots.txt stored in the database */
- function isearch_parseRobots($domain)
- {
- global $isearch_config;
- global $isearch_table_info;
- global $isearch_db;
- global $isearch_base;
-
- $isearch_config['robots_domains'][] = $domain;
-
- // Store in a temp veriable to allow other relative URLs to be evaluated
- // after parsing robots.txt.
- $isearch_base_tmp = $isearch_base;
- $allData = isearch_readFile("http://$domain/robots.txt");
- $isearch_base = $isearch_base_tmp;
-
- if ($allData != '')
- {
- $allData = strtolower($allData);
- $lines = split("(\r|\n)", $allData);
- $validUseragent = False;
- $matched = False;
-
- foreach ($lines as $line)
- {
- $line = ereg_replace('#.*$', '', $line);
- $line = ereg_replace('[[:space:]]+', ' ', $line);
- $temp = explode(':', $line, 2);
- if (count($temp) == 2)
- {
- $field = trim($temp[0]);
- $value = trim($temp[1]);
- if ($field == 'user-agent')
- {
- $validUseragent = False;
- $useragents = explode(' ', $value);
- foreach ($useragents as $useragent)
- {
- if (($useragent == 'isearch') || (($useragent == '*') && (!$matched)))
- {
- $matched = True;
- $validUseragent = True;
- }
- }
- }
- else if (($validUseragent) && ($field == 'disallow'))
- {
- if ($value == '')
- {
- /* This is an allow - remove all previous disallows */
- unset($disallow);
- }
- else
- {
- $disallow[] = $value;
- }
- }
- }
- }
-
- if (isset($disallow))
- {
- foreach ($disallow as $temp)
- {
- if ($temp{0} != '/')
- {
- $temp = '/' . $temp;
- }
- $url = "^http://$domain$temp";
- $url = ereg_replace('\.', '\.', $url);
- $url = ereg_replace('\*', '.*', $url);
- $url = ereg_replace('\?', '\?', $url);
- $url = ereg_replace('\+', '\+', $url);
- $isearch_config['robots_excludes'][] = $url;
- }
- }
- }
-
- if (!mysql_query("UPDATE $isearch_table_info SET robots_domains='" . isearch_escape_string(implode(" ", $isearch_config['robots_domains'])) . "', robots_excludes='" . isearch_escape_string(implode(" ", $isearch_config['robots_excludes'])) . "' WHERE id='1'", $isearch_db))
- {
- isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
- }
- }
-
-
- /* Clear the iSearch log file */
- function isearch_clearLog()
- {
- global $isearch_table_spider_log;
- global $isearch_db;
-
- mysql_query("DELETE FROM $isearch_table_spider_log", $isearch_db);
- }
-
-
- /* Return the contents of the spider log */
- function isearch_getLog()
- {
- global $isearch_table_spider_log;
- global $isearch_db;
-
- $log = '';
-
- $result = mysql_query("SELECT * FROM $isearch_table_spider_log ORDER BY id", $isearch_db);
- if ($result)
- {
- while ($item = mysql_fetch_object($result))
- {
- $log .= str_replace(' ', ' ', htmlentities($item->msg)) . "<BR>\n";
- }
- }
-
- return $log;
- }
-
-
- /* Save the string in the iSearch log file */
- function isearch_log($string, $level=1)
- {
- global $isearch_table_spider_log;
- global $isearch_db;
- global $isearch_config;
- global $isearch_fromCommandLine;
- global $isearch_logEchoLevel;
-
- if ($level <= $isearch_config['log_level'])
- {
- mysql_query("INSERT INTO $isearch_table_spider_log (msg) VALUES ('" . isearch_escape_string($string) . "')", $isearch_db);
- }
-
- if (isset($isearch_fromCommandLine) && $isearch_fromCommandLine)
- {
- if ($level <= $isearch_logEchoLevel)
- {
- echo "$string\n";
- }
- }
- else
- {
- if ($level <= $isearch_config['log_echo_level'])
- {
- echo str_replace(' ', ' ', htmlentities($string)) . "<BR>\n";
- }
- }
- }
-
-
- /* Clean up a string to make it suitable for storing in search index */
- function isearch_cleanString($data, $charset)
- {
- global $isearch_config;
-
- if ($isearch_config['char_set_8_bit'])
- {
- /* Convert to lower case, doing accented character conversion correctly */
- $data = strtr($data, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ?ÁÂ?Ä??Ç?É?Ë?ÍÎ????ÓÔ?Ö×??Ú?ÜÝ?' . chr(0x8a) . chr(0x8e) ,
- 'abcdefghijklmnopqrstuvwxyz?áâ?ä??ç?é?ë?íî????óô?ö÷??ú?üý?' . chr(0x9a) . chr(0x9e) );
- }
-
- /* Strip out all HTML tags */
- $data = strip_tags($data);
-
- /* Replace some breaking chars with spaces */
- $data = ereg_replace('[\\;\?!]+', ' ', $data);
-
- if ($isearch_config['allow_dashes'] == 0)
- {
- /* Replace dashes with spaces */
- $data = str_replace('-', ' ', $data);
- }
- else if ($isearch_config['allow_dashes'] == 1)
- {
- /* Allow within words */
- $data = str_replace(' -', ' ', str_replace('- ', ' ', $data));
- }
- else if ($isearch_config['allow_dashes'] == 3)
- {
- /* Remove All */
- $data = str_replace('-', '', $data);
- }
-
- if ($isearch_config['allow_colons'] == 0)
- {
- /* Replace with spaces */
- $data = str_replace(':', ' ', $data);
- }
- else if ($isearch_config['allow_colons'] == 1)
- {
- /* Allow within words */
- $data = str_replace(' :', ' ', str_replace(': ', ' ', $data));
- }
- else if ($isearch_config['allow_colons'] == 3)
- {
- /* Remove All */
- $data = str_replace(':', '', $data);
- }
-
- if ($isearch_config['allow_dots'] == 0)
- {
- /* Replace with spaces */
- $data = str_replace('.', ' ', $data);
- }
- else if ($isearch_config['allow_dots'] == 1)
- {
- /* Allow within words */
- $data = str_replace(' .', ' ', str_replace('. ', ' ', $data));
- }
- else if ($isearch_config['allow_dots'] == 3)
- {
- /* Remove All */
- $data = str_replace('.', '', $data);
- }
-
- if ($isearch_config['allow_commas'] == 0)
- {
- /* Replace with spaces */
- $data = str_replace(',', ' ', $data);
- }
- else if ($isearch_config['allow_commas'] == 1)
- {
- /* Allow within words */
- $data = str_replace(' ,', ' ', str_replace(', ', ' ', $data));
- }
- else if ($isearch_config['allow_commas'] == 3)
- {
- /* Remove All */
- $data = str_replace(',', '', $data);
- }
-
- if ($isearch_config['allow_underscores'] == 0)
- {
- /* Replace with spaces */
- $data = str_replace('_', ' ', $data);
- }
- else if ($isearch_config['allow_underscores'] == 1)
- {
- /* Allow within words */
- $data = str_replace(' _', ' ', str_replace('_ ', ' ', $data));
- }
- else if ($isearch_config['allow_underscores'] == 3)
- {
- /* Remove All */
- $data = str_replace('_', '', $data);
- }
-
- if ($isearch_config['char_set_8_bit'])
- {
- /* Strip out all characters except whitespace numeric and alpha */
- $data = preg_replace('/([^-@0-9a-z:\\.,' . chr(0xbf) . '-' . chr(0xff) . chr(0x9a) . chr(0x9e) . '\s])/', '', $data);
- }
- else
- {
- /* Remove any single quotes and backslashes. */
- $data = ereg_replace('[\'\\]', '', $data);
- }
-
- /* Convert from source charset to charset used on results page */
- if ((strtolower($charset) != $isearch_config['char_set']) &&
- ($charset != '') &&
- ($isearch_config['char_set'] != ''))
- {
- if (function_exists('iconv'))
- {
- isearch_log("INFO: Converting $charset -> " . $isearch_config['char_set'], 5);
- $convertedData = iconv($charset, $isearch_config['char_set'], $data);
- if ($convertedData === False)
- {
- isearch_log("WARNING: Unable to convert $charset -> " . $isearch_config['char_set'], 3);
- }
- else
- {
- $data = $convertedData;
- }
- }
- else
- {
- isearch_log("WARNING: iconv not installed - unable to convert $charset -> " . $isearch_config['char_set'], 5);
- }
- }
-
- /* Compact all white space into a single space character */
- $data = preg_replace("/\\s+/", ' ', $data);
-
- /* Strip white space from beginning and end of the string */
- $data = trim($data);
-
- return $data;
- }
-
-
- function isearch_fread($handle, $length = 2147483647)
- {
- $bytesToRead = $length;
-
- $contents = '';
- while((!feof($handle)) && ($bytesToRead > 0))
- {
- $data = fread($handle, ($bytesToRead > 16384) ? 16384 : $bytesToRead);
- $bytesToRead -= strlen($data);
- $contents .= $data;
- }
- if (!feof($handle))
- {
- isearch_log('WARNING: File reading was truncated at '.($length/1024).' kbytes', 3);
- }
- return $contents;
- }
-
-
- function isearch_relativeToAbsoluteUrl($newUrl, $relativeToUrl)
- {
- /* Convert to absolute reference */
- if (eregi('^([a-z]+):', $newUrl))
- {
- $absoluteUrl = $newUrl;
- }
- else
- {
- $relativeParts = @parse_url($relativeToUrl);
- if ((!isset($relativeParts['scheme'])) || (!isset($relativeParts['host'])))
- {
- /* Unable to parse relativeToUrl */
- isearch_log("WARNING: Unable to parse relativeToUrl [$relativeToUrl]", 3);
- return;
- }
- unset($relativeParts['query']);
- unset($relativeParts['fragment']);
-
- if (ereg('^/', $newUrl))
- {
- /* New URL begins with a slash. It is within the site */
-
- unset($relativeParts['path']);
- $siteUrl = glue_url($relativeParts);
-
- $absoluteUrl = $siteUrl . $newUrl;
- }
- else
- {
- /* A relative reference (must be within this site) */
-
- /* Remove filename following the last slash */
- if (isset($relativeParts['path']))
- {
- $path = ereg_replace('/[^/]*\.[^/]*$', '/', $relativeParts['path']);
- if (ereg('/$', $path))
- {
- $path .= $newUrl;
- }
- else
- {
- $path .= '/' . $newUrl;
- }
- }
- else
- {
- $path = '/' . $newUrl;
- }
- $path = ereg_replace('/\.$', '', $path); /* Remove ending "/." */
- $path = ereg_replace('/(\./)+', '/', $path); /* Remove any "." references */
- $path = ereg_replace('/+/', '/', $path); /* Remove excess slashes */
-
- /* Resolve any ".." references */
- $temp = explode('/', $path);
- for ($i = 1; $i < count($temp); $i++)
- {
- if (($temp[$i] == "..") && ($i > 1))
- {
- for ($j = $i + 1; $j < count($temp); $j ++)
- {
- $temp[$j-2] = $temp[$j];
- }
- unset($temp[count($temp)-1]);
- unset($temp[count($temp)-1]);
- $i = $i - 2;
- }
- }
- $relativeParts['path'] = implode('/', $temp);
- $absoluteUrl = glue_url($relativeParts);
- }
- }
-
- return $absoluteUrl;
- }
-
-
- function isearch_readFile($url, $depth=0)
- {
- global $isearch_config;
- global $isearch_version;
- global $isearch_header;
- global $isearch_base;
- global $isearch_url_fopen_detected, $isearch_curl_detected, $isearch_sockets_detected;
-
- $isearch_header = array();
-
- isearch_log("TRACE: isearch_readFile($url, $depth)", 10);
-
- if ($depth >= 10)
- {
- /* Inline frame depth of 10 */
- isearch_log("WARNING: Inline frame depth limit $depth exceeded", 3);
- return '';
- }
-
- if ($isearch_config['url_search'] != '')
- {
- $url = ereg_replace($isearch_config['url_search'], $isearch_config['url_replace'], $url);
- isearch_log("INFO: Using replaced URL $url", 5);
- }
-
-
- if ($isearch_config['reading_mechanism'] == 0)
- {
- /* Autodetect */
- if (($isearch_url_fopen_detected) && (!$isearch_config['proxy_enable']))
- {
- $reading_mechanism = 1; /* fopen */
- }
- else if (($isearch_sockets_detected) && (eregi('^http://', $url)))
- {
- $reading_mechanism = 2; /* sockets */
- }
- else if ($isearch_curl_detected)
- {
- $reading_mechanism = 3; /* curl */
- }
- else
- {
- isearch_log('ERROR: Unable to detect a suitable reading mechanism.', 1);
- return '';
- }
- }
- else
- {
- $reading_mechanism = $isearch_config['reading_mechanism'];
- }
-
- $base = $url;
-
- if ($reading_mechanism == 1)
- {
- /* Use fopen/fread */
- isearch_log("INFO: Reading $url using fopen/fread", 5);
- $docData = '';
- @ini_set('user_agent', "iSearch/$isearch_version");
- if ($isearch_config['basic_authorization'] != '')
- {
- $url = str_replace('//', '//'.$isearch_config['basic_authorization'].'@', $url);
- }
- $fp = @fopen($url, 'r');
- if ($fp)
- {
- if (function_exists('stream_get_meta_data'))
- {
- /* Prior to PHP 4.3.0 use $http_response_header instead of stream_get_meta_data() */
- $meta_data = stream_get_meta_data($fp);
- $header_data = $meta_data['wrapper_data'];
- }
- else
- {
- $header_data = $http_response_header;
- }
-
- $header = array();
- foreach($header_data as $headerLine)
- {
- $data = explode(': ', $headerLine, 2);
- if (count($data) == 2)
- {
- $header[strtolower($data[0])] = $data[1];
- }
- }
-
- if (isset($header['content-location']))
- {
- isearch_log("INFO: Content-Location: ".$header['content-location'], 9);
- $base = isearch_relativeToAbsoluteUrl($header['content-location'], $base);
- }
- else if (isset($header['location']))
- {
- isearch_log("INFO: Location: ".$header['location'], 9);
- $base = isearch_relativeToAbsoluteUrl($header['location'], $base);
- }
-
- $docData = isearch_fread($fp, $isearch_config['max_file_size']);
- fclose($fp);
- }
- else
- {
- isearch_log("WARNING: Unable to fopen URL [$url]", ereg('/robots\.txt$', $url) ? 9 : 3);
- return '';
- }
- }
- else
- {
- $recurse = 10;
- while (1)
- {
- /* Check URL and determine whether this is a file or directory */
- $urlParts = @parse_url($url);
- if ((!isset($urlParts['scheme'])) || (!isset($urlParts['host'])))
- {
- isearch_log("WARNING: Unable to parse URL [$url]", 3);
- return '';
- }
-
- if (!eregi('^(https?|ftps?)$', $urlParts['scheme']))
- {
- isearch_log("WARNING: Unsupported URL scheme " . $urlParts['scheme'] . " [$url]", 4);
- return '';
- }
-
- if ($reading_mechanism == 2)
- {
- isearch_log("INFO: Reading $url using sockets", 5);
-
- if ($urlParts['scheme'] == 'http')
- {
- $secure = False;
- }
- else if ($urlParts['scheme'] == 'https')
- {
- $secure = True;
- }
- else
- {
- isearch_log("WARNING: URL scheme " . $urlParts['scheme'] . " not supported by sockets. Use CURL library. [$url]", 3);
- return '';
- }
-
- if ($isearch_config['proxy_enable'])
- {
- $host = $isearch_config['proxy_host'];
- $port = $isearch_config['proxy_port'];
- }
- else
- {
- $host = $urlParts['host'];
- if (isset($urlParts['port']))
- {
- $port = $urlParts['port'];
- }
- else
- {
- $port = $secure ? 443 : 80;
- }
- }
-
- $sock = fsockopen(($secure ? 'ssl://' : '').$host, $port, $errno, $errstr);
- if (!$sock)
- {
- isearch_log("ERROR: Unable to open socket to " . $host . " " . $port . " - $errno : $errstr", 1);
- return '';
- }
-
- $request = "GET $url HTTP/1.0\r\n";
- $request .= "Host: $host\r\n";
- if (($isearch_config['proxy_enable']) && ($isearch_config['proxy_user'] != ''))
- {
- $request .= "Proxy-Authorization: Basic " . base64_encode ($isearch_config['proxy_user'].':'.$isearch_config['proxy_pass']) . "\r\n";
- }
-
- $request .= "User-Agent: iSearch/$isearch_version\r\n";
- if ($isearch_config['basic_authorization'] != '')
- {
- $request .= "Authorization: Basic " . base64_encode($isearch_config['basic_authorization']) . "\r\n";
- }
- $request .= "Connection: Close\r\n\r\n";
-
- fputs($sock, $request);
-
- $allData = isearch_fread($sock, $isearch_config['max_file_size']);
- fclose($sock);
- }
- else
- {
- /* Use the CURL library */
- isearch_log("INFO: Reading $url using CURL", 5);
- $ch = curl_init($url);
-
- // curl_setopt($ch, CURLOPT_VERBOSE, 1);
- curl_setopt($ch, CURLOPT_USERAGENT, "iSearch/$isearch_version");
- curl_setopt($ch, CURLOPT_HEADER, TRUE);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
- curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
- curl_setopt($ch, CURLOPT_HEADER, TRUE);
- curl_setopt($ch, CURLOPT_TIMEOUT, 30);
-
- curl_setopt($ch, CURLOPT_COOKIEJAR, "my_cookies.txt"); // Initiates cookie file if needed
- curl_setopt($ch, CURLOPT_COOKIEFILE, "my_cookies.txt"); // Uses cookies from previous session if exist
-
- if ($isearch_config['proxy_enable'])
- {
- if ($isearch_config['proxy_user'] != '')
- {
- curl_setopt($ch, CURLOPT_PROXYUSERPWD, $isearch_config['proxy_user'].':'.$isearch_config['proxy_pass']);
- }
- curl_setopt($ch, CURLOPT_PROXY, $isearch_config['proxy_host']);
- curl_setopt($ch, CURLOPT_PROXYPORT, $isearch_config['proxy_port']);
- }
- if ($isearch_config['basic_authorization'] != '')
- {
- curl_setopt($ch, CURLOPT_USERPWD, $isearch_config['basic_authorization']);
- }
-
- $allData = curl_exec($ch);
-
- curl_close($ch);
- }
-
- @list($headerData, $docData) = explode("\r\n\r\n", $allData, 2);
- $headerLines = explode("\r\n", $headerData);
- $status = $headerLines[0];
- isearch_log("INFO: Status Line $status", 8);
-
- $header = array();
- for ($i = count($headerLines)-1; $i > 0; $i--)
- {
- $data = explode(': ', $headerLines[$i], 2);
- if (count($data) == 2)
- {
- $header[strtolower($data[0])] = $data[1];
- }
- }
-
- /* Check status code:
- * "200" ; OK
- * "201" ; Created
- * "202" ; Accepted
- * "204" ; No Content
- * "301" ; Moved Permanently
- * "302" ; Moved Temporarily
- * "304" ; Not Modified
- * "400" ; Bad Request
- * "401" ; Unauthorized
- * "403" ; Forbidden
- * "404" ; Not Found
- * "500" ; Internal Server Error
- * "501" ; Not Implemented
- * "502" ; Bad Gateway
- * "503" ; Service Unavailable
- */
- $statusParts = explode(' ', $status, 3);
- if (count($statusParts) < 2)
- {
- isearch_log('ERROR: Unable to read status code', 1);
- return '';
- }
-
- $statusCode = $statusParts[1];
-
- if (($statusCode == '301') || ($statusCode == '302'))
- {
- /* Redirection. Get new location */
- if ($recurse <= 1)
- {
- /* Recursion limit reached */
- isearch_log('ERROR: URL recursion limit 10 exceeded', 1);
- return '';
- }
-
- $url = $header['location'];
- $recurse = $recurse - 1;
- }
- else if ($statusCode >= 300)
- {
- isearch_log("WARNING: HTTP Error : $status [$url]", ereg('/robots\.txt$', $url) ? 9 : 3);
- return '';
- }
- else
- {
- /* We have read the file */
- break;
- }
- }
-
- if (isset($header['content-location']))
- {
- isearch_log("INFO: Content-Location: ".$header['content-location'], 9);
- $base = $header['content-location'];
- }
- else
- {
- isearch_log("INFO: Url: ".$url, 9);
- $base = $url;
- }
- }
-
- if (preg_match("#<BASE\\s+[^>]*?HREF\\s*=\\s*['\"]?([^>]+?)['\"]?[\\s>]#i", $docData, $matches) == 1)
- {
- /* Found a "<BASE HREF=" tag in the document head */
- isearch_log("INFO: BASE: ".$matches[1], 9);
- $base = $matches[1];
- }
-
- /* Search for inline frames and replace them with frame contents */
- $regexp = "#<IFRAME[^>]*?\\sSRC\\s*=\\s*['\"]?(.*?)[\\s'\"][^>]*>#i";
- $matchCount = preg_match_all($regexp, $docData, $matches);
- for ($i = 0; $i < $matchCount; $i++)
- {
- $frameUrl = isearch_relativeToAbsoluteUrl($matches[1][$i], $base);
- isearch_log("INFO: Reading inline frame : $frameUrl", 5);
- $frameData = isearch_readFile($frameUrl, $depth+1);
- $docData = preg_replace($regexp, $frameData, $docData, 1);
- }
-
- if ($isearch_config['javascript_link_search'] == 2)
- {
- /* Search for external JavaScript files and replace them with file contents */
- $regexp = "#<SCRIPT[^>]*?\\sSRC\\s*=\\s*['\"](.*?)['\"][^>]*>#i";
- $matchCount = preg_match_all($regexp, $docData, $matches);
- for ($i = 0; $i < $matchCount; $i++)
- {
- $jsUrl = isearch_relativeToAbsoluteUrl($matches[1][$i], $base);
- isearch_log("INFO: Reading javascript : $jsUrl", 5);
- $jsData = isearch_readFile($jsUrl, $depth+1);
- $docData = preg_replace($regexp, "<SCRIPT>\n<!--\n".$jsData."\n-->\n</SCRIPT>\n", $docData, 1);
- }
- }
-
- $isearch_header = $header;
- $isearch_base = $base;
-
- return $docData;
- }
-
-
- function isearch_execConvert($data, $type, $url)
- {
- global $isearch_config;
-
- isearch_log("INFO: Exec conversion $type", 7);
-
- // Create a temporary filename to use for the conversion
- $tmpfname = tempnam($isearch_config['tmpdir'], "iSearch");
-
- // Write data to the temp file
- $fh = fopen($tmpfname, 'wb');
- if (!$fh)
- {
- isearch_log("ERROR: Unable to open tmp file $tmpfname", 1);
- return '';
- }
- fwrite($fh, $data);
- fclose($fh);
-
- // Execute the command
- $cmd = $isearch_config[$type . '_exec'];
- if ($cmd == '')
- {
- isearch_log("ERROR: Configuration error - executable for $type not set", 1);
- return '';
- }
-
- $pdf = False;
- if ($type == 'pdf')
- {
- $pdf = True;
- $cmd .= ' -htmlmeta '.escapeshellarg($tmpfname).' -';
- }
- else
- {
- $cmd .= ' '.escapeshellarg($tmpfname);
- }
-
- exec($cmd, $output, $retval);
-
- // Delete the temporary file
- unlink($tmpfname);
-
- // Read the stderr and stdout files
- if (is_file("$tmpfname.err"))
- {
- $err = trim(@implode(" ", file("$tmpfname.err")));
- unlink("$tmpfname.err");
- }
- else
- {
- $err = '';
- }
-
- if (is_file("$tmpfname.txt"))
- {
- $text = @implode("\n", file("$tmpfname.txt"));
- unlink("$tmpfname.txt");
- }
- else
- {
- $text = implode("\n", $output);
- }
-
- if ($retval == 0)
- {
- isearch_log("INFO: Executed command $cmd $tmpfname, Return Code $retval", 5);
- }
- else
- {
- isearch_log("ERROR: Executed command $cmd $tmpfname, Return Code $retval", 1);
- return '';
- }
-
- if ($err != '')
- {
- isearch_log("ERROR: Executed command $cmd $tmpfname, Error Msg: $err", 1);
- return '';
- }
-
- if ($text == '')
- {
- isearch_log("ERROR: Unable to read converted file", 2);
- return '';
- }
-
- if ($type != 'pdf')
- {
- // Wrap text in HTML
- $fname = ereg_replace('^.*[/\\\\]', '', $url);
- $text = "<HTML><HEAD><TITLE>$fname</TITLE></HEAD><BODY><PRE>$text</PRE></BODY></HTML>";
- }
-
- return $text;
- }
-
-
- function isearch_onlineConvert($data, $type, $url)
- {
- global $isearch_config;
- global $isearch_version;
-
- $host='convert.iSearchTheNet.com';
- $port=80;
- $path='/';
- $query="?type=$type&gzip=0";
-
- isearch_log("INFO: Online conversion $type", 7);
-
- $sock = fsockopen($host, $port, $errno, $errstr);
- if (!$sock)
- {
- isearch_log("ERROR: Unable to open socket $host $port - $errno : $errstr", 1);
- return '';
- }
-
- $boundary = '---------------------------' . md5('boundary');
-
- $postValues = "--$boundary\r\n";
- $postValues .= "Content-Disposition: form-data; name=\"file.1\"; filename=\"file.1\"\r\n";
- $postValues .= "Content-Type: application/$type\r\n";
- $postValues .= "\r\n";
- $postValues .= "$data\r\n";
- $postValues .= "--$boundary--\r\n\r\n";
-
- $request = "POST http://$host$path$query HTTP/1.0\r\n";
- $request .= "User-Agent: iSearch/$isearch_version\r\n";
- $request .= "Host: $host\r\n";
- $request .= "Authorization: ISEARCH " . $isearch_config['online_id'] . "\r\n";
- $request .= "Content-Type: multipart/form-data; boundary=$boundary\r\n";
- $request .= "Content-Length: " . strlen( $postValues ) . "\r\n";
- // $request .= "Connection: Close\r\n";
- $request .= "\r\n";
-
- fputs($sock, $request.$postValues);
-
- /* Read status line */
- $status = fgets($sock, 1024);
-
- /* Read the header */
- while (!feof($sock))
- {
- $line = trim(fgets($sock, 1024));
- if ($line == '')
- {
- break;
- }
- }
- $convertedData = isearch_fread($sock);
- fclose($sock);
-
- $statusParts = explode(' ', $status, 3);
- if ($statusParts[1] != '200')
- {
- isearch_log("ERROR: Online conversion error: $status", 1);
- return $data;
- }
-
- if ((strlen($convertedData) > 10) && ($convertedData[0] == 0x1f) && ($convertedData[1] == 0x8b))
- {
- isearch_log("INFO: Running gzinflate on converted data", 6);
- $convertedData = gzinflate(substr($convertedData,10));
- }
-
- // Check the first word of the converted data
- $firstWord = '';
- sscanf($convertedData, ' %s ', $firstWord);
- if (!eregi('<HTML>', $firstWord))
- {
- // Wrap text in HTML
- $fname = ereg_replace('^.*[/\\\\]', '', $url);
- $convertedData = "<HTML><HEAD><TITLE>$fname</TITLE></HEAD><BODY><PRE>$convertedData</PRE></BODY></HTML>";
- }
-
- return $convertedData;
- }
-
- function isearch_utf8_chr($ch)
- {
- if ($ch <= 0x7f)
- {
- return chr($ch);
- }
-
- if ($ch <= 0x7ff)
- {
- return chr(($ch >> 6) + 0xc0).chr(($ch & 0x3f) + 0x80);
- }
-
- if ($ch <= 0xffff)
- {
- return chr(($ch >> 12) + 0xe0).chr((($ch >> 6) & 0x3f) + 0x80).chr(($ch & 0x3f) + 0x80);
- }
-
- if ($ch <= 0x1fffff)
- {
- return chr(($ch >> 18) + 0xf0).chr((($ch >> 12) & 0x3f) + 0x80).chr((($ch >> 6) & 0x3f) + 0x80).chr(($num & 0x3f) + 0x80);
- }
-
- // Invalid UTF-8 code
- return '';
- }
-
- function isearch_html_entity_decode($string, $quote, $charset)
- {
- $version = phpversion();
- if ($version{0} == '4')
- {
- if (strtolower($charset) == 'utf-8')
- {
- // PHP4 html_entity_decode does not support multi-byte charsets
- static $utf8_trans_tbl;
-
- $string = preg_replace('/&#x([0-9a-f]+);/ei', 'isearch_utf8_chr(hexdec("\\1"))', $string);
- $string = preg_replace('/&#([0-9]+);/e', 'isearch_utf8_chr(\\1)', $string);
-
- if (!isset($utf8_trans_tbl))
- {
- $utf8_trans_tbl = array();
-
- foreach (get_html_translation_table(HTML_ENTITIES) as $val=>$key)
- {
- $utf8_trans_tbl[$key] = utf8_encode($val);
- }
- }
-
- return strtr($string, $utf8_trans_tbl);
- }
-
- $string = preg_replace('/&#x([0-9a-f]+);?/ei', 'chr(hexdec("\\1"))', $string);
- $string = preg_replace('/&#([0-9]+);?/e', 'chr("\\1")', $string);
- }
-
- if (!function_exists('html_entity_decode'))
- {
- // html_entity_decode was new in PHP 4.3.0
-
- global $isearch_htmlToAsciiTrans;
- if (!isset($isearch_htmlToAsciiTrans))
- {
- /* Translate from HTML to ASCII */
- $isearch_htmlToAsciiTrans = array_flip(get_html_translation_table(HTML_ENTITIES));
- }
- return strtr($string, $isearch_htmlToAsciiTrans);
- }
-
- return html_entity_decode($string, $quote, $charset);
- }
-
-
- /* Parse HTTP date format - one of the following:
- * Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123
- * Sunday, 06-Nov-94 08:49:37 GMT ; RFC 850, obsoleted by RFC 1036
- * Sun Nov 6 08:49:37 1994 ; ANSI C's asctime() format
- */
- function isearch_parseHttpDate($httpDate)
- {
- static $months;
-
- if (!isset($months))
- {
- $months = array('jan'=>1, 'feb'=>2, 'mar'=>3, 'apr'=>4, 'may'=>5, 'jun'=>6, 'jul'=>7, 'aug'=>8, 'sep'=>9, 'oct'=>10, 'nov'=>11, 'dec'=>12);
- }
-
- $time = 0;
- if (preg_match("#^[a-z]+,? +([0-9]{1,2})[ -]+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[ -]+([0-9]{2,4}) +([0-9]{2}):([0-9]{2}):([0-9]{2})#i", $httpDate, $matchName) == 1)
- {
- $day = $matchName[1];
- $monthName = strtolower($matchName[2]);
- $year = $matchName[3];
- $hour = $matchName[4];
- $min = $matchName[5];
- $sec = $matchName[6];
- }
- else if (preg_match("#^[a-z]+,?[ -]+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) +([0-9]{2}):([0-9]{2}):([0-9]{2}) +([0-9]{2,4})#i", $httpDate, $matchName) == 1)
- {
- $day = $matchName[2];
- $monthName = strtolower($matchName[1]);
- $year = $matchName[6];
- $hour = $matchName[3];
- $min = $matchName[4];
- $sec = $matchName[5];
- }
- else
- {
- isearch_log('WARNING: Unknown date format : ' . $httpDate, 7);
- }
-
- if (isset($day))
- {
- if ($year < 70)
- {
- $year += 2000;
- }
- else if ($year < 100)
- {
- $year += 1900;
- }
-
- $time = gmmktime($hour, $min, $sec, $months[$monthName], $day, $year);
- }
-
- return $time;
- }
-
-
- /* Spider a single file. Returns true if there are more files to spider, else false */
- function isearch_indexAFile($verbose = True)
- {
- global $isearch_table_info, $isearch_table_urls, $isearch_table_urls_new, $isearch_table_words, $isearch_table_words_new;
- global $isearch_db;
- global $isearch_config;
- global $isearch_header;
- global $isearch_base;
-
- if (! $verbose)
- {
- /* Disable display of messages. */
- $isearch_config['log_echo_level'] = 0;
- }
-
- $resultUrls = mysql_query("SELECT * FROM $isearch_table_urls_new WHERE state='new' LIMIT 1", $isearch_db);
- if (!$resultUrls)
- {
- /* MySQL error. Sleep and try again */
- isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
- sleep(5);
- return True;
- }
-
- if (mysql_num_rows($resultUrls) != 1)
- {
- isearch_log('INFO: Indexing completed.', 2);
-
- /* Indexing has completed */
- $now = time();
- if (!mysql_query("UPDATE $isearch_table_info SET last_update='$now' WHERE id='1'", $isearch_db))
- {
- isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
- }
-
- $result = mysql_query("SELECT url, state FROM $isearch_table_urls_new WHERE state!='ok'", $isearch_db);
- if (!$result)
- {
- isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
- }
- else if (mysql_num_rows($result) > 0)
- {
- isearch_log('INFO: Deleting the following URLs:', 4);
- while ($item = mysql_fetch_object($result))
- {
- isearch_log('INFO: ' . $item->url . ' (' . $item->state . ')', 4);
- }
- }
-
- /* Delete any unfound references */
- if (!mysql_query("DELETE FROM $isearch_table_urls_new WHERE state!='ok'", $isearch_db))
- {
- isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
- }
-
- /* Update referrer_id fields */
- $result = mysql_query("SELECT id, temp_referrer_id FROM $isearch_table_urls_new", $isearch_db);
- if (!$result)
- {
- isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
- }
- else if (mysql_num_rows($result) > 0)
- {
- mysql_query("LOCK TABLES $isearch_table_urls_new WRITE", $isearch_db);
- while ($item = mysql_fetch_object($result))
- {
- if (!mysql_query("UPDATE $isearch_table_urls_new SET referrer_id='$item->temp_referrer_id' WHERE id='$item->id'", $isearch_db))
- {
- isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
- }
- }
- mysql_query("UNLOCK TABLES", $isearch_db);
- }
-
- if ($isearch_config['test_mode'] == 0)
- {
- /* Swap the old and new tables */
- $backup_words = $isearch_table_words . '_tmp_backup';
- $backup_urls = $isearch_table_urls . '_tmp_backup';
- if (!mysql_query("RENAME TABLE $isearch_table_words TO $backup_words, " .
- "$isearch_table_words_new TO $isearch_table_words, " .
- "$backup_words TO $isearch_table_words_new, " .
- "$isearch_table_urls TO $backup_urls, " .
- "$isearch_table_urls_new TO $isearch_table_urls, " .
- "$backup_urls TO $isearch_table_urls_new", $isearch_db))
- {
- isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
- }
- }
-
- /* Empty the new words table */
- if (!mysql_query("DELETE FROM $isearch_table_words_new", $isearch_db))
- {
- isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
- }
-
- /* Empty the new urls table */
- if (!mysql_query("DELETE FROM $isearch_table_urls_new", $isearch_db))
- {
- isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
- }
-
- /* Optimize the tables */
- if (!mysql_query("OPTIMIZE TABLE $isearch_table_urls, $isearch_table_words", $isearch_db))
- {
- isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
- }
-
- return False;
- }
-
- $itemUrl = mysql_fetch_object($resultUrls);
-
- $allData = isearch_readFile($itemUrl->url);
-
- $newState = 'error';
-
- if ($allData == '')
- {
- isearch_log("ERROR: Unable to open URL [$itemUrl->url]", 1);
- $newState = 'notfound';
- }
- else
- {
- $size = strlen($allData);
- $sig = md5($allData);
-
- /* Look for a duplicate page */
- $resultSig = mysql_query("SELECT * FROM $isearch_table_urls_new WHERE sig='$sig' AND size='$size' AND NOT url='" . isearch_escape_string($itemUrl->url) . "'", $isearch_db);
- if (!$resultSig)
- {
- isearch_log('ERROR: MySQL error : ' . mysql_error() . " in " . __FILE__ . " line " . __LINE__, 1);
- }
- else if (mysql_num_rows($resultSig) > 0)
- {
- isearch_log("INFO: Duplicate URL - will not be processed [$itemUrl->url]", 4);
- $newState = 'duplicate';
- }
- else
- {
- /* No duplicates found. */
-
- isearch_log("INFO: Processing URL [$itemUrl->url]", 2);
-
-
- if ((preg_match('%\.(doc|pdf)(\?.*|#.*)?$%i', $itemUrl->url, $matches)) ||
- (preg_match('/^%(PDF)/i', $allData, $matches)))
- {
- if (strtolower($matches[1]) == 'doc')
- {
- switch ($isearch_config['msword_support'])
- {
- case 0:
- /* Do nothing */
- isearch_log("WARNING: Word support disabled", 3);
- break;
- case 1:
- $allData = isearch_execConvert($allData, 'msword', $itemUrl->url);
- break;
- case 2:
- $allData = isearch_onlineConvert($allData, 'msword', $itemUrl->url);
- break;
- default:
- isearch_log("ERROR: Illegal Word document support setting", 1);
- break;
- }
- }
- else
- {
- switch ($isearch_config['pdf_support'])
- {
- case 0:
- /* Do nothing */
- isearch_log("WARNING: PDF support disabled", 3);
- break;
- case 1:
- $allData = isearch_execConvert($allData, 'pdf', $itemUrl->url);
- break;
- case 2:
- $allData = isearch_onlineConvert($allData, 'pdf', $itemUrl->url);
- break;
- default:
- isearch_log("ERROR: Illegal PDF support setting", 1);
- break;
- }
- }
- }
-
-
- if (($isearch_config['keep_cache']) && ($isearch_config['test_mode'] == 0))
- {
- $cache = $allData;
- }
- else
- {
- $cache = '';
- }
-
- /* Strip out all control characters and replace with spaces.
- * Compact all white space into a single space character.
- */
- $allData = preg_replace("/\\s+/", ' ', $allData);
-
- if (isset($isearch_header['content_type']) && ($isearch_header['content_type'] == 'text/plain'))
- {
- $headData = '';
- $bodyData = $allData;
- }
- else
- {
- $tdata = spliti('</head[^>]*>', $allData, 2);
- if (count($tdata) < 2)
- {
- $tdata = spliti('<body', $allData, 2);
- if (count($tdata) < 2)
- {
- isearch_log('WARNING: <BODY> and </HEAD> tags not found', 4);
- $headData = '';
- $bodyData = $allData;
- }
- else
- {
- $headData = $tdata[0];
- $bodyData = '<body' . $tdata[1];
- }
- }
- else
- {
- $headData = $tdata[0];
- $bodyData = $tdata[1];
- }
- }
-
- /* Strip out HTML comments from head data */
- $headData = preg_replace('/<!--.+?-->/','',$headData);
- $headData = preg_replace("/\\s+/", ' ', $headData);
-
- /* Strip out all HTML tags except ones we are interested in */
- /* Includes workaround for PHP bug. See http://bugs.php.net/bug.php?id=21311 */
- $headData = strip_tags(eregi_replace("<\!DOCTYPE [^>]*>", '', $headData), '<meta><title><base>');
-
- $keyWords = '';
- $description = '';
- $title = '';
- $index = True;
- $follow = True;
-
- $matchCount = preg_match_all("#<META\\s+([^>]*?)\\s*>#i", $allData, $matches);
- for ($i = 0; $i < $matchCount; $i++)
- {
- if (preg_match("#CONTENT\\s*=\\s*(['\"])(.*?)\\1#i", $matches[1][$i], $matchContent) == 1)
- {
- $metaContent = $matchContent[2];
- if (preg_match("#NAME\\s*=\\s*['\"]?(.*?)['\"]?(\\s|$)#i", $matches[1][$i], $matchName) == 1)
- {
- // <META NAME="keywords" CONTENT="keyword list">
- // <META NAME="description" CONTENT="description">
- // <META NAME="robots" CONTENT="nofollow,noindex,noarchive">
-
- $metaName = strtolower($matchName[1]);
- if ($metaName == 'keywords')
- {
- $keyWords = ereg_replace(',', ' ', $metaContent);
- }
- else if ($metaName == 'description')
- {
- $description = $metaContent;
- }
- else if ($metaName == 'robots')
- {
- if (eregi('noindex', $metaContent))
- {
- $index = False;
- }
- if (eregi('nofollow', $metaContent))
- {
- $follow = False;
- }
- if (eregi('noarchive', $metaContent))
- {
- $cache = '';
- }
- }
- }
- else if (preg_match("#HTTP-EQUIV\\s*=(['\"])(.*?)\\1#i", $matches[1][$i], $matchEquiv) == 1)
- {
- $isearch_header[strtolower($matchEquiv[2])] = $metaContent;
- }
- }
- }
-
- // Determine character set
- $charset = $isearch_config['char_set'];
- if ((isset($isearch_header['content-type'])) &&
- (preg_match("#(^|\\s)\\s*CHARSET\\s*=\\s*['\"]?(.*?)['\"]?(\\s|$)#i", $isearch_header['content-type'], $matches) == 1))
- {
- $charset = $matches[2];
- }
-
- $lastModified = 0;
- if (isset($isearch_header['last-modified']))
- {
- $lastModified = isearch_parseHttpDate($isearch_header['last-modified']);
- }
-
- $changefreq = '';
- $priority = -1;
-
- if ($follow)
- {
- $tdata1 = spliti('(<!-- ISEARCH_BEGIN_FOLLOW -->|</nofollow>)', ' ' . $bodyData);
- if (count($tdata1) > 1)
- {
- /* At least 1 found. */
-
- $followData = '';
-
- /* Check for an END_INDEX before the first BEGIN_INDEX */
- $tdata2 = spliti('(<!-- ISEARCH_END_FOLLOW -->|<nofollow>)', $tdata1[0]);
- if (count($tdata2) > 1)
- {
- /* And END_INDEX was found. Add anything before it into the follow data */
- $followData .= $tdata2[0];
- }
-
- for ($i = 1; $i < count($tdata1); $i++)
- {
- $tdata2 = spliti('(<!-- ISEARCH_END_FOLLOW -->|<nofollow>)', $tdata1[$i]);
- $followData .= $tdata2[0];
- }
- }
- else
- {
- $followData = $bodyData;
- }
-
- $urls = array();
- if ($isearch_config['follow_meta_refresh'] && isset($isearch_header['refresh']) &&
- (preg_match("#; *URL *= *(.*)$#i", $isearch_header['refresh'], $matches) == 1))
- {
- $urls[] = $matches[1];
- }
-
- /* Do agressive link searching */
- if ($isearch_config['aggressive_link_search'])
- {
- $matchCount = preg_match_all("~(https?|ftps?)://[^'\"\\s>]*~i", $allData, $matches);
- for ($i = 0; $i < $matchCount; $i++)
- {
- $urls[] = $matches[0][$i];
- }
-
- /* Find any links with no quotes */
- $matchCount = preg_match_all("~<(A|AREA)\\s+([^>]*?\\s+)*?HREF\\s*=\\s*([^\\s>]+?)~i", $followData, $matches);
- for ($i = 0; $i < $matchCount; $i++)
- {
- $urls[] = $matches[3][$i];
- }
- }
-
- /* Do JavaScript link searching */
- if ($isearch_config['javascript_link_search'])
- {
- /* Search for window.open() calls */
- $matchCount = preg_match_all("~window.open\\s*\\(\\s*'(.+?)',~i", $allData, $matches);
- for ($i = 0; $i < $matchCount; $i++)
- {
- if ($matches[1][$i] != '')
- {
- $urls[] = $matches[1][$i];
- }
- }
- $matchCount = preg_match_all("~window.open\\s*\\(\\s*\"(.+?)\",~i", $allData, $matches);
- for ($i = 0; $i < $matchCount; $i++)
- {
- if ($matches[1][$i] != '')
- {
- $urls[] = $matches[1][$i];
- }
- }
- }
-
- /* Remove JavaScript and comments */
- $followData = preg_replace('#(<SCRIPT[^>]*?>.*?</SCRIPT>)|(<!--.*?-->)#i', '', $followData);
-
- /* Find any double quoted links */
- $matchCount = preg_match_all("~<(A|AREA)\\s+([^>]*?\\s+…
Large files files are truncated, but you can click here to view the full file