PageRenderTime 27ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/cms/modules/search/admin/spider.php

https://github.com/akash6190/pragyan
PHP | 633 lines | 491 code | 114 blank | 28 comment | 167 complexity | 72646c469f1e1aa46ddb3a7f80907c65 MD5 | raw file
  1. <?php
  2. if(!defined('__PRAGYAN_CMS'))
  3. {
  4. header($_SERVER['SERVER_PROTOCOL'].' 403 Forbidden');
  5. echo "<h1>403 Forbidden<h1><h4>You are not authorized to access the page.</h4>";
  6. echo '<hr/>'.$_SERVER['SERVER_SIGNATURE'];
  7. exit(1);
  8. }
  9. /*******************************************
  10. * Sphider Version 1.3.*
  11. * This program is licensed under the GNU GPL.
  12. * By Ando Saabas ando(a t)cs.ioc.ee
  13. *
  14. * Thanks to Antoine Bajolet for ideas and
  15. * several code pieces
  16. ********************************************/
  17. set_time_limit (0);
  18. global $sourceFolder;
  19. $include_dir = "$sourceFolder/modules/search/include";
  20. require_once ("$include_dir/commonfuncs.php");
  21. $all = 0;
  22. extract (getHttpVars());
  23. $settings_dir = "$sourceFolder/modules/search/settings";
  24. require_once ("$settings_dir/conf.php");
  25. $admin_dir = "$sourceFolder/modules/search/admin";
  26. include "$admin_dir/messages.php";
  27. include "$admin_dir/spiderfuncs.php";
  28. error_reporting (E_ALL ^ E_NOTICE ^ E_WARNING);
  29. $delay_time = 0;
  30. $command_line = 0;
  31. if (isset($_SERVER['argv']) && $_SERVER['argc'] >= 2) {
  32. $command_line = 1;
  33. $ac = 1; //argument counter
  34. while ($ac < (count($_SERVER['argv']))) {
  35. $arg = $_SERVER['argv'][$ac];
  36. if ($arg == '-all') {
  37. $all = 1;
  38. break;
  39. } else if ($arg == '-u') {
  40. $url = $_SERVER['argv'][$ac+1];
  41. $ac= $ac+2;
  42. } else if ($arg == '-f') {
  43. $soption = 'full';
  44. $ac++;
  45. } else if ($arg == '-d') {
  46. $soption = 'level';
  47. $maxlevel = $_SERVER['argv'][$ac+1];;
  48. $ac= $ac+2;
  49. } else if ($arg == '-l') {
  50. $domaincb = 1;
  51. $ac++;
  52. } else if ($arg == '-r') {
  53. $reindex = 1;
  54. $ac++;
  55. } else if ($arg == '-m') {
  56. $in = str_replace("\\n", chr(10), $_SERVER['argv'][$ac+1]);
  57. $ac= $ac+2;
  58. } else if ($arg == '-n') {
  59. $out = str_replace("\\n", chr(10), $_SERVER['argv'][$ac+1]);
  60. $ac= $ac+2;
  61. } else {
  62. commandline_help();
  63. die();
  64. }
  65. }
  66. }
  67. if (isset($soption) && $soption == 'full') {
  68. $maxlevel = -1;
  69. }
  70. if (!isset($domaincb)) {
  71. $domaincb = 0;
  72. }
  73. if(!isset($reindex)) {
  74. $reindex=0;
  75. }
  76. if(!isset($maxlevel)) {
  77. $maxlevel=0;
  78. }
  79. if ($keep_log) {
  80. if ($log_format=="html") {
  81. $log_file = $log_dir."/".Date("ymdHi").".html";
  82. } else {
  83. $log_file = $log_dir."/".Date("ymdHi").".log";
  84. }
  85. if (!$log_handle = fopen($log_file, 'w')) {
  86. die ("Logging option is set, but cannot open file for logging.");
  87. }
  88. }
  89. if ($all == 1) {
  90. index_all();
  91. } else {
  92. if ($reindex == 1 && $command_line == 1) {
  93. $result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites where url='$url'");
  94. echo mysql_error();
  95. if($row=mysql_fetch_row($result)) {
  96. $url = $row[0];
  97. $maxlevel = $row[1];
  98. $in= $row[2];
  99. $out = $row[3];
  100. $domaincb = $row[4];
  101. if ($domaincb=='') {
  102. $domaincb=0;
  103. }
  104. if ($maxlevel == -1) {
  105. $soption = 'full';
  106. } else {
  107. $soption = 'level';
  108. }
  109. }
  110. }
  111. if (!isset($in)) {
  112. $in = "";
  113. }
  114. if (!isset($out)) {
  115. $out = "";
  116. }
  117. index_site($url, $reindex, $maxlevel, $soption, $in, $out, $domaincb);
  118. }
  119. $tmp_urls = Array();
  120. function microtime_float(){
  121. list($usec, $sec) = explode(" ", microtime());
  122. return ((float)$usec + (float)$sec);
  123. }
  124. function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex) {
  125. global $entities, $min_delay;
  126. global $command_line;
  127. global $min_words_per_page;
  128. global $supdomain;
  129. global $mysql_table_prefix, $user_agent, $tmp_urls, $delay_time, $domain_arr;
  130. $needsReindex = 1;
  131. $deletable = 0;
  132. $url_status = url_status($url);
  133. $thislevel = $level - 1;
  134. if (strstr($url_status['state'], "Relocation")) {
  135. $url = preg_replace("/ /", "", url_purify($url_status['path'], $url, $can_leave_domain));
  136. if ($url <> '') {
  137. $result = mysql_query("select link from ".$mysql_table_prefix."temp where link='$url' && id = '$sessid'");
  138. echo mysql_error();
  139. $rows = mysql_numrows($result);
  140. if ($rows == 0) {
  141. mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', '$level', '$sessid')");
  142. echo mysql_error();
  143. }
  144. }
  145. $url_status['state'] == "redirected";
  146. }
  147. /*
  148. if ($indexdate <> '' && $url_status['date'] <> '') {
  149. if ($indexdate > $url_status['date']) {
  150. $url_status['state'] = "Date checked. Page contents not changed";
  151. $needsReindex = 0;
  152. }
  153. }*/
  154. ini_set("user_agent", $user_agent);
  155. if ($url_status['state'] == 'ok') {
  156. $OKtoIndex = 1;
  157. $file_read_error = 0;
  158. if (time() - $delay_time < $min_delay) {
  159. sleep ($min_delay- (time() - $delay_time));
  160. }
  161. $delay_time = time();
  162. if (!fst_lt_snd(phpversion(), "4.3.0")) {
  163. $file = file_get_contents($url);
  164. if ($file === FALSE) {
  165. $file_read_error = 1;
  166. }
  167. } else {
  168. $fl = @fopen($url, "r");
  169. if ($fl) {
  170. while ($buffer = @fgets($fl, 4096)) {
  171. $file .= $buffer;
  172. }
  173. } else {
  174. $file_read_error = 1;
  175. }
  176. fclose ($fl);
  177. }
  178. if ($file_read_error) {
  179. $contents = getFileContents($url);
  180. $file = $contents['file'];
  181. }
  182. $pageSize = number_format(strlen($file)/1024, 2, ".", "");
  183. printPageSizeReport($pageSize);
  184. if ($url_status['content'] != 'text') {
  185. $file = extract_text($file, $url_status['content']);
  186. }
  187. printStandardReport('starting', $command_line);
  188. $newmd5sum = md5($file);
  189. if ($md5sum == $newmd5sum) {
  190. printStandardReport('md5notChanged',$command_line);
  191. $OKtoIndex = 0;
  192. } else if (isDuplicateMD5($newmd5sum)) {
  193. $OKtoIndex = 0;
  194. printStandardReport('duplicate',$command_line);
  195. }
  196. if (($md5sum != $newmd5sum || $reindex ==1) && $OKtoIndex == 1) {
  197. $urlparts = parse_url($url);
  198. $newdomain = $urlparts['host'];
  199. $type = 0;
  200. /* if ($newdomain <> $domain)
  201. $domainChanged = 1;
  202. if ($domaincb==1) {
  203. $start = strlen($newdomain) - strlen($supdomain);
  204. if (substr($newdomain, $start) == $supdomain) {
  205. $domainChanged = 0;
  206. }
  207. }*/
  208. // remove link to css file
  209. //get all links from file
  210. $data = clean_file($file, $url, $url_status['content']);
  211. if ($data['noindex'] == 1) {
  212. $OKtoIndex = 0;
  213. $deletable = 1;
  214. printStandardReport('metaNoindex',$command_line);
  215. }
  216. $wordarray = unique_array(explode(" ", $data['content']));
  217. if ($data['nofollow'] != 1) {
  218. $links = get_links($file, $url, $can_leave_domain, $data['base']);
  219. $links = distinct_array($links);
  220. $all_links = count($links);
  221. $numoflinks = 0;
  222. //if there are any, add to the temp table, but only if there isnt such url already
  223. if (is_array($links)) {
  224. reset ($links);
  225. while ($thislink = each($links)) {
  226. if ($tmp_urls[$thislink[1]] != 1) {
  227. $tmp_urls[$thislink[1]] = 1;
  228. $numoflinks++;
  229. mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$thislink[1]', '$level', '$sessid')") or die (mysql_error()."-spider.php L:276");
  230. }
  231. }
  232. }
  233. } else {
  234. printStandardReport('noFollow',$command_line);
  235. }
  236. if ($OKtoIndex == 1) {
  237. $title = $data['title'];
  238. $host = $data['host'];
  239. $path = $data['path'];
  240. $fulltxt = $data['fulltext'];
  241. $desc = substr($data['description'], 0,254);
  242. $url_parts = parse_url($url);
  243. $domain_for_db = $url_parts['host'];
  244. if (isset($domain_arr[$domain_for_db])) {
  245. $dom_id = $domain_arr[$domain_for_db];
  246. } else {
  247. mysql_query("insert into ".$mysql_table_prefix."domains (domain) values ('$domain_for_db')");
  248. $dom_id = mysql_insert_id();
  249. $domain_arr[$domain_for_db] = $dom_id;
  250. }
  251. $wordarray = calc_weights ($wordarray, $title, $host, $path, $data['keywords']);
  252. //if there are words to index, add the link to the database, get its id, and add the word + their relation
  253. if (is_array($wordarray) && count($wordarray) > $min_words_per_page) {
  254. if ($md5sum == '') {
  255. mysql_query ("insert into ".$mysql_table_prefix."links (site_id, url, title, description, fulltxt, indexdate, size, md5sum, level) values ('$site_id', '$url', '$title', '$desc', '$fulltxt', curdate(), '$pageSize', '$newmd5sum', $thislevel)") or die( mysql_error()."-spider.php L:307");
  256. $result = mysql_query("select link_id from ".$mysql_table_prefix."links where url='$url'") or die( mysql_error()."-spider.php L:308");
  257. $row = mysql_fetch_row($result);
  258. $link_id = $row[0];
  259. save_keywords($wordarray, $link_id, $dom_id);
  260. printStandardReport('indexed', $command_line);
  261. }else if (($md5sum <> '') && ($md5sum <> $newmd5sum)) { //if page has changed, start updating
  262. $result = mysql_query("select link_id from ".$mysql_table_prefix."links where url='$url'") or die( mysql_error()."-spider.php L:317");
  263. $row = mysql_fetch_row($result);
  264. $link_id = $row[0];
  265. for ($i=0;$i<=15; $i++) {
  266. $char = dechex($i);
  267. mysql_query ("delete from ".$mysql_table_prefix."link_keyword$char where link_id=$link_id") or die( mysql_error()."-spider.php L:322");
  268. }
  269. save_keywords($wordarray, $link_id, $dom_id);
  270. $query = "update ".$mysql_table_prefix."links set title='$title', description ='$desc', fulltxt = '$fulltxt', indexdate=now(), size = '$pageSize', md5sum='$newmd5sum', level=$thislevel where link_id=$link_id";
  271. mysql_query($query) or die( mysql_error()."-spider.php L:327");
  272. printStandardReport('re-indexed', $command_line);
  273. }
  274. }else {
  275. printStandardReport('minWords', $command_line);
  276. }
  277. }
  278. }
  279. } else {
  280. $deletable = 1;
  281. printUrlStatus($url_status['state'], $command_line);
  282. }
  283. if ($reindex ==1 && $deletable == 1) {
  284. check_for_removal($url);
  285. } else if ($reindex == 1) {
  286. }
  287. if (!isset($all_links)) {
  288. $all_links = 0;
  289. }
  290. if (!isset($numoflinks)) {
  291. $numoflinks = 0;
  292. }
  293. printLinksReport($numoflinks, $all_links, $command_line);
  294. }
  295. function index_site($url, $reindex, $maxlevel, $soption, $url_inc, $url_not_inc, $can_leave_domain) {
  296. global $mysql_table_prefix, $command_line, $mainurl, $tmp_urls, $domain_arr, $all_keywords;
  297. if (!isset($all_keywords)) {
  298. $result = mysql_query("select keyword_ID, keyword from ".$mysql_table_prefix."keywords");
  299. echo mysql_error();
  300. while($row=mysql_fetch_array($result)) {
  301. $all_keywords[addslashes($row[1])] = $row[0];
  302. }
  303. }
  304. $compurl = parse_url($url);
  305. if ($compurl['path'] == '')
  306. $url = $url . "/";
  307. $t = microtime();
  308. $a = getenv("REMOTE_ADDR");
  309. $sessid = md5 ($t.$a);
  310. $urlparts = parse_url($url);
  311. $domain = $urlparts['host'];
  312. if (isset($urlparts['port'])) {
  313. $port = (int)$urlparts['port'];
  314. }else {
  315. $port = 80;
  316. }
  317. $result = mysql_query("select site_id from ".$mysql_table_prefix."sites where url='$url'");
  318. echo mysql_error();
  319. $row = mysql_fetch_row($result);
  320. $site_id = $row[0];
  321. if ($site_id != "" && $reindex == 1) {
  322. mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', 0, '$sessid')");
  323. echo mysql_error();
  324. $result = mysql_query("select url, level from ".$mysql_table_prefix."links where site_id = $site_id");
  325. while ($row = mysql_fetch_array($result)) {
  326. $site_link = $row['url'];
  327. $link_level = $row['level'];
  328. if ($site_link != $url) {
  329. mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$site_link', $link_level, '$sessid')");
  330. }
  331. }
  332. $qry = "update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth = $maxlevel, required = '$url_inc'," .
  333. "disallowed = '$url_not_inc', can_leave_domain=$can_leave_domain where site_id=$site_id";
  334. mysql_query ($qry);
  335. echo mysql_error();
  336. } else if ($site_id == '') {
  337. mysql_query ("insert into ".$mysql_table_prefix."sites (url, indexdate, spider_depth, required, disallowed, can_leave_domain) " .
  338. "values ('$url', now(), $maxlevel, '$url_inc', '$url_not_inc', $can_leave_domain)");
  339. echo mysql_error();
  340. $result = mysql_query("select site_ID from ".$mysql_table_prefix."sites where url='$url'");
  341. $row = mysql_fetch_row($result);
  342. $site_id = $row[0];
  343. } else {
  344. mysql_query ("update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth = $maxlevel, required = '$url_inc'," .
  345. "disallowed = '$url_not_inc', can_leave_domain=$can_leave_domain where site_id=$site_id");
  346. echo mysql_error();
  347. }
  348. $result = mysql_query("select site_id, temp_id, level, count, num from ".$mysql_table_prefix."pending where site_id='$site_id'");
  349. echo mysql_error();
  350. $row = mysql_fetch_row($result);
  351. $pending = $row[0];
  352. $level = 0;
  353. $domain_arr = get_domains();
  354. if ($pending == '') {
  355. mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', 0, '$sessid')");
  356. echo mysql_error();
  357. } else if ($pending != '') {
  358. printStandardReport('continueSuspended',$command_line);
  359. mysql_query("select temp_id, level, count from ".$mysql_table_prefix."pending where site_id='$site_id'");
  360. echo mysql_error();
  361. $sessid = $row[1];
  362. $level = $row[2];
  363. $pend_count = $row[3] + 1;
  364. $num = $row[4];
  365. $pending = 1;
  366. $tmp_urls = get_temp_urls($sessid);
  367. }
  368. if ($reindex != 1) {
  369. mysql_query ("insert into ".$mysql_table_prefix."pending (site_id, temp_id, level, count) values ('$site_id', '$sessid', '0', '0')");
  370. echo mysql_error();
  371. }
  372. $time = time();
  373. $omit = check_robot_txt($url);
  374. printHeader ($omit, $url, $command_line);
  375. $mainurl = $url;
  376. $num = 0;
  377. while (($level <= $maxlevel && $soption == 'level') || ($soption == 'full')) {
  378. if ($pending == 1) {
  379. $count = $pend_count;
  380. $pending = 0;
  381. } else
  382. $count = 0;
  383. $links = array();
  384. $result = mysql_query("select distinct link from ".$mysql_table_prefix."temp where level=$level && id='$sessid' order by link");
  385. echo mysql_error();
  386. $rows = mysql_num_rows($result);
  387. if ($rows == 0) {
  388. break;
  389. }
  390. $i = 0;
  391. while ($row = mysql_fetch_array($result)) {
  392. $links[] = $row['link'];
  393. }
  394. reset ($links);
  395. while ($count < count($links)) {
  396. $num++;
  397. $thislink = $links[$count];
  398. $urlparts = parse_url($thislink);
  399. reset ($omit);
  400. $forbidden = 0;
  401. foreach ($omit as $omiturl) {
  402. $omiturl = trim($omiturl);
  403. $omiturl_parts = parse_url($omiturl);
  404. if ($omiturl_parts['scheme'] == '') {
  405. $check_omit = $urlparts['host'] . $omiturl;
  406. } else {
  407. $check_omit = $omiturl;
  408. }
  409. if (strpos($thislink, $check_omit)) {
  410. printRobotsReport($num, $thislink, $command_line);
  411. check_for_removal($thislink);
  412. $forbidden = 1;
  413. break;
  414. }
  415. }
  416. if (!check_include($thislink, $url_inc, $url_not_inc )) {
  417. printUrlStringReport($num, $thislink, $command_line);
  418. check_for_removal($thislink);
  419. $forbidden = 1;
  420. }
  421. if ($forbidden == 0) {
  422. printRetrieving($num, $thislink, $command_line);
  423. $query = "select md5sum, indexdate from ".$mysql_table_prefix."links where url='$thislink'";
  424. $result = mysql_query($query);
  425. echo mysql_error();
  426. $rows = mysql_num_rows($result);
  427. if ($rows == 0) {
  428. if($thislink != "/")
  429. index_url($thislink, $level+1, $site_id, '', $domain, '', $sessid, $can_leave_domain, $reindex);
  430. mysql_query("update ".$mysql_table_prefix."pending set level = $level, count=$count, num=$num where site_id=$site_id");
  431. echo mysql_error();
  432. }else if ($rows <> 0 && $reindex == 1) {
  433. $row = mysql_fetch_array($result);
  434. $md5sum = $row['md5sum'];
  435. $indexdate = $row['indexdate'];
  436. index_url($thislink, $level+1, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex);
  437. mysql_query("update ".$mysql_table_prefix."pending set level = $level, count=$count, num=$num where site_id=$site_id");
  438. echo mysql_error();
  439. }else {
  440. printStandardReport('inDatabase',$command_line);
  441. }
  442. }
  443. $count++;
  444. }
  445. $level++;
  446. }
  447. mysql_query ("delete from ".$mysql_table_prefix."temp where id = '$sessid'");
  448. echo mysql_error();
  449. mysql_query ("delete from ".$mysql_table_prefix."pending where site_id = '$site_id'");
  450. echo mysql_error();
  451. printStandardReport('completed',$command_line);
  452. }
  453. function index_all() {
  454. global $mysql_table_prefix;
  455. $result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites");
  456. echo mysql_error();
  457. while ($row=mysql_fetch_row($result)) {
  458. $url = $row[0];
  459. $depth = $row[1];
  460. $include = $row[2];
  461. $not_include = $row[3];
  462. $can_leave_domain = $row[4];
  463. if ($can_leave_domain=='') {
  464. $can_leave_domain=0;
  465. }
  466. if ($depth == -1) {
  467. $soption = 'full';
  468. } else {
  469. $soption = 'level';
  470. }
  471. index_site($url, 1, $depth, $soption, $include, $not_include, $can_leave_domain);
  472. }
  473. }
  474. function get_temp_urls ($sessid) {
  475. global $mysql_table_prefix;
  476. $result = mysql_query("select link from ".$mysql_table_prefix."temp where id='$sessid'");
  477. echo mysql_error();
  478. $tmp_urls = Array();
  479. while ($row=mysql_fetch_row($result)) {
  480. $tmp_urls[$row[0]] = 1;
  481. }
  482. return $tmp_urls;
  483. }
  484. function get_domains () {
  485. global $mysql_table_prefix;
  486. $result = mysql_query("select domain_id, domain from ".$mysql_table_prefix."domains");
  487. echo mysql_error();
  488. $domains = Array();
  489. while ($row=mysql_fetch_row($result)) {
  490. $domains[$row[1]] = $row[0];
  491. }
  492. return $domains;
  493. }
  494. function commandline_help() {
  495. print "Usage: php spider.php <options>\n\n";
  496. print "Options:\n";
  497. print " -all\t\t Reindex everything in the database\n";
  498. print " -u <url>\t Set url to index\n";
  499. print " -f\t\t Set indexing depth to full (unlimited depth)\n";
  500. print " -d <num>\t Set indexing depth to <num>\n";
  501. print " -l\t\t Allow spider to leave the initial domain\n";
  502. print " -r\t\t Set spider to reindex a site\n";
  503. print " -m <string>\t Set the string(s) that an url must include (use \\n as a delimiter between multiple strings)\n";
  504. print " -n <string>\t Set the string(s) that an url must not include (use \\n as a delimiter between multiple strings)\n";
  505. }
  506. printStandardReport('quit',$command_line);
  507. if ($email_log) {
  508. $indexed = ($all==1) ? 'ALL' : $url;
  509. $log_report = "";
  510. if ($log_handle) {
  511. $log_report = "Log saved into $log_file";
  512. }
  513. mail($admin_email, "Sphider indexing report", "Sphider has finished indexing $indexed at ".date("y-m-d H:i:s").". ".$log_report);
  514. }
  515. if ( $log_handle) {
  516. fclose($log_handle);
  517. }
  518. ?>