PageRenderTime 228ms CodeModel.GetById 176ms app.highlight 40ms RepoModel.GetById 1ms app.codeStats 0ms

/typo3/sysext/indexed_search/class.crawler.php

https://bitbucket.org/linxpinx/mercurial
PHP | 1003 lines | 523 code | 152 blank | 328 comment | 56 complexity | bb76f4ce467c6dc916054088914e1964 MD5 | raw file
   1<?php
   2/***************************************************************
   3*  Copyright notice
   4*
   5*  (c) 2001-2010 Kasper Skaarhoj (kasperYYYY@typo3.com)
   6*  All rights reserved
   7*
   8*  This script is part of the TYPO3 project. The TYPO3 project is
   9*  free software; you can redistribute it and/or modify
  10*  it under the terms of the GNU General Public License as published by
  11*  the Free Software Foundation; either version 2 of the License, or
  12*  (at your option) any later version.
  13*
  14*  The GNU General Public License can be found at
  15*  http://www.gnu.org/copyleft/gpl.html.
  16*  A copy is found in the textfile GPL.txt and important notices to the license
  17*  from the author is found in LICENSE.txt distributed with these scripts.
  18*
  19*
  20*  This script is distributed in the hope that it will be useful,
  21*  but WITHOUT ANY WARRANTY; without even the implied warranty of
  22*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  23*  GNU General Public License for more details.
  24*
  25*  This copyright notice MUST APPEAR in all copies of the script!
  26***************************************************************/
  27/**
  28 * Crawler hook for indexed search. Works with the "crawler" extension
  29 *
  30 * @author	Kasper Sk?rh?j <kasperYYYY@typo3.com>
  31 */
  32/**
  33 * [CLASS/FUNCTION INDEX of SCRIPT]
  34 *
  35 *
  36 *
  37 *   87: class tx_indexedsearch_crawler
  38 *  106:     function crawler_init(&$pObj)
  39 *  219:     function crawler_execute($params,&$pObj)
  40 *  285:     function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj)
  41 *  345:     function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj)
  42 *  414:     function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj)
  43 *  458:     function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj)
  44 *  513:     function cleanUpOldRunningConfigurations()
  45 *
  46 *              SECTION: Helper functions
  47 *  579:     function checkUrl($url,$urlLog,$baseUrl)
  48 *  602:     function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
  49 *  645:     function indexSingleRecord($r,$cfgRec,$rl=NULL)
  50 *  694:     function loadIndexerClass()
  51 *  706:     function getUidRootLineForClosestTemplate($id)
  52 *  739:     function generateNextIndexingTime($cfgRec)
  53 *  778:     function checkDeniedSuburls($url, $url_deny)
  54 *  798:     function addQueueEntryForHook($cfgRec, $title)
  55 *
  56 *              SECTION: Hook functions for TCEmain (indexing of records)
  57 *  830:     function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, &$pObj)
  58 *
  59 *
  60 *  879: class tx_indexedsearch_files
  61 *  888:     function crawler_execute($params,&$pObj)
  62 *  913:     function loadIndexerClass()
  63 *
  64 * TOTAL FUNCTIONS: 18
  65 * (This index is automatically created/updated by the extension "extdeveval")
  66 *
  67 */
  68
  69
  70
  71
  72# To make sure the backend charset is available:
  73if (!is_object($GLOBALS['LANG']))	{
  74	$GLOBALS['LANG'] = t3lib_div::makeInstance('language');
  75	$GLOBALS['LANG']->init($GLOBALS['BE_USER']->uc['lang']);
  76}
  77
  78
  79/**
  80 * Crawler hook for indexed search. Works with the "crawler" extension
  81 *
  82 * @author	Kasper Skaarhoj <kasperYYYY@typo3.com>
  83 * @package TYPO3
  84 * @subpackage tx_indexedsearch
  85 */
  86class tx_indexedsearch_crawler {
  87
  88		// Static:
  89	var $secondsPerExternalUrl = 3;		// Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
  90
  91		// Internal, dynamic:
  92	var $instanceCounter = 0;		// Counts up for each added URL (type 3)
  93
  94		// Internal, static:
  95	var $callBack = 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler';		// The object reference to this class.
  96
  97	/**
  98	 * Initialization of crawler hook.
  99	 * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
 100	 * In reality we select indexing configurations and evaluate if any of them needs to run.
 101	 *
 102	 * @param	object		Parent object (tx_crawler lib)
 103	 * @return	void
 104	 */
 105	function crawler_init(&$pObj){
 106
 107			// Select all indexing configuration which are waiting to be activated:
 108		$indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
 109			'*',
 110			'index_config',
 111			'hidden=0
 112				AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
 113				AND timer_next_indexing<' . $GLOBALS['EXEC_TIME'] . '
 114				AND set_id=0
 115				'.t3lib_BEfunc::deleteClause('index_config')
 116		);
 117
 118			// For each configuration, check if it should be executed and if so, start:
 119		foreach($indexingConfigurations as $cfgRec)	{
 120
 121				// Generate a unique set-ID:
 122			$setId = t3lib_div::md5int(microtime());
 123
 124				// Get next time:
 125			$nextTime = $this->generateNextIndexingTime($cfgRec);
 126
 127				// Start process by updating index-config record:
 128			$field_array = array (
 129				'set_id' => $setId,
 130				'timer_next_indexing' => $nextTime,
 131				'session_data' => '',
 132			);
 133			$GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
 134
 135				// Based on configuration type:
 136			switch($cfgRec['type'])	{
 137				case 1:	// RECORDS:
 138
 139						// Parameters:
 140					$params = array(
 141						'indexConfigUid' => $cfgRec['uid'],
 142						'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
 143						'url' => 'Records (start)',	// Just for show.
 144					);
 145						//
 146					$pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
 147				break;
 148				case 2:	// FILES:
 149
 150						// Parameters:
 151					$params = array(
 152						'indexConfigUid' => $cfgRec['uid'],		// General
 153						'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),	// General
 154						'url' => $cfgRec['filepath'],	// Partly general... (for URL and file types)
 155						'depth' => 0	// Specific for URL and file types
 156					);
 157
 158					$pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
 159				break;
 160				case 3:	// External URL:
 161
 162						// Parameters:
 163					$params = array(
 164						'indexConfigUid' => $cfgRec['uid'],		// General
 165						'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),	// General
 166						'url' => $cfgRec['externalUrl'],	// Partly general... (for URL and file types)
 167						'depth' => 0	// Specific for URL and file types
 168					);
 169
 170					$pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
 171				break;
 172				case 4:	// Page tree
 173
 174						// Parameters:
 175					$params = array(
 176						'indexConfigUid' => $cfgRec['uid'],		// General
 177						'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),	// General
 178						'url' => intval($cfgRec['alternative_source_pid']),	// Partly general... (for URL and file types and page tree (root))
 179						'depth' => 0	// Specific for URL and file types and page tree
 180					);
 181
 182					$pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
 183				break;
 184				case 5:	// Meta configuration, nothing to do:
 185					# NOOP
 186				break;
 187				default:
 188					if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']])	{
 189						$hookObj = t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
 190
 191						if (is_object($hookObj))	{
 192
 193								// Parameters:
 194							$params = array(
 195								'indexConfigUid' => $cfgRec['uid'],		// General
 196								'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].'/CUSTOM]'),	// General
 197								'url' => $hookObj->initMessage($message),
 198							);
 199
 200							$pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
 201						}
 202					}
 203				break;
 204			}
 205		}
 206
 207			// Finally, look up all old index configurations which are finished and needs to be reset and done.
 208		$this->cleanUpOldRunningConfigurations();
 209	}
 210
 211	/**
 212	 * Call back function for execution of a log element
 213	 *
 214	 * @param	array		Params from log element. Must contain $params['indexConfigUid']
 215	 * @param	object		Parent object (tx_crawler lib)
 216	 * @return	array		Result array
 217	 */
 218	function crawler_execute($params,&$pObj)	{
 219
 220			// Indexer configuration ID must exist:
 221		if ($params['indexConfigUid'])	{
 222
 223				// Load the indexing configuration record:
 224			list($cfgRec) = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
 225				'*',
 226				'index_config',
 227				'uid='.intval($params['indexConfigUid'])
 228			);
 229
 230			if (is_array($cfgRec))	{
 231
 232					// Unpack session data:
 233				$session_data = unserialize($cfgRec['session_data']);
 234
 235					// Select which type:
 236				switch($cfgRec['type'])	{
 237					case 1:	// Records:
 238						$this->crawler_execute_type1($cfgRec,$session_data,$params,$pObj);
 239					break;
 240					case 2:	// Files
 241						$this->crawler_execute_type2($cfgRec,$session_data,$params,$pObj);
 242					break;
 243					case 3:	// External URL:
 244						$this->crawler_execute_type3($cfgRec,$session_data,$params,$pObj);
 245					break;
 246					case 4:	// Page tree:
 247						$this->crawler_execute_type4($cfgRec,$session_data,$params,$pObj);
 248					break;
 249					case 5:	// Meta
 250						# NOOP (should never enter here!)
 251					break;
 252					default:
 253						if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']])	{
 254							$hookObj = t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
 255
 256							if (is_object($hookObj))	{
 257								$this->pObj = $pObj;	// For addQueueEntryForHook()
 258								$hookObj->indexOperation($cfgRec,$session_data,$params,$this);
 259							}
 260						}
 261					break;
 262				}
 263
 264					// Save process data which might be modified:
 265				$field_array = array (
 266					'session_data' => serialize($session_data)
 267				);
 268				$GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
 269			}
 270		}
 271
 272		return array('log' => $params);
 273	}
 274
 275	/**
 276	 * Indexing records from a table
 277	 *
 278	 * @param	array		Indexing Configuration Record
 279	 * @param	array		Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
 280	 * @param	array		Parameters from the log queue.
 281	 * @param	object		Parent object (from "crawler" extension!)
 282	 * @return	void
 283	 */
 284	function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj)	{
 285		if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']]))	{
 286
 287				// Init session data array if not already:
 288			if (!is_array($session_data))	{
 289				$session_data = array(
 290					'uid' => 0
 291				);
 292			}
 293
 294				// Init:
 295			$pid = intval($cfgRec['alternative_source_pid']) ? intval($cfgRec['alternative_source_pid']) : $cfgRec['pid'];
 296			$numberOfRecords = $cfgRec['recordsbatch'] ? t3lib_div::intInRange($cfgRec['recordsbatch'],1) : 100;
 297
 298				// Get root line:
 299			$rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
 300
 301				// Select
 302			$recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
 303						'*',
 304						$cfgRec['table2index'],
 305						'pid = '.intval($pid).'
 306							AND uid > '.intval($session_data['uid']).
 307							t3lib_BEfunc::deleteClause($cfgRec['table2index']).
 308							t3lib_BEfunc::BEenableFields($cfgRec['table2index']),
 309						'',
 310						'uid',
 311						$numberOfRecords
 312					);
 313
 314				// Traverse:
 315			if (count($recs))	{
 316				foreach($recs as $r)	{
 317
 318						// Index single record:
 319					$this->indexSingleRecord($r,$cfgRec,$rl);
 320
 321						// Update the UID we last processed:
 322					$session_data['uid'] = $r['uid'];
 323				}
 324
 325					// Finally, set entry for next indexing of batch of records:
 326				$nparams = array(
 327					'indexConfigUid' => $cfgRec['uid'],
 328					'url' => 'Records from UID#'.($r['uid']+1).'-?',
 329					'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')
 330				);
 331				$pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
 332			}
 333		}
 334	}
 335
 336	/**
 337	 * Indexing files from fileadmin
 338	 *
 339	 * @param	array		Indexing Configuration Record
 340	 * @param	array		Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
 341	 * @param	array		Parameters from the log queue.
 342	 * @param	object		Parent object (from "crawler" extension!)
 343	 * @return	void
 344	 */
 345	function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj)	{
 346
 347			// Prepare path, making it absolute and checking:
 348		$readpath = $params['url'];
 349		if (!t3lib_div::isAbsPath($readpath))	{
 350			$readpath = t3lib_div::getFileAbsFileName($readpath);
 351		}
 352
 353		if (t3lib_div::isAllowedAbsPath($readpath))	{
 354			if (@is_file($readpath))	{	// If file, index it!
 355
 356					// Get root line (need to provide this when indexing external files)
 357				$rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
 358
 359					// Load indexer if not yet.
 360				$this->loadIndexerClass();
 361
 362					// (Re)-Indexing file on page.
 363				$indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
 364				$indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
 365				$indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
 366				$indexerObj->hash['phash'] = -1;	// EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
 367
 368					// Index document:
 369				$indexerObj->indexRegularDocument(substr($readpath,strlen(PATH_site)), TRUE);
 370			} elseif (@is_dir($readpath)) {	// If dir, read content and create new pending items for log:
 371
 372					// Select files and directories in path:
 373				$extList = implode(',',t3lib_div::trimExplode(',',$cfgRec['extensions'],1));
 374				$fileArr = array();
 375				$files = t3lib_div::getAllFilesAndFoldersInPath($fileArr,$readpath,$extList,0,0);
 376
 377				$directoryList = t3lib_div::get_dirs($readpath);
 378				if (is_array($directoryList) && $params['depth'] < $cfgRec['depth'])	{
 379					foreach ($directoryList as $subdir)	{
 380						if ((string)$subdir!='')	{
 381							$files[]= $readpath.$subdir.'/';
 382						}
 383					}
 384				}
 385				$files = t3lib_div::removePrefixPathFromList($files,PATH_site);
 386
 387					// traverse the items and create log entries:
 388				foreach($files as $path)	{
 389					$this->instanceCounter++;
 390					if ($path!==$params['url'])	{
 391							// Parameters:
 392						$nparams = array(
 393							'indexConfigUid' => $cfgRec['uid'],
 394							'url' => $path,
 395							'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
 396							'depth' => $params['depth']+1
 397						);
 398						$pObj->addQueueEntry_callBack(
 399							$cfgRec['set_id'],
 400							$nparams,
 401							$this->callBack,
 402							$cfgRec['pid'],
 403							$GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
 404						);
 405					}
 406				}
 407			}
 408		}
 409	}
 410
 411	/**
 412	 * Indexing External URLs
 413	 *
 414	 * @param	array		Indexing Configuration Record
 415	 * @param	array		Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
 416	 * @param	array		Parameters from the log queue.
 417	 * @param	object		Parent object (from "crawler" extension!)
 418	 * @return	void
 419	 */
 420	function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj)	{
 421
 422			// Init session data array if not already:
 423		if (!is_array($session_data))	{
 424			$session_data = array(
 425				'urlLog' => array($params['url'])
 426			);
 427		}
 428
 429			// Index the URL:
 430		$rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
 431		$subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
 432
 433			// Add more elements to log now:
 434		if ($params['depth'] < $cfgRec['depth'])	{
 435			foreach($subUrls as $url)	{
 436				if ($url = $this->checkUrl($url,$session_data['urlLog'],$cfgRec['externalUrl']))	{
 437					if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny']))	{
 438						$this->instanceCounter++;
 439						$session_data['urlLog'][] = $url;
 440
 441							// Parameters:
 442						$nparams = array(
 443							'indexConfigUid' => $cfgRec['uid'],
 444							'url' => $url,
 445							'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
 446							'depth' => $params['depth']+1
 447						);
 448						$pObj->addQueueEntry_callBack(
 449							$cfgRec['set_id'],
 450							$nparams,
 451							$this->callBack,
 452							$cfgRec['pid'],
 453							$GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
 454						);
 455					}
 456				}
 457			}
 458		}
 459	}
 460
 461	/**
 462	 * Page tree indexing type
 463	 *
 464	 * @param	array		Indexing Configuration Record
 465	 * @param	array		Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
 466	 * @param	array		Parameters from the log queue.
 467	 * @param	object		Parent object (from "crawler" extension!)
 468	 * @return	void
 469	 */
 470	function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj)	{
 471
 472			// Base page uid:
 473		$pageUid = intval($params['url']);
 474
 475			// Get array of URLs from page:
 476		$pageRow = t3lib_BEfunc::getRecord('pages',$pageUid);
 477		$res = $pObj->getUrlsForPageRow($pageRow);
 478
 479		$duplicateTrack = array();	// Registry for duplicates
 480		$downloadUrls = array();	// Dummy.
 481
 482			// Submit URLs:
 483		if (count($res))	{
 484			foreach($res as $paramSetKey => $vv)	{
 485				$urlList = $pObj->urlListFromUrlArray(
 486					$vv,
 487					$pageRow,
 488					$GLOBALS['EXEC_TIME'],
 489					30,
 490					1,
 491					0,
 492					$duplicateTrack,
 493					$downloadUrls,
 494					array('tx_indexedsearch_reindex')
 495				);
 496			}
 497		}
 498
 499			// Add subpages to log now:
 500		if ($params['depth'] < $cfgRec['depth'])	{
 501
 502				// Subpages selected
 503			$recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
 504				'uid,title',
 505				'pages',
 506				'pid = '.intval($pageUid).
 507					t3lib_BEfunc::deleteClause('pages')
 508			);
 509
 510				// Traverse subpages and add to queue:
 511			if (count($recs))	{
 512				foreach($recs as $r)	{
 513					$this->instanceCounter++;
 514					$url = 'pages:'.$r['uid'].': '.$r['title'];
 515					$session_data['urlLog'][] = $url;
 516
 517							// Parameters:
 518					$nparams = array(
 519						'indexConfigUid' => $cfgRec['uid'],
 520						'url' => $r['uid'],
 521						'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
 522						'depth' => $params['depth']+1
 523					);
 524					$pObj->addQueueEntry_callBack(
 525						$cfgRec['set_id'],
 526						$nparams,
 527						$this->callBack,
 528						$cfgRec['pid'],
 529						$GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
 530					);
 531				}
 532			}
 533		}
 534	}
 535
 536	/**
 537	 * Look up all old index configurations which are finished and needs to be reset and done
 538	 *
 539	 * @return	void
 540	 */
 541	function cleanUpOldRunningConfigurations()	{
 542
 543			// Lookup running index configurations:
 544		$runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
 545			'uid,set_id',
 546			'index_config',
 547			'set_id!=0'.t3lib_BEfunc::deleteClause('index_config')
 548		);
 549
 550			// For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE)
 551		foreach($runningIndexingConfigurations as $cfgRec)	{
 552
 553				// Look for ended processes:
 554			$queued_items = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
 555				'*',
 556				'tx_crawler_queue',
 557				'set_id=' . intval($cfgRec['set_id']) . ' AND exec_time=0'
 558			);
 559
 560			if (!$queued_items) {
 561
 562					// Lookup old phash rows:
 563				$oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
 564					'phash',
 565					'index_phash',
 566					'freeIndexUid='.intval($cfgRec['uid']).' AND freeIndexSetId!='.$cfgRec['set_id']
 567				);
 568
 569				foreach($oldPhashRows as $pHashRow)	{
 570						// Removing old registrations for all tables (code copied from class.tx_indexedsearch_modfunc1.php)
 571					$tableArr = explode(',','index_phash,index_rel,index_section,index_grlist,index_fulltext,index_debug');
 572					foreach($tableArr as $table)	{
 573						$GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($pHashRow['phash']));
 574					}
 575				}
 576
 577					// End process by updating index-config record:
 578				$field_array = array (
 579					'set_id' => 0,
 580					'session_data' => '',
 581				);
 582				$GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
 583			}
 584		}
 585	}
 586
 587
 588
 589
 590
 591
 592
 593	/*****************************************
 594	 *
 595	 * Helper functions
 596	 *
 597	 *****************************************/
 598
 599	/**
 600	 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
 601	 *
 602	 * @param	string		URL string to check
 603	 * @param	array		Array of already indexed URLs (input url is looked up here and must not exist already)
 604	 * @param	string		Base URL of the indexing process (input URL must be "inside" the base URL!)
 605	 * @return	string		Returls the URL if OK, otherwise false
 606	 */
 607	function checkUrl($url,$urlLog,$baseUrl)	{
 608		$url = preg_replace('/\/\/$/','/',$url);
 609		list($url) = explode('#',$url);
 610
 611		if (!strstr($url,'../'))	{
 612			if (t3lib_div::isFirstPartOfStr($url,$baseUrl))	{
 613				if (!in_array($url,$urlLog))	{
 614					return $url;
 615				}
 616			}
 617		}
 618	}
 619
 620	/**
 621	 * Indexing External URL
 622	 *
 623	 * @param	string		URL, http://....
 624	 * @param	integer		Page id to relate indexing to.
 625	 * @param	array		Rootline array to relate indexing to
 626	 * @param	integer		Configuration UID
 627	 * @param	integer		Set ID value
 628	 * @return	array		URLs found on this page
 629	 */
 630	function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)	{
 631
 632			// Load indexer if not yet.
 633		$this->loadIndexerClass();
 634
 635			// Index external URL:
 636		$indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
 637		$indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
 638		$indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
 639		$indexerObj->hash['phash'] = -1;	// To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
 640
 641		$indexerObj->indexExternalUrl($url);
 642		$url_qParts = parse_url($url);
 643
 644		$baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
 645		$baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
 646		if (!$baseHref) {
 647				// Extract base href from current URL
 648			$baseHref = $baseAbsoluteHref;
 649			$baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
 650		}
 651		$baseHref = rtrim($baseHref, '/');
 652
 653			// Get URLs on this page:
 654		$subUrls = array();
 655		$list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
 656
 657						// Traverse links:
 658		foreach ($list as $count => $linkInfo)	{
 659
 660				// Decode entities:
 661			$subUrl = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
 662
 663			$qParts = parse_url($subUrl);
 664			if (!$qParts['scheme'])	{
 665				$relativeUrl = t3lib_div::resolveBackPath($subUrl);
 666				if ($relativeUrl{0} === '/') {
 667					$subUrl = $baseAbsoluteHref . $relativeUrl;
 668				} else {
 669					$subUrl = $baseHref . '/' . $relativeUrl;
 670				}
 671			}
 672
 673			$subUrls[] = $subUrl;
 674		}
 675
 676		return $subUrls;
 677	}
 678
 679	/**
 680	 * Indexing Single Record
 681	 *
 682	 * @param	array		Record to index
 683	 * @param	array		Configuration Record
 684	 * @param	array		Rootline array to relate indexing to
 685	 * @return	void
 686	 */
 687	function indexSingleRecord($r,$cfgRec,$rl=NULL)	{
 688
 689			// Load indexer if not yet.
 690		$this->loadIndexerClass();
 691
 692
 693			// Init:
 694		$rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
 695		$fieldList = t3lib_div::trimExplode(',',$cfgRec['fieldlist'],1);
 696		$languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
 697		$sys_language_uid = $languageField ? $r[$languageField] : 0;
 698
 699			// (Re)-Indexing a row from a table:
 700		$indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
 701		parse_str(str_replace('###UID###',$r['uid'],$cfgRec['get_params']),$GETparams);
 702		$indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, $cfgRec['chashcalc'] ? TRUE : FALSE);
 703		$indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
 704		$indexerObj->forceIndexing = TRUE;
 705
 706		$theContent = '';
 707		foreach($fieldList as $k => $v)	{
 708			if (!$k)	{
 709				$theTitle = $r[$v];
 710			} else {
 711				$theContent.= $r[$v].' ';
 712			}
 713		}
 714
 715			// Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
 716		$indexerObj->backend_indexAsTYPO3Page(
 717			strip_tags(str_replace('<', ' <', $theTitle)),
 718			'',
 719			'',
 720			strip_tags(str_replace('<', ' <', $theContent)),
 721			$GLOBALS['LANG']->charSet,	// Requires that
 722			$r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']],
 723			$r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']],
 724			$r['uid']
 725		);
 726	}
 727
 728	/**
 729	 * Include indexer class.
 730	 *
 731	 * @return	void
 732	 */
 733	function loadIndexerClass()	{
 734		global $TYPO3_CONF_VARS;
 735		require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
 736	}
 737
 738	/**
 739	 * Get rootline for closest TypoScript template root.
 740	 * Algorithm same as used in Web > Template, Object browser
 741	 *
 742	 * @param	integer		The page id to traverse rootline back from
 743	 * @return	array		Array where the root lines uid values are found.
 744	 */
 745	function getUidRootLineForClosestTemplate($id)	{
 746		global $TYPO3_CONF_VARS;
 747
 748		$tmpl = t3lib_div::makeInstance("t3lib_tsparser_ext");
 749		$tmpl->tt_track = 0;	// Do not log time-performance information
 750		$tmpl->init();
 751
 752				// Gets the rootLine
 753		$sys_page = t3lib_div::makeInstance("t3lib_pageSelect");
 754		$rootLine = $sys_page->getRootLine($id);
 755		$tmpl->runThroughTemplates($rootLine,0);	// This generates the constants/config + hierarchy info for the template.
 756
 757			// Root line uids
 758		$rootline_uids = array();
 759		foreach($tmpl->rootLine as $rlkey => $rldat)	{
 760			$rootline_uids[$rlkey] = $rldat['uid'];
 761		}
 762
 763		return $rootline_uids;
 764	}
 765
 766	/**
 767	 * Generate the unix time stamp for next visit.
 768	 *
 769	 * @param	array		Index configuration record
 770	 * @return	integer		The next time stamp
 771	 */
 772	function generateNextIndexingTime($cfgRec)	{
 773		$currentTime = $GLOBALS['EXEC_TIME'];
 774
 775			// Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
 776		if ($cfgRec['timer_frequency']<=24*3600)	{
 777			$aMidNight = mktime (0,0,0)-1*24*3600;
 778		} else {
 779			$lastTime = $cfgRec['timer_next_indexing'] ? $cfgRec['timer_next_indexing'] : $GLOBALS['EXEC_TIME'];
 780			$aMidNight = mktime (0,0,0, date('m',$lastTime), date('d',$lastTime), date('y',$lastTime));
 781		}
 782
 783			// Find last offset time plus frequency in seconds:
 784		$lastSureOffset = $aMidNight+t3lib_div::intInRange($cfgRec['timer_offset'],0,86400);
 785		$frequencySeconds = t3lib_div::intInRange($cfgRec['timer_frequency'],1);
 786
 787			// Now, find out how many blocks of the length of frequency there is until the next time:
 788		$frequencyBlocksUntilNextTime = ceil(($currentTime-$lastSureOffset)/$frequencySeconds);
 789
 790			// Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
 791		$nextTime = $lastSureOffset + $frequencyBlocksUntilNextTime*$frequencySeconds;
 792
 793		return $nextTime;
 794	}
 795
 796	/**
 797	 * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns true.
 798	 *
 799	 * @param	string		URL to test
 800	 * @param	string		String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
 801	 * @return	boolean		TRUE if there is a matching URL (hence, do not index!)
 802	 */
 803	function checkDeniedSuburls($url, $url_deny)	{
 804		if (trim($url_deny))	{
 805			$url_denyArray = t3lib_div::trimExplode(LF,$url_deny,1);
 806			foreach($url_denyArray as $testurl)	{
 807				if (t3lib_div::isFirstPartOfStr($url,$testurl))	{
 808					echo $url.' /// '.$url_deny.LF;
 809					return TRUE;
 810				}
 811			}
 812		}
 813		return FALSE;
 814	}
 815
 816	/**
 817	 * Adding entry in queue for Hook
 818	 *
 819	 * @param	array		Configuration record
 820	 * @param	string		Title/URL
 821	 * @return	void
 822	 */
 823	function addQueueEntryForHook($cfgRec, $title)	{
 824
 825		$nparams = array(
 826			'indexConfigUid' => $cfgRec['uid'],		// This must ALWAYS be the cfgRec uid!
 827			'url' => $title,
 828			'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')	// Also just for information. Its good style to show that its an indexing configuration that added the entry.
 829		);
 830		$this->pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
 831	}
 832
 833	/**
 834	 * Deletes all data stored by indexed search for a given page
 835	 *
 836	 * @param	integer		Uid of the page to delete all pHash
 837	 * @return	void
 838	 */
 839	function deleteFromIndex($id)	{
 840
 841			// Lookup old phash rows:
 842		$oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash','index_section', 'page_id='.intval($id));
 843
 844		if (count($oldPhashRows))	{
 845			$pHashesToDelete = array();
 846			foreach ($oldPhashRows as $pHashRow)	{
 847				$pHashesToDelete[] = $pHashRow['phash'];
 848			}
 849
 850			$where_clause = 'phash IN ('.implode(',',$GLOBALS['TYPO3_DB']->cleanIntArray($pHashesToDelete)).')';
 851			$tables = explode(',', 'index_debug,index_fulltext,index_grlist,index_phash,index_rel,index_section');
 852			foreach ($tables as $table)	{
 853				$GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where_clause);
 854			}
 855		}
 856	}
 857
 858
 859
 860
 861
 862
 863
 864	/*************************
 865	 *
 866	 * Hook functions for TCEmain (indexing of records)
 867	 *
 868	 *************************/
 869
 870	/**
 871	 * TCEmain hook function for on-the-fly indexing of database records
 872	 *
 873	 * @param	string		TCEmain command
 874	 * @param	string		Table name
 875	 * @param	string		Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
 876	 * @param	mixed		Target value (ignored)
 877	 * @param	object		Reference to tcemain calling object
 878	 * @return	void
 879	 */
 880	function processCmdmap_preProcess($command, $table, $id, $value, $pObj) {
 881
 882			// Clean up the index
 883		if ($command=='delete' && $table == 'pages')	{
 884			$this->deleteFromIndex($id);
 885		}
 886	}
 887
 888	/**
 889	 * TCEmain hook function for on-the-fly indexing of database records
 890	 *
 891	 * @param	string		Status "new" or "update"
 892	 * @param	string		Table name
 893	 * @param	string		Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
 894	 * @param	array		Field array of updated fields in the operation
 895	 * @param	object		Reference to tcemain calling object
 896	 * @return	void
 897	 */
 898	function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj) {
 899
 900			// Check if any fields are actually updated:
 901		if (count($fieldArray))	{
 902
 903				// Translate new ids.
 904			if ($status=='new')	{
 905				$id = $pObj->substNEWwithIDs[$id];
 906
 907			} elseif ($table=='pages' && $status=='update' && ((array_key_exists('hidden',$fieldArray) && $fieldArray['hidden']==1) || (array_key_exists('no_search',$fieldArray) && $fieldArray['no_search']==1)))	{
 908
 909					// If the page should be hidden or not indexed after update, delete index for this page
 910				$this->deleteFromIndex($id);
 911			}
 912
 913				// Get full record and if exists, search for indexing configurations:
 914			$currentRecord = t3lib_BEfunc::getRecord($table,$id);
 915			if (is_array($currentRecord))	{
 916
 917					// Select all (not running) indexing configurations of type "record" (1) and which points to this table and is located on the same page as the record or pointing to the right source PID
 918				$indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
 919					'*',
 920					'index_config',
 921					'hidden=0
 922						AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
 923						AND set_id=0
 924						AND type=1
 925						AND table2index='.$GLOBALS['TYPO3_DB']->fullQuoteStr($table,'index_config').'
 926						AND (
 927								(alternative_source_pid=0 AND pid='.intval($currentRecord['pid']).')
 928								OR (alternative_source_pid='.intval($currentRecord['pid']).')
 929							)
 930						AND records_indexonchange=1
 931						'.t3lib_BEfunc::deleteClause('index_config')
 932				);
 933
 934				foreach($indexingConfigurations as $cfgRec)	{
 935					$this->indexSingleRecord($currentRecord,$cfgRec);
 936				}
 937			}
 938		}
 939	}
 940}
 941
 942
 943/**
 944 * Crawler hook for indexed search. Works with the "crawler" extension
 945 * This hook is specifically used to index external files found on pages through the crawler extension.
 946 *
 947 * @author	Kasper Skaarhoj <kasperYYYY@typo3.com>
 948 * @package TYPO3
 949 * @subpackage tx_indexedsearch
 950 * @see tx_indexedsearch_indexer::extractLinks()
 951 */
 952class tx_indexedsearch_files {
 953
 954	/**
 955	 * Call back function for execution of a log element
 956	 *
 957	 * @param	array		Params from log element.
 958	 * @param	object		Parent object (tx_crawler lib)
 959	 * @return	array		Result array
 960	 */
 961	function crawler_execute($params,&$pObj)	{
 962
 963			// Load indexer if not yet.
 964		$this->loadIndexerClass();
 965
 966		if (is_array($params['conf']))	{
 967
 968				// Initialize the indexer class:
 969			$indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
 970			$indexerObj->conf = $params['conf'];
 971			$indexerObj->init();
 972
 973				// Index document:
 974			if ($params['alturl'])	{
 975				$fI = pathinfo($params['document']);
 976				$ext = strtolower($fI['extension']);
 977				$indexerObj->indexRegularDocument($params['alturl'], TRUE, $params['document'], $ext);
 978			} else {
 979				$indexerObj->indexRegularDocument($params['document'], TRUE);
 980			}
 981
 982				// Return OK:
 983			return array('content' => array());
 984		}
 985	}
 986
 987	/**
 988	 * Include indexer class.
 989	 *
 990	 * @return	void
 991	 */
 992	function loadIndexerClass()	{
 993		global $TYPO3_CONF_VARS;
 994		require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
 995	}
 996}
 997
 998
 999if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php'])	{
1000	include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php']);
1001}
1002
1003?>