PageRenderTime 76ms CodeModel.GetById 35ms RepoModel.GetById 1ms app.codeStats 0ms

/typo3/sysext/indexed_search/class.crawler.php

https://bitbucket.org/linxpinx/mercurial
PHP | 1003 lines | 523 code | 152 blank | 328 comment | 56 complexity | bb76f4ce467c6dc916054088914e1964 MD5 | raw file
Possible License(s): BSD-3-Clause, GPL-2.0, Unlicense, LGPL-2.1, Apache-2.0
  1. <?php
  2. /***************************************************************
  3. * Copyright notice
  4. *
  5. * (c) 2001-2010 Kasper Skaarhoj (kasperYYYY@typo3.com)
  6. * All rights reserved
  7. *
  8. * This script is part of the TYPO3 project. The TYPO3 project is
  9. * free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * The GNU General Public License can be found at
  15. * http://www.gnu.org/copyleft/gpl.html.
  16. * A copy is found in the textfile GPL.txt and important notices to the license
  17. * from the author is found in LICENSE.txt distributed with these scripts.
  18. *
  19. *
  20. * This script is distributed in the hope that it will be useful,
  21. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  22. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  23. * GNU General Public License for more details.
  24. *
  25. * This copyright notice MUST APPEAR in all copies of the script!
  26. ***************************************************************/
  27. /**
  28. * Crawler hook for indexed search. Works with the "crawler" extension
  29. *
  30. * @author Kasper Sk?rh?j <kasperYYYY@typo3.com>
  31. */
  32. /**
  33. * [CLASS/FUNCTION INDEX of SCRIPT]
  34. *
  35. *
  36. *
  37. * 87: class tx_indexedsearch_crawler
  38. * 106: function crawler_init(&$pObj)
  39. * 219: function crawler_execute($params,&$pObj)
  40. * 285: function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj)
  41. * 345: function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj)
  42. * 414: function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj)
  43. * 458: function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj)
  44. * 513: function cleanUpOldRunningConfigurations()
  45. *
  46. * SECTION: Helper functions
  47. * 579: function checkUrl($url,$urlLog,$baseUrl)
  48. * 602: function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
  49. * 645: function indexSingleRecord($r,$cfgRec,$rl=NULL)
  50. * 694: function loadIndexerClass()
  51. * 706: function getUidRootLineForClosestTemplate($id)
  52. * 739: function generateNextIndexingTime($cfgRec)
  53. * 778: function checkDeniedSuburls($url, $url_deny)
  54. * 798: function addQueueEntryForHook($cfgRec, $title)
  55. *
  56. * SECTION: Hook functions for TCEmain (indexing of records)
  57. * 830: function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, &$pObj)
  58. *
  59. *
  60. * 879: class tx_indexedsearch_files
  61. * 888: function crawler_execute($params,&$pObj)
  62. * 913: function loadIndexerClass()
  63. *
  64. * TOTAL FUNCTIONS: 18
  65. * (This index is automatically created/updated by the extension "extdeveval")
  66. *
  67. */
  68. # To make sure the backend charset is available:
  69. if (!is_object($GLOBALS['LANG'])) {
  70. $GLOBALS['LANG'] = t3lib_div::makeInstance('language');
  71. $GLOBALS['LANG']->init($GLOBALS['BE_USER']->uc['lang']);
  72. }
  73. /**
  74. * Crawler hook for indexed search. Works with the "crawler" extension
  75. *
  76. * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
  77. * @package TYPO3
  78. * @subpackage tx_indexedsearch
  79. */
  80. class tx_indexedsearch_crawler {
  81. // Static:
  82. var $secondsPerExternalUrl = 3; // Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
  83. // Internal, dynamic:
  84. var $instanceCounter = 0; // Counts up for each added URL (type 3)
  85. // Internal, static:
  86. var $callBack = 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'; // The object reference to this class.
  87. /**
  88. * Initialization of crawler hook.
  89. * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
  90. * In reality we select indexing configurations and evaluate if any of them needs to run.
  91. *
  92. * @param object Parent object (tx_crawler lib)
  93. * @return void
  94. */
  95. function crawler_init(&$pObj){
  96. // Select all indexing configuration which are waiting to be activated:
  97. $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
  98. '*',
  99. 'index_config',
  100. 'hidden=0
  101. AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
  102. AND timer_next_indexing<' . $GLOBALS['EXEC_TIME'] . '
  103. AND set_id=0
  104. '.t3lib_BEfunc::deleteClause('index_config')
  105. );
  106. // For each configuration, check if it should be executed and if so, start:
  107. foreach($indexingConfigurations as $cfgRec) {
  108. // Generate a unique set-ID:
  109. $setId = t3lib_div::md5int(microtime());
  110. // Get next time:
  111. $nextTime = $this->generateNextIndexingTime($cfgRec);
  112. // Start process by updating index-config record:
  113. $field_array = array (
  114. 'set_id' => $setId,
  115. 'timer_next_indexing' => $nextTime,
  116. 'session_data' => '',
  117. );
  118. $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
  119. // Based on configuration type:
  120. switch($cfgRec['type']) {
  121. case 1: // RECORDS:
  122. // Parameters:
  123. $params = array(
  124. 'indexConfigUid' => $cfgRec['uid'],
  125. 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
  126. 'url' => 'Records (start)', // Just for show.
  127. );
  128. //
  129. $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
  130. break;
  131. case 2: // FILES:
  132. // Parameters:
  133. $params = array(
  134. 'indexConfigUid' => $cfgRec['uid'], // General
  135. 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
  136. 'url' => $cfgRec['filepath'], // Partly general... (for URL and file types)
  137. 'depth' => 0 // Specific for URL and file types
  138. );
  139. $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
  140. break;
  141. case 3: // External URL:
  142. // Parameters:
  143. $params = array(
  144. 'indexConfigUid' => $cfgRec['uid'], // General
  145. 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
  146. 'url' => $cfgRec['externalUrl'], // Partly general... (for URL and file types)
  147. 'depth' => 0 // Specific for URL and file types
  148. );
  149. $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
  150. break;
  151. case 4: // Page tree
  152. // Parameters:
  153. $params = array(
  154. 'indexConfigUid' => $cfgRec['uid'], // General
  155. 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
  156. 'url' => intval($cfgRec['alternative_source_pid']), // Partly general... (for URL and file types and page tree (root))
  157. 'depth' => 0 // Specific for URL and file types and page tree
  158. );
  159. $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
  160. break;
  161. case 5: // Meta configuration, nothing to do:
  162. # NOOP
  163. break;
  164. default:
  165. if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
  166. $hookObj = t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
  167. if (is_object($hookObj)) {
  168. // Parameters:
  169. $params = array(
  170. 'indexConfigUid' => $cfgRec['uid'], // General
  171. 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].'/CUSTOM]'), // General
  172. 'url' => $hookObj->initMessage($message),
  173. );
  174. $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
  175. }
  176. }
  177. break;
  178. }
  179. }
  180. // Finally, look up all old index configurations which are finished and needs to be reset and done.
  181. $this->cleanUpOldRunningConfigurations();
  182. }
  183. /**
  184. * Call back function for execution of a log element
  185. *
  186. * @param array Params from log element. Must contain $params['indexConfigUid']
  187. * @param object Parent object (tx_crawler lib)
  188. * @return array Result array
  189. */
  190. function crawler_execute($params,&$pObj) {
  191. // Indexer configuration ID must exist:
  192. if ($params['indexConfigUid']) {
  193. // Load the indexing configuration record:
  194. list($cfgRec) = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
  195. '*',
  196. 'index_config',
  197. 'uid='.intval($params['indexConfigUid'])
  198. );
  199. if (is_array($cfgRec)) {
  200. // Unpack session data:
  201. $session_data = unserialize($cfgRec['session_data']);
  202. // Select which type:
  203. switch($cfgRec['type']) {
  204. case 1: // Records:
  205. $this->crawler_execute_type1($cfgRec,$session_data,$params,$pObj);
  206. break;
  207. case 2: // Files
  208. $this->crawler_execute_type2($cfgRec,$session_data,$params,$pObj);
  209. break;
  210. case 3: // External URL:
  211. $this->crawler_execute_type3($cfgRec,$session_data,$params,$pObj);
  212. break;
  213. case 4: // Page tree:
  214. $this->crawler_execute_type4($cfgRec,$session_data,$params,$pObj);
  215. break;
  216. case 5: // Meta
  217. # NOOP (should never enter here!)
  218. break;
  219. default:
  220. if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
  221. $hookObj = t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
  222. if (is_object($hookObj)) {
  223. $this->pObj = $pObj; // For addQueueEntryForHook()
  224. $hookObj->indexOperation($cfgRec,$session_data,$params,$this);
  225. }
  226. }
  227. break;
  228. }
  229. // Save process data which might be modified:
  230. $field_array = array (
  231. 'session_data' => serialize($session_data)
  232. );
  233. $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
  234. }
  235. }
  236. return array('log' => $params);
  237. }
  238. /**
  239. * Indexing records from a table
  240. *
  241. * @param array Indexing Configuration Record
  242. * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
  243. * @param array Parameters from the log queue.
  244. * @param object Parent object (from "crawler" extension!)
  245. * @return void
  246. */
  247. function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj) {
  248. if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
  249. // Init session data array if not already:
  250. if (!is_array($session_data)) {
  251. $session_data = array(
  252. 'uid' => 0
  253. );
  254. }
  255. // Init:
  256. $pid = intval($cfgRec['alternative_source_pid']) ? intval($cfgRec['alternative_source_pid']) : $cfgRec['pid'];
  257. $numberOfRecords = $cfgRec['recordsbatch'] ? t3lib_div::intInRange($cfgRec['recordsbatch'],1) : 100;
  258. // Get root line:
  259. $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
  260. // Select
  261. $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
  262. '*',
  263. $cfgRec['table2index'],
  264. 'pid = '.intval($pid).'
  265. AND uid > '.intval($session_data['uid']).
  266. t3lib_BEfunc::deleteClause($cfgRec['table2index']).
  267. t3lib_BEfunc::BEenableFields($cfgRec['table2index']),
  268. '',
  269. 'uid',
  270. $numberOfRecords
  271. );
  272. // Traverse:
  273. if (count($recs)) {
  274. foreach($recs as $r) {
  275. // Index single record:
  276. $this->indexSingleRecord($r,$cfgRec,$rl);
  277. // Update the UID we last processed:
  278. $session_data['uid'] = $r['uid'];
  279. }
  280. // Finally, set entry for next indexing of batch of records:
  281. $nparams = array(
  282. 'indexConfigUid' => $cfgRec['uid'],
  283. 'url' => 'Records from UID#'.($r['uid']+1).'-?',
  284. 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')
  285. );
  286. $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
  287. }
  288. }
  289. }
  290. /**
  291. * Indexing files from fileadmin
  292. *
  293. * @param array Indexing Configuration Record
  294. * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
  295. * @param array Parameters from the log queue.
  296. * @param object Parent object (from "crawler" extension!)
  297. * @return void
  298. */
  299. function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj) {
  300. // Prepare path, making it absolute and checking:
  301. $readpath = $params['url'];
  302. if (!t3lib_div::isAbsPath($readpath)) {
  303. $readpath = t3lib_div::getFileAbsFileName($readpath);
  304. }
  305. if (t3lib_div::isAllowedAbsPath($readpath)) {
  306. if (@is_file($readpath)) { // If file, index it!
  307. // Get root line (need to provide this when indexing external files)
  308. $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
  309. // Load indexer if not yet.
  310. $this->loadIndexerClass();
  311. // (Re)-Indexing file on page.
  312. $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
  313. $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
  314. $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
  315. $indexerObj->hash['phash'] = -1; // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
  316. // Index document:
  317. $indexerObj->indexRegularDocument(substr($readpath,strlen(PATH_site)), TRUE);
  318. } elseif (@is_dir($readpath)) { // If dir, read content and create new pending items for log:
  319. // Select files and directories in path:
  320. $extList = implode(',',t3lib_div::trimExplode(',',$cfgRec['extensions'],1));
  321. $fileArr = array();
  322. $files = t3lib_div::getAllFilesAndFoldersInPath($fileArr,$readpath,$extList,0,0);
  323. $directoryList = t3lib_div::get_dirs($readpath);
  324. if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
  325. foreach ($directoryList as $subdir) {
  326. if ((string)$subdir!='') {
  327. $files[]= $readpath.$subdir.'/';
  328. }
  329. }
  330. }
  331. $files = t3lib_div::removePrefixPathFromList($files,PATH_site);
  332. // traverse the items and create log entries:
  333. foreach($files as $path) {
  334. $this->instanceCounter++;
  335. if ($path!==$params['url']) {
  336. // Parameters:
  337. $nparams = array(
  338. 'indexConfigUid' => $cfgRec['uid'],
  339. 'url' => $path,
  340. 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
  341. 'depth' => $params['depth']+1
  342. );
  343. $pObj->addQueueEntry_callBack(
  344. $cfgRec['set_id'],
  345. $nparams,
  346. $this->callBack,
  347. $cfgRec['pid'],
  348. $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
  349. );
  350. }
  351. }
  352. }
  353. }
  354. }
  355. /**
  356. * Indexing External URLs
  357. *
  358. * @param array Indexing Configuration Record
  359. * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
  360. * @param array Parameters from the log queue.
  361. * @param object Parent object (from "crawler" extension!)
  362. * @return void
  363. */
  364. function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj) {
  365. // Init session data array if not already:
  366. if (!is_array($session_data)) {
  367. $session_data = array(
  368. 'urlLog' => array($params['url'])
  369. );
  370. }
  371. // Index the URL:
  372. $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
  373. $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
  374. // Add more elements to log now:
  375. if ($params['depth'] < $cfgRec['depth']) {
  376. foreach($subUrls as $url) {
  377. if ($url = $this->checkUrl($url,$session_data['urlLog'],$cfgRec['externalUrl'])) {
  378. if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
  379. $this->instanceCounter++;
  380. $session_data['urlLog'][] = $url;
  381. // Parameters:
  382. $nparams = array(
  383. 'indexConfigUid' => $cfgRec['uid'],
  384. 'url' => $url,
  385. 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
  386. 'depth' => $params['depth']+1
  387. );
  388. $pObj->addQueueEntry_callBack(
  389. $cfgRec['set_id'],
  390. $nparams,
  391. $this->callBack,
  392. $cfgRec['pid'],
  393. $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
  394. );
  395. }
  396. }
  397. }
  398. }
  399. }
  400. /**
  401. * Page tree indexing type
  402. *
  403. * @param array Indexing Configuration Record
  404. * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
  405. * @param array Parameters from the log queue.
  406. * @param object Parent object (from "crawler" extension!)
  407. * @return void
  408. */
  409. function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj) {
  410. // Base page uid:
  411. $pageUid = intval($params['url']);
  412. // Get array of URLs from page:
  413. $pageRow = t3lib_BEfunc::getRecord('pages',$pageUid);
  414. $res = $pObj->getUrlsForPageRow($pageRow);
  415. $duplicateTrack = array(); // Registry for duplicates
  416. $downloadUrls = array(); // Dummy.
  417. // Submit URLs:
  418. if (count($res)) {
  419. foreach($res as $paramSetKey => $vv) {
  420. $urlList = $pObj->urlListFromUrlArray(
  421. $vv,
  422. $pageRow,
  423. $GLOBALS['EXEC_TIME'],
  424. 30,
  425. 1,
  426. 0,
  427. $duplicateTrack,
  428. $downloadUrls,
  429. array('tx_indexedsearch_reindex')
  430. );
  431. }
  432. }
  433. // Add subpages to log now:
  434. if ($params['depth'] < $cfgRec['depth']) {
  435. // Subpages selected
  436. $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
  437. 'uid,title',
  438. 'pages',
  439. 'pid = '.intval($pageUid).
  440. t3lib_BEfunc::deleteClause('pages')
  441. );
  442. // Traverse subpages and add to queue:
  443. if (count($recs)) {
  444. foreach($recs as $r) {
  445. $this->instanceCounter++;
  446. $url = 'pages:'.$r['uid'].': '.$r['title'];
  447. $session_data['urlLog'][] = $url;
  448. // Parameters:
  449. $nparams = array(
  450. 'indexConfigUid' => $cfgRec['uid'],
  451. 'url' => $r['uid'],
  452. 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
  453. 'depth' => $params['depth']+1
  454. );
  455. $pObj->addQueueEntry_callBack(
  456. $cfgRec['set_id'],
  457. $nparams,
  458. $this->callBack,
  459. $cfgRec['pid'],
  460. $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
  461. );
  462. }
  463. }
  464. }
  465. }
  466. /**
  467. * Look up all old index configurations which are finished and needs to be reset and done
  468. *
  469. * @return void
  470. */
  471. function cleanUpOldRunningConfigurations() {
  472. // Lookup running index configurations:
  473. $runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
  474. 'uid,set_id',
  475. 'index_config',
  476. 'set_id!=0'.t3lib_BEfunc::deleteClause('index_config')
  477. );
  478. // For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE)
  479. foreach($runningIndexingConfigurations as $cfgRec) {
  480. // Look for ended processes:
  481. $queued_items = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
  482. '*',
  483. 'tx_crawler_queue',
  484. 'set_id=' . intval($cfgRec['set_id']) . ' AND exec_time=0'
  485. );
  486. if (!$queued_items) {
  487. // Lookup old phash rows:
  488. $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
  489. 'phash',
  490. 'index_phash',
  491. 'freeIndexUid='.intval($cfgRec['uid']).' AND freeIndexSetId!='.$cfgRec['set_id']
  492. );
  493. foreach($oldPhashRows as $pHashRow) {
  494. // Removing old registrations for all tables (code copied from class.tx_indexedsearch_modfunc1.php)
  495. $tableArr = explode(',','index_phash,index_rel,index_section,index_grlist,index_fulltext,index_debug');
  496. foreach($tableArr as $table) {
  497. $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($pHashRow['phash']));
  498. }
  499. }
  500. // End process by updating index-config record:
  501. $field_array = array (
  502. 'set_id' => 0,
  503. 'session_data' => '',
  504. );
  505. $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
  506. }
  507. }
  508. }
  509. /*****************************************
  510. *
  511. * Helper functions
  512. *
  513. *****************************************/
  514. /**
  515. * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
  516. *
  517. * @param string URL string to check
  518. * @param array Array of already indexed URLs (input url is looked up here and must not exist already)
  519. * @param string Base URL of the indexing process (input URL must be "inside" the base URL!)
  520. * @return string Returls the URL if OK, otherwise false
  521. */
  522. function checkUrl($url,$urlLog,$baseUrl) {
  523. $url = preg_replace('/\/\/$/','/',$url);
  524. list($url) = explode('#',$url);
  525. if (!strstr($url,'../')) {
  526. if (t3lib_div::isFirstPartOfStr($url,$baseUrl)) {
  527. if (!in_array($url,$urlLog)) {
  528. return $url;
  529. }
  530. }
  531. }
  532. }
  533. /**
  534. * Indexing External URL
  535. *
  536. * @param string URL, http://....
  537. * @param integer Page id to relate indexing to.
  538. * @param array Rootline array to relate indexing to
  539. * @param integer Configuration UID
  540. * @param integer Set ID value
  541. * @return array URLs found on this page
  542. */
  543. function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId) {
  544. // Load indexer if not yet.
  545. $this->loadIndexerClass();
  546. // Index external URL:
  547. $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
  548. $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
  549. $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
  550. $indexerObj->hash['phash'] = -1; // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
  551. $indexerObj->indexExternalUrl($url);
  552. $url_qParts = parse_url($url);
  553. $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
  554. $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
  555. if (!$baseHref) {
  556. // Extract base href from current URL
  557. $baseHref = $baseAbsoluteHref;
  558. $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
  559. }
  560. $baseHref = rtrim($baseHref, '/');
  561. // Get URLs on this page:
  562. $subUrls = array();
  563. $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
  564. // Traverse links:
  565. foreach ($list as $count => $linkInfo) {
  566. // Decode entities:
  567. $subUrl = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
  568. $qParts = parse_url($subUrl);
  569. if (!$qParts['scheme']) {
  570. $relativeUrl = t3lib_div::resolveBackPath($subUrl);
  571. if ($relativeUrl{0} === '/') {
  572. $subUrl = $baseAbsoluteHref . $relativeUrl;
  573. } else {
  574. $subUrl = $baseHref . '/' . $relativeUrl;
  575. }
  576. }
  577. $subUrls[] = $subUrl;
  578. }
  579. return $subUrls;
  580. }
  581. /**
  582. * Indexing Single Record
  583. *
  584. * @param array Record to index
  585. * @param array Configuration Record
  586. * @param array Rootline array to relate indexing to
  587. * @return void
  588. */
  589. function indexSingleRecord($r,$cfgRec,$rl=NULL) {
  590. // Load indexer if not yet.
  591. $this->loadIndexerClass();
  592. // Init:
  593. $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
  594. $fieldList = t3lib_div::trimExplode(',',$cfgRec['fieldlist'],1);
  595. $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
  596. $sys_language_uid = $languageField ? $r[$languageField] : 0;
  597. // (Re)-Indexing a row from a table:
  598. $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
  599. parse_str(str_replace('###UID###',$r['uid'],$cfgRec['get_params']),$GETparams);
  600. $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, $cfgRec['chashcalc'] ? TRUE : FALSE);
  601. $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
  602. $indexerObj->forceIndexing = TRUE;
  603. $theContent = '';
  604. foreach($fieldList as $k => $v) {
  605. if (!$k) {
  606. $theTitle = $r[$v];
  607. } else {
  608. $theContent.= $r[$v].' ';
  609. }
  610. }
  611. // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
  612. $indexerObj->backend_indexAsTYPO3Page(
  613. strip_tags(str_replace('<', ' <', $theTitle)),
  614. '',
  615. '',
  616. strip_tags(str_replace('<', ' <', $theContent)),
  617. $GLOBALS['LANG']->charSet, // Requires that
  618. $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']],
  619. $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']],
  620. $r['uid']
  621. );
  622. }
  623. /**
  624. * Include indexer class.
  625. *
  626. * @return void
  627. */
  628. function loadIndexerClass() {
  629. global $TYPO3_CONF_VARS;
  630. require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
  631. }
  632. /**
  633. * Get rootline for closest TypoScript template root.
  634. * Algorithm same as used in Web > Template, Object browser
  635. *
  636. * @param integer The page id to traverse rootline back from
  637. * @return array Array where the root lines uid values are found.
  638. */
  639. function getUidRootLineForClosestTemplate($id) {
  640. global $TYPO3_CONF_VARS;
  641. $tmpl = t3lib_div::makeInstance("t3lib_tsparser_ext");
  642. $tmpl->tt_track = 0; // Do not log time-performance information
  643. $tmpl->init();
  644. // Gets the rootLine
  645. $sys_page = t3lib_div::makeInstance("t3lib_pageSelect");
  646. $rootLine = $sys_page->getRootLine($id);
  647. $tmpl->runThroughTemplates($rootLine,0); // This generates the constants/config + hierarchy info for the template.
  648. // Root line uids
  649. $rootline_uids = array();
  650. foreach($tmpl->rootLine as $rlkey => $rldat) {
  651. $rootline_uids[$rlkey] = $rldat['uid'];
  652. }
  653. return $rootline_uids;
  654. }
  655. /**
  656. * Generate the unix time stamp for next visit.
  657. *
  658. * @param array Index configuration record
  659. * @return integer The next time stamp
  660. */
  661. function generateNextIndexingTime($cfgRec) {
  662. $currentTime = $GLOBALS['EXEC_TIME'];
  663. // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
  664. if ($cfgRec['timer_frequency']<=24*3600) {
  665. $aMidNight = mktime (0,0,0)-1*24*3600;
  666. } else {
  667. $lastTime = $cfgRec['timer_next_indexing'] ? $cfgRec['timer_next_indexing'] : $GLOBALS['EXEC_TIME'];
  668. $aMidNight = mktime (0,0,0, date('m',$lastTime), date('d',$lastTime), date('y',$lastTime));
  669. }
  670. // Find last offset time plus frequency in seconds:
  671. $lastSureOffset = $aMidNight+t3lib_div::intInRange($cfgRec['timer_offset'],0,86400);
  672. $frequencySeconds = t3lib_div::intInRange($cfgRec['timer_frequency'],1);
  673. // Now, find out how many blocks of the length of frequency there is until the next time:
  674. $frequencyBlocksUntilNextTime = ceil(($currentTime-$lastSureOffset)/$frequencySeconds);
  675. // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
  676. $nextTime = $lastSureOffset + $frequencyBlocksUntilNextTime*$frequencySeconds;
  677. return $nextTime;
  678. }
  679. /**
  680. * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns true.
  681. *
  682. * @param string URL to test
  683. * @param string String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
  684. * @return boolean TRUE if there is a matching URL (hence, do not index!)
  685. */
  686. function checkDeniedSuburls($url, $url_deny) {
  687. if (trim($url_deny)) {
  688. $url_denyArray = t3lib_div::trimExplode(LF,$url_deny,1);
  689. foreach($url_denyArray as $testurl) {
  690. if (t3lib_div::isFirstPartOfStr($url,$testurl)) {
  691. echo $url.' /// '.$url_deny.LF;
  692. return TRUE;
  693. }
  694. }
  695. }
  696. return FALSE;
  697. }
  698. /**
  699. * Adding entry in queue for Hook
  700. *
  701. * @param array Configuration record
  702. * @param string Title/URL
  703. * @return void
  704. */
  705. function addQueueEntryForHook($cfgRec, $title) {
  706. $nparams = array(
  707. 'indexConfigUid' => $cfgRec['uid'], // This must ALWAYS be the cfgRec uid!
  708. 'url' => $title,
  709. 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']') // Also just for information. Its good style to show that its an indexing configuration that added the entry.
  710. );
  711. $this->pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
  712. }
  713. /**
  714. * Deletes all data stored by indexed search for a given page
  715. *
  716. * @param integer Uid of the page to delete all pHash
  717. * @return void
  718. */
  719. function deleteFromIndex($id) {
  720. // Lookup old phash rows:
  721. $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash','index_section', 'page_id='.intval($id));
  722. if (count($oldPhashRows)) {
  723. $pHashesToDelete = array();
  724. foreach ($oldPhashRows as $pHashRow) {
  725. $pHashesToDelete[] = $pHashRow['phash'];
  726. }
  727. $where_clause = 'phash IN ('.implode(',',$GLOBALS['TYPO3_DB']->cleanIntArray($pHashesToDelete)).')';
  728. $tables = explode(',', 'index_debug,index_fulltext,index_grlist,index_phash,index_rel,index_section');
  729. foreach ($tables as $table) {
  730. $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where_clause);
  731. }
  732. }
  733. }
  734. /*************************
  735. *
  736. * Hook functions for TCEmain (indexing of records)
  737. *
  738. *************************/
  739. /**
  740. * TCEmain hook function for on-the-fly indexing of database records
  741. *
  742. * @param string TCEmain command
  743. * @param string Table name
  744. * @param string Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
  745. * @param mixed Target value (ignored)
  746. * @param object Reference to tcemain calling object
  747. * @return void
  748. */
  749. function processCmdmap_preProcess($command, $table, $id, $value, $pObj) {
  750. // Clean up the index
  751. if ($command=='delete' && $table == 'pages') {
  752. $this->deleteFromIndex($id);
  753. }
  754. }
  755. /**
  756. * TCEmain hook function for on-the-fly indexing of database records
  757. *
  758. * @param string Status "new" or "update"
  759. * @param string Table name
  760. * @param string Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
  761. * @param array Field array of updated fields in the operation
  762. * @param object Reference to tcemain calling object
  763. * @return void
  764. */
  765. function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj) {
  766. // Check if any fields are actually updated:
  767. if (count($fieldArray)) {
  768. // Translate new ids.
  769. if ($status=='new') {
  770. $id = $pObj->substNEWwithIDs[$id];
  771. } elseif ($table=='pages' && $status=='update' && ((array_key_exists('hidden',$fieldArray) && $fieldArray['hidden']==1) || (array_key_exists('no_search',$fieldArray) && $fieldArray['no_search']==1))) {
  772. // If the page should be hidden or not indexed after update, delete index for this page
  773. $this->deleteFromIndex($id);
  774. }
  775. // Get full record and if exists, search for indexing configurations:
  776. $currentRecord = t3lib_BEfunc::getRecord($table,$id);
  777. if (is_array($currentRecord)) {
  778. // Select all (not running) indexing configurations of type "record" (1) and which points to this table and is located on the same page as the record or pointing to the right source PID
  779. $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
  780. '*',
  781. 'index_config',
  782. 'hidden=0
  783. AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
  784. AND set_id=0
  785. AND type=1
  786. AND table2index='.$GLOBALS['TYPO3_DB']->fullQuoteStr($table,'index_config').'
  787. AND (
  788. (alternative_source_pid=0 AND pid='.intval($currentRecord['pid']).')
  789. OR (alternative_source_pid='.intval($currentRecord['pid']).')
  790. )
  791. AND records_indexonchange=1
  792. '.t3lib_BEfunc::deleteClause('index_config')
  793. );
  794. foreach($indexingConfigurations as $cfgRec) {
  795. $this->indexSingleRecord($currentRecord,$cfgRec);
  796. }
  797. }
  798. }
  799. }
  800. }
  801. /**
  802. * Crawler hook for indexed search. Works with the "crawler" extension
  803. * This hook is specifically used to index external files found on pages through the crawler extension.
  804. *
  805. * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
  806. * @package TYPO3
  807. * @subpackage tx_indexedsearch
  808. * @see tx_indexedsearch_indexer::extractLinks()
  809. */
  810. class tx_indexedsearch_files {
  811. /**
  812. * Call back function for execution of a log element
  813. *
  814. * @param array Params from log element.
  815. * @param object Parent object (tx_crawler lib)
  816. * @return array Result array
  817. */
  818. function crawler_execute($params,&$pObj) {
  819. // Load indexer if not yet.
  820. $this->loadIndexerClass();
  821. if (is_array($params['conf'])) {
  822. // Initialize the indexer class:
  823. $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
  824. $indexerObj->conf = $params['conf'];
  825. $indexerObj->init();
  826. // Index document:
  827. if ($params['alturl']) {
  828. $fI = pathinfo($params['document']);
  829. $ext = strtolower($fI['extension']);
  830. $indexerObj->indexRegularDocument($params['alturl'], TRUE, $params['document'], $ext);
  831. } else {
  832. $indexerObj->indexRegularDocument($params['document'], TRUE);
  833. }
  834. // Return OK:
  835. return array('content' => array());
  836. }
  837. }
  838. /**
  839. * Include indexer class.
  840. *
  841. * @return void
  842. */
  843. function loadIndexerClass() {
  844. global $TYPO3_CONF_VARS;
  845. require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
  846. }
  847. }
  848. if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php']) {
  849. include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php']);
  850. }
  851. ?>