PageRenderTime 58ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/typo3/sysext/indexed_search/class.indexer.php

https://bitbucket.org/linxpinx/mercurial
PHP | 2253 lines | 1202 code | 318 blank | 733 comment | 158 complexity | bf71260307be889c9de476dc5d5bb85f MD5 | raw file
Possible License(s): BSD-3-Clause, GPL-2.0, Unlicense, LGPL-2.1, Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. <?php
  2. /***************************************************************
  3. * Copyright notice
  4. *
  5. * (c) 2001-2010 Kasper Skaarhoj (kasperYYYY@typo3.com)
  6. * All rights reserved
  7. *
  8. * This script is part of the TYPO3 project. The TYPO3 project is
  9. * free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * The GNU General Public License can be found at
  15. * http://www.gnu.org/copyleft/gpl.html.
  16. * A copy is found in the textfile GPL.txt and important notices to the license
  17. * from the author is found in LICENSE.txt distributed with these scripts.
  18. *
  19. *
  20. * This script is distributed in the hope that it will be useful,
  21. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  22. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  23. * GNU General Public License for more details.
  24. *
  25. * This copyright notice MUST APPEAR in all copies of the script!
  26. ***************************************************************/
  27. /**
  28. * This class is a search indexer for TYPO3
  29. *
  30. * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
  31. * Originally Christian Jul Jensen <christian@jul.net> helped as well.
  32. */
  33. /**
  34. * [CLASS/FUNCTION INDEX of SCRIPT]
  35. *
  36. *
  37. *
  38. * 141: class tx_indexedsearch_indexer
  39. * 207: function hook_indexContent(&$pObj)
  40. *
  41. * SECTION: Backend API
  42. * 308: function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE)
  43. * 347: function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0)
  44. * 365: function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0)
  45. *
  46. * SECTION: Initialization
  47. * 416: function init()
  48. * 468: function initializeExternalParsers()
  49. *
  50. * SECTION: Indexing; TYPO3 pages (HTML content)
  51. * 509: function indexTypo3PageContent()
  52. * 596: function splitHTMLContent($content)
  53. * 642: function getHTMLcharset($content)
  54. * 657: function convertHTMLToUtf8($content,$charset='')
  55. * 685: function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList)
  56. * 712: function typoSearchTags(&$body)
  57. * 741: function extractLinks($content)
  58. * 812: function extractHyperLinks($string)
  59. *
  60. * SECTION: Indexing; external URL
  61. * 871: function indexExternalUrl($externalUrl)
  62. * 902: function getUrlHeaders($url)
  63. *
  64. * SECTION: Indexing; external files (PDF, DOC, etc)
  65. * 948: function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='')
  66. * 1054: function readFileContent($ext,$absFile,$cPKey)
  67. * 1071: function fileContentParts($ext,$absFile)
  68. * 1089: function splitRegularContent($content)
  69. *
  70. * SECTION: Analysing content, Extracting words
  71. * 1122: function charsetEntity2utf8(&$contentArr, $charset)
  72. * 1145: function processWordsInArrays($contentArr)
  73. * 1170: function procesWordsInArrays($contentArr)
  74. * 1180: function bodyDescription($contentArr)
  75. * 1202: function indexAnalyze($content)
  76. * 1223: function analyzeHeaderinfo(&$retArr,$content,$key,$offset)
  77. * 1242: function analyzeBody(&$retArr,$content)
  78. * 1262: function metaphone($word,$retRaw=FALSE)
  79. *
  80. * SECTION: SQL; TYPO3 Pages
  81. * 1304: function submitPage()
  82. * 1378: function submit_grlist($hash,$phash_x)
  83. * 1398: function submit_section($hash,$hash_t3)
  84. * 1416: function removeOldIndexedPages($phash)
  85. *
  86. * SECTION: SQL; External media
  87. * 1459: function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts)
  88. * 1525: function submitFile_grlist($hash)
  89. * 1539: function submitFile_section($hash)
  90. * 1553: function removeOldIndexedFiles($phash)
  91. *
  92. * SECTION: SQL Helper functions
  93. * 1589: function checkMtimeTstamp($mtime,$phash)
  94. * 1625: function checkContentHash()
  95. * 1642: function checkExternalDocContentHash($hashGr,$content_md5h)
  96. * 1656: function is_grlist_set($phash_x)
  97. * 1669: function update_grlist($phash,$phash_x)
  98. * 1684: function updateTstamp($phash,$mtime=0)
  99. * 1699: function updateSetId($phash)
  100. * 1714: function updateParsetime($phash,$parsetime)
  101. * 1727: function updateRootline()
  102. * 1742: function getRootLineFields(&$fieldArr)
  103. * 1761: function removeLoginpagesWithContentHash()
  104. * 1778: function includeCrawlerClass()
  105. *
  106. * SECTION: SQL; Submitting words
  107. * 1805: function checkWordList($wl)
  108. * 1842: function submitWords($wl,$phash)
  109. * 1866: function freqMap($freq)
  110. *
  111. * SECTION: Hashing
  112. * 1899: function setT3Hashes()
  113. * 1925: function setExtHashes($file,$subinfo=array())
  114. * 1949: function md5inthash($str)
  115. * 1959: function makeCHash($paramArray)
  116. *
  117. * SECTION: Internal logging functions
  118. * 1991: function log_push($msg,$key)
  119. * 2000: function log_pull()
  120. * 2011: function log_setTSlogMessage($msg, $errorNum=0)
  121. *
  122. * SECTION: tslib_fe hooks:
  123. * 2036: function fe_headerNoCache(&$params, $ref)
  124. *
  125. * TOTAL FUNCTIONS: 59
  126. * (This index is automatically created/updated by the extension "extdeveval")
  127. *
  128. */
  129. /**
  130. * Indexing class for TYPO3 frontend
  131. *
  132. * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
  133. * @package TYPO3
  134. * @subpackage tx_indexedsearch
  135. */
  136. class tx_indexedsearch_indexer {
  137. // Messages:
  138. var $reasons = array(
  139. -1 => 'mtime matched the document, so no changes detected and no content updated',
  140. -2 => 'The minimum age was not exceeded',
  141. 1 => "The configured max-age was exceeded for the document and thus it's indexed.",
  142. 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
  143. 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
  144. 4 => 'Page has never been indexed (is not represented in the index_phash table).'
  145. );
  146. // HTML code blocks to exclude from indexing:
  147. var $excludeSections = 'script,style';
  148. // Supported Extensions for external files:
  149. var $external_parsers = array(); // External parser objects, keys are file extension names. Values are objects with certain methods.
  150. // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
  151. var $defaultGrList = '0,-1';
  152. // Min/Max times:
  153. var $tstamp_maxAge = 0; // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
  154. var $tstamp_minAge = 0; // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
  155. var $maxExternalFiles = 0; // Max number of external files to index.
  156. var $forceIndexing = FALSE; // If true, indexing is forced despite of hashes etc.
  157. var $crawlerActive = FALSE; // Set when crawler is detected (internal)
  158. // INTERNALS:
  159. var $defaultContentArray=array(
  160. 'title' => '',
  161. 'description' => '',
  162. 'keywords' => '',
  163. 'body' => '',
  164. );
  165. var $wordcount = 0;
  166. var $externalFileCounter = 0;
  167. var $conf = array(); // Configuration set internally (see init functions for required keys and their meaning)
  168. var $indexerConfig = array(); // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
  169. var $hash = array(); // Hash array, contains phash and phash_grouping
  170. var $file_phash_arr = array(); // Hash array for files
  171. var $contentParts = array(); // Content of TYPO3 page
  172. var $content_md5h = '';
  173. var $internal_log = array(); // Internal log
  174. var $indexExternalUrl_content = '';
  175. var $cHashParams = array(); // cHashparams array
  176. var $freqRange = 32000;
  177. var $freqMax = 0.1;
  178. // Objects:
  179. /**
  180. * Charset class object
  181. *
  182. * @var t3lib_cs
  183. */
  184. var $csObj;
  185. /**
  186. * Metaphone object, if any
  187. *
  188. * @var user_DoubleMetaPhone
  189. */
  190. var $metaphoneObj;
  191. /**
  192. * Lexer object for word splitting
  193. *
  194. * @var tx_indexedsearch_lexer
  195. */
  196. var $lexerObj;
  197. /**
  198. * Parent Object (TSFE) Initialization
  199. *
  200. * @param object Parent Object (frontend TSFE object), passed by reference
  201. * @return void
  202. */
  203. function hook_indexContent(&$pObj) {
  204. // Indexer configuration from Extension Manager interface:
  205. $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
  206. // Crawler activation:
  207. // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
  208. if (t3lib_extMgm::isLoaded('crawler')
  209. && $pObj->applicationData['tx_crawler']['running']
  210. && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
  211. // Setting simple log message:
  212. $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
  213. // Setting variables:
  214. $this->crawlerActive = TRUE; // Crawler active flag
  215. $this->forceIndexing = TRUE; // Force indexing despite timestamps etc.
  216. }
  217. // Determine if page should be indexed, and if so, configure and initialize indexer
  218. if ($pObj->config['config']['index_enable']) {
  219. $this->log_push('Index page','');
  220. if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
  221. if (!$pObj->page['no_search']) {
  222. if (!$pObj->no_cache) {
  223. if (!strcmp($pObj->sys_language_uid,$pObj->sys_language_content)) {
  224. // Setting up internal configuration from config array:
  225. $this->conf = array();
  226. // Information about page for which the indexing takes place
  227. $this->conf['id'] = $pObj->id; // Page id
  228. $this->conf['type'] = $pObj->type; // Page type
  229. $this->conf['sys_language_uid'] = $pObj->sys_language_uid; // sys_language UID of the language of the indexing.
  230. $this->conf['MP'] = $pObj->MP; // MP variable, if any (Mount Points)
  231. $this->conf['gr_list'] = $pObj->gr_list; // Group list
  232. $this->conf['cHash'] = $pObj->cHash; // cHash string for additional parameters
  233. $this->conf['cHash_array'] = $pObj->cHash_array; // Array of the additional parameters
  234. $this->conf['crdate'] = $pObj->page['crdate']; // The creation date of the TYPO3 page
  235. $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1; // reg1 of the caching table. Not known what practical use this has.
  236. // Root line uids
  237. $this->conf['rootline_uids'] = array();
  238. foreach($pObj->config['rootLine'] as $rlkey => $rldat) {
  239. $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
  240. }
  241. // Content of page:
  242. $this->conf['content'] = $pObj->content; // Content string (HTML of TYPO3 page)
  243. $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle); // Alternative title for indexing
  244. $this->conf['metaCharset'] = $pObj->metaCharset; // Character set of content (will be converted to utf-8 during indexing)
  245. $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED']; // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
  246. // Configuration of behavior:
  247. $this->conf['index_externals'] = $pObj->config['config']['index_externals']; // Whether to index external documents like PDF, DOC etc. (if possible)
  248. $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd']; // Length of description text (max 250, default 200)
  249. $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : true;
  250. // Set to zero:
  251. $this->conf['recordUid'] = 0;
  252. $this->conf['freeIndexUid'] = 0;
  253. $this->conf['freeIndexSetId'] = 0;
  254. // Init and start indexing:
  255. $this->init();
  256. $this->indexTypo3PageContent();
  257. } else $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
  258. } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
  259. } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
  260. } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
  261. $this->log_pull();
  262. }
  263. }
  264. /****************************
  265. *
  266. * Backend API
  267. *
  268. ****************************/
  269. /**
  270. * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
  271. *
  272. * @param integer The page uid, &id=
  273. * @param integer The page type, &type=
  274. * @param integer sys_language uid, typically &L=
  275. * @param string The MP variable (Mount Points), &MP=
  276. * @param array Rootline array of only UIDs.
  277. * @param array Array of GET variables to register with this indexing
  278. * @param boolean If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!
  279. * @return void
  280. */
  281. function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) {
  282. // Setting up internal configuration from config array:
  283. $this->conf = array();
  284. // Information about page for which the indexing takes place
  285. $this->conf['id'] = $id; // Page id (integer)
  286. $this->conf['type'] = $type; // Page type (integer)
  287. $this->conf['sys_language_uid'] = $sys_language_uid; // sys_language UID of the language of the indexing (integer)
  288. $this->conf['MP'] = $MP; // MP variable, if any (Mount Points) (string)
  289. $this->conf['gr_list'] = '0,-1'; // Group list (hardcoded for now...)
  290. // cHash values:
  291. $this->conf['cHash'] = $createCHash ? t3lib_div::generateCHash(t3lib_div::implodeArrayForUrl('', $cHash_array)) : ''; // cHash string for additional parameters
  292. $this->conf['cHash_array'] = $cHash_array; // Array of the additional parameters
  293. // Set to defaults
  294. $this->conf['freeIndexUid'] = 0;
  295. $this->conf['freeIndexSetId'] = 0;
  296. $this->conf['page_cache_reg1'] = '';
  297. // Root line uids
  298. $this->conf['rootline_uids'] = $uidRL;
  299. // Configuration of behavior:
  300. $this->conf['index_externals'] = 1; // Whether to index external documents like PDF, DOC etc. (if possible)
  301. $this->conf['index_descrLgd'] = 200; // Length of description text (max 250, default 200)
  302. $this->conf['index_metatags'] = true; // Whether to index document keywords and description (if present)
  303. // Init and start indexing:
  304. $this->init();
  305. }
  306. /**
  307. * Sets the free-index uid. Can be called right after backend_initIndexer()
  308. *
  309. * @param integer Free index UID
  310. * @param integer Set id - an integer identifying the "set" of indexing operations.
  311. * @return void
  312. */
  313. function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0) {
  314. $this->conf['freeIndexUid'] = $freeIndexUid;
  315. $this->conf['freeIndexSetId'] = $freeIndexSetId;
  316. }
  317. /**
  318. * Indexing records as the content of a TYPO3 page.
  319. *
  320. * @param string Title equivalent
  321. * @param string Keywords equivalent
  322. * @param string Description equivalent
  323. * @param string The main content to index
  324. * @param string The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
  325. * @param integer Last modification time, in seconds
  326. * @param integer The creation date of the content, in seconds
  327. * @param integer The record UID that the content comes from (for registration with the indexed rows)
  328. * @return void
  329. */
  330. function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) {
  331. // Content of page:
  332. $this->conf['mtime'] = $mtime; // Most recent modification time (seconds) of the content
  333. $this->conf['crdate'] = $crdate; // The creation date of the TYPO3 content
  334. $this->conf['recordUid'] = $recordUid; // UID of the record, if applicable
  335. // Construct fake HTML for parsing:
  336. $this->conf['content'] = '
  337. <html>
  338. <head>
  339. <title>'.htmlspecialchars($title).'</title>
  340. <meta name="keywords" content="'.htmlspecialchars($keywords).'" />
  341. <meta name="description" content="'.htmlspecialchars($description).'" />
  342. </head>
  343. <body>
  344. '.htmlspecialchars($content).'
  345. </body>
  346. </html>'; // Content string (HTML of TYPO3 page)
  347. // Initializing charset:
  348. $this->conf['metaCharset'] = $charset; // Character set of content (will be converted to utf-8 during indexing)
  349. $this->conf['indexedDocTitle'] = ''; // Alternative title for indexing
  350. // Index content as if it was a TYPO3 page:
  351. $this->indexTypo3PageContent();
  352. }
  353. /********************************
  354. *
  355. * Initialization
  356. *
  357. *******************************/
  358. /**
  359. * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
  360. *
  361. * @return void
  362. */
  363. function init() {
  364. global $TYPO3_CONF_VARS;
  365. // Initializing:
  366. $this->cHashParams = $this->conf['cHash_array'];
  367. if (is_array($this->cHashParams) && count($this->cHashParams)) {
  368. if ($this->conf['cHash']) $this->cHashParams['cHash'] = $this->conf['cHash']; // Add this so that URL's come out right...
  369. unset($this->cHashParams['encryptionKey']); // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!!
  370. }
  371. // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
  372. $this->setT3Hashes();
  373. // Indexer configuration from Extension Manager interface:
  374. $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
  375. $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0);
  376. $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0);
  377. $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);
  378. $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255);
  379. // Initialize external document parsers:
  380. // Example configuration, see ext_localconf.php of this file!
  381. if ($this->conf['index_externals']) {
  382. $this->initializeExternalParsers();
  383. }
  384. // Initialize lexer (class that deconstructs the text into words):
  385. // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
  386. $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
  387. $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
  388. 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
  389. $this->lexerObj = t3lib_div::getUserObj($lexerObjRef);
  390. $this->lexerObj->debug = $this->indexerConfig['debugMode'];
  391. // Initialize metaphone hook:
  392. // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
  393. if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
  394. $this->metaphoneObj = t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
  395. $this->metaphoneObj->pObj = $this;
  396. }
  397. // Init charset class:
  398. $this->csObj = t3lib_div::makeInstance('t3lib_cs');
  399. }
  400. /**
  401. * Initialize external parsers
  402. *
  403. * @return void
  404. * @access private
  405. * @see init()
  406. */
  407. function initializeExternalParsers() {
  408. global $TYPO3_CONF_VARS;
  409. if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
  410. foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
  411. $this->external_parsers[$extension] = t3lib_div::getUserObj($_objRef);
  412. $this->external_parsers[$extension]->pObj = $this;
  413. // Init parser and if it returns false, unset its entry again:
  414. if (!$this->external_parsers[$extension]->initParser($extension)) {
  415. unset($this->external_parsers[$extension]);
  416. }
  417. }
  418. }
  419. }
  420. /********************************
  421. *
  422. * Indexing; TYPO3 pages (HTML content)
  423. *
  424. *******************************/
  425. /**
  426. * Start indexing of the TYPO3 page
  427. *
  428. * @return void
  429. */
  430. function indexTypo3PageContent() {
  431. $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
  432. $is_grlist = $this->is_grlist_set($this->hash['phash']);
  433. if ($check > 0 || !$is_grlist || $this->forceIndexing) {
  434. // Setting message:
  435. if ($this->forceIndexing) {
  436. $this->log_setTSlogMessage('Indexing needed, reason: Forced',1);
  437. } elseif ($check > 0) {
  438. $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
  439. } else {
  440. $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
  441. }
  442. // Divide into title,keywords,description and body:
  443. $this->log_push('Split content','');
  444. $this->contentParts = $this->splitHTMLContent($this->conf['content']);
  445. if ($this->conf['indexedDocTitle']) {
  446. $this->contentParts['title'] = $this->conf['indexedDocTitle'];
  447. }
  448. $this->log_pull();
  449. // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
  450. $this->content_md5h = $this->md5inthash(implode($this->contentParts,''));
  451. // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
  452. // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
  453. // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
  454. $checkCHash = $this->checkContentHash();
  455. if (!is_array($checkCHash) || $check===1) {
  456. $Pstart=t3lib_div::milliseconds();
  457. $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8','');
  458. $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']);
  459. $this->log_pull();
  460. // Splitting words
  461. $this->log_push('Extract words from content','');
  462. $splitInWords = $this->processWordsInArrays($this->contentParts);
  463. $this->log_pull();
  464. // Analyse the indexed words.
  465. $this->log_push('Analyse the extracted words','');
  466. $indexArr = $this->indexAnalyze($splitInWords);
  467. $this->log_pull();
  468. // Submitting page (phash) record
  469. $this->log_push('Submitting page','');
  470. $this->submitPage();
  471. $this->log_pull();
  472. // Check words and submit to word list if not there
  473. $this->log_push('Check word list and submit words','');
  474. $this->checkWordList($indexArr);
  475. $this->submitWords($indexArr,$this->hash['phash']);
  476. $this->log_pull();
  477. // Set parsetime
  478. $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);
  479. // Checking external files if configured for.
  480. $this->log_push('Checking external files','');
  481. if ($this->conf['index_externals']) {
  482. $this->extractLinks($this->conf['content']);
  483. }
  484. $this->log_pull();
  485. } else {
  486. $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp
  487. $this->updateSetId($this->hash['phash']);
  488. $this->update_grlist($checkCHash['phash'],$this->hash['phash']); // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
  489. $this->updateRootline();
  490. $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
  491. }
  492. } else {
  493. $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
  494. }
  495. }
  496. /**
  497. * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
  498. *
  499. * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
  500. * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
  501. * @see splitRegularContent()
  502. */
  503. function splitHTMLContent($content) {
  504. // divide head from body ( u-ouh :) )
  505. $contentArr = $this->defaultContentArray;
  506. $contentArr['body'] = stristr($content,'<body');
  507. $headPart = substr($content,0,-strlen($contentArr['body']));
  508. // get title
  509. $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);
  510. $titleParts = explode(':',$contentArr['title'],2);
  511. $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
  512. // get keywords and description metatags
  513. if($this->conf['index_metatags']) {
  514. for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ }
  515. for($i=0;isset($meta[$i]);$i++) {
  516. $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
  517. if (stristr($meta[$i]['name'], 'keywords')) {
  518. $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
  519. }
  520. if (stristr($meta[$i]['name'], 'description')) {
  521. $contentArr['description'] .= ',' . $meta[$i]['content'];
  522. }
  523. }
  524. }
  525. // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
  526. $this->typoSearchTags($contentArr['body']);
  527. // Get rid of unwanted sections (ie. scripting and style stuff) in body
  528. $tagList = explode(',',$this->excludeSections);
  529. foreach($tagList as $tag) {
  530. while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));
  531. }
  532. // remove tags, but first make sure we don't concatenate words by doing it
  533. $contentArr['body'] = str_replace('<',' <',$contentArr['body']);
  534. $contentArr['body'] = trim(strip_tags($contentArr['body']));
  535. $contentArr['keywords'] = trim($contentArr['keywords']);
  536. $contentArr['description'] = trim($contentArr['description']);
  537. // Return array
  538. return $contentArr;
  539. }
  540. /**
  541. * Extract the charset value from HTML meta tag.
  542. *
  543. * @param string HTML content
  544. * @return string The charset value if found.
  545. */
  546. function getHTMLcharset($content) {
  547. if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i',$content,$reg)) {
  548. if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i',$reg[0],$reg2)) {
  549. return $reg2[1];
  550. }
  551. }
  552. }
  553. /**
  554. * Converts a HTML document to utf-8
  555. *
  556. * @param string HTML content, any charset
  557. * @param string Optional charset (otherwise extracted from HTML)
  558. * @return string Converted HTML
  559. */
  560. function convertHTMLToUtf8($content,$charset='') {
  561. // Find charset:
  562. $charset = $charset ? $charset : $this->getHTMLcharset($content);
  563. $charset = $this->csObj->parse_charset($charset);
  564. // Convert charset:
  565. if ($charset && $charset!=='utf-8') {
  566. $content = $this->csObj->utf8_encode($content, $charset);
  567. }
  568. // Convert entities, assuming document is now UTF-8:
  569. $content = $this->csObj->entities_to_utf8($content, TRUE);
  570. return $content;
  571. }
  572. /**
  573. * Finds first occurence of embracing tags and returns the embraced content and the original string with
  574. * the tag removed in the two passed variables. Returns false if no match found. ie. useful for finding
  575. * <title> of document or removing <script>-sections
  576. *
  577. * @param string String to search in
  578. * @param string Tag name, eg. "script"
  579. * @param string Passed by reference: Content inside found tag
  580. * @param string Passed by reference: Content after found tag
  581. * @param string Passed by reference: Attributes of the found tag.
  582. * @return boolean Returns false if tag was not found, otherwise true.
  583. */
  584. function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
  585. $endTag = '</'.$tagName.'>';
  586. $startTag = '<'.$tagName;
  587. $isTagInText = stristr($string,$startTag); // stristr used because we want a case-insensitive search for the tag.
  588. if(!$isTagInText) return false; // if the tag was not found, return false
  589. list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
  590. $afterTagInText = stristr($isTagInText,$endTag);
  591. if ($afterTagInText) {
  592. $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
  593. $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText));
  594. $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag));
  595. } else { // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned.
  596. $tagContent='';
  597. $stringAfter = $isTagInText;
  598. }
  599. return true;
  600. }
  601. /**
  602. * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
  603. *
  604. * @param string HTML Content, passed by reference
  605. * @return boolean Returns true if a TYPOSEARCH_ tag was found, otherwise false.
  606. */
  607. function typoSearchTags(&$body) {
  608. $expBody = preg_split('/\<\!\-\-[\s]?TYPO3SEARCH_/',$body);
  609. if(count($expBody)>1) {
  610. $body = '';
  611. foreach($expBody as $val) {
  612. $part = explode('-->',$val,2);
  613. if(trim($part[0])=='begin') {
  614. $body.= $part[1];
  615. $prev = '';
  616. } elseif(trim($part[0])=='end') {
  617. $body.= $prev;
  618. } else {
  619. $prev = $val;
  620. }
  621. }
  622. return true;
  623. } else {
  624. return false;
  625. }
  626. }
  627. /**
  628. * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
  629. *
  630. * @param string HTML content
  631. * @return void
  632. */
  633. function extractLinks($content) {
  634. // Get links:
  635. $list = $this->extractHyperLinks($content);
  636. if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler')) {
  637. $this->includeCrawlerClass();
  638. $crawler = t3lib_div::makeInstance('tx_crawler_lib');
  639. }
  640. // Traverse links:
  641. foreach($list as $linkInfo) {
  642. // Decode entities:
  643. if ($linkInfo['localPath']) { // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
  644. $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']);
  645. } else {
  646. $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
  647. }
  648. // Parse URL:
  649. $qParts = parse_url($linkSource);
  650. // Check for jumpurl (TYPO3 specific thing...)
  651. if ($qParts['query'] && strstr($qParts['query'],'jumpurl=')) {
  652. parse_str($qParts['query'],$getP);
  653. $linkSource = $getP['jumpurl'];
  654. $qParts = parse_url($linkSource); // parse again due to new linkSource!
  655. }
  656. if (!$linkInfo['localPath'] && $qParts['scheme']) {
  657. if ($this->indexerConfig['indexExternalURLs']) {
  658. // Index external URL (http or otherwise)
  659. $this->indexExternalUrl($linkSource);
  660. }
  661. } elseif (!$qParts['query']) {
  662. $linkSource = urldecode($linkSource);
  663. if (t3lib_div::isAllowedAbsPath($linkSource)) {
  664. $localFile = $linkSource;
  665. } else {
  666. $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
  667. }
  668. if ($localFile && @is_file($localFile)) {
  669. // Index local file:
  670. if ($linkInfo['localPath']) {
  671. $fI = pathinfo($linkSource);
  672. $ext = strtolower($fI['extension']);
  673. if (is_object($crawler)) {
  674. $params = array(
  675. 'document' => $linkSource,
  676. 'alturl' => $linkInfo['href'],
  677. 'conf' => $this->conf
  678. );
  679. unset($params['conf']['content']);
  680. $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
  681. $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
  682. } else {
  683. $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
  684. }
  685. } else {
  686. if (is_object($crawler)) {
  687. $params = array(
  688. 'document' => $linkSource,
  689. 'conf' => $this->conf
  690. );
  691. unset($params['conf']['content']);
  692. $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
  693. $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
  694. } else {
  695. $this->indexRegularDocument($linkSource);
  696. }
  697. }
  698. }
  699. }
  700. }
  701. }
  702. /**
  703. * Extracts all links to external documents from the HTML content string
  704. *
  705. * @param string $html
  706. * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
  707. * @see extractLinks()
  708. */
  709. function extractHyperLinks($html) {
  710. $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
  711. $htmlParts = $htmlParser->splitTags('a', $html);
  712. $hyperLinksData = array();
  713. foreach ($htmlParts as $index => $tagData) {
  714. if (($index % 2) !== 0) {
  715. $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
  716. $firstTagName = $htmlParser->getFirstTagName($tagData);
  717. if (strtolower($firstTagName) == 'a') {
  718. if ($tagAttributes[0]['href'] && $tagAttributes[0]['href']{0} != '#') {
  719. $hyperLinksData[] = array(
  720. 'tag' => $tagData,
  721. 'href' => $tagAttributes[0]['href'],
  722. 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
  723. );
  724. }
  725. }
  726. }
  727. }
  728. return $hyperLinksData;
  729. }
  730. /**
  731. * Extracts the "base href" from content string.
  732. *
  733. * @param string Content to analyze
  734. * @return string The base href or an empty string if not found
  735. */
  736. public function extractBaseHref($html) {
  737. $href = '';
  738. $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
  739. $htmlParts = $htmlParser->splitTags('base', $html);
  740. foreach ($htmlParts as $index => $tagData) {
  741. if (($index % 2) !== 0) {
  742. $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
  743. $firstTagName = $htmlParser->getFirstTagName($tagData);
  744. if (strtolower($firstTagName) == 'base') {
  745. $href = $tagAttributes[0]['href'];
  746. if ($href) {
  747. break;
  748. }
  749. }
  750. }
  751. }
  752. return $href;
  753. }
  754. /******************************************
  755. *
  756. * Indexing; external URL
  757. *
  758. ******************************************/
  759. /**
  760. * Index External URLs HTML content
  761. *
  762. * @param string URL, eg. "http://typo3.org/"
  763. * @return void
  764. * @see indexRegularDocument()
  765. */
  766. function indexExternalUrl($externalUrl) {
  767. // Parse External URL:
  768. $qParts = parse_url($externalUrl);
  769. $fI = pathinfo($qParts['path']);
  770. $ext = strtolower($fI['extension']);
  771. // Get headers:
  772. $urlHeaders = $this->getUrlHeaders($externalUrl);
  773. if (stristr($urlHeaders['Content-Type'],'text/html')) {
  774. $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);
  775. if (strlen($content)) {
  776. // Create temporary file:
  777. $tmpFile = t3lib_div::tempnam('EXTERNAL_URL');
  778. if ($tmpFile) {
  779. t3lib_div::writeFile($tmpFile, $content);
  780. // Index that file:
  781. $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html'); // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
  782. unlink($tmpFile);
  783. }
  784. }
  785. }
  786. }
  787. /**
  788. * Getting HTTP request headers of URL
  789. *
  790. * @param string The URL
  791. * @param integer Timeout (seconds?)
  792. * @return mixed If no answer, returns false. Otherwise an array where HTTP headers are keys
  793. */
  794. function getUrlHeaders($url) {
  795. $content = t3lib_div::getURL($url,2); // Try to get the headers only
  796. if (strlen($content)) {
  797. // Compile headers:
  798. $headers = t3lib_div::trimExplode(LF,$content,1);
  799. $retVal = array();
  800. foreach($headers as $line) {
  801. if (!strlen(trim($line))) {
  802. break; // Stop at the first empty line (= end of header)
  803. }
  804. list($headKey, $headValue) = explode(':', $line, 2);
  805. $retVal[$headKey] = $headValue;
  806. }
  807. return $retVal;
  808. }
  809. }
  810. /**
  811. * Checks if the file is local
  812. *
  813. * @param $sourcePath
  814. * @return string Absolute path to file if file is local, else empty string
  815. */
  816. protected function createLocalPath($sourcePath) {
  817. $localPath = '';
  818. static $pathFunctions = array(
  819. 'createLocalPathFromT3vars',
  820. 'createLocalPathUsingAbsRefPrefix',
  821. 'createLocalPathUsingDomainURL',
  822. 'createLocalPathFromAbsoluteURL',
  823. 'createLocalPathFromRelativeURL'
  824. );
  825. foreach ($pathFunctions as $functionName) {
  826. $localPath = $this->$functionName($sourcePath);
  827. if ($localPath != '') {
  828. break;
  829. }
  830. }
  831. return $localPath;
  832. }
  833. /**
  834. * Attempts to create a local file path from T3VARs. This is useful for
  835. * various download extensions that hide actual file name but still want the
  836. * file to be indexed.
  837. *
  838. * @param string $sourcePath
  839. * @return string
  840. */
  841. protected function createLocalPathFromT3vars($sourcePath) {
  842. $localPath = '';
  843. $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
  844. if (is_array($indexLocalFiles)) {
  845. $md5 = t3lib_div::shortMD5($sourcePath);
  846. // Note: not using self::isAllowedLocalFile here because this method
  847. // is allowed to index files outside of the web site (for example,
  848. // protected downloads)
  849. if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
  850. $localPath = $indexLocalFiles[$md5];
  851. }
  852. }
  853. return $localPath;
  854. }
  855. /**
  856. * Attempts to create a local file path by matching a current request URL.
  857. *
  858. * @param string $sourcePath
  859. * @return string
  860. */
  861. protected function createLocalPathUsingDomainURL($sourcePath) {
  862. $localPath = '';
  863. $baseURL = t3lib_div::getIndpEnv('TYPO3_SITE_URL');
  864. $baseURLLength = strlen($baseURL);
  865. if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
  866. $sourcePath = substr($sourcePath, $baseURLLength);
  867. $localPath = PATH_site . $sourcePath;
  868. if (!self::isAllowedLocalFile($localPath)) {
  869. $localPath = '';
  870. }
  871. }
  872. return $localPath;
  873. }
  874. /**
  875. * Attempts to create a local file path by matching absRefPrefix. This
  876. * requires TSFE. If TSFE is missing, this function does nothing.
  877. *
  878. * @param string $sourcePath
  879. * @return string
  880. */
  881. protected function createLocalPathUsingAbsRefPrefix($sourcePath) {
  882. $localPath = '';
  883. if ($GLOBALS['TSFE'] instanceof tslib_fe) {
  884. $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
  885. $absRefPrefixLength = strlen($absRefPrefix);
  886. if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
  887. $sourcePath = substr($sourcePath, $absRefPrefixLength);
  888. $localPath = PATH_site . $sourcePath;
  889. if (!self::isAllowedLocalFile($localPath)) {
  890. $localPath = '';
  891. }
  892. }
  893. }
  894. return $localPath;
  895. }
  896. /**
  897. * Attempts to create a local file path from the absolute URL without
  898. * schema.
  899. *
  900. * @param string $sourcePath
  901. * @return string
  902. */
  903. protected function createLocalPathFromAbsoluteURL($sourcePath) {
  904. $localPath = '';
  905. if ($sourcePath{0} == '/') {
  906. $sourcePath = substr($sourcePath, 1);
  907. $localPath = PATH_site . $sourcePath;
  908. if (!self::isAllowedLocalFile($localPath)) {
  909. $localPath = '';
  910. }
  911. }
  912. return $localPath;
  913. }
  914. /**
  915. * Attempts to create a local file path from the relative URL.
  916. *
  917. * @param string $sourcePath
  918. * @return string
  919. */
  920. protected function createLocalPathFromRelativeURL($sourcePath) {
  921. $localPath = '';
  922. if (self::isRelativeURL($sourcePath)) {
  923. $localPath = PATH_site . $sourcePath;
  924. if (!self::isAllowedLocalFile($localPath)) {
  925. $localPath = '';
  926. }
  927. }
  928. return $localPath;
  929. }
  930. /**
  931. * Checks if URL is relative.
  932. *
  933. * @param string $url
  934. * @return boolean
  935. */
  936. static protected function isRelativeURL($url) {
  937. $urlParts = @parse_url($url);
  938. return ($urlParts['scheme'] == '' && $urlParts['path']{0} != '/');
  939. }
  940. /**
  941. * Checks if the path points to the file inside the web site
  942. *
  943. * @param string $filePath
  944. * @return boolean
  945. */
  946. static protected function isAllowedLocalFile($filePath) {
  947. $filePath = t3lib_div::resolveBackPath($filePath);
  948. $insideWebPath = (substr($filePath, 0, strlen(PATH_site)) == PATH_site);
  949. $isFile = is_file($filePath);
  950. return $insideWebPath && $isFile;
  951. }
  952. /******************************************
  953. *
  954. * Indexing; external files (PDF, DOC, etc)
  955. *
  956. ******************************************/
  957. /**
  958. * Indexing a regular document given as $file (relative to PATH_site, local file)
  959. *
  960. * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with t3lib_div::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
  961. * @param boolean If set, indexing is forced (despite content hashes, mtime etc).
  962. * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
  963. * @param string File extension for temporary file.
  964. * @return void
  965. */
  966. function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') {
  967. // Init
  968. $fI = pathinfo($file);
  969. $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
  970. // Create abs-path:
  971. if (!$contentTmpFile) {
  972. if (!t3lib_div::isAbsPath($file)) { // Relative, prepend PATH_site:
  973. $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);
  974. } else { // Absolute, pass-through:
  975. $absFile = $file;
  976. }
  977. $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';
  978. } else {
  979. $absFile = $contentTmpFile;
  980. }
  981. // Indexing the document:
  982. if ($absFile && @is_file($absFile)) {
  983. if ($this->external_parsers[$ext]) {
  984. $mtime = filemtime($absFile);
  985. $cParts = $this->fileContentParts($ext,$absFile);
  986. foreach($cParts as $cPKey) {
  987. $this->internal_log = array();
  988. $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
  989. $Pstart = t3lib_div::milliseconds();
  990. $subinfo = array('key' => $cPKey); // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
  991. $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo);
  992. $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
  993. if ($check > 0 || $force) {
  994. if ($check > 0) {
  995. $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
  996. } else {
  997. $this->log_setTSlogMessage('Indexing forced by flag',1);
  998. }
  999. // Check external file counter:
  1000. if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
  1001. // Divide into title,keywords,description and body:
  1002. $this->log_push('Split content','');
  1003. $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
  1004. $this->log_pull();
  1005. if (is_array($contentParts)) {
  1006. // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
  1007. $content_md5h = $this->md5inthash(implode($contentParts,''));
  1008. if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
  1009. // Increment counter:
  1010. $this->externalFileCounter++;
  1011. // Splitting words
  1012. $this->log_push('Extract words from content','');
  1013. $splitInWords = $this->processWordsInArrays($contentParts);
  1014. $this->log_pull();
  1015. // Analyse the indexed words.
  1016. $this->log_push('Analyse the extracted words','');
  1017. $indexArr = $this->indexAnalyze($splitInWords);
  1018. $this->log_pull();
  1019. // Submitting page (phash) record
  1020. $this->log_push('Submitting page','');
  1021. $size = filesize($absFile);
  1022. $ctime = filemtime($absFile); // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
  1023. $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
  1024. $this->log_pull();
  1025. // Check words and submit to word list if not there
  1026. $this->log_push('Check word list and submit words','');
  1027. $this->checkWordList($indexArr);
  1028. $this->submitWords($indexArr,$phash_arr['phash']);
  1029. $this->log_pull();
  1030. // Set parsetime
  1031. $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
  1032. } else {
  1033. $this->updateTstamp($phash_arr['phash'],$mtime); // Update the timestamp
  1034. $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
  1035. }
  1036. } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
  1037. } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');
  1038. } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
  1039. // Checking and setting sections:
  1040. # $this->submitFile_grlist($phash_arr['phash']); // Setting a gr_list record if there is none already (set for default fe_group)
  1041. $this->submitFile_section($phash_arr['phash']); // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
  1042. $this->log_pull();
  1043. }
  1044. } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');
  1045. } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');
  1046. }
  1047. /**
  1048. * Reads the content of an external file being indexed.
  1049. * The content from the external parser MUST be returned in utf-8!
  1050. *
  1051. * @param string File extension, eg. "pdf", "doc" etc.
  1052. * @param string Absolute filename of file (must exist and be validated OK before calling function)
  1053. * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
  1054. * @return array Standard content array (title, description, keywords, body keys)
  1055. */
  1056. function readFileContent($ext,$absFile,$cPKey) {
  1057. // Consult relevant external document parser:
  1058. if (is_object($this->external_parsers[$ext])) {
  1059. $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey);
  1060. }
  1061. return $contentArr;
  1062. }
  1063. /**
  1064. * Creates an array with pointers to divisions of document.
  1065. *
  1066. * @param string File extension
  1067. * @param string Absolute filename (must exist and be validated OK before calling function)
  1068. * @return array Array of pointers to sections that the document should be divided into
  1069. */
  1070. function fileContentParts($ext,$absFile) {
  1071. $cParts = array(0);
  1072. // Consult relevant external document parser:
  1073. if (is_object($this->external_parsers[$ext])) {
  1074. $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile);
  1075. }
  1076. return $cParts;
  1077. }
  1078. /**
  1079. * Splits non-HTML content (from external files for instance)
  1080. *
  1081. * @param string Input content (non-HTML) to index.
  1082. * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
  1083. * @see splitHTMLContent()
  1084. */
  1085. function splitRegularContent($content) {
  1086. $contentArr = $this->defaultContentArray;
  1087. $contentArr['body'] = $content;
  1088. return $contentArr;
  1089. }
  1090. /**********************************
  1091. *
  1092. * Analysing content, Extracting words
  1093. *
  1094. **********************************/
  1095. /**
  1096. * Convert character set and HTML entities in the value of input content array keys
  1097. *
  1098. * @param array Standard content array
  1099. * @param string Charset of the input content (converted to utf-8)
  1100. * @return void
  1101. */
  1102. function charsetEntity2utf8(&$contentArr, $charset) {
  1103. // Convert charset if necessary
  1104. foreach ($contentArr as $key => $value) {
  1105. if (strlen($contentArr[$key])) {
  1106. if ($charset!=='utf-8') {
  1107. $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
  1108. }
  1109. // decode all numeric / html-entities in the string to real characters:
  1110. $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE);
  1111. }
  1112. }
  1113. }
  1114. /**
  1115. * Processing words in the array from split*Content -functions
  1116. *
  1117. * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
  1118. * @return array Content input array modified so each key is not a unique array of words
  1119. */
  1120. function processWordsInArrays($contentArr) {
  1121. // split all parts to words
  1122. foreach ($contentArr as $key => $value) {
  1123. $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
  1124. }
  1125. // For title, keywords, and description we don't want duplicates:
  1126. $contentArr['title'] = array_unique($contentArr['title']);
  1127. $contentArr['keywords'] = array_unique($contentArr['keywords']);
  1128. $contentArr['description'] = array_unique($contentArr['description']);
  1129. // Return modified array:
  1130. return $contentArr;
  1131. }
  1132. /**
  1133. * Processing words in the array from split*Content -functions
  1134. * This function is only a wrapper because the function has been removed (see above).
  1135. *
  1136. * @param array Array of content to index, see splitHTMLContent() and splitRegularContent()
  1137. * @return array Content input array modified so each key is not a unique array of words
  1138. * @deprecated since TYPO3 4.0, this function will be removed in TYPO3 4.5.
  1139. */
  1140. function procesWordsInArrays($contentArr) {
  1141. t3lib_div::l

Large files files are truncated, but you can click here to view the full file