PageRenderTime 31ms CodeModel.GetById 28ms RepoModel.GetById 2ms app.codeStats 0ms

/typo3/sysext/indexed_search/class.external_parser.php

https://bitbucket.org/linxpinx/mercurial
PHP | 677 lines | 425 code | 69 blank | 183 comment | 79 complexity | ee6761db770ed0a6158a3126cff4677e MD5 | raw file
Possible License(s): BSD-3-Clause, GPL-2.0, Unlicense, LGPL-2.1, Apache-2.0
  1. <?php
  2. /***************************************************************
  3. * Copyright notice
  4. *
  5. * (c) 2001-2010 Kasper Skaarhoj (kasperYYYY@typo3.com)
  6. * All rights reserved
  7. *
  8. * This script is part of the TYPO3 project. The TYPO3 project is
  9. * free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * The GNU General Public License can be found at
  15. * http://www.gnu.org/copyleft/gpl.html.
  16. * A copy is found in the textfile GPL.txt and important notices to the license
  17. * from the author is found in LICENSE.txt distributed with these scripts.
  18. *
  19. *
  20. * This script is distributed in the hope that it will be useful,
  21. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  22. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  23. * GNU General Public License for more details.
  24. *
  25. * This copyright notice MUST APPEAR in all copies of the script!
  26. ***************************************************************/
  27. /**
  28. * External standard parsers for indexed_search
  29. *
  30. * @author Kasper Sk?rh?j <kasperYYYY@typo3.com>
  31. * @coauthor Olivier Simah <noname_paris@yahoo.fr>
  32. */
  33. /**
  34. * [CLASS/FUNCTION INDEX of SCRIPT]
  35. *
  36. *
  37. *
  38. * 75: class tx_indexed_search_extparse
  39. * 94: function initParser($extension)
  40. * 214: function softInit($extension)
  41. * 247: function searchTypeMediaTitle($extension)
  42. * 323: function isMultiplePageExtension($extension)
  43. *
  44. * SECTION: Reading documents (for parsing)
  45. * 354: function readFileContent($ext,$absFile,$cPKey)
  46. * 521: function fileContentParts($ext,$absFile)
  47. * 560: function splitPdfInfo($pdfInfoArray)
  48. * 579: function removeEndJunk($string)
  49. *
  50. * SECTION: Backend analyzer
  51. * 606: function getIcon($extension)
  52. *
  53. * TOTAL FUNCTIONS: 9
  54. * (This index is automatically created/updated by the extension "extdeveval")
  55. *
  56. */
  57. /**
  58. * External standard parsers for indexed_search
  59. * MUST RETURN utf-8 content!
  60. *
  61. * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
  62. * @package TYPO3
  63. * @subpackage tx_indexedsearch
  64. */
  65. class tx_indexed_search_extparse {
  66. // This value is also overridden from config.
  67. var $pdf_mode = -20; // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
  68. // This array is configured in initialization:
  69. var $app = array();
  70. var $ext2itemtype_map = array();
  71. var $supportedExtensions = array();
  72. var $pObj; // Reference to parent object (indexer class)
  73. protected $langObject; // Reference to LANG-Object
  74. /**
  75. * Constructs this external parsers object
  76. */
  77. public function __construct() {
  78. // Set the language object to be used accordant to current TYPO3_MODE:
  79. $this->langObject = (TYPO3_MODE == 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG']);
  80. }
  81. /**
  82. * Initialize external parser for parsing content.
  83. *
  84. * @param string File extension
  85. * @return boolean Returns true if extension is supported/enabled, otherwise false.
  86. */
  87. function initParser($extension) {
  88. // Then read indexer-config and set if appropriate:
  89. $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
  90. // If windows, apply extension to tool name:
  91. $exe = (TYPO3_OS == 'WIN') ? '.exe' : ''; // lg
  92. $extOK = FALSE;
  93. $mainExtension = '';
  94. // Ignore extensions
  95. $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
  96. if (in_array($extension, $ignoreExtensions)) {
  97. $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:ignoreExtensions'), $extension), 1);
  98. return FALSE;
  99. }
  100. // Switch on file extension:
  101. switch($extension) {
  102. case 'pdf':
  103. // PDF
  104. if ($indexerConfig['pdftools']) {
  105. $pdfPath = rtrim($indexerConfig['pdftools'], '/').'/';
  106. if (ini_get('safe_mode') || (@is_file($pdfPath.'pdftotext'.$exe) && @is_file($pdfPath.'pdfinfo'.$exe))) {
  107. $this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe;
  108. $this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe;
  109. // PDF mode:
  110. $this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100);
  111. $extOK = TRUE;
  112. } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsNotFound'), $pdfPath), 3);
  113. } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsDisabled'), 1);
  114. break;
  115. case 'doc':
  116. // Catdoc
  117. if ($indexerConfig['catdoc']) {
  118. $catdocPath = rtrim($indexerConfig['catdoc'], '/').'/';
  119. if (ini_get('safe_mode') || @is_file($catdocPath.'catdoc'.$exe)) {
  120. $this->app['catdoc'] = $catdocPath.'catdoc'.$exe;
  121. $extOK = TRUE;
  122. } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:catdocNotFound'), $catdocPath), 3);
  123. } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:catdocDisabled'), 1);
  124. break;
  125. case 'pps': // MS PowerPoint(?)
  126. case 'ppt': // MS PowerPoint
  127. // ppthtml
  128. if ($indexerConfig['ppthtml']) {
  129. $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/').'/';
  130. if (ini_get('safe_mode') || @is_file($ppthtmlPath.'ppthtml'.$exe)){
  131. $this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe;
  132. $extOK = TRUE;
  133. } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:ppthtmlNotFound'), $ppthtmlPath), 3);
  134. } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:ppthtmlDisabled'), 1);
  135. break;
  136. case 'xls': // MS Excel
  137. // Xlhtml
  138. if ($indexerConfig['xlhtml']) {
  139. $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/').'/';
  140. if (ini_get('safe_mode') || @is_file($xlhtmlPath.'xlhtml'.$exe)){
  141. $this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe;
  142. $extOK = TRUE;
  143. } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:xlhtmlNotFound'), $xlhtmlPath), 3);
  144. } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:xlhtmlDisabled'), 1);
  145. break;
  146. case 'sxc': // Open Office Calc.
  147. case 'sxi': // Open Office Impress
  148. case 'sxw': // Open Office Writer
  149. case 'ods': // Oasis OpenDocument Spreadsheet
  150. case 'odp': // Oasis OpenDocument Presentation
  151. case 'odt': // Oasis OpenDocument Text
  152. if ($indexerConfig['unzip']) {
  153. $unzipPath = rtrim($indexerConfig['unzip'], '/').'/';
  154. if (ini_get('safe_mode') || @is_file($unzipPath.'unzip'.$exe)) {
  155. $this->app['unzip'] = $unzipPath.'unzip'.$exe;
  156. $extOK = TRUE;
  157. } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:unzipNotFound'), $unzipPath), 3);
  158. } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:unzipDisabled'), 1);
  159. break;
  160. case 'rtf':
  161. // Catdoc
  162. if ($indexerConfig['unrtf']) {
  163. $unrtfPath = rtrim($indexerConfig['unrtf'], '/').'/';
  164. if (ini_get('safe_mode') || @is_file($unrtfPath.'unrtf'.$exe)) {
  165. $this->app['unrtf'] = $unrtfPath.'unrtf'.$exe;
  166. $extOK = TRUE;
  167. } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:unrtfNotFound'), $unrtfPath), 3);
  168. } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:unrtfDisabled'), 1);
  169. break;
  170. case 'txt': // Raw text
  171. case 'csv': // Raw text
  172. case 'xml': // PHP strip-tags()
  173. case 'tif': // PHP EXIF
  174. $extOK = TRUE;
  175. break;
  176. case 'html': // PHP strip-tags()
  177. case 'htm': // PHP strip-tags()
  178. $extOK = TRUE;
  179. $mainExtension = 'html'; // making "html" the common "item_type"
  180. break;
  181. case 'jpg': // PHP EXIF
  182. case 'jpeg': // PHP EXIF
  183. $extOK = TRUE;
  184. $mainExtension = 'jpeg'; // making "jpeg" the common item_type
  185. break;
  186. }
  187. // If extension was OK:
  188. if ($extOK) {
  189. $this->supportedExtensions[$extension] = TRUE;
  190. $this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension;
  191. return TRUE;
  192. }
  193. }
  194. /**
  195. * Initialize external parser for backend modules
  196. * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
  197. *
  198. * @param string File extension to initialize for.
  199. * @return boolean Returns true if the extension is supported and enabled, otherwise false.
  200. */
  201. function softInit($extension) {
  202. switch($extension) {
  203. case 'pdf': // PDF
  204. case 'doc': // MS Word files
  205. case 'pps': // MS PowerPoint
  206. case 'ppt': // MS PowerPoint
  207. case 'xls': // MS Excel
  208. case 'sxc': // Open Office Calc.
  209. case 'sxi': // Open Office Impress
  210. case 'sxw': // Open Office Writer
  211. case 'ods': // Oasis OpenDocument Spreadsheet
  212. case 'odp': // Oasis OpenDocument Presentation
  213. case 'odt': // Oasis OpenDocument Text
  214. case 'rtf': // RTF documents
  215. case 'txt': // ASCII Text documents
  216. case 'html': // HTML
  217. case 'htm': // HTML
  218. case 'csv': // Comma Separated Values
  219. case 'xml': // Generic XML
  220. case 'jpg': // Jpeg images (EXIF comment)
  221. case 'jpeg': // Jpeg images (EXIF comment)
  222. case 'tif': // TIF images (EXIF comment)
  223. return TRUE;
  224. break;
  225. }
  226. }
  227. /**
  228. * Return title of entry in media type selector box.
  229. *
  230. * @param string File extension
  231. * @return string String with label value of entry in media type search selector box (frontend plugin).
  232. */
  233. function searchTypeMediaTitle($extension) {
  234. // Read indexer-config
  235. $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
  236. // Ignore extensions
  237. $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
  238. if (in_array($extension, $ignoreExtensions)) {
  239. return FALSE;
  240. }
  241. // Switch on file extension:
  242. switch($extension) {
  243. case 'pdf':
  244. // PDF
  245. if ($indexerConfig['pdftools']) {
  246. return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.PDF'), $extension);
  247. }
  248. break;
  249. case 'doc':
  250. // Catdoc
  251. if ($indexerConfig['catdoc']) {
  252. return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.DOC'), $extension);
  253. }
  254. break;
  255. case 'pps': // MS PowerPoint(?)
  256. case 'ppt': // MS PowerPoint
  257. // ppthtml
  258. if ($indexerConfig['ppthtml']) {
  259. return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.PP'), $extension);
  260. }
  261. break;
  262. case 'xls': // MS Excel
  263. // Xlhtml
  264. if ($indexerConfig['xlhtml']) {
  265. return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.XLS'), $extension);
  266. }
  267. break;
  268. case 'sxc': // Open Office Calc.
  269. if ($indexerConfig['unzip']) {
  270. return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXC'), $extension);
  271. }
  272. break;
  273. case 'sxi': // Open Office Impress
  274. if ($indexerConfig['unzip']) {
  275. return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXI'), $extension);
  276. }
  277. break;
  278. case 'sxw': // Open Office Writer
  279. if ($indexerConfig['unzip']) {
  280. return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXW'), $extension);
  281. }
  282. break;
  283. case 'ods': // Oasis OpenDocument Spreadsheet
  284. if ($indexerConfig['unzip']) {
  285. return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODS'), $extension);
  286. }
  287. break;
  288. case 'odp': // Oasis OpenDocument Presentation
  289. if ($indexerConfig['unzip']) {
  290. return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODP'), $extension);
  291. }
  292. break;
  293. case 'odt': // Oasis OpenDocument Text
  294. if ($indexerConfig['unzip']) {
  295. return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODT'), $extension);
  296. }
  297. break;
  298. case 'rtf':
  299. // Catdoc
  300. if ($indexerConfig['unrtf']) {
  301. return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.RTF'), $extension);
  302. }
  303. break;
  304. case 'jpeg': // PHP EXIF
  305. case 'tif': // PHP EXIF
  306. return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.Images'), $extension);
  307. break;
  308. case 'html': // PHP strip-tags()
  309. return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.HTML'), $extension);
  310. break;
  311. case 'txt': // Raw text
  312. return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.TXT'), $extension);
  313. break;
  314. case 'csv': // Raw text
  315. return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.CSV'), $extension);
  316. break;
  317. case 'xml': // PHP strip-tags()
  318. return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.XML'), $extension);
  319. break;
  320. // NO entry (duplicates or blank):
  321. case 'htm': // PHP strip-tags()
  322. case 'jpg': // PHP EXIF
  323. default:
  324. break;
  325. }
  326. }
  327. /**
  328. * Returns true if the input extension (item_type) is a potentially a multi-page extension
  329. *
  330. * @param string Extension / item_type string
  331. * @return boolean Return true if multi-page
  332. */
  333. function isMultiplePageExtension($extension) {
  334. // Switch on file extension:
  335. switch((string)$extension) {
  336. case 'pdf':
  337. return TRUE;
  338. break;
  339. }
  340. }
  341. /**
  342. * Wraps the "splitLabel function" of the language object.
  343. *
  344. * @param string $reference: Reference/key of the label
  345. * @param boolean $useHtmlSpecialChar: Convert special chars to HTML entities (default: false)
  346. * @return string The label of the reference/key to be fetched
  347. */
  348. protected function sL($reference, $useHtmlSpecialChar = false) {
  349. return $this->langObject->sL($reference, $useHtmlSpecialChar);
  350. }
  351. /************************
  352. *
  353. * Reading documents (for parsing)
  354. *
  355. ************************/
  356. /**
  357. * Reads the content of an external file being indexed.
  358. *
  359. * @param string File extension, eg. "pdf", "doc" etc.
  360. * @param string Absolute filename of file (must exist and be validated OK before calling function)
  361. * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
  362. * @return array Standard content array (title, description, keywords, body keys)
  363. */
  364. function readFileContent($ext,$absFile,$cPKey) {
  365. unset($contentArr);
  366. // Return immediately if initialization didn't set support up:
  367. if (!$this->supportedExtensions[$ext]) return FALSE;
  368. // Switch by file extension
  369. switch ($ext) {
  370. case 'pdf':
  371. if ($this->app['pdfinfo']) {
  372. // Getting pdf-info:
  373. $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
  374. exec($cmd,$res);
  375. $pdfInfo = $this->splitPdfInfo($res);
  376. unset($res);
  377. if (intval($pdfInfo['pages'])) {
  378. list($low,$high) = explode('-',$cPKey);
  379. // Get pdf content:
  380. $tempFileName = t3lib_div::tempnam('Typo3_indexer'); // Create temporary name
  381. @unlink ($tempFileName); // Delete if exists, just to be safe.
  382. $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
  383. exec($cmd);
  384. if (@is_file($tempFileName)) {
  385. $content = t3lib_div::getUrl($tempFileName);
  386. unlink($tempFileName);
  387. } else {
  388. $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsFailed'), $absFile), 2);
  389. }
  390. if (strlen($content)) {
  391. $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
  392. }
  393. }
  394. }
  395. break;
  396. case 'doc':
  397. if ($this->app['catdoc']) {
  398. $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
  399. exec($cmd,$res);
  400. $content = implode(LF,$res);
  401. unset($res);
  402. $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
  403. }
  404. break;
  405. case 'pps':
  406. case 'ppt':
  407. if ($this->app['ppthtml']) {
  408. $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
  409. exec($cmd,$res);
  410. $content = implode(LF,$res);
  411. unset($res);
  412. $content = $this->pObj->convertHTMLToUtf8($content);
  413. $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
  414. $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
  415. }
  416. break;
  417. case 'xls':
  418. if ($this->app['xlhtml']) {
  419. $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
  420. exec($cmd,$res);
  421. $content = implode(LF,$res);
  422. unset($res);
  423. $content = $this->pObj->convertHTMLToUtf8($content);
  424. $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
  425. $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
  426. }
  427. break;
  428. case 'sxi':
  429. case 'sxc':
  430. case 'sxw':
  431. case 'ods':
  432. case 'odp':
  433. case 'odt':
  434. if ($this->app['unzip']) {
  435. // Read content.xml:
  436. $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
  437. exec($cmd,$res);
  438. $content_xml = implode(LF,$res);
  439. unset($res);
  440. // Read meta.xml:
  441. $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
  442. exec($cmd, $res);
  443. $meta_xml = implode(LF,$res);
  444. unset($res);
  445. $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml)));
  446. $contentArr = $this->pObj->splitRegularContent($utf8_content);
  447. $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
  448. // Meta information
  449. $metaContent = t3lib_div::xml2tree($meta_xml);
  450. $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
  451. if (is_array($metaContent)) {
  452. $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
  453. $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];
  454. // Keywords collected:
  455. if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
  456. foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
  457. $contentArr['keywords'].= $kwDat['values'][0].' ';
  458. }
  459. }
  460. }
  461. }
  462. break;
  463. case 'rtf':
  464. if ($this->app['unrtf']) {
  465. $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
  466. exec($cmd,$res);
  467. $fileContent = implode(LF,$res);
  468. unset($res);
  469. $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
  470. $contentArr = $this->pObj->splitHTMLContent($fileContent);
  471. }
  472. break;
  473. case 'txt':
  474. case 'csv': // Raw text
  475. $content = t3lib_div::getUrl($absFile);
  476. // TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...)
  477. $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1');
  478. $contentArr = $this->pObj->splitRegularContent($content);
  479. $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
  480. break;
  481. case 'html':
  482. case 'htm':
  483. $fileContent = t3lib_div::getUrl($absFile);
  484. $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
  485. $contentArr = $this->pObj->splitHTMLContent($fileContent);
  486. break;
  487. case 'xml': // PHP strip-tags()
  488. $fileContent = t3lib_div::getUrl($absFile);
  489. // Finding charset:
  490. preg_match('/^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i',substr($fileContent,0,200),$reg);
  491. $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
  492. // Converting content:
  493. $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset);
  494. $contentArr = $this->pObj->splitRegularContent($fileContent);
  495. $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
  496. break;
  497. case 'jpg': // PHP EXIF
  498. case 'jpeg': // PHP EXIF
  499. case 'tif': // PHP EXIF
  500. if (function_exists('exif_read_data')) {
  501. $exif = exif_read_data($absFile, 'IFD0');
  502. } else {
  503. $exif = FALSE;
  504. }
  505. if ($exif) {
  506. $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']); // The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii.
  507. } else {
  508. $comment = '';
  509. }
  510. $contentArr = $this->pObj->splitRegularContent($comment);
  511. $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
  512. break;
  513. default:
  514. return false;
  515. break;
  516. }
  517. // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
  518. if (is_array($contentArr) && !$contentArr['title']) {
  519. $contentArr['title'] = str_replace('_',' ',basename($absFile)); // Substituting "_" for " " because many filenames may have this instead of a space char.
  520. }
  521. return $contentArr;
  522. }
  523. /**
  524. * Creates an array with pointers to divisions of document.
  525. * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back.
  526. *
  527. * @param string File extension
  528. * @param string Absolute filename (must exist and be validated OK before calling function)
  529. * @return array Array of pointers to sections that the document should be divided into
  530. */
  531. function fileContentParts($ext,$absFile) {
  532. $cParts = array(0);
  533. switch ($ext) {
  534. case 'pdf':
  535. // Getting pdf-info:
  536. $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
  537. exec($cmd,$res);
  538. $pdfInfo = $this->splitPdfInfo($res);
  539. unset($res);
  540. if (intval($pdfInfo['pages'])) {
  541. $cParts = array();
  542. // Calculate mode
  543. if ($this->pdf_mode>0) {
  544. $iter = ceil($pdfInfo['pages']/$this->pdf_mode);
  545. } else {
  546. $iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);
  547. }
  548. // Traverse and create intervals.
  549. for ($a=0;$a<$iter;$a++) {
  550. $low = floor($a*($pdfInfo['pages']/$iter))+1;
  551. $high = floor(($a+1)*($pdfInfo['pages']/$iter));
  552. $cParts[] = $low.'-'.$high;
  553. }
  554. }
  555. break;
  556. }
  557. return $cParts;
  558. }
  559. /**
  560. * Analysing PDF info into a useable format.
  561. *
  562. * @param array Array of PDF content, coming from the pdfinfo tool
  563. * @return array Result array
  564. * @access private
  565. * @see fileContentParts()
  566. */
  567. function splitPdfInfo($pdfInfoArray) {
  568. $res = array();
  569. if (is_array($pdfInfoArray)) {
  570. foreach($pdfInfoArray as $line) {
  571. $parts = explode(':',$line,2);
  572. if (count($parts)>1 && trim($parts[0])) {
  573. $res[strtolower(trim($parts[0]))] = trim($parts[1]);
  574. }
  575. }
  576. }
  577. return $res;
  578. }
  579. /**
  580. * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
  581. *
  582. * @param string String to clean up
  583. * @return string String
  584. */
  585. function removeEndJunk($string) {
  586. return trim(preg_replace('/['.LF.chr(12).']*$/','',$string));
  587. }
  588. /************************
  589. *
  590. * Backend analyzer
  591. *
  592. ************************/
  593. /**
  594. * Return icon for file extension
  595. *
  596. * @param string File extension, lowercase.
  597. * @return string Relative file reference, resolvable by t3lib_div::getFileAbsFileName()
  598. */
  599. function getIcon($extension) {
  600. if ($extension=='htm') $extension = 'html';
  601. if ($extension=='jpeg') $extension = 'jpg';
  602. return 'EXT:indexed_search/pi/res/'.$extension.'.gif';
  603. }
  604. }
  605. if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']) {
  606. include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);
  607. }
  608. ?>