PageRenderTime 99ms CodeModel.GetById 57ms app.highlight 33ms RepoModel.GetById 1ms app.codeStats 1ms

/typo3/sysext/indexed_search/class.external_parser.php

https://bitbucket.org/linxpinx/mercurial
PHP | 677 lines | 425 code | 69 blank | 183 comment | 79 complexity | ee6761db770ed0a6158a3126cff4677e MD5 | raw file
  1<?php
  2/***************************************************************
  3*  Copyright notice
  4*
  5*  (c) 2001-2010 Kasper Skaarhoj (kasperYYYY@typo3.com)
  6*  All rights reserved
  7*
  8*  This script is part of the TYPO3 project. The TYPO3 project is
  9*  free software; you can redistribute it and/or modify
 10*  it under the terms of the GNU General Public License as published by
 11*  the Free Software Foundation; either version 2 of the License, or
 12*  (at your option) any later version.
 13*
 14*  The GNU General Public License can be found at
 15*  http://www.gnu.org/copyleft/gpl.html.
 16*  A copy is found in the textfile GPL.txt and important notices to the license
 17*  from the author is found in LICENSE.txt distributed with these scripts.
 18*
 19*
 20*  This script is distributed in the hope that it will be useful,
 21*  but WITHOUT ANY WARRANTY; without even the implied warranty of
 22*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 23*  GNU General Public License for more details.
 24*
 25*  This copyright notice MUST APPEAR in all copies of the script!
 26***************************************************************/
 27/**
 28 * External standard parsers for indexed_search
 29 *
 30 * @author	Kasper Sk?rh?j <kasperYYYY@typo3.com>
 31 * @coauthor	Olivier Simah <noname_paris@yahoo.fr>
 32 */
 33/**
 34 * [CLASS/FUNCTION INDEX of SCRIPT]
 35 *
 36 *
 37 *
 38 *   75: class tx_indexed_search_extparse
 39 *   94:     function initParser($extension)
 40 *  214:     function softInit($extension)
 41 *  247:     function searchTypeMediaTitle($extension)
 42 *  323:     function isMultiplePageExtension($extension)
 43 *
 44 *              SECTION: Reading documents (for parsing)
 45 *  354:     function readFileContent($ext,$absFile,$cPKey)
 46 *  521:     function fileContentParts($ext,$absFile)
 47 *  560:     function splitPdfInfo($pdfInfoArray)
 48 *  579:     function removeEndJunk($string)
 49 *
 50 *              SECTION: Backend analyzer
 51 *  606:     function getIcon($extension)
 52 *
 53 * TOTAL FUNCTIONS: 9
 54 * (This index is automatically created/updated by the extension "extdeveval")
 55 *
 56 */
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67/**
 68 * External standard parsers for indexed_search
 69 * MUST RETURN utf-8 content!
 70 *
 71 * @author	Kasper Skaarhoj <kasperYYYY@typo3.com>
 72 * @package TYPO3
 73 * @subpackage tx_indexedsearch
 74 */
 75class tx_indexed_search_extparse {
 76
 77		// This value is also overridden from config.
 78	var $pdf_mode = -20;	// zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
 79
 80		// This array is configured in initialization:
 81	var $app = array();
 82	var $ext2itemtype_map = array();
 83	var $supportedExtensions = array();
 84
 85	var $pObj;		// Reference to parent object (indexer class)
 86	protected $langObject;	// Reference to LANG-Object
 87
 88	/**
 89	 * Constructs this external parsers object
 90	 */
 91	public function __construct() {
 92			// Set the language object to be used accordant to current TYPO3_MODE:
 93		$this->langObject = (TYPO3_MODE == 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG']);
 94	}
 95
 96	/**
 97	 * Initialize external parser for parsing content.
 98	 *
 99	 * @param	string		File extension
100	 * @return	boolean		Returns true if extension is supported/enabled, otherwise false.
101	 */
102	function initParser($extension)	{
103
104			// Then read indexer-config and set if appropriate:
105		$indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
106
107			// If windows, apply extension to tool name:
108		$exe = (TYPO3_OS == 'WIN') ? '.exe' : ''; // lg
109		$extOK = FALSE;
110		$mainExtension = '';
111
112			// Ignore extensions
113		$ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
114		if (in_array($extension, $ignoreExtensions))	{
115			$this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:ignoreExtensions'), $extension), 1);
116			return FALSE;
117		}
118
119			// Switch on file extension:
120		switch($extension)	{
121			case 'pdf':
122					// PDF
123				if ($indexerConfig['pdftools'])	{
124					$pdfPath = rtrim($indexerConfig['pdftools'], '/').'/';
125					if (ini_get('safe_mode') || (@is_file($pdfPath.'pdftotext'.$exe) && @is_file($pdfPath.'pdfinfo'.$exe)))	{
126						$this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe;
127						$this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe;
128							// PDF mode:
129						$this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100);
130						$extOK = TRUE;
131					} else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsNotFound'), $pdfPath), 3);
132				} else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsDisabled'), 1);
133			break;
134			case 'doc':
135					// Catdoc
136				if ($indexerConfig['catdoc'])	{
137					$catdocPath = rtrim($indexerConfig['catdoc'], '/').'/';
138					if (ini_get('safe_mode') || @is_file($catdocPath.'catdoc'.$exe))	{
139						$this->app['catdoc'] = $catdocPath.'catdoc'.$exe;
140						$extOK = TRUE;
141					} else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:catdocNotFound'), $catdocPath), 3);
142				} else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:catdocDisabled'), 1);
143			break;
144			case 'pps':		// MS PowerPoint(?)
145			case 'ppt':		// MS PowerPoint
146					// ppthtml
147				if ($indexerConfig['ppthtml'])	{
148					$ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/').'/';
149					if (ini_get('safe_mode') || @is_file($ppthtmlPath.'ppthtml'.$exe)){
150						$this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe;
151						$extOK = TRUE;
152					} else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:ppthtmlNotFound'), $ppthtmlPath), 3);
153				} else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:ppthtmlDisabled'), 1);
154			break;
155			case 'xls':		// MS Excel
156					// Xlhtml
157				if ($indexerConfig['xlhtml'])	{
158					$xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/').'/';
159					if (ini_get('safe_mode') || @is_file($xlhtmlPath.'xlhtml'.$exe)){
160						$this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe;
161						$extOK = TRUE;
162					} else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:xlhtmlNotFound'), $xlhtmlPath), 3);
163				} else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:xlhtmlDisabled'), 1);
164			break;
165			case 'sxc':		// Open Office Calc.
166			case 'sxi':		// Open Office Impress
167			case 'sxw':		// Open Office Writer
168			case 'ods':		// Oasis OpenDocument Spreadsheet
169			case 'odp':		// Oasis OpenDocument Presentation
170			case 'odt':		// Oasis OpenDocument Text
171				if ($indexerConfig['unzip'])	{
172					$unzipPath = rtrim($indexerConfig['unzip'], '/').'/';
173					if (ini_get('safe_mode') || @is_file($unzipPath.'unzip'.$exe))	{
174						$this->app['unzip'] = $unzipPath.'unzip'.$exe;
175						$extOK = TRUE;
176					} else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:unzipNotFound'), $unzipPath), 3);
177				} else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:unzipDisabled'), 1);
178			break;
179			case 'rtf':
180					// Catdoc
181				if ($indexerConfig['unrtf'])	{
182					$unrtfPath = rtrim($indexerConfig['unrtf'], '/').'/';
183					if (ini_get('safe_mode') || @is_file($unrtfPath.'unrtf'.$exe))	{
184						$this->app['unrtf'] = $unrtfPath.'unrtf'.$exe;
185						$extOK = TRUE;
186					} else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:unrtfNotFound'), $unrtfPath), 3);
187				} else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:unrtfDisabled'), 1);
188			break;
189			case 'txt':		// Raw text
190			case 'csv':		// Raw text
191			case 'xml':		// PHP strip-tags()
192			case 'tif':		// PHP EXIF
193				$extOK = TRUE;
194			break;
195			case 'html':	// PHP strip-tags()
196			case 'htm':		// PHP strip-tags()
197				$extOK = TRUE;
198				$mainExtension = 'html';	// making "html" the common "item_type"
199			break;
200			case 'jpg':		// PHP EXIF
201			case 'jpeg':	// PHP EXIF
202				$extOK = TRUE;
203				$mainExtension = 'jpeg';	// making "jpeg" the common item_type
204			break;
205		}
206
207			// If extension was OK:
208		if ($extOK)	{
209			$this->supportedExtensions[$extension] = TRUE;
210			$this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension;
211			return TRUE;
212		}
213	}
214
215	/**
216	 * Initialize external parser for backend modules
217	 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
218	 *
219	 * @param	string		File extension to initialize for.
220	 * @return	boolean		Returns true if the extension is supported and enabled, otherwise false.
221	 */
222	function softInit($extension)	{
223		switch($extension)	{
224			case 'pdf':		// PDF
225			case 'doc':		// MS Word files
226			case 'pps':		// MS PowerPoint
227			case 'ppt':		// MS PowerPoint
228			case 'xls':		// MS Excel
229			case 'sxc':		// Open Office Calc.
230			case 'sxi':		// Open Office Impress
231			case 'sxw':		// Open Office Writer
232			case 'ods':		// Oasis OpenDocument Spreadsheet
233			case 'odp':		// Oasis OpenDocument Presentation
234			case 'odt':		// Oasis OpenDocument Text
235			case 'rtf':		// RTF documents
236			case 'txt':		// ASCII Text documents
237			case 'html':	// HTML
238			case 'htm':		// HTML
239			case 'csv':		// Comma Separated Values
240			case 'xml':		// Generic XML
241			case 'jpg':		// Jpeg images (EXIF comment)
242			case 'jpeg':	// Jpeg images (EXIF comment)
243			case 'tif':		// TIF images (EXIF comment)
244				return TRUE;
245			break;
246		}
247	}
248
249	/**
250	 * Return title of entry in media type selector box.
251	 *
252	 * @param	string		File extension
253	 * @return	string		String with label value of entry in media type search selector box (frontend plugin).
254	 */
255	function searchTypeMediaTitle($extension)	{
256
257			// Read indexer-config
258		$indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
259
260			// Ignore extensions
261		$ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
262		if (in_array($extension, $ignoreExtensions))	{
263			return FALSE;
264		}
265
266			// Switch on file extension:
267		switch($extension)	{
268			case 'pdf':
269					// PDF
270				if ($indexerConfig['pdftools'])	{
271					return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.PDF'), $extension);
272				}
273			break;
274			case 'doc':
275					// Catdoc
276				if ($indexerConfig['catdoc'])	{
277					return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.DOC'), $extension);
278				}
279			break;
280			case 'pps':		// MS PowerPoint(?)
281			case 'ppt':		// MS PowerPoint
282					// ppthtml
283				if ($indexerConfig['ppthtml'])	{
284					return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.PP'), $extension);
285				}
286			break;
287			case 'xls':		// MS Excel
288					// Xlhtml
289				if ($indexerConfig['xlhtml'])	{
290					return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.XLS'), $extension);
291				}
292			break;
293			case 'sxc':		// Open Office Calc.
294			if ($indexerConfig['unzip'])	{
295					return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXC'), $extension);
296				}
297			break;
298			case 'sxi':		// Open Office Impress
299			if ($indexerConfig['unzip'])	{
300					return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXI'), $extension);
301				}
302			break;
303			case 'sxw':		// Open Office Writer
304			if ($indexerConfig['unzip'])	{
305					return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXW'), $extension);
306				}
307			break;
308			case 'ods':		// Oasis OpenDocument Spreadsheet
309			if ($indexerConfig['unzip'])	{
310					return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODS'), $extension);
311				}
312			break;
313			case 'odp':		// Oasis OpenDocument Presentation
314				if ($indexerConfig['unzip'])	{
315					return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODP'), $extension);
316				}
317			break;
318			case 'odt':		// Oasis OpenDocument Text
319				if ($indexerConfig['unzip'])	{
320					return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODT'), $extension);
321				}
322			break;
323			case 'rtf':
324					// Catdoc
325				if ($indexerConfig['unrtf'])	{
326					return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.RTF'), $extension);
327				}
328			break;
329			case 'jpeg':	// PHP EXIF
330			case 'tif':		// PHP EXIF
331				return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.Images'), $extension);
332			break;
333			case 'html':	// PHP strip-tags()
334				return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.HTML'), $extension);
335			break;
336			case 'txt':		// Raw text
337				return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.TXT'), $extension);
338			break;
339			case 'csv':		// Raw text
340				return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.CSV'), $extension);
341			break;
342			case 'xml':		// PHP strip-tags()
343				return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.XML'), $extension);
344			break;
345				// NO entry (duplicates or blank):
346			case 'htm':		// PHP strip-tags()
347			case 'jpg':		// PHP EXIF
348			default:
349			break;
350		}
351	}
352
353	/**
354	 * Returns true if the input extension (item_type) is a potentially a multi-page extension
355	 *
356	 * @param	string		Extension / item_type string
357	 * @return	boolean		Return true if multi-page
358	 */
359	function isMultiplePageExtension($extension)	{
360			// Switch on file extension:
361		switch((string)$extension)	{
362			case 'pdf':
363				return TRUE;
364			break;
365		}
366	}
367
368	/**
369	 * Wraps the "splitLabel function" of the language object.
370	 *
371	 * @param	string		$reference: Reference/key of the label
372	 * @param	boolean		$useHtmlSpecialChar: Convert special chars to HTML entities (default: false)
373	 * @return	string		The label of the reference/key to be fetched
374	 */
375	protected function sL($reference, $useHtmlSpecialChar = false) {
376		return $this->langObject->sL($reference, $useHtmlSpecialChar);
377	}
378
379
380
381
382
383
384
385
386
387	/************************
388	 *
389	 * Reading documents (for parsing)
390	 *
391	 ************************/
392
393	/**
394	 * Reads the content of an external file being indexed.
395	 *
396	 * @param	string		File extension, eg. "pdf", "doc" etc.
397	 * @param	string		Absolute filename of file (must exist and be validated OK before calling function)
398	 * @param	string		Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
399	 * @return	array		Standard content array (title, description, keywords, body keys)
400	 */
401	function readFileContent($ext,$absFile,$cPKey)	{
402		unset($contentArr);
403
404			// Return immediately if initialization didn't set support up:
405		if (!$this->supportedExtensions[$ext])	return FALSE;
406
407			// Switch by file extension
408		switch ($ext)	{
409			case 'pdf':
410				if ($this->app['pdfinfo'])	{
411						// Getting pdf-info:
412					$cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
413					exec($cmd,$res);
414					$pdfInfo = $this->splitPdfInfo($res);
415					unset($res);
416					if (intval($pdfInfo['pages']))	{
417						list($low,$high) = explode('-',$cPKey);
418
419							// Get pdf content:
420						$tempFileName = t3lib_div::tempnam('Typo3_indexer');		// Create temporary name
421						@unlink ($tempFileName);	// Delete if exists, just to be safe.
422						$cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
423						exec($cmd);
424						if (@is_file($tempFileName))	{
425							$content = t3lib_div::getUrl($tempFileName);
426							unlink($tempFileName);
427						} else {
428							$this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsFailed'), $absFile), 2);
429						}
430						if (strlen($content))	{
431							$contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
432						}
433					}
434				}
435			break;
436			case 'doc':
437				if ($this->app['catdoc'])	{
438					$cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
439					exec($cmd,$res);
440					$content = implode(LF,$res);
441					unset($res);
442					$contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
443				}
444			break;
445			case 'pps':
446			case 'ppt':
447				if ($this->app['ppthtml'])	{
448					$cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
449					exec($cmd,$res);
450					$content = implode(LF,$res);
451					unset($res);
452					$content = $this->pObj->convertHTMLToUtf8($content);
453					$contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
454					$contentArr['title'] = basename($absFile);	// Make sure the title doesn't expose the absolute path!
455				}
456			break;
457			case 'xls':
458				if ($this->app['xlhtml'])	{
459					$cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
460					exec($cmd,$res);
461					$content = implode(LF,$res);
462					unset($res);
463					$content = $this->pObj->convertHTMLToUtf8($content);
464					$contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
465					$contentArr['title'] = basename($absFile);	// Make sure the title doesn't expose the absolute path!
466				}
467			break;
468			case 'sxi':
469			case 'sxc':
470			case 'sxw':
471			case 'ods':
472			case 'odp':
473			case 'odt':
474				if ($this->app['unzip'])	{
475						// Read content.xml:
476					$cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
477					exec($cmd,$res);
478					$content_xml = implode(LF,$res);
479					unset($res);
480
481						// Read meta.xml:
482					$cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
483					exec($cmd, $res);
484					$meta_xml = implode(LF,$res);
485					unset($res);
486
487					$utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml)));
488					$contentArr = $this->pObj->splitRegularContent($utf8_content);
489					$contentArr['title'] = basename($absFile);	// Make sure the title doesn't expose the absolute path!
490
491						// Meta information
492					$metaContent = t3lib_div::xml2tree($meta_xml);
493					$metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
494					if (is_array($metaContent))	{
495						$contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
496						$contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];
497
498							// Keywords collected:
499						if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword']))	{
500							foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat)	{
501								$contentArr['keywords'].= $kwDat['values'][0].' ';
502							}
503						}
504					}
505				}
506			break;
507			case 'rtf':
508				if ($this->app['unrtf'])	{
509					$cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
510					exec($cmd,$res);
511					$fileContent = implode(LF,$res);
512					unset($res);
513					$fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
514					$contentArr = $this->pObj->splitHTMLContent($fileContent);
515				}
516			break;
517			case 'txt':
518			case 'csv':		// Raw text
519				$content = t3lib_div::getUrl($absFile);
520					// TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...)
521				$content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1');
522				$contentArr = $this->pObj->splitRegularContent($content);
523				$contentArr['title'] = basename($absFile);	// Make sure the title doesn't expose the absolute path!
524			break;
525			case 'html':
526			case 'htm':
527				$fileContent = t3lib_div::getUrl($absFile);
528				$fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
529				$contentArr = $this->pObj->splitHTMLContent($fileContent);
530			break;
531			case 'xml':		// PHP strip-tags()
532				$fileContent = t3lib_div::getUrl($absFile);
533
534					// Finding charset:
535				preg_match('/^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i',substr($fileContent,0,200),$reg);
536				$charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
537
538					// Converting content:
539				$fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset);
540				$contentArr = $this->pObj->splitRegularContent($fileContent);
541				$contentArr['title'] = basename($absFile);	// Make sure the title doesn't expose the absolute path!
542			break;
543			case 'jpg':		// PHP EXIF
544			case 'jpeg':	// PHP EXIF
545			case 'tif':		// PHP EXIF
546				if (function_exists('exif_read_data'))	{
547					$exif = exif_read_data($absFile, 'IFD0');
548				} else {
549					$exif = FALSE;
550				}
551
552				if ($exif)	{
553					$comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']);	// The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii.
554				} else {
555					$comment = '';
556				}
557				$contentArr = $this->pObj->splitRegularContent($comment);
558				$contentArr['title'] = basename($absFile);	// Make sure the title doesn't expose the absolute path!
559			break;
560			default:
561				return false;
562			break;
563		}
564			// If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
565		if (is_array($contentArr) && !$contentArr['title'])	{
566			$contentArr['title'] = str_replace('_',' ',basename($absFile));	// Substituting "_" for " " because many filenames may have this instead of a space char.
567		}
568
569		return $contentArr;
570	}
571
572	/**
573	 * Creates an array with pointers to divisions of document.
574	 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back.
575	 *
576	 * @param	string		File extension
577	 * @param	string		Absolute filename (must exist and be validated OK before calling function)
578	 * @return	array		Array of pointers to sections that the document should be divided into
579	 */
580	function fileContentParts($ext,$absFile)	{
581		$cParts = array(0);
582		switch ($ext)	{
583			case 'pdf':
584					// Getting pdf-info:
585				$cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
586				exec($cmd,$res);
587				$pdfInfo = $this->splitPdfInfo($res);
588				unset($res);
589
590				if (intval($pdfInfo['pages']))	{
591					$cParts = array();
592
593						// Calculate mode
594					if ($this->pdf_mode>0)	{
595						$iter = ceil($pdfInfo['pages']/$this->pdf_mode);
596					} else {
597						$iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);
598					}
599
600						// Traverse and create intervals.
601					for ($a=0;$a<$iter;$a++)	{
602						$low = floor($a*($pdfInfo['pages']/$iter))+1;
603						$high = floor(($a+1)*($pdfInfo['pages']/$iter));
604						$cParts[] = $low.'-'.$high;
605					}
606				}
607			break;
608		}
609		return $cParts;
610	}
611
612	/**
613	 * Analysing PDF info into a useable format.
614	 *
615	 * @param	array		Array of PDF content, coming from the pdfinfo tool
616	 * @return	array		Result array
617	 * @access private
618	 * @see fileContentParts()
619	 */
620	function splitPdfInfo($pdfInfoArray)	{
621		$res = array();
622		if (is_array($pdfInfoArray))	{
623			foreach($pdfInfoArray as $line)	{
624				$parts = explode(':',$line,2);
625				if (count($parts)>1 && trim($parts[0]))	{
626					$res[strtolower(trim($parts[0]))] = trim($parts[1]);
627				}
628			}
629		}
630		return $res;
631	}
632
633	/**
634	 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
635	 *
636	 * @param	string		String to clean up
637	 * @return	string		String
638	 */
639	function removeEndJunk($string)	{
640		return trim(preg_replace('/['.LF.chr(12).']*$/','',$string));
641	}
642
643
644
645
646
647
648
649
650
651
652
653
654	/************************
655	 *
656	 * Backend analyzer
657	 *
658	 ************************/
659
660	/**
661	 * Return icon for file extension
662	 *
663	 * @param	string		File extension, lowercase.
664	 * @return	string		Relative file reference, resolvable by t3lib_div::getFileAbsFileName()
665	 */
666	function getIcon($extension)	{
667		if ($extension=='htm')	$extension = 'html';
668		if ($extension=='jpeg')	$extension = 'jpg';
669		return 'EXT:indexed_search/pi/res/'.$extension.'.gif';
670	}
671}
672
673if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php'])    {
674	include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);
675}
676
677?>