/zf/library/Zend/Search/Lucene/Document/Html.php
PHP | 481 lines | 233 code | 64 blank | 184 comment | 43 complexity | be6cbd7a479d9f800302112dead47774 MD5 | raw file
Possible License(s): MIT, BSD-3-Clause, Apache-2.0, LGPL-2.1, LGPL-3.0, BSD-2-Clause
1<?php 2/** 3 * Zend Framework 4 * 5 * LICENSE 6 * 7 * This source file is subject to the new BSD license that is bundled 8 * with this package in the file LICENSE.txt. 9 * It is also available through the world-wide-web at this URL: 10 * http://framework.zend.com/license/new-bsd 11 * If you did not receive a copy of the license and are unable to 12 * obtain it through the world-wide-web, please send an email 13 * to license@zend.com so we can send you a copy immediately. 14 * 15 * @category Zend 16 * @package Zend_Search_Lucene 17 * @subpackage Document 18 * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com) 19 * @license http://framework.zend.com/license/new-bsd New BSD License 20 * @version $Id: Html.php 24144 2011-06-14 22:06:56Z adamlundrigan $ 21 */ 22 23 24/** Zend_Search_Lucene_Document */ 25require_once 'Zend/Search/Lucene/Document.php'; 26 27 28/** 29 * HTML document. 30 * 31 * @category Zend 32 * @package Zend_Search_Lucene 33 * @subpackage Document 34 * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com) 35 * @license http://framework.zend.com/license/new-bsd New BSD License 36 */ 37class Zend_Search_Lucene_Document_Html extends Zend_Search_Lucene_Document 38{ 39 /** 40 * List of document links 41 * 42 * @var array 43 */ 44 private $_links = array(); 45 46 /** 47 * List of document header links 48 * 49 * @var array 50 */ 51 private $_headerLinks = array(); 52 53 /** 54 * Stored DOM representation 55 * 56 * @var DOMDocument 57 */ 58 private $_doc; 59 60 /** 61 * Exclud nofollow links flag 62 * 63 * If true then links with rel='nofollow' attribute are not included into 64 * document links. 65 * 66 * @var boolean 67 */ 68 private static $_excludeNoFollowLinks = false; 69 70 /** 71 * 72 * List of inline tags 73 * 74 * @var array 75 */ 76 private $_inlineTags = array('a', 'abbr', 'acronym', 'dfn', 'em', 'strong', 'code', 77 'samp', 'kbd', 'var', 'b', 'i', 'big', 'small', 'strike', 78 'tt', 'u', 'font', 'span', 'bdo', 'cite', 'del', 'ins', 79 'q', 'sub', 'sup'); 80 81 /** 82 * Object constructor 83 * 84 * @param string $data HTML string (may be HTML fragment, ) 85 * @param boolean $isFile 86 * @param boolean $storeContent 87 * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. 88 */ 89 private function __construct($data, $isFile, $storeContent, $defaultEncoding = '') 90 { 91 $this->_doc = new DOMDocument(); 92 $this->_doc->substituteEntities = true; 93 94 if ($isFile) { 95 $htmlData = file_get_contents($data); 96 } else { 97 $htmlData = $data; 98 } 99 @$this->_doc->loadHTML($htmlData); 100 101 if ($this->_doc->encoding === null) { 102 // Document encoding is not recognized 103 104 /** @todo improve HTML vs HTML fragment recognition */ 105 if (preg_match('/<html[^>]*>/i', $htmlData, $matches, PREG_OFFSET_CAPTURE)) { 106 // It's an HTML document 107 // Add additional HEAD section and recognize document 108 $htmlTagOffset = $matches[0][1] + strlen($matches[0][0]); 109 110 @$this->_doc->loadHTML(iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, 0, $htmlTagOffset)) 111 . '<head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head>' 112 . iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, $htmlTagOffset))); 113 114 // Remove additional HEAD section 115 $xpath = new DOMXPath($this->_doc); 116 $head = $xpath->query('/html/head')->item(0); 117 $head->parentNode->removeChild($head); 118 } else { 119 // It's an HTML fragment 120 @$this->_doc->loadHTML('<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>' 121 . iconv($defaultEncoding, 'UTF-8//IGNORE', $htmlData) 122 . '</body></html>'); 123 } 124 125 } 126 /** @todo Add correction of wrong HTML encoding recognition processing 127 * The case is: 128 * Content-type HTTP-EQUIV meta tag is presented, but ISO-8859-5 encoding is actually used, 129 * even $this->_doc->encoding demonstrates another recognized encoding 130 */ 131 132 $xpath = new DOMXPath($this->_doc); 133 134 $docTitle = ''; 135 $titleNodes = $xpath->query('/html/head/title'); 136 foreach ($titleNodes as $titleNode) { 137 // title should always have only one entry, but we process all nodeset entries 138 $docTitle .= $titleNode->nodeValue . ' '; 139 } 140 $this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, 'UTF-8')); 141 142 $metaNodes = $xpath->query('/html/head/meta[@name]'); 143 foreach ($metaNodes as $metaNode) { 144 $this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'), 145 $metaNode->getAttribute('content'), 146 'UTF-8')); 147 } 148 149 $docBody = ''; 150 $bodyNodes = $xpath->query('/html/body'); 151 foreach ($bodyNodes as $bodyNode) { 152 // body should always have only one entry, but we process all nodeset entries 153 $this->_retrieveNodeText($bodyNode, $docBody); 154 } 155 if ($storeContent) { 156 $this->addField(Zend_Search_Lucene_Field::Text('body', $docBody, 'UTF-8')); 157 } else { 158 $this->addField(Zend_Search_Lucene_Field::UnStored('body', $docBody, 'UTF-8')); 159 } 160 161 $linkNodes = $this->_doc->getElementsByTagName('a'); 162 foreach ($linkNodes as $linkNode) { 163 if (($href = $linkNode->getAttribute('href')) != '' && 164 (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow' ) 165 ) { 166 $this->_links[] = $href; 167 } 168 } 169 $linkNodes = $this->_doc->getElementsByTagName('area'); 170 foreach ($linkNodes as $linkNode) { 171 if (($href = $linkNode->getAttribute('href')) != '' && 172 (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow' ) 173 ) { 174 $this->_links[] = $href; 175 } 176 } 177 $this->_links = array_unique($this->_links); 178 179 $linkNodes = $xpath->query('/html/head/link'); 180 foreach ($linkNodes as $linkNode) { 181 if (($href = $linkNode->getAttribute('href')) != '') { 182 $this->_headerLinks[] = $href; 183 } 184 } 185 $this->_headerLinks = array_unique($this->_headerLinks); 186 } 187 188 /** 189 * Set exclude nofollow links flag 190 * 191 * @param boolean $newValue 192 */ 193 public static function setExcludeNoFollowLinks($newValue) 194 { 195 self::$_excludeNoFollowLinks = $newValue; 196 } 197 198 /** 199 * Get exclude nofollow links flag 200 * 201 * @return boolean 202 */ 203 public static function getExcludeNoFollowLinks() 204 { 205 return self::$_excludeNoFollowLinks; 206 } 207 208 /** 209 * Get node text 210 * 211 * We should exclude scripts, which may be not included into comment tags, CDATA sections, 212 * 213 * @param DOMNode $node 214 * @param string &$text 215 */ 216 private function _retrieveNodeText(DOMNode $node, &$text) 217 { 218 if ($node->nodeType == XML_TEXT_NODE) { 219 $text .= $node->nodeValue; 220 if(!in_array($node->parentNode->tagName, $this->_inlineTags)) { 221 $text .= ' '; 222 } 223 } else if ($node->nodeType == XML_ELEMENT_NODE && $node->nodeName != 'script') { 224 foreach ($node->childNodes as $childNode) { 225 $this->_retrieveNodeText($childNode, $text); 226 } 227 } 228 } 229 230 /** 231 * Get document HREF links 232 * 233 * @return array 234 */ 235 public function getLinks() 236 { 237 return $this->_links; 238 } 239 240 /** 241 * Get document header links 242 * 243 * @return array 244 */ 245 public function getHeaderLinks() 246 { 247 return $this->_headerLinks; 248 } 249 250 /** 251 * Load HTML document from a string 252 * 253 * @param string $data 254 * @param boolean $storeContent 255 * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. 256 * @return Zend_Search_Lucene_Document_Html 257 */ 258 public static function loadHTML($data, $storeContent = false, $defaultEncoding = '') 259 { 260 return new Zend_Search_Lucene_Document_Html($data, false, $storeContent, $defaultEncoding); 261 } 262 263 /** 264 * Load HTML document from a file 265 * 266 * @param string $file 267 * @param boolean $storeContent 268 * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. 269 * @return Zend_Search_Lucene_Document_Html 270 */ 271 public static function loadHTMLFile($file, $storeContent = false, $defaultEncoding = '') 272 { 273 return new Zend_Search_Lucene_Document_Html($file, true, $storeContent, $defaultEncoding); 274 } 275 276 277 /** 278 * Highlight text in text node 279 * 280 * @param DOMText $node 281 * @param array $wordsToHighlight 282 * @param callback $callback Callback method, used to transform (highlighting) text. 283 * @param array $params Array of additionall callback parameters (first non-optional parameter is a text to transform) 284 * @throws Zend_Search_Lucene_Exception 285 */ 286 protected function _highlightTextNode(DOMText $node, $wordsToHighlight, $callback, $params) 287 { 288 /** Zend_Search_Lucene_Analysis_Analyzer */ 289 require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; 290 291 $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); 292 $analyzer->setInput($node->nodeValue, 'UTF-8'); 293 294 $matchedTokens = array(); 295 296 while (($token = $analyzer->nextToken()) !== null) { 297 if (isset($wordsToHighlight[$token->getTermText()])) { 298 $matchedTokens[] = $token; 299 } 300 } 301 302 if (count($matchedTokens) == 0) { 303 return; 304 } 305 306 $matchedTokens = array_reverse($matchedTokens); 307 308 foreach ($matchedTokens as $token) { 309 // Cut text after matched token 310 $node->splitText($token->getEndOffset()); 311 312 // Cut matched node 313 $matchedWordNode = $node->splitText($token->getStartOffset()); 314 315 // Retrieve HTML string representation for highlihted word 316 $fullCallbackparamsList = $params; 317 array_unshift($fullCallbackparamsList, $matchedWordNode->nodeValue); 318 $highlightedWordNodeSetHtml = call_user_func_array($callback, $fullCallbackparamsList); 319 320 // Transform HTML string to a DOM representation and automatically transform retrieved string 321 // into valid XHTML (It's automatically done by loadHTML() method) 322 $highlightedWordNodeSetDomDocument = new DOMDocument('1.0', 'UTF-8'); 323 $success = @$highlightedWordNodeSetDomDocument-> 324 loadHTML('<html><head><meta http-equiv="Content-type" content="text/html; charset=UTF-8"/></head><body>' 325 . $highlightedWordNodeSetHtml 326 . '</body></html>'); 327 if (!$success) { 328 require_once 'Zend/Search/Lucene/Exception.php'; 329 throw new Zend_Search_Lucene_Exception("Error occured while loading highlighted text fragment: '$highlightedWordNodeSetHtml'."); 330 } 331 $highlightedWordNodeSetXpath = new DOMXPath($highlightedWordNodeSetDomDocument); 332 $highlightedWordNodeSet = $highlightedWordNodeSetXpath->query('/html/body')->item(0)->childNodes; 333 334 for ($count = 0; $count < $highlightedWordNodeSet->length; $count++) { 335 $nodeToImport = $highlightedWordNodeSet->item($count); 336 $node->parentNode->insertBefore($this->_doc->importNode($nodeToImport, true /* deep copy */), 337 $matchedWordNode); 338 } 339 340 $node->parentNode->removeChild($matchedWordNode); 341 } 342 } 343 344 345 /** 346 * highlight words in content of the specified node 347 * 348 * @param DOMNode $contextNode 349 * @param array $wordsToHighlight 350 * @param callback $callback Callback method, used to transform (highlighting) text. 351 * @param array $params Array of additionall callback parameters (first non-optional parameter is a text to transform) 352 */ 353 protected function _highlightNodeRecursive(DOMNode $contextNode, $wordsToHighlight, $callback, $params) 354 { 355 $textNodes = array(); 356 357 if (!$contextNode->hasChildNodes()) { 358 return; 359 } 360 361 foreach ($contextNode->childNodes as $childNode) { 362 if ($childNode->nodeType == XML_TEXT_NODE) { 363 // process node later to leave childNodes structure untouched 364 $textNodes[] = $childNode; 365 } else { 366 // Process node if it's not a script node 367 if ($childNode->nodeName != 'script') { 368 $this->_highlightNodeRecursive($childNode, $wordsToHighlight, $callback, $params); 369 } 370 } 371 } 372 373 foreach ($textNodes as $textNode) { 374 $this->_highlightTextNode($textNode, $wordsToHighlight, $callback, $params); 375 } 376 } 377 378 /** 379 * Standard callback method used to highlight words. 380 * 381 * @param string $stringToHighlight 382 * @return string 383 * @internal 384 */ 385 public function applyColour($stringToHighlight, $colour) 386 { 387 return '<b style="color:black;background-color:' . $colour . '">' . $stringToHighlight . '</b>'; 388 } 389 390 /** 391 * Highlight text with specified color 392 * 393 * @param string|array $words 394 * @param string $colour 395 * @return string 396 */ 397 public function highlight($words, $colour = '#66ffff') 398 { 399 return $this->highlightExtended($words, array($this, 'applyColour'), array($colour)); 400 } 401 402 403 404 /** 405 * Highlight text using specified View helper or callback function. 406 * 407 * @param string|array $words Words to highlight. Words could be organized using the array or string. 408 * @param callback $callback Callback method, used to transform (highlighting) text. 409 * @param array $params Array of additionall callback parameters passed through into it 410 * (first non-optional parameter is an HTML fragment for highlighting) 411 * @return string 412 * @throws Zend_Search_Lucene_Exception 413 */ 414 public function highlightExtended($words, $callback, $params = array()) 415 { 416 /** Zend_Search_Lucene_Analysis_Analyzer */ 417 require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; 418 419 if (!is_array($words)) { 420 $words = array($words); 421 } 422 423 $wordsToHighlightList = array(); 424 $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); 425 foreach ($words as $wordString) { 426 $wordsToHighlightList[] = $analyzer->tokenize($wordString); 427 } 428 $wordsToHighlight = call_user_func_array('array_merge', $wordsToHighlightList); 429 430 if (count($wordsToHighlight) == 0) { 431 return $this->_doc->saveHTML(); 432 } 433 434 $wordsToHighlightFlipped = array(); 435 foreach ($wordsToHighlight as $id => $token) { 436 $wordsToHighlightFlipped[$token->getTermText()] = $id; 437 } 438 439 if (!is_callable($callback)) { 440 require_once 'Zend/Search/Lucene/Exception.php'; 441 throw new Zend_Search_Lucene_Exception('$viewHelper parameter must be a View Helper name, View Helper object or callback.'); 442 } 443 444 $xpath = new DOMXPath($this->_doc); 445 446 $matchedNodes = $xpath->query("/html/body"); 447 foreach ($matchedNodes as $matchedNode) { 448 $this->_highlightNodeRecursive($matchedNode, $wordsToHighlightFlipped, $callback, $params); 449 } 450 } 451 452 453 /** 454 * Get HTML 455 * 456 * @return string 457 */ 458 public function getHTML() 459 { 460 return $this->_doc->saveHTML(); 461 } 462 463 /** 464 * Get HTML body 465 * 466 * @return string 467 */ 468 public function getHtmlBody() 469 { 470 $xpath = new DOMXPath($this->_doc); 471 $bodyNodes = $xpath->query('/html/body')->item(0)->childNodes; 472 473 $outputFragments = array(); 474 for ($count = 0; $count < $bodyNodes->length; $count++) { 475 $outputFragments[] = $this->_doc->saveXML($bodyNodes->item($count)); 476 } 477 478 return implode($outputFragments); 479 } 480} 481