PageRenderTime 43ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/concreteOLD/libraries/3rdparty/Zend/Search/Lucene/Document/Html.php

https://bitbucket.org/selfeky/xclusivescardwebsite
PHP | 481 lines | 233 code | 64 blank | 184 comment | 43 complexity | e3f34c21974b2dafea7de2dbffd3c980 MD5 | raw file
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Document
  18. * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id: Html.php 23775 2011-03-01 17:25:24Z ralph $
  21. */
  22. /** Zend_Search_Lucene_Document */
  23. require_once 'Zend/Search/Lucene/Document.php';
  24. /**
  25. * HTML document.
  26. *
  27. * @category Zend
  28. * @package Zend_Search_Lucene
  29. * @subpackage Document
  30. * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
  31. * @license http://framework.zend.com/license/new-bsd New BSD License
  32. */
  33. class Zend_Search_Lucene_Document_Html extends Zend_Search_Lucene_Document
  34. {
  35. /**
  36. * List of document links
  37. *
  38. * @var array
  39. */
  40. private $_links = array();
  41. /**
  42. * List of document header links
  43. *
  44. * @var array
  45. */
  46. private $_headerLinks = array();
  47. /**
  48. * Stored DOM representation
  49. *
  50. * @var DOMDocument
  51. */
  52. private $_doc;
  53. /**
  54. * Exclud nofollow links flag
  55. *
  56. * If true then links with rel='nofollow' attribute are not included into
  57. * document links.
  58. *
  59. * @var boolean
  60. */
  61. private static $_excludeNoFollowLinks = false;
  62. /**
  63. *
  64. * List of inline tags
  65. *
  66. * @var array
  67. */
  68. private $_inlineTags = array('a', 'abbr', 'acronym', 'dfn', 'em', 'strong', 'code',
  69. 'samp', 'kbd', 'var', 'b', 'i', 'big', 'small', 'strike',
  70. 'tt', 'u', 'font', 'span', 'bdo', 'cite', 'del', 'ins',
  71. 'q', 'sub', 'sup');
  72. /**
  73. * Object constructor
  74. *
  75. * @param string $data HTML string (may be HTML fragment, )
  76. * @param boolean $isFile
  77. * @param boolean $storeContent
  78. * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
  79. */
  80. private function __construct($data, $isFile, $storeContent, $defaultEncoding = '')
  81. {
  82. $this->_doc = new DOMDocument();
  83. $this->_doc->substituteEntities = true;
  84. if ($isFile) {
  85. $htmlData = file_get_contents($data);
  86. } else {
  87. $htmlData = $data;
  88. }
  89. @$this->_doc->loadHTML($htmlData);
  90. if ($this->_doc->encoding === null) {
  91. // Document encoding is not recognized
  92. /** @todo improve HTML vs HTML fragment recognition */
  93. if (preg_match('/<html[^>]*>/i', $htmlData, $matches, PREG_OFFSET_CAPTURE)) {
  94. // It's an HTML document
  95. // Add additional HEAD section and recognize document
  96. $htmlTagOffset = $matches[0][1] + strlen($matches[0][0]);
  97. @$this->_doc->loadHTML(iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, 0, $htmlTagOffset))
  98. . '<head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head>'
  99. . iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, $htmlTagOffset)));
  100. // Remove additional HEAD section
  101. $xpath = new DOMXPath($this->_doc);
  102. $head = $xpath->query('/html/head')->item(0);
  103. $head->parentNode->removeChild($head);
  104. } else {
  105. // It's an HTML fragment
  106. @$this->_doc->loadHTML('<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>'
  107. . iconv($defaultEncoding, 'UTF-8//IGNORE', $htmlData)
  108. . '</body></html>');
  109. }
  110. }
  111. /** @todo Add correction of wrong HTML encoding recognition processing
  112. * The case is:
  113. * Content-type HTTP-EQUIV meta tag is presented, but ISO-8859-5 encoding is actually used,
  114. * even $this->_doc->encoding demonstrates another recognized encoding
  115. */
  116. $xpath = new DOMXPath($this->_doc);
  117. $docTitle = '';
  118. $titleNodes = $xpath->query('/html/head/title');
  119. foreach ($titleNodes as $titleNode) {
  120. // title should always have only one entry, but we process all nodeset entries
  121. $docTitle .= $titleNode->nodeValue . ' ';
  122. }
  123. $this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, 'UTF-8'));
  124. $metaNodes = $xpath->query('/html/head/meta[@name]');
  125. foreach ($metaNodes as $metaNode) {
  126. $this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'),
  127. $metaNode->getAttribute('content'),
  128. 'UTF-8'));
  129. }
  130. $docBody = '';
  131. $bodyNodes = $xpath->query('/html/body');
  132. foreach ($bodyNodes as $bodyNode) {
  133. // body should always have only one entry, but we process all nodeset entries
  134. $this->_retrieveNodeText($bodyNode, $docBody);
  135. }
  136. if ($storeContent) {
  137. $this->addField(Zend_Search_Lucene_Field::Text('body', $docBody, 'UTF-8'));
  138. } else {
  139. $this->addField(Zend_Search_Lucene_Field::UnStored('body', $docBody, 'UTF-8'));
  140. }
  141. $linkNodes = $this->_doc->getElementsByTagName('a');
  142. foreach ($linkNodes as $linkNode) {
  143. if (($href = $linkNode->getAttribute('href')) != '' &&
  144. (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow' )
  145. ) {
  146. $this->_links[] = $href;
  147. }
  148. }
  149. $linkNodes = $this->_doc->getElementsByTagName('area');
  150. foreach ($linkNodes as $linkNode) {
  151. if (($href = $linkNode->getAttribute('href')) != '' &&
  152. (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow' )
  153. ) {
  154. $this->_links[] = $href;
  155. }
  156. }
  157. $this->_links = array_unique($this->_links);
  158. $linkNodes = $xpath->query('/html/head/link');
  159. foreach ($linkNodes as $linkNode) {
  160. if (($href = $linkNode->getAttribute('href')) != '') {
  161. $this->_headerLinks[] = $href;
  162. }
  163. }
  164. $this->_headerLinks = array_unique($this->_headerLinks);
  165. }
  166. /**
  167. * Set exclude nofollow links flag
  168. *
  169. * @param boolean $newValue
  170. */
  171. public static function setExcludeNoFollowLinks($newValue)
  172. {
  173. self::$_excludeNoFollowLinks = $newValue;
  174. }
  175. /**
  176. * Get exclude nofollow links flag
  177. *
  178. * @return boolean
  179. */
  180. public static function getExcludeNoFollowLinks()
  181. {
  182. return self::$_excludeNoFollowLinks;
  183. }
  184. /**
  185. * Get node text
  186. *
  187. * We should exclude scripts, which may be not included into comment tags, CDATA sections,
  188. *
  189. * @param DOMNode $node
  190. * @param string &$text
  191. */
  192. private function _retrieveNodeText(DOMNode $node, &$text)
  193. {
  194. if ($node->nodeType == XML_TEXT_NODE) {
  195. $text .= $node->nodeValue;
  196. if(!in_array($node->parentNode->tagName, $this->_inlineTags)) {
  197. $text .= ' ';
  198. }
  199. } else if ($node->nodeType == XML_ELEMENT_NODE && $node->nodeName != 'script') {
  200. foreach ($node->childNodes as $childNode) {
  201. $this->_retrieveNodeText($childNode, $text);
  202. }
  203. }
  204. }
  205. /**
  206. * Get document HREF links
  207. *
  208. * @return array
  209. */
  210. public function getLinks()
  211. {
  212. return $this->_links;
  213. }
  214. /**
  215. * Get document header links
  216. *
  217. * @return array
  218. */
  219. public function getHeaderLinks()
  220. {
  221. return $this->_headerLinks;
  222. }
  223. /**
  224. * Load HTML document from a string
  225. *
  226. * @param string $data
  227. * @param boolean $storeContent
  228. * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
  229. * @return Zend_Search_Lucene_Document_Html
  230. */
  231. public static function loadHTML($data, $storeContent = false, $defaultEncoding = '')
  232. {
  233. return new Zend_Search_Lucene_Document_Html($data, false, $storeContent, $defaultEncoding);
  234. }
  235. /**
  236. * Load HTML document from a file
  237. *
  238. * @param string $file
  239. * @param boolean $storeContent
  240. * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
  241. * @return Zend_Search_Lucene_Document_Html
  242. */
  243. public static function loadHTMLFile($file, $storeContent = false, $defaultEncoding = '')
  244. {
  245. return new Zend_Search_Lucene_Document_Html($file, true, $storeContent, $defaultEncoding);
  246. }
  247. /**
  248. * Highlight text in text node
  249. *
  250. * @param DOMText $node
  251. * @param array $wordsToHighlight
  252. * @param callback $callback Callback method, used to transform (highlighting) text.
  253. * @param array $params Array of additionall callback parameters (first non-optional parameter is a text to transform)
  254. * @throws Zend_Search_Lucene_Exception
  255. */
  256. protected function _highlightTextNode(DOMText $node, $wordsToHighlight, $callback, $params)
  257. {
  258. /** Zend_Search_Lucene_Analysis_Analyzer */
  259. require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  260. $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
  261. $analyzer->setInput($node->nodeValue, 'UTF-8');
  262. $matchedTokens = array();
  263. while (($token = $analyzer->nextToken()) !== null) {
  264. if (isset($wordsToHighlight[$token->getTermText()])) {
  265. $matchedTokens[] = $token;
  266. }
  267. }
  268. if (count($matchedTokens) == 0) {
  269. return;
  270. }
  271. $matchedTokens = array_reverse($matchedTokens);
  272. foreach ($matchedTokens as $token) {
  273. // Cut text after matched token
  274. $node->splitText($token->getEndOffset());
  275. // Cut matched node
  276. $matchedWordNode = $node->splitText($token->getStartOffset());
  277. // Retrieve HTML string representation for highlihted word
  278. $fullCallbackparamsList = $params;
  279. array_unshift($fullCallbackparamsList, $matchedWordNode->nodeValue);
  280. $highlightedWordNodeSetHtml = call_user_func_array($callback, $fullCallbackparamsList);
  281. // Transform HTML string to a DOM representation and automatically transform retrieved string
  282. // into valid XHTML (It's automatically done by loadHTML() method)
  283. $highlightedWordNodeSetDomDocument = new DOMDocument('1.0', 'UTF-8');
  284. $success = @$highlightedWordNodeSetDomDocument->
  285. loadHTML('<html><head><meta http-equiv="Content-type" content="text/html; charset=UTF-8"/></head><body>'
  286. . $highlightedWordNodeSetHtml
  287. . '</body></html>');
  288. if (!$success) {
  289. require_once 'Zend/Search/Lucene/Exception.php';
  290. throw new Zend_Search_Lucene_Exception("Error occured while loading highlighted text fragment: '$highlightedWordNodeSetHtml'.");
  291. }
  292. $highlightedWordNodeSetXpath = new DOMXPath($highlightedWordNodeSetDomDocument);
  293. $highlightedWordNodeSet = $highlightedWordNodeSetXpath->query('/html/body')->item(0)->childNodes;
  294. for ($count = 0; $count < $highlightedWordNodeSet->length; $count++) {
  295. $nodeToImport = $highlightedWordNodeSet->item($count);
  296. $node->parentNode->insertBefore($this->_doc->importNode($nodeToImport, true /* deep copy */),
  297. $matchedWordNode);
  298. }
  299. $node->parentNode->removeChild($matchedWordNode);
  300. }
  301. }
  302. /**
  303. * highlight words in content of the specified node
  304. *
  305. * @param DOMNode $contextNode
  306. * @param array $wordsToHighlight
  307. * @param callback $callback Callback method, used to transform (highlighting) text.
  308. * @param array $params Array of additionall callback parameters (first non-optional parameter is a text to transform)
  309. */
  310. protected function _highlightNodeRecursive(DOMNode $contextNode, $wordsToHighlight, $callback, $params)
  311. {
  312. $textNodes = array();
  313. if (!$contextNode->hasChildNodes()) {
  314. return;
  315. }
  316. foreach ($contextNode->childNodes as $childNode) {
  317. if ($childNode->nodeType == XML_TEXT_NODE) {
  318. // process node later to leave childNodes structure untouched
  319. $textNodes[] = $childNode;
  320. } else {
  321. // Process node if it's not a script node
  322. if ($childNode->nodeName != 'script') {
  323. $this->_highlightNodeRecursive($childNode, $wordsToHighlight, $callback, $params);
  324. }
  325. }
  326. }
  327. foreach ($textNodes as $textNode) {
  328. $this->_highlightTextNode($textNode, $wordsToHighlight, $callback, $params);
  329. }
  330. }
  331. /**
  332. * Standard callback method used to highlight words.
  333. *
  334. * @param string $stringToHighlight
  335. * @return string
  336. * @internal
  337. */
  338. public function applyColour($stringToHighlight, $colour)
  339. {
  340. return '<b style="color:black;background-color:' . $colour . '">' . $stringToHighlight . '</b>';
  341. }
  342. /**
  343. * Highlight text with specified color
  344. *
  345. * @param string|array $words
  346. * @param string $colour
  347. * @return string
  348. */
  349. public function highlight($words, $colour = '#66ffff')
  350. {
  351. return $this->highlightExtended($words, array($this, 'applyColour'), array($colour));
  352. }
  353. /**
  354. * Highlight text using specified View helper or callback function.
  355. *
  356. * @param string|array $words Words to highlight. Words could be organized using the array or string.
  357. * @param callback $callback Callback method, used to transform (highlighting) text.
  358. * @param array $params Array of additionall callback parameters passed through into it
  359. * (first non-optional parameter is an HTML fragment for highlighting)
  360. * @return string
  361. * @throws Zend_Search_Lucene_Exception
  362. */
  363. public function highlightExtended($words, $callback, $params = array())
  364. {
  365. /** Zend_Search_Lucene_Analysis_Analyzer */
  366. require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
  367. if (!is_array($words)) {
  368. $words = array($words);
  369. }
  370. $wordsToHighlightList = array();
  371. $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
  372. foreach ($words as $wordString) {
  373. $wordsToHighlightList[] = $analyzer->tokenize($wordString);
  374. }
  375. $wordsToHighlight = call_user_func_array('array_merge', $wordsToHighlightList);
  376. if (count($wordsToHighlight) == 0) {
  377. return $this->_doc->saveHTML();
  378. }
  379. $wordsToHighlightFlipped = array();
  380. foreach ($wordsToHighlight as $id => $token) {
  381. $wordsToHighlightFlipped[$token->getTermText()] = $id;
  382. }
  383. if (!is_callable($callback)) {
  384. require_once 'Zend/Search/Lucene/Exception.php';
  385. throw new Zend_Search_Lucene_Exception('$viewHelper parameter mast be a View Helper name, View Helper object or callback.');
  386. }
  387. $xpath = new DOMXPath($this->_doc);
  388. $matchedNodes = $xpath->query("/html/body");
  389. foreach ($matchedNodes as $matchedNode) {
  390. $this->_highlightNodeRecursive($matchedNode, $wordsToHighlightFlipped, $callback, $params);
  391. }
  392. }
  393. /**
  394. * Get HTML
  395. *
  396. * @return string
  397. */
  398. public function getHTML()
  399. {
  400. return $this->_doc->saveHTML();
  401. }
  402. /**
  403. * Get HTML body
  404. *
  405. * @return string
  406. */
  407. public function getHtmlBody()
  408. {
  409. $xpath = new DOMXPath($this->_doc);
  410. $bodyNodes = $xpath->query('/html/body')->item(0)->childNodes;
  411. $outputFragments = array();
  412. for ($count = 0; $count < $bodyNodes->length; $count++) {
  413. $outputFragments[] = $this->_doc->saveXML($bodyNodes->item($count));
  414. }
  415. return implode($outputFragments);
  416. }
  417. }