PageRenderTime 65ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/Readability.php

https://github.com/fansidele/orange
PHP | 1174 lines | 661 code | 136 blank | 377 comment | 205 complexity | f87045633763b56f66a7ca92a2f040ae MD5 | raw file
  1. <?php
  2. /**
  3. * Arc90's Readability ported to PHP for FiveFilters.org
  4. * Based on readability.js version 1.7.1 (without multi-page support)
  5. * ------------------------------------------------------
  6. * Original URL: http://lab.arc90.com/experiments/readability/js/readability.js
  7. * Arc90's project URL: http://lab.arc90.com/experiments/readability/
  8. * JS Source: http://code.google.com/p/arc90labs-readability
  9. * Ported by: Keyvan Minoukadeh, http://www.keyvan.net
  10. * More information: http://fivefilters.org/content-only/
  11. * License: Apache License, Version 2.0
  12. * Requires: PHP5
  13. * Date: 2011-07-22
  14. *
  15. * Differences between the PHP port and the original
  16. * ------------------------------------------------------
  17. * Arc90's Readability is designed to run in the browser. It works on the DOM
  18. * tree (the parsed HTML) after the page's CSS styles have been applied and
  19. * Javascript code executed. This PHP port does not run inside a browser.
  20. * We use PHP's ability to parse HTML to build our DOM tree, but we cannot
  21. * rely on CSS or Javascript support. As such, the results will not always
  22. * match Arc90's Readability. (For example, if a web page contains CSS style
  23. * rules or Javascript code which hide certain HTML elements from display,
  24. * Arc90's Readability will dismiss those from consideration but our PHP port,
  25. * unable to understand CSS or Javascript, will not know any better.)
  26. *
  27. * Another significant difference is that the aim of Arc90's Readability is
  28. * to re-present the main content block of a given web page so users can
  29. * read it more easily in their browsers. Correct identification, clean up,
  30. * and separation of the content block is only a part of this process.
  31. * This PHP port is only concerned with this part, it does not include code
  32. * that relates to presentation in the browser - Arc90 already do
  33. * that extremely well, and for PDF output there's FiveFilters.org's
  34. * PDF Newspaper: http://fivefilters.org/pdf-newspaper/.
  35. *
  36. * Finally, this class contains methods that might be useful for developers
  37. * working on HTML document fragments. So without deviating too much from
  38. * the original code (which I don't want to do because it makes debugging
  39. * and updating more difficult), I've tried to make it a little more
  40. * developer friendly. You should be able to use the methods here on
  41. * existing DOMElement objects without passing an entire HTML document to
  42. * be parsed.
  43. */
  44. class Readability
  45. {
  46. public $version = '1.7.1-without-multi-page';
  47. public $convertLinksToFootnotes = false;
  48. public $revertForcedParagraphElements = true;
  49. // public $articleTitle;
  50. public $articleContent;
  51. public $dom;
  52. public $url = null; // optional - URL where HTML was retrieved
  53. public $domain;
  54. public $debug = false;
  55. protected $body = null; //
  56. protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
  57. protected $flags = 7; // 1 | 2 | 4; // Start with all flags set.
  58. protected $success = false; // indicates whether we were able to extract or not
  59. /**
  60. * All of the regular expressions in use within readability.
  61. * Defined up here so we don't instantiate them repeatedly in loops.
  62. **/
  63. public $regexps = array(
  64. 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter|like|share/i',
  65. 'okMaybeItsACandidate' => '/and|article|section|body|column|main|shadow/i',
  66. 'positive' => '/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story|interactiveFreeFormMain|wikistyle|wp-post-image|featured|contentPane|cnnVPLeftCol/i',
  67. 'negative' => '/combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|navigation|outbrain|promo|related|scroll|share|shoutbox|sidebar|sponsor|shopping|tags|tool|widget|cnn_strylftcexpbx/i',
  68. 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|code|table|ul|iframe|object|embed)/i',
  69. 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',
  70. 'replaceFonts' => '/<(\/?)font[^>]*>/i',
  71. // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
  72. 'normalize' => '/\s{2,}/',
  73. 'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
  74. 'video' => '/http:\/\/(www\.)?(youtube|vimeo|kickstarter|youtube-nocookie|brightcove)\.com/i',
  75. 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
  76. );
  77. /* constants */
  78. const FLAG_STRIP_UNLIKELYS = 1;
  79. const FLAG_WEIGHT_CLASSES = 2;
  80. const FLAG_CLEAN_CONDITIONALLY = 4;
  81. /**
  82. * Create instance of Readability
  83. * @param string UTF-8 encoded string
  84. * @param string (optional) URL associated with HTML (used for footnotes)
  85. */
  86. function __construct($html, $url=null, $domain)
  87. {
  88. /* Turn all double br's into p's */
  89. $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
  90. $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
  91. $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
  92. $this->dom = new DOMDocument();
  93. $this->dom->preserveWhiteSpace = false;
  94. $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
  95. if (trim($html) == '') $html = '<html></html>';
  96. @$this->dom->loadHTML($html);
  97. $this->url = $url;
  98. $this->domain = $domain;
  99. }
  100. /**
  101. * Get article title element
  102. * @return DOMElement
  103. */
  104. // public function getTitle() {
  105. // return $this->articleTitle;
  106. // }
  107. /**
  108. * Get article content element
  109. * @return DOMElement
  110. */
  111. public function getContent() {
  112. return $this->articleContent;
  113. }
  114. /**
  115. * Runs readability.
  116. *
  117. * Workflow:
  118. * 1. Prep the document by removing script tags, css, etc.
  119. * 2. Build readability's DOM tree.
  120. * 3. Grab the article content from the current dom tree.
  121. * 4. Replace the current DOM tree with the new one.
  122. * 5. Read peacefully.
  123. *
  124. * @return boolean true if we found content, false otherwise
  125. **/
  126. public function init()
  127. {
  128. if (!isset($this->dom->documentElement)) return false;
  129. $this->removeScripts($this->dom);
  130. // Assume successful outcome
  131. $this->success = true;
  132. $bodyElems = $this->dom->getElementsByTagName('body');
  133. if ($bodyElems->length > 0) {
  134. if ($this->bodyCache == null) {
  135. $this->bodyCache = $bodyElems->item(0)->innerHTML;
  136. }
  137. if ($this->body == null) {
  138. $this->body = $bodyElems->item(0);
  139. }
  140. }
  141. $this->prepDocument();
  142. //die($this->dom->documentElement->parentNode->nodeType);
  143. //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement));
  144. //die($this->getInnerHTML($this->dom->documentElement));
  145. /* Build readability's DOM tree */
  146. $overlay = $this->dom->createElement('div');
  147. $innerDiv = $this->dom->createElement('div');
  148. // $articleTitle = $this->getArticleTitle();
  149. $articleContent = $this->grabArticle();
  150. if (!$articleContent) {
  151. $this->success = false;
  152. $articleContent = $this->dom->createElement('div');
  153. $articleContent->setAttribute('id', 'readability-content');
  154. $articleContent->innerHTML = '<p>Sorry, Orange was unable to parse this page for content.</p>';
  155. }
  156. $overlay->setAttribute('id', 'readOverlay');
  157. $innerDiv->setAttribute('id', 'readInner');
  158. /* Glue the structure of our document together. */
  159. // $innerDiv->appendChild($articleTitle);
  160. $innerDiv->appendChild($articleContent);
  161. $overlay->appendChild($innerDiv);
  162. /* Clear the old HTML, insert the new content. */
  163. $this->body->innerHTML = '';
  164. $this->body->appendChild($overlay);
  165. //document.body.insertBefore(overlay, document.body.firstChild);
  166. $this->body->removeAttribute('style');
  167. $this->parseImages($articleContent);
  168. $this->convertSmartQuotes($articleContent);
  169. // Set title and content instance variables
  170. // $this->articleTitle = $articleTitle;
  171. $this->articleContent = $articleContent;
  172. return $this->success;
  173. }
  174. /**
  175. * Debug
  176. */
  177. protected function dbg($msg) {
  178. if ($this->debug) echo '* ',$msg, '<br />', "\n";
  179. }
  180. /**
  181. * Run any post-process modifications to article content as necessary.
  182. *
  183. * @param DOMElement
  184. * @return void
  185. */
  186. public function postProcessContent($articleContent) {
  187. if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) {
  188. $this->addFootnotes($articleContent);
  189. }
  190. }
  191. /**
  192. * Get the article title as an H1.
  193. *
  194. * @return DOMElement
  195. */
  196. // protected function getArticleTitle() {
  197. // $curTitle = '';
  198. // $origTitle = '';
  199. //
  200. // try {
  201. // $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
  202. // } catch(Exception $e) {}
  203. //
  204. // if (preg_match('/ [\|\-] /', $curTitle))
  205. // {
  206. // $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
  207. //
  208. // if (count(explode(' ', $curTitle)) < 3) {
  209. // $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
  210. // }
  211. // }
  212. // else if (strpos($curTitle, ': ') !== false)
  213. // {
  214. // $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
  215. //
  216. // if (count(explode(' ', $curTitle)) < 3) {
  217. // $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle);
  218. // }
  219. // }
  220. // else if(strlen($curTitle) > 150 || strlen($curTitle) < 15)
  221. // {
  222. // $hOnes = $this->dom->getElementsByTagName('h1');
  223. // if($hOnes->length == 1)
  224. // {
  225. // $curTitle = $this->getInnerText($hOnes->item(0));
  226. // }
  227. // }
  228. //
  229. // $curTitle = trim($curTitle);
  230. //
  231. // if (count(explode(' ', $curTitle)) <= 4) {
  232. // $curTitle = $origTitle;
  233. // }
  234. //
  235. // $articleTitle = $this->dom->createElement('h1');
  236. // $articleTitle->innerHTML = $curTitle;
  237. //
  238. // return $articleTitle;
  239. // }
  240. /**
  241. * Prepare the HTML document for readability to scrape it.
  242. * This includes things like stripping javascript, CSS, and handling terrible markup.
  243. *
  244. * @return void
  245. **/
  246. protected function prepDocument() {
  247. /**
  248. * In some cases a body element can't be found (if the HTML is totally hosed for example)
  249. * so we create a new body node and append it to the document.
  250. */
  251. if ($this->body == null)
  252. {
  253. $this->body = $this->dom->createElement('body');
  254. $this->dom->appendChild($this->body);
  255. }
  256. $this->body->setAttribute('id', 'readabilityBody');
  257. /* Remove all style tags in head */
  258. $styleTags = $this->dom->getElementsByTagName('style');
  259. for ($i = $styleTags->length-1; $i >= 0; $i--)
  260. {
  261. $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
  262. }
  263. /* Turn all double br's into p's */
  264. /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
  265. //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>');
  266. // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree.
  267. // Manipulating innerHTML as it's done in JS is not possible in PHP.
  268. }
  269. /**
  270. * Replace smart quotes with apostrophes and quotes
  271. * @return HTML String
  272. */
  273. protected function convertSmartQuotes($string) {
  274. $search = array(chr(145),
  275. chr(146),
  276. chr(147),
  277. chr(148),
  278. chr(151));
  279. $replace = array("'",
  280. "'",
  281. '"',
  282. '"',
  283. '-');
  284. return str_replace($search, $replace, $string);
  285. }
  286. /**
  287. * Convert relative image path to absolute URL
  288. * @return HTML String
  289. */
  290. protected function rel2abs($rel, $base) {
  291. /* queries and anchors */
  292. if ($rel[0]=='#' || $rel[0]=='?') return $base.$rel;
  293. /* parse base URL and convert to local variables:
  294. $scheme, $host, $path */
  295. extract(parse_url($base));
  296. /* remove non-directory element from path */
  297. $path = preg_replace('#/[^/]*$#', '', $path);
  298. /* destroy path if relative url points to root */
  299. if ($rel[0] == '/') $path = '';
  300. /* dirty absolute URL */
  301. $abs = "$host$path/$rel";
  302. /* replace '//' or '/./' or '/foo/../' with '/' */
  303. $re = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');
  304. for($n=1; $n>0; $abs=preg_replace($re, '/', $abs, -1, $n)) {}
  305. /* absolute URL is ready! */
  306. return $scheme.'://'.$abs;
  307. }
  308. /**
  309. * Filter article content for valuable images
  310. * @return HTML String
  311. */
  312. protected function parseImages($articleContent) {
  313. $articleImages = $articleContent->getElementsByTagName('img');
  314. $imagesCount = $articleImages->length;
  315. if ($imagesCount > 0) {
  316. if ($imagesCount >= 30) {
  317. for($i = $imagesCount-1; $i >= 0; $i--) {
  318. $articleImages->item($i)->parentNode->removeChild($articleImages->item($i));
  319. }
  320. } else {
  321. $bestImage = '';
  322. $maxSize = -1;
  323. for($i = $imagesCount-1; $i >= 0; $i--) {
  324. $img = $articleImages->item($i);
  325. $attrWidth = $img->getAttribute('width');
  326. $attrHeight = $img->getAttribute('height');
  327. if ($attrWidth && $attrWidth < 150 || $attrHeight && $attrHeight < 120) {
  328. $img->parentNode->removeChild($img);
  329. } else {
  330. $src = $img->getAttribute('src');
  331. if (substr($src, 0, 4) != 'http') {
  332. $src = $this->rel2abs($src, $this->url);
  333. }
  334. list($width, $height) = getimagesize($src);
  335. if ($width && $height) {
  336. if ($width < 150 || $height && $height < 120) {
  337. $img->parentNode->removeChild($img);
  338. } else {
  339. $img->setAttribute('src', $src);
  340. if ($height && ($width + $height) > $maxSize) {
  341. $maxSize = $width + $height;
  342. $bestImage = $img;
  343. }
  344. }
  345. }
  346. }
  347. }
  348. if ($bestImage != '') {
  349. $bestImage->setAttribute('class', 'orange-best-image');
  350. }
  351. }
  352. }
  353. }
  354. /**
  355. * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
  356. * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
  357. *
  358. * @return void
  359. **/
  360. public function addFootnotes($articleContent) {
  361. $footnotesWrapper = $this->dom->createElement('div');
  362. $footnotesWrapper->setAttribute('id', 'readability-footnotes');
  363. $footnotesWrapper->innerHTML = '<h3>References</h3>';
  364. $articleFootnotes = $this->dom->createElement('ol');
  365. $articleFootnotes->setAttribute('id', 'readability-footnotes-list');
  366. $footnotesWrapper->appendChild($articleFootnotes);
  367. $articleLinks = $articleContent->getElementsByTagName('a');
  368. $linkCount = 0;
  369. for ($i = 0; $i < $articleLinks->length; $i++)
  370. {
  371. $articleLink = $articleLinks->item($i);
  372. $footnoteLink = $articleLink->cloneNode(true);
  373. $refLink = $this->dom->createElement('a');
  374. $footnote = $this->dom->createElement('li');
  375. $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST);
  376. if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST);
  377. //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host,
  378. $linkText = $this->getInnerText($articleLink);
  379. if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
  380. continue;
  381. }
  382. $linkCount++;
  383. /** Add a superscript reference after the article link */
  384. $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount);
  385. $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>';
  386. $refLink->setAttribute('class', 'readability-DoNotFootnote');
  387. $refLink->setAttribute('style', 'color: inherit;');
  388. //TODO: does this work or should we use DOMNode.isSameNode()?
  389. if ($articleLink->parentNode->lastChild == $articleLink) {
  390. $articleLink->parentNode->appendChild($refLink);
  391. } else {
  392. $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);
  393. }
  394. $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
  395. $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
  396. $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> ';
  397. $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);
  398. $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
  399. $footnote->appendChild($footnoteLink);
  400. if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>';
  401. $articleFootnotes->appendChild($footnote);
  402. }
  403. if ($linkCount > 0) {
  404. $articleContent->appendChild($footnotesWrapper);
  405. }
  406. }
  407. /**
  408. * Reverts P elements with class 'readability-styled'
  409. * to text nodes - which is what they were before.
  410. *
  411. * @param DOMElement
  412. * @return void
  413. */
  414. function revertReadabilityStyledElements($articleContent) {
  415. $xpath = new DOMXPath($articleContent->ownerDocument);
  416. $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent);
  417. //$elems = $articleContent->getElementsByTagName('p');
  418. for ($i = $elems->length-1; $i >= 0; $i--) {
  419. $e = $elems->item($i);
  420. $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
  421. //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') {
  422. // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e);
  423. //}
  424. }
  425. }
  426. /**
  427. * Prepare the article node for display. Clean out any inline styles,
  428. * iframes, forms, strip extraneous <p> tags, etc.
  429. *
  430. * @param DOMElement
  431. * @return void
  432. */
  433. function prepArticle($articleContent) {
  434. $this->cleanStyles($articleContent);
  435. $this->killBreaks($articleContent);
  436. if ($this->revertForcedParagraphElements) {
  437. $this->revertReadabilityStyledElements($articleContent);
  438. }
  439. /* Clean out junk from the article content */
  440. $this->cleanConditionally($articleContent, 'form');
  441. $this->clean($articleContent, 'input');
  442. $this->clean($articleContent, 'button');
  443. $this->clean($articleContent, 'h1');
  444. /**
  445. * If there is only one h2, they are probably using it
  446. * as a header and not a subheader, so remove it since we already have a header.
  447. ***/
  448. if ($articleContent->getElementsByTagName('h2')->length == 1) {
  449. $this->clean($articleContent, 'h2');
  450. }
  451. $this->cleanHeaders($articleContent);
  452. /* Do these last as the previous stuff may have removed junk that will affect these */
  453. $this->cleanConditionally($articleContent, 'table');
  454. $this->cleanConditionally($articleContent, 'ul');
  455. $this->cleanConditionally($articleContent, 'div');
  456. /* Remove extra paragraphs */
  457. $articleParagraphs = $articleContent->getElementsByTagName('p');
  458. for ($i = $articleParagraphs->length-1; $i >= 0; $i--)
  459. {
  460. $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
  461. $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
  462. $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
  463. $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
  464. if ($imgCount === 0 && $iframeCount === 0 && $embedCount === 0 && $objectCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
  465. {
  466. $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
  467. }
  468. }
  469. }
  470. /**
  471. * Initialize a node with the readability object. Also checks the
  472. * className/id for special names to add to its score.
  473. *
  474. * @param Element
  475. * @return void
  476. **/
  477. protected function initializeNode($node) {
  478. $readability = $this->dom->createAttribute('readability');
  479. $readability->value = 0; // this is our contentScore
  480. $node->setAttributeNode($readability);
  481. switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
  482. case 'DIV':
  483. $readability->value += 5;
  484. break;
  485. case 'PRE':
  486. case 'TD':
  487. case 'BLOCKQUOTE':
  488. $readability->value += 3;
  489. break;
  490. case 'ADDRESS':
  491. case 'OL':
  492. case 'UL':
  493. case 'DL':
  494. case 'DD':
  495. case 'DT':
  496. case 'LI':
  497. case 'FORM':
  498. $readability->value -= 3;
  499. break;
  500. case 'H1':
  501. case 'H2':
  502. case 'H3':
  503. case 'H4':
  504. case 'H5':
  505. case 'H6':
  506. case 'TH':
  507. $readability->value -= 5;
  508. break;
  509. }
  510. $readability->value += $this->getClassWeight($node);
  511. }
  512. /***
  513. * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
  514. * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
  515. *
  516. * @return DOMElement
  517. **/
  518. protected function grabArticle($page=null) {
  519. $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
  520. if (!$page) $page = $this->dom;
  521. $allElements = $page->getElementsByTagName('*');
  522. /**
  523. * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
  524. * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
  525. *
  526. * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
  527. * TODO: Shouldn't this be a reverse traversal?
  528. **/
  529. $node = null;
  530. $nodesToScore = array();
  531. for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {
  532. //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
  533. //$node = $targetList->item($nodeIndex);
  534. $tagName = strtoupper($node->tagName);
  535. /* Remove unlikely candidates */
  536. if ($stripUnlikelyCandidates) {
  537. $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
  538. if (
  539. preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
  540. !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&
  541. $tagName != 'BODY'
  542. )
  543. {
  544. $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
  545. //$nodesToRemove[] = $node;
  546. $node->parentNode->removeChild($node);
  547. $nodeIndex--;
  548. continue;
  549. }
  550. }
  551. if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
  552. $nodesToScore[] = $node;
  553. }
  554. if ($tagName == 'IFRAME') {
  555. if (preg_match($this->regexps['video'], $node->getAttribute('src'))) {
  556. $nodesToScore[] = $node;
  557. } else {
  558. $node->parentNode->removeChild($node);
  559. $nodeIndex--;
  560. continue;
  561. }
  562. }
  563. /* Turn all divs that don't have children block level elements into p's */
  564. if ($tagName == 'DIV') {
  565. if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
  566. $this->dbg('Altering div to p');
  567. $newNode = $this->dom->createElement('p');
  568. try {
  569. $newNode->innerHTML = $node->innerHTML;
  570. //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
  571. $node->parentNode->replaceChild($newNode, $node);
  572. $nodeIndex--;
  573. $nodesToScore[] = $node; // or $newNode?
  574. }
  575. catch(Exception $e) {
  576. $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
  577. }
  578. }
  579. else
  580. {
  581. /* EXPERIMENTAL */
  582. // TODO: change these p elements back to text nodes after processing
  583. for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
  584. $childNode = $node->childNodes->item($i);
  585. if ($childNode->nodeType == 3) { // XML_TEXT_NODE
  586. $this->dbg('replacing text node with a p tag with the same content.');
  587. $p = $this->dom->createElement('p');
  588. $p->innerHTML = $childNode->nodeValue;
  589. $p->setAttribute('style', 'display: inline;');
  590. $p->setAttribute('class', 'readability-styled');
  591. $childNode->parentNode->replaceChild($p, $childNode);
  592. }
  593. }
  594. }
  595. }
  596. }
  597. /**
  598. * Loop through all paragraphs, and assign a score to them based on how content-y they look.
  599. * Then add their score to their parent node.
  600. *
  601. * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
  602. **/
  603. $candidates = array();
  604. for ($pt=0; $pt < count($nodesToScore); $pt++) {
  605. $parentNode = $nodesToScore[$pt]->parentNode;
  606. // $grandParentNode = $parentNode ? $parentNode->parentNode : null;
  607. $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);
  608. $innerText = $this->getInnerText($nodesToScore[$pt]);
  609. if (!$parentNode || !isset($parentNode->tagName)) {
  610. continue;
  611. }
  612. /* If this paragraph is less than 25 characters, don't even count it. */
  613. if(strlen($innerText) < 25) {
  614. continue;
  615. }
  616. /* Initialize readability data for the parent. */
  617. if (!$parentNode->hasAttribute('readability'))
  618. {
  619. $this->initializeNode($parentNode);
  620. $candidates[] = $parentNode;
  621. }
  622. /* Initialize readability data for the grandparent. */
  623. if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))
  624. {
  625. $this->initializeNode($grandParentNode);
  626. $candidates[] = $grandParentNode;
  627. }
  628. $contentScore = 0;
  629. /* Add a point for the paragraph itself as a base. */
  630. $contentScore++;
  631. /* Add points for any commas within this paragraph */
  632. $contentScore += count(explode(',', $innerText));
  633. /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
  634. $contentScore += min(floor(strlen($innerText) / 100), 3);
  635. /* Add the score to the parent. The grandparent gets half. */
  636. $parentNode->getAttributeNode('readability')->value += $contentScore;
  637. if ($grandParentNode) {
  638. $grandParentNode->getAttributeNode('readability')->value += $contentScore/2;
  639. }
  640. }
  641. /**
  642. * After we've calculated scores, loop through all of the possible candidate nodes we found
  643. * and find the one with the highest score.
  644. **/
  645. $topCandidate = null;
  646. for ($c=0, $cl=count($candidates); $c < $cl; $c++)
  647. {
  648. /**
  649. * Scale the final candidates score based on link density. Good content should have a
  650. * relatively small link density (5% or less) and be mostly unaffected by this operation.
  651. **/
  652. $readability = $candidates[$c]->getAttributeNode('readability');
  653. $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));
  654. $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
  655. if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {
  656. $topCandidate = $candidates[$c];
  657. }
  658. }
  659. /**
  660. * If we still have no top candidate, just use the body as a last resort.
  661. * We also have to copy the body node so it is something we can modify.
  662. **/
  663. if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
  664. {
  665. $topCandidate = $this->dom->createElement('div');
  666. if ($page instanceof DOMDocument) {
  667. if (!isset($page->documentElement)) {
  668. // we don't have a body either? what a mess! :)
  669. } else {
  670. $topCandidate->innerHTML = $page->documentElement->innerHTML;
  671. $page->documentElement->innerHTML = '';
  672. $page->documentElement->appendChild($topCandidate);
  673. }
  674. } else {
  675. $topCandidate->innerHTML = $page->innerHTML;
  676. $page->innerHTML = '';
  677. $page->appendChild($topCandidate);
  678. }
  679. $this->initializeNode($topCandidate);
  680. }
  681. /**
  682. * Now that we have the top candidate, look through its siblings for content that might also be related.
  683. * Things like preambles, content split by ads that we removed, etc.
  684. **/
  685. $articleContent = $this->dom->createElement('div');
  686. $articleContent->setAttribute('id', 'readability-content');
  687. $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
  688. $siblingNodes = $topCandidate->parentNode->childNodes;
  689. if (!isset($siblingNodes)) {
  690. $siblingNodes = new stdClass;
  691. $siblingNodes->length = 0;
  692. }
  693. for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
  694. {
  695. $siblingNode = $siblingNodes->item($s);
  696. $append = false;
  697. $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
  698. //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
  699. if ($siblingNode === $topCandidate)
  700. // or if ($siblingNode->isSameNode($topCandidate))
  701. {
  702. $append = true;
  703. }
  704. $contentBonus = 0;
  705. /* Give a bonus if sibling nodes and top candidates have the example same classname */
  706. if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
  707. $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;
  708. }
  709. if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)
  710. {
  711. $append = true;
  712. }
  713. if (strtoupper($siblingNode->nodeName) == 'P') {
  714. $linkDensity = $this->getLinkDensity($siblingNode);
  715. $nodeContent = $this->getInnerText($siblingNode);
  716. $nodeLength = strlen($nodeContent);
  717. if ($nodeLength > 80 && $linkDensity < 0.25)
  718. {
  719. $append = true;
  720. }
  721. else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))
  722. {
  723. $append = true;
  724. }
  725. }
  726. if ($append)
  727. {
  728. $this->dbg('Appending node: ' . $siblingNode->nodeName);
  729. $nodeToAppend = null;
  730. $sibNodeName = strtoupper($siblingNode->nodeName);
  731. if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
  732. /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
  733. $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
  734. $nodeToAppend = $this->dom->createElement('div');
  735. try {
  736. $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
  737. $nodeToAppend->innerHTML = $siblingNode->innerHTML;
  738. }
  739. catch(Exception $e)
  740. {
  741. $this->dbg('Could not alter siblingNode to div, reverting back to original.');
  742. $nodeToAppend = $siblingNode;
  743. $s--;
  744. $sl--;
  745. }
  746. } else {
  747. $nodeToAppend = $siblingNode;
  748. $s--;
  749. $sl--;
  750. }
  751. /* To ensure a node does not interfere with readability styles, remove its classnames */
  752. $nodeToAppend->removeAttribute('class');
  753. /* Append sibling and subtract from our list because it removes the node when you append to another node */
  754. $articleContent->appendChild($nodeToAppend);
  755. }
  756. }
  757. /**
  758. * So we have all of the content that we need. Now we clean it up for presentation.
  759. **/
  760. $this->prepArticle($articleContent);
  761. /**
  762. * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
  763. * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
  764. * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
  765. * finding the -right- content.
  766. **/
  767. if (strlen($this->getInnerText($articleContent, false)) < 250)
  768. {
  769. if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body');
  770. $this->body->innerHTML = $this->bodyCache;
  771. if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
  772. $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
  773. return $this->grabArticle($this->body);
  774. }
  775. else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
  776. $this->removeFlag(self::FLAG_WEIGHT_CLASSES);
  777. return $this->grabArticle($this->body);
  778. }
  779. else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
  780. $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
  781. return $this->grabArticle($this->body);
  782. }
  783. else {
  784. return false;
  785. }
  786. }
  787. return $articleContent;
  788. }
  789. /**
  790. * Remove script tags from document
  791. *
  792. * @param DOMElement
  793. * @return void
  794. */
  795. public function removeScripts($doc) {
  796. $scripts = $doc->getElementsByTagName('script');
  797. for($i = $scripts->length-1; $i >= 0; $i--)
  798. {
  799. $scripts->item($i)->parentNode->removeChild($scripts->item($i));
  800. }
  801. $linkTags = $this->dom->getElementsByTagName('link');
  802. for ($i = $linkTags->length-1; $i >= 0; $i--)
  803. {
  804. $linkTags->item($i)->parentNode->removeChild($linkTags->item($i));
  805. }
  806. }
  807. /**
  808. * Get the inner text of a node.
  809. * This also strips out any excess whitespace to be found.
  810. *
  811. * @param DOMElement $
  812. * @param boolean $normalizeSpaces (default: true)
  813. * @return string
  814. **/
  815. public function getInnerText($e, $normalizeSpaces=true) {
  816. $textContent = '';
  817. if (!isset($e->textContent) || $e->textContent == '') {
  818. return '';
  819. }
  820. $textContent = trim($e->textContent);
  821. if ($normalizeSpaces) {
  822. return preg_replace($this->regexps['normalize'], ' ', $textContent);
  823. } else {
  824. return $textContent;
  825. }
  826. }
  827. /**
  828. * Get the number of times a string $s appears in the node $e.
  829. *
  830. * @param DOMElement $e
  831. * @param string - what to count. Default is ","
  832. * @return number (integer)
  833. **/
  834. public function getCharCount($e, $s=',') {
  835. return substr_count($this->getInnerText($e), $s);
  836. }
  837. /**
  838. * Remove the style attribute on every $e and under.
  839. *
  840. * @param DOMElement $e
  841. * @return void
  842. */
  843. public function cleanStyles($e) {
  844. if (!is_object($e)) return;
  845. $elems = $e->getElementsByTagName('*');
  846. foreach ($elems as $elem) {
  847. $elem->removeAttribute('style');
  848. }
  849. }
  850. /**
  851. * Get the density of links as a percentage of the content
  852. * This is the amount of text that is inside a link divided by the total text in the node.
  853. *
  854. * @param DOMElement $e
  855. * @return number (float)
  856. */
  857. public function getLinkDensity($e) {
  858. $links = $e->getElementsByTagName('a');
  859. $textLength = strlen($this->getInnerText($e));
  860. $linkLength = 0;
  861. for ($i=0, $il=$links->length; $i < $il; $i++)
  862. {
  863. $linkLength += strlen($this->getInnerText($links->item($i)));
  864. }
  865. if ($textLength > 0) {
  866. return $linkLength / $textLength;
  867. } else {
  868. return 0;
  869. }
  870. }
  871. /**
  872. * Get an elements class/id weight. Uses regular expressions to tell if this
  873. * element looks good or bad.
  874. *
  875. * @param DOMElement $e
  876. * @return number (Integer)
  877. */
  878. public function getClassWeight($e) {
  879. if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
  880. return 0;
  881. }
  882. $weight = 0;
  883. /* Look for a special classname */
  884. if ($e->hasAttribute('class') && $e->getAttribute('class') != '')
  885. {
  886. if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {
  887. $weight -= 25;
  888. }
  889. if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {
  890. $weight += 25;
  891. }
  892. }
  893. /* Look for a special ID */
  894. if ($e->hasAttribute('id') && $e->getAttribute('id') != '')
  895. {
  896. if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {
  897. $weight -= 25;
  898. }
  899. if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {
  900. $weight += 25;
  901. }
  902. }
  903. return $weight;
  904. }
  905. /**
  906. * Remove extraneous break tags from a node.
  907. *
  908. * @param DOMElement $node
  909. * @return void
  910. */
  911. public function killBreaks($node) {
  912. $html = $node->innerHTML;
  913. $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
  914. $node->innerHTML = $html;
  915. }
  916. /**
  917. * Clean a node of all elements of type "tag".
  918. * (Unless it's a youtube/vimeo video. People love movies.)
  919. *
  920. * @param DOMElement $e
  921. * @param string $tag
  922. * @return void
  923. */
  924. public function clean($e, $tag) {
  925. $targetList = $e->getElementsByTagName($tag);
  926. $isEmbed = ($tag == 'object' || $tag == 'embed' || $tag == 'iframe');
  927. for ($y=$targetList->length-1; $y >= 0; $y--) {
  928. /* Allow youtube and vimeo videos through as people usually want to see those. */
  929. if ($isEmbed) {
  930. $attributeValues = '';
  931. for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
  932. $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
  933. }
  934. /* First, check the elements attributes to see if any of them contain youtube or vimeo */
  935. if (preg_match($this->regexps['video'], $attributeValues)) {
  936. continue;
  937. }
  938. /* Then check the elements inside this element for the same. */
  939. if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {
  940. continue;
  941. }
  942. }
  943. $targetList->item($y)->parentNode->removeChild($targetList->item($y));
  944. }
  945. }
  946. /**
  947. * Clean an element of all tags of type "tag" if they look fishy.
  948. * "Fishy" is an algorithm based on content length, classnames,
  949. * link density, number of images & embeds, etc.
  950. *
  951. * @param DOMElement $e
  952. * @param string $tag
  953. * @return void
  954. */
  955. public function cleanConditionally($e, $tag) {
  956. if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
  957. return;
  958. }
  959. $tagsList = $e->getElementsByTagName($tag);
  960. $curTagsLength = $tagsList->length;
  961. /**
  962. * Gather counts for other typical elements embedded within.
  963. * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
  964. *
  965. * TODO: Consider taking into account original contentScore here.
  966. */
  967. for ($i=$curTagsLength-1; $i >= 0; $i--) {
  968. $weight = $this->getClassWeight($tagsList->item($i));
  969. $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
  970. $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
  971. if ($weight + $contentScore < 0) {
  972. $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
  973. }
  974. else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {
  975. /**
  976. * If there are not very many commas, and the number of
  977. * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
  978. **/
  979. $p = $tagsList->item($i)->getElementsByTagName('p')->length;
  980. $img = $tagsList->item($i)->getElementsByTagName('img')->length;
  981. $li = $tagsList->item($i)->getElementsByTagName('li')->length-100;
  982. $input = $tagsList->item($i)->getElementsByTagName('input')->length;
  983. $embedCount = 0;
  984. $embeds = $tagsList->item($i)->getElementsByTagName('embed');
  985. for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
  986. if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
  987. $embedCount++;
  988. }
  989. }
  990. $linkDensity = $this->getLinkDensity($tagsList->item($i));
  991. $contentLength = strlen($this->getInnerText($tagsList->item($i)));
  992. $toRemove = false;
  993. if ($li > $p && $tag != 'ul' && $tag != 'ol') {
  994. $toRemove = true;
  995. } else if ( $input > floor($p/3) ) {
  996. $toRemove = true;
  997. } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
  998. $toRemove = true;
  999. } else if($weight < 25 && $linkDensity > 0.2) {
  1000. $toRemove = true;
  1001. } else if($weight >= 25 && $linkDensity > 0.5) {
  1002. $toRemove = true;
  1003. } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
  1004. $toRemove = true;
  1005. }
  1006. if ($toRemove) {
  1007. $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
  1008. }
  1009. }
  1010. }
  1011. }
  1012. /**
  1013. * Clean out spurious headers from an Element. Checks things like classnames and link density.
  1014. *
  1015. * @param DOMElement $e
  1016. * @return void
  1017. */
  1018. public function cleanHeaders($e) {
  1019. for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
  1020. $headers = $e->getElementsByTagName('h' . $headerIndex);
  1021. for ($i=$headers->length-1; $i >=0; $i--) {
  1022. if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
  1023. $headers->item($i)->parentNode->removeChild($headers->item($i));
  1024. }
  1025. }
  1026. }
  1027. }
  1028. public function flagIsActive($flag) {
  1029. return ($this->flags & $flag) > 0;
  1030. }
  1031. public function addFlag($flag) {
  1032. $this->flags = $this->flags | $flag;
  1033. }
  1034. public function removeFlag($flag) {
  1035. $this->flags = $this->flags & ~$flag;
  1036. }
  1037. }
  1038. ?>