PageRenderTime 53ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 0ms

/makefulltextfeed.php

https://bitbucket.org/timgws/full-text-rss
PHP | 1021 lines | 730 code | 44 blank | 247 comment | 220 complexity | 2c1f4b9d7160b1e9bdfbaff6b0352aa8 MD5 | raw file
  1. <?php
  2. // Create Full-Text Feeds
  3. // Author: Keyvan Minoukadeh
  4. // Copyright (c) 2012 Keyvan Minoukadeh
  5. // License: AGPLv3
  6. // Version: 2.9.5
  7. // Date: 2012-04-29
  8. // More info: http://fivefilters.org/content-only/
  9. // Help: http://help.fivefilters.org
  10. /*
  11. This program is free software: you can redistribute it and/or modify
  12. it under the terms of the GNU Affero General Public License as published by
  13. the Free Software Foundation, either version 3 of the License, or
  14. (at your option) any later version.
  15. This program is distributed in the hope that it will be useful,
  16. but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. GNU Affero General Public License for more details.
  19. You should have received a copy of the GNU Affero General Public License
  20. along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. */
  22. // Usage
  23. // -----
  24. // Request this file passing it your feed in the querystring: makefulltextfeed.php?url=mysite.org
  25. // The following options can be passed in the querystring:
  26. // * URL: url=[feed or website url] (required, should be URL-encoded - in php: urlencode($url))
  27. // * URL points to HTML (not feed): html=true (optional, by default it's automatically detected)
  28. // * API key: key=[api key] (optional, refer to config.php)
  29. // * Max entries to process: max=[max number of items] (optional)
  30. error_reporting(E_ALL ^ E_NOTICE);
  31. ini_set("display_errors", 1);
  32. @set_time_limit(120);
  33. // set include path
  34. set_include_path(realpath(dirname(__FILE__).'/libraries').PATH_SEPARATOR.get_include_path());
  35. // Autoloading of classes allows us to include files only when they're
  36. // needed. If we've got a cached copy, for example, only Zend_Cache is loaded.
  37. function autoload($class_name) {
  38. static $mapping = array(
  39. // Include SimplePie for RSS/Atom parsing
  40. // 'SimplePie' => 'simplepie/simplepie.class.php',
  41. // 'SimplePie_Misc' => 'simplepie/simplepie.class.php',
  42. // 'SimplePie_HTTP_Parser' => 'simplepie/simplepie.class.php',
  43. // 'SimplePie_File' => 'simplepie/simplepie.class.php',
  44. // Include FeedCreator for RSS/Atom creation
  45. 'FeedWriter' => 'feedwriter/FeedWriter.php',
  46. 'FeedItem' => 'feedwriter/FeedItem.php',
  47. // Include ContentExtractor and Readability for identifying and extracting content from URLs
  48. 'ContentExtractor' => 'content-extractor/ContentExtractor.php',
  49. 'SiteConfig' => 'content-extractor/SiteConfig.php',
  50. 'Readability' => 'readability/Readability.php',
  51. // Include Humble HTTP Agent to allow parallel requests and response caching
  52. 'HumbleHttpAgent' => 'humble-http-agent/HumbleHttpAgent.php',
  53. 'SimplePie_HumbleHttpAgent' => 'humble-http-agent/SimplePie_HumbleHttpAgent.php',
  54. 'CookieJar' => 'humble-http-agent/CookieJar.php',
  55. // Include IRI class for resolving relative URLs
  56. // 'IRI' => 'iri/iri.php',
  57. // Include Zend Cache to improve performance (cache results)
  58. 'Zend_Cache' => 'Zend/Cache.php',
  59. // Include Zend CSS to XPath for dealing with custom patterns
  60. 'Zend_Dom_Query_Css2Xpath' => 'Zend/Dom/Query/Css2Xpath.php',
  61. // Language detect
  62. 'Text_LanguageDetect' => 'language-detect/LanguageDetect.php'
  63. );
  64. if (isset($mapping[$class_name])) {
  65. //echo "Loading $class_name\n<br />";
  66. require_once $mapping[$class_name];
  67. return true;
  68. } else {
  69. return false;
  70. }
  71. }
  72. spl_autoload_register('autoload');
  73. require_once 'libraries/simplepie/SimplePieAutoloader.php';
  74. // always include Simplepie_Core as it defines constants which other SimplePie components
  75. // assume will always be available.
  76. require_once 'libraries/simplepie/SimplePie/Core.php';
  77. ////////////////////////////////
  78. // Load config file
  79. ////////////////////////////////
  80. require_once(dirname(__FILE__).'/config.php');
  81. ////////////////////////////////
  82. // Prevent indexing/following by search engines because:
  83. // 1. The content is already public and presumably indexed (why create duplicates?)
  84. // 2. Not doing so might increase number of requests from search engines, thus increasing server load
  85. // Note: feed readers and services such as Yahoo Pipes will not be affected by this header.
  86. // Note: Using Disallow in a robots.txt file will be more effective (search engines will check
  87. // that before even requesting makefulltextfeed.php).
  88. ////////////////////////////////
  89. header('X-Robots-Tag: noindex, nofollow');
  90. ////////////////////////////////
  91. // Check if service is enabled
  92. ////////////////////////////////
  93. if (!$options->enabled) {
  94. die('The full-text RSS service is currently disabled');
  95. }
  96. ////////////////////////////////
  97. // Check for feed URL
  98. ////////////////////////////////
  99. if (!isset($_GET['url'])) {
  100. die('No URL supplied');
  101. }
  102. $url = trim($_GET['url']);
  103. if (strtolower(substr($url, 0, 7)) == 'feed://') {
  104. $url = 'http://'.substr($url, 7);
  105. }
  106. if (!preg_match('!^https?://.+!i', $url)) {
  107. $url = 'http://'.$url;
  108. }
  109. $url = filter_var($url, FILTER_SANITIZE_URL);
  110. $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
  111. // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
  112. if ($test === false) {
  113. $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
  114. }
  115. if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
  116. // all okay
  117. unset($test);
  118. } else {
  119. die('Invalid URL supplied');
  120. }
  121. ////////////////////////////////
  122. // Redirect to alternative URL?
  123. ////////////////////////////////
  124. if ($options->alternative_url != '' && !isset($_GET['redir']) && mt_rand(0, 100) > 50) {
  125. $redirect = $options->alternative_url.'?redir=true&url='.urlencode($url);
  126. if (isset($_GET['html'])) $redirect .= '&html='.urlencode($_GET['html']);
  127. if (isset($_GET['key'])) $redirect .= '&key='.urlencode($_GET['key']);
  128. if (isset($_GET['max'])) $redirect .= '&max='.(int)$_GET['max'];
  129. if (isset($_GET['links'])) $redirect .= '&links='.$_GET['links'];
  130. if (isset($_GET['exc'])) $redirect .= '&exc='.$_GET['exc'];
  131. if (isset($_GET['what'])) $redirect .= '&what='.$_GET['what'];
  132. if (isset($_GET['format'])) $redirect .= '&format='.$_GET['format'];
  133. if (isset($_GET['l'])) $redirect .= '&format='.$_GET['l'];
  134. header("Location: $redirect");
  135. exit;
  136. }
  137. /////////////////////////////////
  138. // Redirect to hide API key
  139. /////////////////////////////////
  140. if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->api_keys)) !== false) {
  141. $host = $_SERVER['HTTP_HOST'];
  142. $path = rtrim(dirname($_SERVER['SCRIPT_NAME']), '/\\');
  143. $redirect = 'http://'.htmlspecialchars($host.$path).'/makefulltextfeed.php?url='.urlencode($url);
  144. $redirect .= '&key='.$key_index;
  145. $redirect .= '&hash='.urlencode(sha1($_GET['key'].$url));
  146. if (isset($_GET['html'])) $redirect .= '&html='.urlencode($_GET['html']);
  147. if (isset($_GET['max'])) $redirect .= '&max='.(int)$_GET['max'];
  148. if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']);
  149. if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']);
  150. if (isset($_GET['what'])) $redirect .= '&what='.urlencode($_GET['what']);
  151. if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']);
  152. if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']);
  153. header("Location: $redirect");
  154. exit;
  155. }
  156. ///////////////////////////////////////////////
  157. // Set timezone.
  158. // Prevents warnings, but needs more testing -
  159. // perhaps if timezone is set in php.ini we
  160. // don't need to set it at all...
  161. ///////////////////////////////////////////////
  162. if (!ini_get('date.timezone') || !@date_default_timezone_set(ini_get('date.timezone'))) {
  163. date_default_timezone_set('UTC');
  164. }
  165. ///////////////////////////////////////////////
  166. // Check if the request is explicitly for an HTML page
  167. ///////////////////////////////////////////////
  168. $html_only = (isset($_GET['html']) && ($_GET['html'] == '1' || $_GET['html'] == 'true'));
  169. ///////////////////////////////////////////////
  170. // Check if valid key supplied
  171. ///////////////////////////////////////////////
  172. $valid_key = false;
  173. if (isset($_GET['key']) && isset($_GET['hash']) && isset($options->api_keys[(int)$_GET['key']])) {
  174. $valid_key = ($_GET['hash'] == sha1($options->api_keys[(int)$_GET['key']].$url));
  175. }
  176. $key_index = ($valid_key) ? (int)$_GET['key'] : 0;
  177. if (!$valid_key && $options->key_required) {
  178. die('A valid key must be supplied');
  179. }
  180. if (!$valid_key && isset($_GET['key']) && $_GET['key'] != '') {
  181. die('The entered key is invalid');
  182. }
  183. if (file_exists('custom_init.php')) require 'custom_init.php';
  184. ///////////////////////////////////////////////
  185. // Check URL against list of blacklisted URLs
  186. ///////////////////////////////////////////////
  187. if (!url_allowed($url)) die('URL blocked');
  188. ///////////////////////////////////////////////
  189. // Max entries
  190. // see config.php to find these values
  191. ///////////////////////////////////////////////
  192. if (isset($_GET['max'])) {
  193. $max = (int)$_GET['max'];
  194. if ($valid_key) {
  195. $max = min($max, $options->max_entries_with_key);
  196. } else {
  197. $max = min($max, $options->max_entries);
  198. }
  199. } else {
  200. if ($valid_key) {
  201. $max = $options->default_entries_with_key;
  202. } else {
  203. $max = $options->default_entries;
  204. }
  205. }
  206. ///////////////////////////////////////////////
  207. // Link handling
  208. ///////////////////////////////////////////////
  209. if (($valid_key || !$options->restrict) && isset($_GET['links']) && in_array($_GET['links'], array('preserve', 'footnotes', 'remove'))) {
  210. $links = $_GET['links'];
  211. } else {
  212. $links = 'preserve';
  213. }
  214. ///////////////////////////////////////////////
  215. // Exclude items if extraction fails
  216. ///////////////////////////////////////////////
  217. if ($options->exclude_items_on_fail == 'user') {
  218. $exclude_on_fail = (isset($_GET['exc']) && ($_GET['exc'] == '1'));
  219. } else {
  220. $exclude_on_fail = $options->exclude_items_on_fail;
  221. }
  222. ///////////////////////////////////////////////
  223. // Detect language
  224. ///////////////////////////////////////////////
  225. if ((string)$options->detect_language == 'user') {
  226. if (isset($_GET['l'])) {
  227. $detect_language = (int)$_GET['l'];
  228. } else {
  229. $detect_language = 1;
  230. }
  231. } else {
  232. $detect_language = $options->detect_language;
  233. }
  234. if ($detect_language >= 2) {
  235. $language_codes = array('albanian' => 'sq','arabic' => 'ar','azeri' => 'az','bengali' => 'bn','bulgarian' => 'bg',
  236. 'cebuano' => 'ceb', // ISO 639-2
  237. 'croatian' => 'hr','czech' => 'cs','danish' => 'da','dutch' => 'nl','english' => 'en','estonian' => 'et','farsi' => 'fa','finnish' => 'fi','french' => 'fr','german' => 'de','hausa' => 'ha',
  238. 'hawaiian' => 'haw', // ISO 639-2
  239. 'hindi' => 'hi','hungarian' => 'hu','icelandic' => 'is','indonesian' => 'id','italian' => 'it','kazakh' => 'kk','kyrgyz' => 'ky','latin' => 'la','latvian' => 'lv','lithuanian' => 'lt','macedonian' => 'mk','mongolian' => 'mn','nepali' => 'ne','norwegian' => 'no','pashto' => 'ps',
  240. 'pidgin' => 'cpe', // ISO 639-2
  241. 'polish' => 'pl','portuguese' => 'pt','romanian' => 'ro','russian' => 'ru','serbian' => 'sr','slovak' => 'sk','slovene' => 'sl','somali' => 'so','spanish' => 'es','swahili' => 'sw','swedish' => 'sv','tagalog' => 'tl','turkish' => 'tr','ukrainian' => 'uk','urdu' => 'ur','uzbek' => 'uz','vietnamese' => 'vi','welsh' => 'cy');
  242. }
  243. $use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0);
  244. ///////////////////////////////////////////////
  245. // Extraction pattern
  246. ///////////////////////////////////////////////
  247. $auto_extract = true;
  248. if ($options->extraction_pattern == 'user') {
  249. $extract_pattern = (isset($_GET['what']) ? trim($_GET['what']) : 'auto');
  250. } else {
  251. $extract_pattern = trim($options->extraction_pattern);
  252. }
  253. if (($extract_pattern != '') && ($extract_pattern != 'auto')) {
  254. // split pattern by space (currently only descendants of 'auto' are recognised)
  255. $extract_pattern = preg_split('/\s+/', $extract_pattern, 2);
  256. if ($extract_pattern[0] == 'auto') { // parent selector is 'auto'
  257. $extract_pattern = $extract_pattern[1];
  258. } else {
  259. $extract_pattern = implode(' ', $extract_pattern);
  260. $auto_extract = false;
  261. }
  262. // Convert CSS to XPath
  263. // Borrowed from Symfony's cssToXpath() function: https://github.com/fabpot/symfony/blob/master/src/Symfony/Component/CssSelector/Parser.php
  264. // (Itself based on Python's lxml library)
  265. if (preg_match('#^\w+\s*$#u', $extract_pattern, $match)) {
  266. $extract_pattern = '//'.trim($match[0]);
  267. } elseif (preg_match('~^(\w*)#(\w+)\s*$~u', $extract_pattern, $match)) {
  268. $extract_pattern = sprintf("%s%s[@id = '%s']", '//', $match[1] ? $match[1] : '*', $match[2]);
  269. } elseif (preg_match('#^(\w*)\.(\w+)\s*$#u', $extract_pattern, $match)) {
  270. $extract_pattern = sprintf("%s%s[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]", '//', $match[1] ? $match[1] : '*', $match[2]);
  271. } else {
  272. // if the patterns above do not match, invoke Zend's CSS to Xpath function
  273. $extract_pattern = Zend_Dom_Query_Css2Xpath::transform($extract_pattern);
  274. }
  275. } else {
  276. $extract_pattern = false;
  277. }
  278. /////////////////////////////////////
  279. // Check for valid format
  280. // (stick to RSS (or RSS as JSON) for the time being)
  281. /////////////////////////////////////
  282. if (isset($_GET['format']) && $_GET['format'] == 'json') {
  283. $format = 'json';
  284. } else {
  285. $format = 'rss';
  286. }
  287. //////////////////////////////////
  288. // Check for cached copy
  289. //////////////////////////////////
  290. if ($options->caching) {
  291. $frontendOptions = array(
  292. 'lifetime' => ($valid_key || !$options->restrict) ? 10*60 : 20*60, // cache lifetime of 10 or 20 minutes
  293. 'automatic_serialization' => false,
  294. 'write_control' => false,
  295. 'automatic_cleaning_factor' => $options->cache_cleanup,
  296. 'ignore_user_abort' => false
  297. );
  298. $backendOptions = array(
  299. 'cache_dir' => ($valid_key) ? $options->cache_dir.'/rss-with-key/' : $options->cache_dir.'/rss/', // directory where to put the cache files
  300. 'file_locking' => false,
  301. 'read_control' => true,
  302. 'read_control_type' => 'strlen',
  303. 'hashed_directory_level' => $options->cache_directory_level,
  304. 'hashed_directory_umask' => 0777,
  305. 'cache_file_umask' => 0664,
  306. 'file_name_prefix' => 'ff'
  307. );
  308. // getting a Zend_Cache_Core object
  309. $cache = Zend_Cache::factory('Core', 'File', $frontendOptions, $backendOptions);
  310. $cache_id = md5($max.$url.$valid_key.$links.$exclude_on_fail.$auto_extract.$extract_pattern.$format.(int)isset($_GET['l']).(int)isset($_GET['pubsub']));
  311. if ($data = $cache->load($cache_id)) {
  312. if ($format == 'json') {
  313. header("Content-type: application/json; charset=UTF-8");
  314. } else {
  315. header("Content-type: text/xml; charset=UTF-8");
  316. }
  317. if (headers_sent()) die('Some data has already been output, can\'t send RSS file');
  318. echo $data;
  319. exit;
  320. }
  321. }
  322. //////////////////////////////////
  323. // Set Expires header
  324. //////////////////////////////////
  325. if ($valid_key) {
  326. header('Expires: ' . gmdate('D, d M Y H:i:s', time()+(60*10)) . ' GMT');
  327. } else {
  328. header('Expires: ' . gmdate('D, d M Y H:i:s', time()+(60*20)) . ' GMT');
  329. }
  330. //////////////////////////////////
  331. // Set up HTTP agent
  332. //////////////////////////////////
  333. $http = new HumbleHttpAgent();
  334. $http->userAgentMap = $options->user_agents;
  335. $http->headerOnlyTypes = array_keys($options->content_type_exc);
  336. $http->rewriteUrls = $options->rewrite_url;
  337. //////////////////////////////////
  338. // Set up Content Extractor
  339. //////////////////////////////////
  340. $extractor = new ContentExtractor(dirname(__FILE__).'/site_config/custom', dirname(__FILE__).'/site_config/standard');
  341. $extractor->fingerprints = $options->fingerprints;
  342. /*
  343. if ($options->caching) {
  344. $frontendOptions = array(
  345. 'lifetime' => 30*60, // cache lifetime of 30 minutes
  346. 'automatic_serialization' => true,
  347. 'write_control' => false,
  348. 'automatic_cleaning_factor' => $options->cache_cleanup,
  349. 'ignore_user_abort' => false
  350. );
  351. $backendOptions = array(
  352. 'cache_dir' => $options->cache_dir.'/http-responses/', // directory where to put the cache files
  353. 'file_locking' => false,
  354. 'read_control' => true,
  355. 'read_control_type' => 'strlen',
  356. 'hashed_directory_level' => $options->cache_directory_level,
  357. 'hashed_directory_umask' => 0777,
  358. 'cache_file_umask' => 0664,
  359. 'file_name_prefix' => 'ff'
  360. );
  361. $httpCache = Zend_Cache::factory('Core', 'File', $frontendOptions, $backendOptions);
  362. $http->useCache($httpCache);
  363. }
  364. */
  365. ////////////////////////////////
  366. // Get RSS/Atom feed
  367. ////////////////////////////////
  368. if (!$html_only) {
  369. // Send user agent header showing PHP (prevents a HTML response from feedburner)
  370. $http->userAgentDefault = HumbleHttpAgent::UA_PHP;
  371. // configure SimplePie HTTP extension class to use our HumbleHttpAgent instance
  372. SimplePie_HumbleHttpAgent::set_agent($http);
  373. $feed = new SimplePie();
  374. // some feeds use the text/html content type - force_feed tells SimplePie to process anyway
  375. $feed->force_feed(true);
  376. $feed->set_file_class('SimplePie_HumbleHttpAgent');
  377. //$feed->set_feed_url($url); // colons appearing in the URL's path get encoded
  378. $feed->feed_url = $url;
  379. $feed->set_autodiscovery_level(SIMPLEPIE_LOCATOR_NONE);
  380. $feed->set_timeout(20);
  381. $feed->enable_cache(false);
  382. $feed->set_stupidly_fast(true);
  383. $feed->enable_order_by_date(false); // we don't want to do anything to the feed
  384. $feed->set_url_replacements(array());
  385. // initialise the feed
  386. // the @ suppresses notices which on some servers causes a 500 internal server error
  387. $result = @$feed->init();
  388. //$feed->handle_content_type();
  389. //$feed->get_title();
  390. if ($result && (!is_array($feed->data) || count($feed->data) == 0)) {
  391. die('Sorry, no feed items found');
  392. }
  393. // from now on, we'll identify ourselves as a browser
  394. $http->userAgentDefault = HumbleHttpAgent::UA_BROWSER;
  395. }
  396. ////////////////////////////////////////////////////////////////////////////////
  397. // Our given URL is not a feed, so let's create our own feed with a single item:
  398. // the given URL. This basically treats all non-feed URLs as if they were
  399. // single-item feeds.
  400. ////////////////////////////////////////////////////////////////////////////////
  401. $isDummyFeed = false;
  402. if ($html_only || !$result) {
  403. $isDummyFeed = true;
  404. unset($feed, $result);
  405. // create single item dummy feed object
  406. class DummySingleItemFeed {
  407. public $item;
  408. function __construct($url) { $this->item = new DummySingleItem($url); }
  409. public function get_title() { return ''; }
  410. public function get_description() { return 'Content extracted from '.$this->item->url; }
  411. public function get_link() { return $this->item->url; }
  412. public function get_language() { return false; }
  413. public function get_image_url() { return false; }
  414. public function get_items($start=0, $max=1) { return array(0=>$this->item); }
  415. }
  416. class DummySingleItem {
  417. public $url;
  418. function __construct($url) { $this->url = $url; }
  419. public function get_permalink() { return $this->url; }
  420. public function get_title() { return ''; }
  421. public function get_date($format='') { return false; }
  422. public function get_author($key=0) { return null; }
  423. public function get_authors() { return null; }
  424. public function get_description() { return ''; }
  425. public function get_enclosure($key=0, $prefer=null) { return null; }
  426. public function get_enclosures() { return null; }
  427. }
  428. $feed = new DummySingleItemFeed($url);
  429. }
  430. ////////////////////////////////////////////
  431. // Create full-text feed
  432. ////////////////////////////////////////////
  433. $output = new FeedWriter();
  434. $output->setTitle($feed->get_title());
  435. $output->setDescription($feed->get_description());
  436. $output->setXsl('css/feed.xsl'); // Chrome uses this, most browsers ignore it
  437. if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment
  438. $output->addHub('http://fivefilters.superfeedr.com/');
  439. $output->addHub('http://pubsubhubbub.appspot.com/');
  440. $output->setSelf('http://'.$_SERVER['HTTP_HOST'].$_SERVER['REQUEST_URI']);
  441. }
  442. $output->setLink($feed->get_link()); // Google Reader uses this for pulling in favicons
  443. if ($img_url = $feed->get_image_url()) {
  444. $output->setImage($feed->get_title(), $feed->get_link(), $img_url);
  445. }
  446. /*
  447. if ($format == 'atom') {
  448. $output->setChannelElement('updated', date(DATE_ATOM));
  449. $output->setChannelElement('author', array('name'=>'Five Filters', 'uri'=>'http://fivefilters.org'));
  450. }
  451. */
  452. ////////////////////////////////////////////
  453. // Loop through feed items
  454. ////////////////////////////////////////////
  455. $items = $feed->get_items(0, $max);
  456. // Request all feed items in parallel (if supported)
  457. $urls_sanitized = array();
  458. $urls = array();
  459. foreach ($items as $key => $item) {
  460. $permalink = htmlspecialchars_decode($item->get_permalink());
  461. // Colons in URL path segments get encoded by SimplePie, yet some sites expect them unencoded
  462. $permalink = str_replace('%3A', ':', $permalink);
  463. // validateUrl() strips non-ascii characters
  464. // simplepie already sanitizes URLs so let's not do it again here.
  465. //$permalink = $http->validateUrl($permalink);
  466. if ($permalink) {
  467. $urls_sanitized[] = $permalink;
  468. }
  469. $urls[$key] = $permalink;
  470. }
  471. $http->fetchAll($urls_sanitized);
  472. //$http->cacheAll();
  473. // count number of items added to full feed
  474. $item_count = 0;
  475. foreach ($items as $key => $item) {
  476. $do_content_extraction = true;
  477. $extract_result = false;
  478. $text_sample = null;
  479. $permalink = $urls[$key];
  480. $newitem = $output->createNewItem();
  481. $newitem->setTitle(htmlspecialchars_decode($item->get_title()));
  482. if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment
  483. if ($permalink !== false) {
  484. $newitem->setLink('http://fivefilters.org/content-only/redirect.php?url='.urlencode($permalink));
  485. } else {
  486. $newitem->setLink('http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()));
  487. }
  488. } else {
  489. if ($permalink !== false) {
  490. $newitem->setLink($permalink);
  491. } else {
  492. $newitem->setLink($item->get_permalink());
  493. }
  494. }
  495. // TODO: Allow error codes - some sites return correct content with error status
  496. // e.g. prospectmagazine.co.uk returns 403
  497. //if ($permalink && ($response = $http->get($permalink, true)) && $response['status_code'] < 300) {
  498. if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) {
  499. $effective_url = $response['effective_url'];
  500. if (!url_allowed($effective_url)) continue;
  501. // check if action defined for returned Content-Type
  502. $type = null;
  503. if (preg_match('!^Content-Type:\s*(([-\w]+)/([-\w\+]+))!im', $response['headers'], $match)) {
  504. // look for full mime type (e.g. image/jpeg) or just type (e.g. image)
  505. $match[1] = strtolower(trim($match[1]));
  506. $match[2] = strtolower(trim($match[2]));
  507. foreach (array($match[1], $match[2]) as $_mime) {
  508. if (isset($options->content_type_exc[$_mime])) {
  509. $type = $match[1];
  510. $_act = $options->content_type_exc[$_mime]['action'];
  511. $_name = $options->content_type_exc[$_mime]['name'];
  512. if ($_act == 'exclude') {
  513. continue 2; // skip this feed item entry
  514. } elseif ($_act == 'link') {
  515. if ($match[2] == 'image') {
  516. $html = "<a href=\"$effective_url\"><img src=\"$effective_url\" alt=\"$_name\" /></a>";
  517. } else {
  518. $html = "<a href=\"$effective_url\">Download $_name</a>";
  519. }
  520. $title = $_name;
  521. $do_content_extraction = false;
  522. break;
  523. }
  524. }
  525. }
  526. unset($_mime, $_act, $_name, $match);
  527. }
  528. if ($do_content_extraction) {
  529. $html = $response['body'];
  530. // remove strange things
  531. $html = str_replace('</[>', '', $html);
  532. $html = convert_to_utf8($html, $response['headers']);
  533. if ($auto_extract) {
  534. // check site config for single page URL - fetch it if found
  535. if ($single_page_response = getSinglePage($item, $html, $effective_url)) {
  536. $html = $single_page_response['body'];
  537. // remove strange things
  538. $html = str_replace('</[>', '', $html);
  539. $html = convert_to_utf8($html, $single_page_response['headers']);
  540. $effective_url = $single_page_response['effective_url'];
  541. unset($single_page_response);
  542. }
  543. $extract_result = $extractor->process($html, $effective_url);
  544. $readability = $extractor->readability;
  545. $content_block = ($extract_result) ? $extractor->getContent() : null;
  546. $title = ($extract_result) ? $extractor->getTitle() : '';
  547. } else {
  548. $readability = new Readability($html, $effective_url);
  549. // content block is entire document (for now...)
  550. $content_block = $readability->dom;
  551. //TODO: get title
  552. $title = '';
  553. }
  554. }
  555. // use extracted title for both feed and item title if we're using single-item dummy feed
  556. if ($isDummyFeed) {
  557. $output->setTitle($title);
  558. $newitem->setTitle($title);
  559. }
  560. if ($do_content_extraction) {
  561. if ($extract_pattern && isset($content_block)) {
  562. $xpath = new DOMXPath($readability->dom);
  563. $elems = @$xpath->query($extract_pattern, $content_block);
  564. // check if our custom extraction pattern matched
  565. if ($elems && $elems->length > 0) {
  566. $extract_result = true;
  567. // get the first matched element
  568. $content_block = $elems->item(0);
  569. // clean it up
  570. $readability->removeScripts($content_block);
  571. $readability->prepArticle($content_block);
  572. }
  573. }
  574. }
  575. }
  576. if ($do_content_extraction) {
  577. // if we failed to extract content...
  578. if (!$extract_result) {
  579. if ($exclude_on_fail) continue; // skip this and move to next item
  580. //TODO: get text sample for language detection
  581. if (!$valid_key) {
  582. $html = $options->error_message;
  583. } else {
  584. $html = $options->error_message_with_key;
  585. }
  586. // keep the original item description
  587. $html .= $item->get_description();
  588. } else {
  589. $readability->clean($content_block, 'select');
  590. if ($options->rewrite_relative_urls) makeAbsolute($effective_url, $content_block);
  591. // footnotes
  592. if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) {
  593. $readability->addFootnotes($content_block);
  594. }
  595. if ($extract_pattern) {
  596. // get outerHTML
  597. $html = $content_block->ownerDocument->saveXML($content_block);
  598. } else {
  599. if ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) {
  600. $html = $content_block->firstChild->innerHTML;
  601. } else {
  602. $html = $content_block->innerHTML;
  603. }
  604. }
  605. // post-processing cleanup
  606. $html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html);
  607. if ($links == 'remove') {
  608. $html = preg_replace('!</?a[^>]*>!', '', $html);
  609. }
  610. // get text sample for language detection
  611. $text_sample = strip_tags(substr($html, 0, 500));
  612. if (!$valid_key) {
  613. $html = make_substitutions($options->message_to_prepend).$html;
  614. $html .= make_substitutions($options->message_to_append);
  615. } else {
  616. $html = make_substitutions($options->message_to_prepend_with_key).$html;
  617. $html .= make_substitutions($options->message_to_append_with_key);
  618. }
  619. }
  620. }
  621. /*
  622. if ($format == 'atom') {
  623. $newitem->addElement('content', $html);
  624. $newitem->setDate((int)$item->get_date('U'));
  625. if ($author = $item->get_author()) {
  626. $newitem->addElement('author', array('name'=>$author->get_name()));
  627. }
  628. } else {
  629. */
  630. if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment
  631. $newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false'));
  632. } else {
  633. $newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true'));
  634. }
  635. $newitem->setDescription($html);
  636. // set date
  637. if ((int)$item->get_date('U') > 0) {
  638. $newitem->setDate((int)$item->get_date('U'));
  639. } elseif ($extractor->getDate()) {
  640. $newitem->setDate($extractor->getDate());
  641. }
  642. // add authors
  643. if ($authors = $item->get_authors()) {
  644. foreach ($authors as $author) {
  645. $newitem->addElement('dc:creator', $author->get_name());
  646. }
  647. } elseif ($authors = $extractor->getAuthors()) {
  648. //TODO: make sure the list size is reasonable
  649. foreach ($authors as $author) {
  650. //TODO: addElement replaces this element each time
  651. $newitem->addElement('dc:creator', $author);
  652. }
  653. }
  654. // add language
  655. if ($detect_language) {
  656. $language = $extractor->getLanguage();
  657. if (!$language) $language = $feed->get_language();
  658. if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) {
  659. try {
  660. if ($use_cld) {
  661. // Use PHP-CLD extension
  662. $php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error
  663. $res = $php_cld($text_sample);
  664. if (is_array($res) && count($res) > 0) {
  665. $language = $res[0]['code'];
  666. }
  667. } else {
  668. //die('what');
  669. // Use PEAR's Text_LanguageDetect
  670. if (!isset($l)) {
  671. $l = new Text_LanguageDetect('libraries/language-detect/lang.dat', 'libraries/language-detect/unicode_blocks.dat');
  672. }
  673. $l_result = $l->detect($text_sample, 1);
  674. if (count($l_result) > 0) {
  675. $language = $language_codes[key($l_result)];
  676. }
  677. }
  678. } catch (Exception $e) {
  679. //die('error: '.$e);
  680. // do nothing
  681. }
  682. }
  683. if ($language && (strlen($language) < 7)) {
  684. $newitem->addElement('dc:language', $language);
  685. }
  686. }
  687. // add MIME type (if it appeared in our exclusions lists)
  688. if (isset($type)) $newitem->addElement('dc:format', $type);
  689. // add effective URL (URL after redirects)
  690. if (isset($effective_url)) {
  691. //TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g.
  692. //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-–-25th-March-2012-Special-Program-from-Liari-(Karachi)
  693. //temporary measure: use utf8_encode()
  694. $newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url)));
  695. } else {
  696. $newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink()));
  697. }
  698. // check for enclosures
  699. if ($options->keep_enclosures) {
  700. if ($enclosures = $item->get_enclosures()) {
  701. foreach ($enclosures as $enclosure) {
  702. if (!$enclosure->get_link()) continue;
  703. $enc = array();
  704. // Media RSS spec ($enc): http://search.yahoo.com/mrss
  705. // SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4
  706. $enc['url'] = $enclosure->get_link();
  707. if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length();
  708. if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type();
  709. if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium();
  710. if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression();
  711. if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate();
  712. if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate();
  713. if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate();
  714. if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels();
  715. if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration();
  716. if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height();
  717. if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width();
  718. if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language();
  719. $newitem->addElement('media:content', '', $enc);
  720. }
  721. }
  722. }
  723. /* } */
  724. $output->addItem($newitem);
  725. unset($html);
  726. $item_count++;
  727. }
  728. // output feed
  729. if ($format == 'json') $output->setFormat(JSON);
  730. if ($options->caching) {
  731. ob_start();
  732. $output->genarateFeed();
  733. $output = ob_get_contents();
  734. ob_end_clean();
  735. if ($html_only && $item_count == 0) {
  736. // do not cache - in case of temporary server glitch at source URL
  737. } else {
  738. $cache->save($output, $cache_id);
  739. }
  740. echo $output;
  741. } else {
  742. $output->genarateFeed();
  743. }
  744. ///////////////////////////////
  745. // HELPER FUNCTIONS
  746. ///////////////////////////////
  747. function url_allowed($url) {
  748. global $options;
  749. if (!empty($options->allowed_urls)) {
  750. $allowed = false;
  751. foreach ($options->allowed_urls as $allowurl) {
  752. if (stristr($url, $allowurl) !== false) {
  753. $allowed = true;
  754. break;
  755. }
  756. }
  757. if (!$allowed) return false;
  758. } else {
  759. foreach ($options->blocked_urls as $blockurl) {
  760. if (stristr($url, $blockurl) !== false) {
  761. return false;
  762. }
  763. }
  764. }
  765. return true;
  766. }
  767. //////////////////////////////////////////////
  768. // Convert $html to UTF8
  769. // (uses HTTP headers and HTML to find encoding)
  770. // adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8
  771. //////////////////////////////////////////////
  772. function convert_to_utf8($html, $header=null)
  773. {
  774. $encoding = null;
  775. if ($html || $header) {
  776. if (is_array($header)) $header = implode("\n", $header);
  777. if (!$header || !preg_match_all('/^Content-Type:\s+([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $header, $match, PREG_SET_ORDER)) {
  778. // error parsing the response
  779. } else {
  780. $match = end($match); // get last matched element (in case of redirects)
  781. if (isset($match[2])) $encoding = trim($match[2], "\"' \r\n\0\x0B\t");
  782. }
  783. // TODO: check to see if encoding is supported (can we convert it?)
  784. // If it's not, result will be empty string.
  785. // For now we'll check for invalid encoding types returned by some sites, e.g. 'none'
  786. // Problem URL: http://facta.co.jp/blog/archives/20111026001026.html
  787. if (!$encoding || $encoding == 'none') {
  788. // search for encoding in HTML - only look at the first 35000 characters
  789. $html_head = substr($html, 0, 40000);
  790. if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $html_head, $match)) {
  791. $encoding = trim($match[1], '"\'');
  792. } elseif (preg_match('/<meta\s+http-equiv=["\']?Content-Type["\']? content=["\'][^;]+;\s*charset=["\']?([^;"\'>]+)/i', $html_head, $match)) {
  793. $encoding = trim($match[1]);
  794. } elseif (preg_match_all('/<meta\s+([^>]+)>/i', $html_head, $match)) {
  795. foreach ($match[1] as $_test) {
  796. if (preg_match('/charset=["\']?([^"\']+)/i', $_test, $_m)) {
  797. $encoding = trim($_m[1]);
  798. break;
  799. }
  800. }
  801. }
  802. }
  803. if (isset($encoding)) $encoding = trim($encoding);
  804. // trim is important here!
  805. if (!$encoding || (strtolower($encoding) == 'iso-8859-1')) {
  806. // replace MS Word smart qutoes
  807. $trans = array();
  808. $trans[chr(130)] = '&sbquo;'; // Single Low-9 Quotation Mark
  809. $trans[chr(131)] = '&fnof;'; // Latin Small Letter F With Hook
  810. $trans[chr(132)] = '&bdquo;'; // Double Low-9 Quotation Mark
  811. $trans[chr(133)] = '&hellip;'; // Horizontal Ellipsis
  812. $trans[chr(134)] = '&dagger;'; // Dagger
  813. $trans[chr(135)] = '&Dagger;'; // Double Dagger
  814. $trans[chr(136)] = '&circ;'; // Modifier Letter Circumflex Accent
  815. $trans[chr(137)] = '&permil;'; // Per Mille Sign
  816. $trans[chr(138)] = '&Scaron;'; // Latin Capital Letter S With Caron
  817. $trans[chr(139)] = '&lsaquo;'; // Single Left-Pointing Angle Quotation Mark
  818. $trans[chr(140)] = '&OElig;'; // Latin Capital Ligature OE
  819. $trans[chr(145)] = '&lsquo;'; // Left Single Quotation Mark
  820. $trans[chr(146)] = '&rsquo;'; // Right Single Quotation Mark
  821. $trans[chr(147)] = '&ldquo;'; // Left Double Quotation Mark
  822. $trans[chr(148)] = '&rdquo;'; // Right Double Quotation Mark
  823. $trans[chr(149)] = '&bull;'; // Bullet
  824. $trans[chr(150)] = '&ndash;'; // En Dash
  825. $trans[chr(151)] = '&mdash;'; // Em Dash
  826. $trans[chr(152)] = '&tilde;'; // Small Tilde
  827. $trans[chr(153)] = '&trade;'; // Trade Mark Sign
  828. $trans[chr(154)] = '&scaron;'; // Latin Small Letter S With Caron
  829. $trans[chr(155)] = '&rsaquo;'; // Single Right-Pointing Angle Quotation Mark
  830. $trans[chr(156)] = '&oelig;'; // Latin Small Ligature OE
  831. $trans[chr(159)] = '&Yuml;'; // Latin Capital Letter Y With Diaeresis
  832. $html = strtr($html, $trans);
  833. }
  834. if (!$encoding) {
  835. $encoding = 'utf-8';
  836. } else {
  837. if (strtolower($encoding) != 'utf-8') {
  838. $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
  839. /*
  840. if (function_exists('iconv')) {
  841. // iconv appears to handle certain character encodings better than mb_convert_encoding
  842. $html = iconv($encoding, 'utf-8', $html);
  843. } else {
  844. $html = mb_convert_encoding($html, 'utf-8', $encoding);
  845. }
  846. */
  847. }
  848. }
  849. }
  850. return $html;
  851. }
  852. function makeAbsolute($base, $elem) {
  853. $base = new SimplePie_IRI($base);
  854. // remove '//' in URL path (used to prevent URLs from resolving properly)
  855. // TODO: check if this is still the case
  856. if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
  857. foreach(array('a'=>'href', 'img'=>'src') as $tag => $attr) {
  858. $elems = $elem->getElementsByTagName($tag);
  859. for ($i = $elems->length-1; $i >= 0; $i--) {
  860. $e = $elems->item($i);
  861. //$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
  862. makeAbsoluteAttr($base, $e, $attr);
  863. }
  864. if (strtolower($elem->tagName) == $tag) makeAbsoluteAttr($base, $elem, $attr);
  865. }
  866. }
  867. function makeAbsoluteAttr($base, $e, $attr) {
  868. if ($e->hasAttribute($attr)) {
  869. // Trim leading and trailing white space. I don't really like this but
  870. // unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" />
  871. $url = trim(str_replace('%20', ' ', $e->getAttribute($attr)));
  872. $url = str_replace(' ', '%20', $url);
  873. if (!preg_match('!https?://!i', $url)) {
  874. if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
  875. $e->setAttribute($attr, $absolute);
  876. }
  877. }
  878. }
  879. }
  880. function makeAbsoluteStr($base, $url) {
  881. $base = new SimplePie_IRI($base);
  882. // remove '//' in URL path (causes URLs not to resolve properly)
  883. if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
  884. if (preg_match('!^https?://!i', $url)) {
  885. // already absolute
  886. return $url;
  887. } else {
  888. if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
  889. return $absolute;
  890. }
  891. return false;
  892. }
  893. }
  894. // returns single page response, or false if not found
  895. function getSinglePage($item, $html, $url) {
  896. global $http, $extractor;
  897. $host = @parse_url($url, PHP_URL_HOST);
  898. $site_config = SiteConfig::build($host);
  899. if ($site_config === false) {
  900. // check for fingerprints
  901. if (!empty($extractor->fingerprints) && ($_fphost = $extractor->findHostUsingFingerprints($html))) {
  902. $site_config = SiteConfig::build($_fphost);
  903. }
  904. if ($site_config === false) $site_config = new SiteConfig();
  905. SiteConfig::add_to_cache($host, $site_config);
  906. return false;
  907. } else {
  908. SiteConfig::add_to_cache($host, $site_config);
  909. }
  910. $splink = null;
  911. if (!empty($site_config->single_page_link)) {
  912. $splink = $site_config->single_page_link;
  913. } elseif (!empty($site_config->single_page_link_in_feed)) {
  914. // single page link xpath is targeted at feed
  915. $splink = $site_config->single_page_link_in_feed;
  916. // so let's replace HTML with feed item description
  917. $html = $item->get_description();
  918. }
  919. if (isset($splink)) {
  920. // Build DOM tree from HTML
  921. $readability = new Readability($html, $url);
  922. $xpath = new DOMXPath($readability->dom);
  923. // Loop through single_page_link xpath expressions
  924. $single_page_url = null;
  925. foreach ($splink as $pattern) {
  926. $elems = @$xpath->evaluate($pattern, $readability->dom);
  927. if (is_string($elems)) {
  928. $single_page_url = trim($elems);
  929. break;
  930. } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
  931. foreach ($elems as $item) {
  932. if ($item instanceof DOMElement && $item->hasAttribute('href')) {
  933. $single_page_url = $item->getAttribute('href');
  934. break;
  935. } elseif ($item instanceof DOMAttr && $item->value) {
  936. $single_page_url = $item->value;
  937. break;
  938. }
  939. }
  940. }
  941. }
  942. // If we've got URL, resolve against $url
  943. if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) {
  944. // check it's not what we have already!
  945. if ($single_page_url != $url) {
  946. // it's not, so let's try to fetch it...
  947. $_prev_ref = $http->referer;
  948. $http->referer = $single_page_url;
  949. if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) {
  950. $http->referer = $_prev_ref;
  951. return $response;
  952. }
  953. $http->referer = $_prev_ref;
  954. }
  955. }
  956. }
  957. return false;
  958. }
  959. function remove_url_cruft($url) {
  960. // remove google analytics for the time being
  961. // regex adapted from http://navitronic.co.uk/2010/12/removing-google-analytics-cruft-from-urls/
  962. // https://gist.github.com/758177
  963. return preg_replace('/(\?|\&)utm_[a-z]+=[^\&]+/', '', $url);
  964. }
  965. function make_substitutions($string) {
  966. if ($string == '') return $string;
  967. global $item, $effective_url;
  968. $string = str_replace('{url}', htmlspecialchars($item->get_permalink()), $string);
  969. $string = str_replace('{effective-url}', htmlspecialchars($effective_url), $string);
  970. return $string;
  971. }
  972. ?>