PageRenderTime 44ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/feeds/feeds.php

https://github.com/agnesrambaud/yacs
PHP | 428 lines | 195 code | 67 blank | 166 comment | 47 complexity | 68e65b0a3d848b65b3fbff2011fa981e MD5 | raw file
  1. <?php
  2. /**
  3. * get news
  4. *
  5. * This data abstraction for feeds provides two main functions, plus several utility functions:
  6. * - [code]get_local_news()[/code] - retrieve local news
  7. * - [code]get_remote_news()[/code] - retrieve news collected from remote sites
  8. * - [code]get_remote_news_from()[/code] - actual news fetching from one feeding site
  9. * - [code]tick_hook()[/code] - trigger feeding in the background
  10. *
  11. * @author Bernard Paques
  12. * @author GnapZ
  13. * @tester Dobliu
  14. * @tester NickR
  15. * @reference
  16. * @license http://www.gnu.org/copyleft/lesser.txt GNU Lesser General Public License
  17. */
  18. class Feeds {
  19. /**
  20. * decode a date
  21. *
  22. * @link http://www.w3.org/TR/NOTE-datetime Date and Time Formats, a profile of ISO 8601
  23. *
  24. * @param string some date
  25. * @return int a valid time stamp, or -1
  26. */
  27. function decode_date($date) {
  28. global $context;
  29. // match wc3dtf
  30. if(preg_match("/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})?(?:([-+])(\d{2}):?(\d{2})|(Z))?/", $date, $matches)) {
  31. // split date components
  32. list($year, $month, $day, $hours, $minutes, $seconds) = array($matches[1], $matches[2], $matches[3], $matches[4], $matches[5], $matches[6]);
  33. // calc epoch for current date assuming GMT
  34. $stamp = gmmktime((int)$hours, (int)$minutes, (int)$seconds, (int)$month, (int)$day, (int)$year);
  35. // zulu time, aka GMT
  36. if($matches[9] == 'Z')
  37. $offset = 0;
  38. else {
  39. list($tz_mod, $tz_hour, $tz_min) = array($matches[7], $matches[8], $matches[9]);
  40. // zero out the variables
  41. if(!$tz_hour)
  42. $tz_hour = 0;
  43. if(!$tz_min)
  44. $tz_min = 0;
  45. $offset = (($tz_hour*60)+$tz_min)*60;
  46. // is timezone ahead of GMT? then subtract offset
  47. if($tz_mod == '-')
  48. $offset = $offset * -1;
  49. }
  50. return ($stamp + $offset);
  51. // everything else
  52. } else
  53. return strtotime($date);
  54. }
  55. /**
  56. * suppress invisible HTML tags
  57. *
  58. * @param string submitted text
  59. * @return string filtered text
  60. */
  61. function encode_text($text) {
  62. // remove blank blocks
  63. $text = preg_replace(array('@<comment[^>]*?>.*?</comment>@siu',
  64. '@<embed[^>]*?.*?</embed>@siu',
  65. '@<link[^>]*?>.*?</link>@siu',
  66. '@<listing[^>]*?>.*?</listing>@siu',
  67. '@<meta[^>]*?>.*?</meta>@siu',
  68. '@<noscript[^>]*?.*?</noscript>@siu',
  69. '@<object[^>]*?.*?</object>@siu',
  70. '@<plaintext[^>]*?.*?</plaintext>@siu',
  71. '@<script[^>]*?.*?</script>@siu',
  72. '@<xmp[^>]*?.*?</xmp>@siu'), '', $text);
  73. return $text;
  74. }
  75. /**
  76. * get current news from this server
  77. *
  78. * Actually, this function lists most recent published articles.
  79. *
  80. * @param int the number of items to list
  81. * @param 'feed' to get a regular feed, or 'contents' to get everything
  82. * @return an array of array($time, $title, $author, $section, $image, $description)
  83. */
  84. function get_local_news($count=20, $variant='feed') {
  85. global $context;
  86. // list the newest published articles
  87. return Articles::list_by('publication', 0, $count, $variant);
  88. }
  89. /**
  90. * get news from remote servers
  91. *
  92. * This function extracts from the database most recent links fetched from feeders.
  93. *
  94. * By default, up to 20 items are displayed.
  95. *
  96. * @param the maximum number of news to fetch
  97. * @param the expected variant to use
  98. * @return an array to use with [code]Skin::build_list()[/code], or NULL
  99. *
  100. * @see feeds/index.php
  101. */
  102. function get_remote_news($count=20, $variant='compact') {
  103. global $context;
  104. // number of items to display
  105. if($count < 3)
  106. $count = 10;
  107. if($count > 50)
  108. $count = 50;
  109. // get them from the database
  110. include_once $context['path_to_root'].'links/links.php';
  111. return Links::list_news(0, $count, $variant);
  112. }
  113. /**
  114. * get news from a remote server
  115. *
  116. * This function is aiming to run silently, therefore errors are logged in a file.
  117. * To troubleshoot feeders you can configure the debugging facility in the
  118. * configuration panel for feeds (parameter [code]debug_feeds[/code], at [script]feeds/configure.php[/script]).
  119. *
  120. * @see links/link.php
  121. *
  122. * @param string the URL to use to fetch news
  123. * @return either an array of items, or NULL on error
  124. *
  125. * @see feeds/feeds.php
  126. * @see servers/test.php
  127. */
  128. function get_remote_news_from($feed_url) {
  129. global $context;
  130. // ensure we are using adequate feeding parameters
  131. Safe::load('parameters/feeds.include.php');
  132. // parse the target URL
  133. $items = @parse_url($feed_url);
  134. // stop here if no host
  135. if(!isset($items['host']) || !$items['host']) {
  136. Logger::remember('feeds/feeds.php', 'No valid host at '.$feed_url);
  137. return NULL;
  138. }
  139. // use simplepie
  140. include_once $context['path_to_root'].'included/simplepie.inc';
  141. $feed = new SimplePie($feed_url, $context['path_to_root'].'temporary');
  142. $feed->init();
  143. // make an array
  144. $items = array();
  145. foreach($feed->get_items() as $item) {
  146. $category = '';
  147. if(($categories = $item->get_categories()) && is_array($categories))
  148. foreach($categories as $one)
  149. $category .= $one->get_label().', ';
  150. if($category)
  151. $category = rtrim($category, ', ');
  152. $items[] = array(
  153. 'author' => $item->get_author(),
  154. 'category' => $category,
  155. 'description' => $item->get_content(),
  156. 'link' => $item->get_permalink(),
  157. 'pubDate' => $item->get_date('r'),
  158. 'title' => $item->get_title()
  159. );
  160. }
  161. // and returns it
  162. return $items;
  163. }
  164. /**
  165. * build a reference to a feed
  166. *
  167. * Depending on parameter '[code]with_friendly_urls[/code]' and on action,
  168. * following results can be observed:
  169. *
  170. * - atom - feeds/atom.php or feeds/atom
  171. * - articles - articles/feed.php or feeds/articles
  172. * - comments - comments/feed.php or feeds/comments
  173. * - files - files/feed.php or feeds/files
  174. * - opml - feeds/describe.php or feeds/opml
  175. * - rss - feeds/rss.php or feeds/rss
  176. * - foo_bar - feeds/foo_bar.php or feeds/foo_bar
  177. *
  178. * @param string the expected feed ('atom', 'articles', 'comments', 'files', ...)
  179. * @return string a normalized reference
  180. *
  181. * @see control/configure.php
  182. */
  183. function get_url($id='rss') {
  184. global $context;
  185. // use rewriting engine to achieve pretty references
  186. if($context['with_friendly_urls'] == 'R')
  187. return 'feeds/'.$id;
  188. // the default is to trigger actual PHP scripts
  189. switch($id) {
  190. case 'articles':
  191. return 'articles/feed.php';
  192. case 'atom':
  193. return 'feeds/atom.php';
  194. case 'comments':
  195. return 'comments/feed.php';
  196. case 'files':
  197. return 'files/feed.php';
  198. case 'opml':
  199. return 'feeds/describe.php';
  200. case 'rdf':
  201. return 'feeds/rdf.php';
  202. case 'rss':
  203. return 'feeds/rss.php';
  204. default:
  205. return 'feeds/'.$id.'.php';
  206. }
  207. }
  208. /**
  209. * get news from remote servers
  210. *
  211. * This function queries remote sources and populate the table of links based on fetched news.
  212. *
  213. * On tick, the including hook calls [code]Feeds::tick_hook()[/code].
  214. * See [script]control/scan.php[/script] for a more complete description of hooks.
  215. *
  216. * The function browses the database to locate servers acting as feeders, and read the URLs to use.
  217. *
  218. * A round-robin algorithm is implemented, meaning that servers are polled in sequence throughout successive ticks.
  219. * At most 1 feed is parsed on each tick, to limit impact when the "poor-man" cron mechanism is used,
  220. * which is the default setting.
  221. *
  222. * XML feeds are fetched and parsed according to their type.
  223. * At the moment YACS is able to process RSS and slashdot feeds.
  224. * Link records are created or updated in the database saving as much of possible of provided data.
  225. * Item data is reflected in Link, Title, and Description fields.
  226. * Channel data is used to populate the Source field.
  227. * Stamping information is based on feeding date, and channel title.
  228. * Also, the edit action 'link:feed' marks links that are collected from feeders.
  229. * The anchor field is set to the category assigned in the server profile.
  230. *
  231. * At the end of the feeding process, the database is purged from oldest links according to the limit
  232. * defined in parameters/feeds.include.php, set through feeds/configure.php.
  233. * See Links::purge_old_news().
  234. *
  235. * @param boolean if set to true, fetch news on each call; else use normal period of time
  236. * @return a string to be displayed in resulting page, if any
  237. *
  238. * @see control/scan.php
  239. * @see feeds/configure.php
  240. */
  241. function tick_hook($forced=FALSE) {
  242. global $context;
  243. // load librairies only once
  244. include_once $context['path_to_root'].'links/links.php';
  245. include_once $context['path_to_root'].'servers/servers.php';
  246. include_once $context['path_to_root'].'shared/values.php'; // feeds.tick
  247. // get feeding parameters
  248. Safe::load('parameters/feeds.include.php');
  249. // delay between feeds - minimum is 5 minutes
  250. if(!isset($context['minutes_between_feeds']) || ($context['minutes_between_feeds'] < 5))
  251. $context['minutes_between_feeds'] = 5;
  252. // do not wait for the end of a feeding cycle
  253. if($forced)
  254. $threshold = gmstrftime('%Y-%m-%d %H:%M:%S');
  255. // do not process servers that have been polled recently
  256. else
  257. $threshold = gmstrftime('%Y-%m-%d %H:%M:%S', time() - ($context['minutes_between_feeds'] * 60));
  258. // get a batch of feeders
  259. if(!$feeders = Servers::list_for_feed(0, 1, 'feed'))
  260. return 'feeds/feeds.php: no feed has been defined'.BR;
  261. // remember start time
  262. $start_time = get_micro_time();
  263. // list banned tokens
  264. $banned_pattern = Servers::get_banned_pattern();
  265. // browse each feed
  266. $count = 0;
  267. foreach($feeders as $server_id => $attributes) {
  268. // get specific feed parameters
  269. list($feed_url, $feed_title, $anchor, $stamp) = $attributes;
  270. // skip servers processed recently
  271. if($stamp > $threshold)
  272. continue;
  273. // flag this record to enable round-robin even on error
  274. Servers::stamp($server_id);
  275. // fetch news from the provided link
  276. if((!$news = Feeds::get_remote_news_from($feed_url)) || !is_array($news))
  277. continue;
  278. // no anchor has been defined for this feed
  279. if(!$anchor) {
  280. // create a default section if necessary
  281. if(!($anchor = Sections::lookup('external_news'))) {
  282. $fields = array();
  283. $fields['nick_name'] = 'external_news';
  284. $fields['create_date'] = gmstrftime('%Y-%m-%d %H:%M:%S', time());
  285. $fields['edit_date'] = gmstrftime('%Y-%m-%d %H:%M:%S', time());
  286. $fields['locked'] = 'Y'; // no direct contributions
  287. $fields['home_panel'] = 'extra'; // in a side box at the front page
  288. $fields['rank'] = 40000; // at the end of the list
  289. $fields['title'] = i18n::c('External News');
  290. $fields['description'] = i18n::c('Received from feeding servers');
  291. if(!$fields['id'] = Sections::post($fields)) {
  292. Logger::remember('feeds/feeds.php', 'Impossible to add a section.');
  293. return;
  294. }
  295. $anchor = 'section:'.$fields['id'];
  296. }
  297. }
  298. // process retrieved links
  299. $links = 0;
  300. foreach($news as $item) {
  301. // link has to be valid
  302. if(!isset($item['link']) || !($item['title'].$item['description'])) {
  303. if(isset($context['debug_feeds']) && ($context['debug_feeds'] == 'Y'))
  304. Logger::remember('feeds/feeds.php', 'feed item is invalid', $item, 'debug');
  305. continue;
  306. }
  307. // skip banned servers
  308. if($banned_pattern && preg_match($banned_pattern, $item['link'])) {
  309. if(isset($context['debug_feeds']) && ($context['debug_feeds'] == 'Y'))
  310. Logger::remember('feeds/feeds.php', 'feed host has been banned', $item['link'], 'debug');
  311. continue;
  312. }
  313. // one link processed
  314. $links++;
  315. // link description
  316. $fields = array();
  317. $fields['anchor'] = $anchor;
  318. $fields['link_url'] = $item['link'];
  319. $fields['title'] = $item['title'];
  320. $fields['description'] = $item['description'];
  321. if($item['category'])
  322. $fields['description'] .= ' ('.$item['category'].')';
  323. $fields['edit_name'] = $feed_title;
  324. $fields['edit_address'] = $feed_url;
  325. $fields['edit_action'] = 'link:feed';
  326. if($item['pubDate'])
  327. $fields['edit_date'] = gmstrftime('%Y-%m-%d %H:%M:%S', strtotime($item['pubDate']));
  328. // update links that already exist in the database
  329. if(Links::have($item['link'], $anchor, $fields))
  330. continue;
  331. // save link in the database
  332. if(!Links::post($fields))
  333. Logger::remember('feeds/feeds.php', 'Impossible to save feed link: '.Logger::error_pop());
  334. }
  335. // one feed has been processed
  336. $count += 1;
  337. // remember tick date
  338. Values::set('feeds.tick.'.$feed_url, $links);
  339. }
  340. // cap the number of links used for news
  341. if(!isset($context['maximum_news']) || !$context['maximum_news'])
  342. $context['maximum_news'] = 1000;
  343. if($context['maximum_news'] > 10) {
  344. include_once $context['path_to_root'].'links/links.php';
  345. Links::purge_old_news($context['maximum_news']);
  346. }
  347. // compute execution time
  348. $time = round(get_micro_time() - $start_time, 2);
  349. // report on work achieved
  350. if($count > 1)
  351. return 'feeds/feeds.php: '.$count.' feeds have been processed ('.$time.' seconds)'.BR;
  352. elseif($count == 1)
  353. return 'feeds/feeds.php: 1 feed has been processed ('.$time.' seconds)'.BR;
  354. else
  355. return 'feeds/feeds.php: nothing to do ('.$time.' seconds)'.BR;
  356. }
  357. }
  358. // load localized strings
  359. if(is_callable(array('i18n', 'bind')))
  360. i18n::bind('feeds');
  361. ?>