PageRenderTime 47ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/baser/plugins/feed/models/rss.php

https://github.com/hashing/basercms
PHP | 321 lines | 144 code | 54 blank | 123 comment | 24 complexity | f1f318dc63211d84eb053a62109ecc3c MD5 | raw file
Possible License(s): MIT
  1. <?php
  2. /**
  3. * Created: Wed Sep 06 18:03:26 CEST 2006
  4. *
  5. * This Model allows you to parse a given RSS 2.0 feed and have it returned in a big
  6. * array.
  7. *
  8. * PHP versions 5
  9. *
  10. * Copyright (c) Felix Geisendrfer <info@fg-webdesign.de>
  11. *
  12. * Licensed under The MIT License
  13. * Redistributions of files must retain the above copyright notice.
  14. *
  15. * @copyright Copyright (c) 2006, Felix Geisendrfer.
  16. * @link http://www.fg-webdesign.de/
  17. * @link http://www.thinkingphp.org/
  18. * @license http://www.opensource.org/licenses/mit-license.php The MIT License
  19. */
  20. /**
  21. * Include files
  22. */
  23. App::import("Model","Feed.WebModel",true,array(BASER_PLUGINS));
  24. /**
  25. * rss
  26. *
  27. * @package baser.plugins.feed.models
  28. */
  29. class Rss extends WebModel {
  30. /**
  31. * name
  32. *
  33. * @var string
  34. * @access public
  35. */
  36. var $name = 'Rss';
  37. /**
  38. * cacheExpires
  39. *
  40. * @var string
  41. * @access public
  42. */
  43. var $cacheExpires = '+2 hours';
  44. /**
  45. * cacheFolder
  46. *
  47. * @var string
  48. * @access public
  49. */
  50. var $cacheFolder = 'web/rss';
  51. /**
  52. * useDbConfig
  53. *
  54. * @var string
  55. * @access public
  56. */
  57. var $useDbConfig = null;
  58. /**
  59. * findAll
  60. *
  61. * @param string $feedUrl
  62. * @param int $limit
  63. * @param string $cacheExpires
  64. * @return array
  65. * @access public
  66. */
  67. function findAll($feedUrl, $limit = 10, $cacheExpires = null)
  68. {
  69. if (empty($feedUrl))
  70. return array();
  71. $feed = $this->__parseRSS($this->__getRawRSS($feedUrl, null, $cacheExpires));
  72. if (isset($feed['Error']))
  73. return $feed;
  74. if (count($feed['Items']>$limit))
  75. {
  76. $feed['Items'] = array_slice($feed['Items'], 0, $limit);
  77. }
  78. return $feed;
  79. }
  80. /**
  81. * getRawRSS
  82. *
  83. * @param string $feedUrl
  84. * @param array $vars
  85. * @param string $cacheExpires
  86. * @return string
  87. * @access private
  88. */
  89. function __getRawRSS($feedUrl, $vars = array(), $cacheExpires = null)
  90. {
  91. $url = $feedUrl;
  92. $cachePath = $this->cacheFolder.$this->__createCacheHash('.rss', $url, $vars);
  93. if (empty($cacheExpires))
  94. $cacheExpires = $this->cacheExpires;
  95. if (empty($vars))
  96. $vars = array();
  97. $rssData = cache($cachePath, null, $cacheExpires);
  98. if (empty($rssData))
  99. {
  100. $rssData = cache($cachePath, $this->httpGet($url, $vars));
  101. }
  102. return $rssData;
  103. }
  104. /**
  105. * A simple function for parsing RSS data. Only returns Items for now.
  106. *
  107. * @param string $data
  108. * @return array
  109. * @access private
  110. */
  111. function __parseRSS($data)
  112. {
  113. if (empty($data))
  114. return array();
  115. $regex = '/\<rss.+version="(.+)".*\>/iUs';
  116. preg_match($regex, $data, $match);
  117. if (empty($match))
  118. return array('Error' => 'No valid feed (no feed version found).');
  119. list($raw, $version) = $match;
  120. if (empty($version))
  121. $version = '2.0';
  122. // Check if we have a valid version number
  123. if (!preg_match('/^[0-9.]+$/iUs', $version))
  124. {
  125. return array('Error' => '"'.$version.'" is no valid RSS version.');
  126. }
  127. $rssFunction = '__parseRSS_'.str_replace('.', '_', $version);
  128. if (method_exists($this, $rssFunction))
  129. {
  130. return call_user_func(array(&$this, $rssFunction), $data);
  131. }
  132. else
  133. {
  134. return array('Error' => 'No function for parsing RSS feeds of version "'.$version.'" available.');
  135. }
  136. }
  137. /**
  138. * parseRSS_2_0
  139. *
  140. * @param string $data
  141. * @return array
  142. * @access private
  143. */
  144. function __parseRSS_2_0($data)
  145. {
  146. // First thing we need to do, is to identify all html/otherwise formated contents
  147. preg_match_all('/\<\!\[CDATA\[(.+)\]\]\>/iUs', $data, $cdata, PREG_SET_ORDER);
  148. // Create the md5 hash of the data to parse
  149. $dataHash = md5($data);
  150. // Now we have to replace them with something that won't confuse our parser, but still keep the array containing their original content
  151. // [[CDATA:$dataHash:$cdataNum]] should be pretty unique, so we don't have to deal with errors in an rss feed that talks about this replacment
  152. // method.
  153. foreach ($cdata as $cdataNum => $cdataItem)
  154. {
  155. $data = str_replace($cdataItem[0], '[[CDATA:'.$dataHash.':'.$cdataNum.']]', $data);
  156. }
  157. // Let's get the information about the channel
  158. $regex = '/\<channel\>(.+)\<item\>/iUs';
  159. preg_match($regex, $data, $match);
  160. if (!empty($match))
  161. {
  162. list($raw, $channel) = $match;
  163. $channel = $this->__getNodeFields($channel, $cdata, $dataHash, 'channel');
  164. }
  165. else
  166. $channel = array();
  167. // This will get us a list with all Items contained in the feed
  168. $regex = '/\<item\>(.+)\<\/item\>/iUs';
  169. $matchCount = preg_match_all($regex, $data, $matches, PREG_SET_ORDER);
  170. if (empty($matchCount))
  171. {
  172. // No items? Nothing to parse.
  173. $matches = array();
  174. }
  175. else
  176. {
  177. $items = array();
  178. // Loop through all Item Matches
  179. foreach ($matches as $itemNr => $item)
  180. {
  181. // Find all fields in our Item
  182. $items[$itemNr] = $this->__getNodeFields($item[1], $cdata, $dataHash);
  183. }
  184. }
  185. // Return everything
  186. return array('Channel' => $channel,
  187. 'Items' => @$items);
  188. }
  189. /**
  190. * getXMLNodeAttributes
  191. *
  192. * @param string $rawFields
  193. * @param string $cdata
  194. * @param string $dataHash
  195. * @param string $type
  196. * @return string
  197. * @access private
  198. */
  199. function __getNodeFields($rawFields, $cdata = null, $dataHash = null, $type = null)
  200. {
  201. // Don't ask - it works. No seriously, I spent a lot of time and thought on this regex
  202. // if you are interested in how it works feel free to contact me. In case you wonder about
  203. // the \x00's, that's an optimization trick to generate a character set that matches new lines
  204. // but doesn't require the /s modifier.
  205. $fieldRegex = '/\<(.+)( [^\x00]*)?\>([^\x00]*)\<\/\\1\>|\<(.+)( [^\x00]*)?\/\>/U';
  206. preg_match_all($fieldRegex, $rawFields, $fieldMatches, PREG_SET_ORDER);
  207. // Loop through those fields
  208. foreach ($fieldMatches as $fieldMatch)
  209. {
  210. // Assign the preg_match_all contents to a couple of variables
  211. if (count($fieldMatch)==4)
  212. list($raw, $field, $attributes, $value) = $fieldMatch;
  213. else
  214. {
  215. // This is for <nodes ... /> that don't have enclosed content
  216. list($raw, , , ,$field, $attributes) = $fieldMatch;
  217. $value = null;
  218. }
  219. // The child image in channel has child elements in RSS, so let's make sure we parse them too
  220. if ($type=='channel' && $field=='image')
  221. {
  222. $value = $this->__getNodeFields($value, $cdata, $dataHash);
  223. }
  224. else
  225. {
  226. // Find CDATA replaced stuff and but it back in.
  227. preg_match_all('/\[\[CDATA:'.$dataHash.':([0-9]+)\]\]/iUs', $rawFields, $cdataDummies, PREG_SET_ORDER);
  228. foreach ($cdataDummies as $cdataDummy)
  229. {
  230. // Replace CDATA dummies with the actual contents of the cdata field
  231. $value = str_replace($cdataDummy[0], $cdata[($cdataDummy[1])][1], $value);
  232. }
  233. }
  234. // Parse the attributes contained in our Node / ItemField
  235. $attributes = $this->__getXMLNodeAttributes($attributes);
  236. // Add our news Node to the list of Items.
  237. $fields[$field] = array('value' => $value,
  238. 'attributes' => $attributes);
  239. }
  240. if (!isset($fields))
  241. $fields = $rawFields;
  242. return $fields;
  243. }
  244. /**
  245. * getXMLNodeAttributes
  246. *
  247. * @param string $attributesData
  248. * @return array
  249. * @access private
  250. */
  251. function __getXMLNodeAttributes($attributesData)
  252. {
  253. if (empty($attributesData))
  254. return array();
  255. preg_match_all('/ ([^ \r\n]+)=(["\'])(.+)\\2/iUs', $attributesData, $attributeMatches, PREG_SET_ORDER);
  256. if (empty($attributeMatches))
  257. return array();
  258. $attributes = array();
  259. foreach ($attributeMatches as $attribute)
  260. {
  261. list($raw, $attributeKey, $enclosure, $attributeValue) = $attribute;
  262. $attributes[] = array($attributeKey => $attributeValue);
  263. }
  264. return $attributes;
  265. }
  266. }
  267. ?>