PageRenderTime 51ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/include/magpierss/rss_parse.inc

https://bitbucket.org/yousef_fadila/vtiger
PHP | 605 lines | 406 code | 81 blank | 118 comment | 104 complexity | aab0643525dab46e7ddae1fba8f3f223 MD5 | raw file
Possible License(s): LGPL-2.1, GPL-2.0
  1. <?php
  2. /**
  3. * Project: MagpieRSS: a simple RSS integration tool
  4. * File: rss_parse.inc - parse an RSS or Atom feed
  5. * return as a simple object.
  6. *
  7. * Handles RSS 0.9x, RSS 2.0, RSS 1.0, and Atom 0.3
  8. *
  9. * The lastest version of MagpieRSS can be obtained from:
  10. * http://magpierss.sourceforge.net
  11. *
  12. * For questions, help, comments, discussion, etc., please join the
  13. * Magpie mailing list:
  14. * magpierss-general@lists.sourceforge.net
  15. *
  16. * @author Kellan Elliott-McCrea <kellan@protest.net>
  17. * @version 0.7a
  18. * @license GPL
  19. *
  20. */
  21. define('RSS', 'RSS');
  22. define('ATOM', 'Atom');
  23. require_once (MAGPIE_DIR . 'rss_utils.inc');
  24. /**
  25. * Hybrid parser, and object, takes RSS as a string and returns a simple object.
  26. *
  27. * see: rss_fetch.inc for a simpler interface with integrated caching support
  28. *
  29. */
  30. class MagpieRSS {
  31. var $parser;
  32. var $current_item = array(); // item currently being parsed
  33. var $items = array(); // collection of parsed items
  34. var $channel = array(); // hash of channel fields
  35. var $textinput = array();
  36. var $image = array();
  37. var $feed_type;
  38. var $feed_version;
  39. var $encoding = ''; // output encoding of parsed rss
  40. var $_source_encoding = ''; // only set if we have to parse xml prolog
  41. var $ERROR = "";
  42. var $WARNING = "";
  43. // define some constants
  44. var $_CONTENT_CONSTRUCTS = array('content', 'summary', 'info', 'title', 'tagline', 'copyright');
  45. var $_KNOWN_ENCODINGS = array('UTF-8', 'US-ASCII', 'ISO-8859-1');
  46. // parser variables, useless if you're not a parser, treat as private
  47. var $stack = array(); // parser stack
  48. var $inchannel = false;
  49. var $initem = false;
  50. var $incontent = false; // if in Atom <content mode="xml"> field
  51. var $intextinput = false;
  52. var $inimage = false;
  53. var $current_namespace = false;
  54. /**
  55. * Set up XML parser, parse source, and return populated RSS object..
  56. *
  57. * @param string $source string containing the RSS to be parsed
  58. *
  59. * NOTE: Probably a good idea to leave the encoding options alone unless
  60. * you know what you're doing as PHP's character set support is
  61. * a little weird.
  62. *
  63. * NOTE: A lot of this is unnecessary but harmless with PHP5
  64. *
  65. *
  66. * @param string $output_encoding output the parsed RSS in this character
  67. * set defaults to ISO-8859-1 as this is PHP's
  68. * default.
  69. *
  70. * NOTE: might be changed to UTF-8 in future
  71. * versions.
  72. *
  73. * @param string $input_encoding the character set of the incoming RSS source.
  74. * Leave blank and Magpie will try to figure it
  75. * out.
  76. *
  77. *
  78. * @param bool $detect_encoding if false Magpie won't attempt to detect
  79. * source encoding. (caveat emptor)
  80. *
  81. */
  82. function MagpieRSS ($source, $output_encoding='UTF-8',
  83. $input_encoding=null, $detect_encoding=true)
  84. {
  85. # if PHP xml isn't compiled in, die
  86. #
  87. if (!function_exists('xml_parser_create')) {
  88. $this->error( "Failed to load PHP's XML Extension. " .
  89. "http://www.php.net/manual/en/ref.xml.php",
  90. E_USER_ERROR );
  91. }
  92. list($parser, $source) = $this->create_parser($source,
  93. $output_encoding, $input_encoding, $detect_encoding);
  94. if (!is_resource($parser)) {
  95. $this->error( "Failed to create an instance of PHP's XML parser. " .
  96. "http://www.php.net/manual/en/ref.xml.php",
  97. E_USER_ERROR );
  98. }
  99. $this->parser = $parser;
  100. # pass in parser, and a reference to this object
  101. # setup handlers
  102. #
  103. xml_set_object( $this->parser, $this );
  104. xml_set_element_handler($this->parser,
  105. 'feed_start_element', 'feed_end_element' );
  106. xml_set_character_data_handler( $this->parser, 'feed_cdata' );
  107. $status = xml_parse( $this->parser, $source );
  108. if (! $status ) {
  109. $errorcode = xml_get_error_code( $this->parser );
  110. if ( $errorcode != XML_ERROR_NONE ) {
  111. $xml_error = xml_error_string( $errorcode );
  112. $error_line = xml_get_current_line_number($this->parser);
  113. $error_col = xml_get_current_column_number($this->parser);
  114. $errormsg = "$xml_error at line $error_line, column $error_col";
  115. $this->error( $errormsg );
  116. }
  117. }
  118. xml_parser_free( $this->parser );
  119. $this->normalize();
  120. }
  121. function feed_start_element($p, $element, &$attrs) {
  122. $el = $element = strtolower($element);
  123. $attrs = array_change_key_case($attrs, CASE_LOWER);
  124. // check for a namespace, and split if found
  125. $ns = false;
  126. if ( strpos( $element, ':' ) ) {
  127. list($ns, $el) = split( ':', $element, 2);
  128. }
  129. if ( $ns and $ns != 'rdf' ) {
  130. $this->current_namespace = $ns;
  131. }
  132. # if feed type isn't set, then this is first element of feed
  133. # identify feed from root element
  134. #
  135. if (!isset($this->feed_type) ) {
  136. if ( $el == 'rdf' ) {
  137. $this->feed_type = RSS;
  138. $this->feed_version = '1.0';
  139. }
  140. elseif ( $el == 'rss' ) {
  141. $this->feed_type = RSS;
  142. $this->feed_version = $attrs['version'];
  143. }
  144. elseif ( $el == 'feed' ) {
  145. $this->feed_type = ATOM;
  146. $this->feed_version = $attrs['version'];
  147. $this->inchannel = true;
  148. }
  149. return;
  150. }
  151. if ( $el == 'channel' )
  152. {
  153. $this->inchannel = true;
  154. }
  155. elseif ($el == 'item' or $el == 'entry' )
  156. {
  157. $this->initem = true;
  158. if ( isset($attrs['rdf:about']) ) {
  159. $this->current_item['about'] = $attrs['rdf:about'];
  160. }
  161. }
  162. // if we're in the default namespace of an RSS feed,
  163. // record textinput or image fields
  164. elseif (
  165. $this->feed_type == RSS and
  166. $this->current_namespace == '' and
  167. $el == 'textinput' )
  168. {
  169. $this->intextinput = true;
  170. }
  171. elseif (
  172. $this->feed_type == RSS and
  173. $this->current_namespace == '' and
  174. $el == 'image' )
  175. {
  176. $this->inimage = true;
  177. }
  178. # handle atom content constructs
  179. elseif ( $this->feed_type == ATOM and in_array($el, $this->_CONTENT_CONSTRUCTS) )
  180. {
  181. // avoid clashing w/ RSS mod_content
  182. if ($el == 'content' ) {
  183. $el = 'atom_content';
  184. }
  185. $this->incontent = $el;
  186. }
  187. // if inside an Atom content construct (e.g. content or summary) field treat tags as text
  188. elseif ($this->feed_type == ATOM and $this->incontent )
  189. {
  190. // if tags are inlined, then flatten
  191. $attrs_str = join(' ',
  192. array_map('map_attrs',
  193. array_keys($attrs),
  194. array_values($attrs) ) );
  195. $this->append_content( "<$element $attrs_str>" );
  196. array_unshift( $this->stack, $el );
  197. }
  198. // Atom support many links per containging element.
  199. // Magpie treats link elements of type rel='alternate'
  200. // as being equivalent to RSS's simple link element.
  201. //
  202. elseif ($this->feed_type == ATOM and $el == 'link' )
  203. {
  204. if ( isset($attrs['rel']) and $attrs['rel'] == 'alternate' )
  205. {
  206. $link_el = 'link';
  207. }
  208. else {
  209. $link_el = 'link_' . $attrs['rel'];
  210. }
  211. $this->append($link_el, $attrs['href']);
  212. }
  213. // set stack[0] to current element
  214. else {
  215. array_unshift($this->stack, $el);
  216. }
  217. }
  218. function feed_cdata ($p, $text) {
  219. if ($this->feed_type == ATOM and $this->incontent)
  220. {
  221. $this->append_content( $text );
  222. }
  223. else {
  224. $current_el = join('_', array_reverse($this->stack));
  225. $this->append($current_el, $text);
  226. }
  227. }
  228. function feed_end_element ($p, $el) {
  229. $el = strtolower($el);
  230. if ( $el == 'item' or $el == 'entry' )
  231. {
  232. $this->items[] = $this->current_item;
  233. $this->current_item = array();
  234. $this->initem = false;
  235. }
  236. elseif ($this->feed_type == RSS and $this->current_namespace == '' and $el == 'textinput' )
  237. {
  238. $this->intextinput = false;
  239. }
  240. elseif ($this->feed_type == RSS and $this->current_namespace == '' and $el == 'image' )
  241. {
  242. $this->inimage = false;
  243. }
  244. elseif ($this->feed_type == ATOM and in_array($el, $this->_CONTENT_CONSTRUCTS) )
  245. {
  246. $this->incontent = false;
  247. }
  248. elseif ($el == 'channel' or $el == 'feed' )
  249. {
  250. $this->inchannel = false;
  251. }
  252. elseif ($this->feed_type == ATOM and $this->incontent ) {
  253. // balance tags properly
  254. // note: i don't think this is actually neccessary
  255. if ( $this->stack[0] == $el )
  256. {
  257. $this->append_content("</$el>");
  258. }
  259. else {
  260. $this->append_content("<$el />");
  261. }
  262. array_shift( $this->stack );
  263. }
  264. else {
  265. array_shift( $this->stack );
  266. }
  267. $this->current_namespace = false;
  268. }
  269. function concat (&$str1, $str2="") {
  270. if (!isset($str1) ) {
  271. $str1="";
  272. }
  273. $str1 .= $str2;
  274. }
  275. function append_content($text) {
  276. if ( $this->initem ) {
  277. $this->concat( $this->current_item[ $this->incontent ], $text );
  278. }
  279. elseif ( $this->inchannel ) {
  280. $this->concat( $this->channel[ $this->incontent ], $text );
  281. }
  282. }
  283. // smart append - field and namespace aware
  284. function append($el, $text) {
  285. if (!$el) {
  286. return;
  287. }
  288. if ( $this->current_namespace )
  289. {
  290. if ( $this->initem ) {
  291. $this->concat(
  292. $this->current_item[ $this->current_namespace ][ $el ], $text);
  293. }
  294. elseif ($this->inchannel) {
  295. $this->concat(
  296. $this->channel[ $this->current_namespace][ $el ], $text );
  297. }
  298. elseif ($this->intextinput) {
  299. $this->concat(
  300. $this->textinput[ $this->current_namespace][ $el ], $text );
  301. }
  302. elseif ($this->inimage) {
  303. $this->concat(
  304. $this->image[ $this->current_namespace ][ $el ], $text );
  305. }
  306. }
  307. else {
  308. if ( $this->initem ) {
  309. $this->concat(
  310. $this->current_item[ $el ], $text);
  311. }
  312. elseif ($this->intextinput) {
  313. $this->concat(
  314. $this->textinput[ $el ], $text );
  315. }
  316. elseif ($this->inimage) {
  317. $this->concat(
  318. $this->image[ $el ], $text );
  319. }
  320. elseif ($this->inchannel) {
  321. $this->concat(
  322. $this->channel[ $el ], $text );
  323. }
  324. }
  325. }
  326. function normalize () {
  327. // if atom populate rss fields
  328. if ( $this->is_atom() ) {
  329. $this->channel['description'] = $this->channel['tagline'];
  330. for ( $i = 0; $i < count($this->items); $i++) {
  331. $item = $this->items[$i];
  332. if ( isset($item['summary']) )
  333. $item['description'] = $item['summary'];
  334. if ( isset($item['atom_content']))
  335. $item['content']['encoded'] = $item['atom_content'];
  336. $atom_date = (isset($item['issued']) ) ? $item['issued'] : $item['modified'];
  337. if ( $atom_date ) {
  338. $epoch = @parse_w3cdtf($atom_date);
  339. if ($epoch and $epoch > 0) {
  340. $item['date_timestamp'] = $epoch;
  341. }
  342. }
  343. $this->items[$i] = $item;
  344. }
  345. }
  346. elseif ( $this->is_rss() ) {
  347. $this->channel['tagline'] = $this->channel['description'];
  348. for ( $i = 0; $i < count($this->items); $i++) {
  349. $item = $this->items[$i];
  350. if ( isset($item['description']))
  351. $item['summary'] = $item['description'];
  352. if ( isset($item['content']['encoded'] ) )
  353. $item['atom_content'] = $item['content']['encoded'];
  354. if ( $this->is_rss() == '1.0' and isset($item['dc']['date']) ) {
  355. $epoch = @parse_w3cdtf($item['dc']['date']);
  356. if ($epoch and $epoch > 0) {
  357. $item['date_timestamp'] = $epoch;
  358. }
  359. }
  360. elseif ( isset($item['pubdate']) ) {
  361. $epoch = @strtotime($item['pubdate']);
  362. if ($epoch > 0) {
  363. $item['date_timestamp'] = $epoch;
  364. }
  365. }
  366. $this->items[$i] = $item;
  367. }
  368. }
  369. }
  370. function is_rss () {
  371. if ( $this->feed_type == RSS ) {
  372. return $this->feed_version;
  373. }
  374. else {
  375. return false;
  376. }
  377. }
  378. function is_atom() {
  379. if ( $this->feed_type == ATOM ) {
  380. return $this->feed_version;
  381. }
  382. else {
  383. return false;
  384. }
  385. }
  386. /**
  387. * return XML parser, and possibly re-encoded source
  388. *
  389. */
  390. function create_parser($source, $out_enc, $in_enc, $detect) {
  391. if ( substr(phpversion(),0,1) == 5) {
  392. $parser = $this->php5_create_parser($in_enc, $detect);
  393. }
  394. else {
  395. list($parser, $source) = $this->php4_create_parser($source, $in_enc, $detect);
  396. }
  397. if ($out_enc) {
  398. $this->encoding = $out_enc;
  399. xml_parser_set_option($parser, XML_OPTION_TARGET_ENCODING, $out_enc);
  400. }
  401. return array($parser, $source);
  402. }
  403. /**
  404. * Instantiate an XML parser under PHP5
  405. *
  406. * PHP5 will do a fine job of detecting input encoding
  407. * if passed an empty string as the encoding.
  408. *
  409. * All hail libxml2!
  410. *
  411. */
  412. function php5_create_parser($in_enc, $detect) {
  413. // by default php5 does a fine job of detecting input encodings
  414. if(!$detect && $in_enc) {
  415. return xml_parser_create($in_enc);
  416. }
  417. else {
  418. return xml_parser_create('');
  419. }
  420. }
  421. /**
  422. * Instaniate an XML parser under PHP4
  423. *
  424. * Unfortunately PHP4's support for character encodings
  425. * and especially XML and character encodings sucks. As
  426. * long as the documents you parse only contain characters
  427. * from the ISO-8859-1 character set (a superset of ASCII,
  428. * and a subset of UTF-8) you're fine. However once you
  429. * step out of that comfy little world things get mad, bad,
  430. * and dangerous to know.
  431. *
  432. * The following code is based on SJM's work with FoF
  433. * @see http://minutillo.com/steve/weblog/2004/6/17/php-xml-and-character-encodings-a-tale-of-sadness-rage-and-data-loss
  434. *
  435. */
  436. function php4_create_parser($source, $in_enc, $detect) {
  437. if ( !$detect ) {
  438. return array(xml_parser_create($in_enc), $source);
  439. }
  440. if (!$in_enc) {
  441. if (preg_match('/<?xml.*encoding=[\'"](.*?)[\'"].*?>/m', $source, $m)) {
  442. $in_enc = strtoupper($m[1]);
  443. $this->source_encoding = $in_enc;
  444. }
  445. else {
  446. $in_enc = 'UTF-8';
  447. }
  448. }
  449. if ($this->known_encoding($in_enc)) {
  450. return array(xml_parser_create($in_enc), $source);
  451. }
  452. // the dectected encoding is not one of the simple encodings PHP knows
  453. // attempt to use the iconv extension to
  454. // cast the XML to a known encoding
  455. // @see http://php.net/iconv
  456. if (function_exists('iconv')) {
  457. $encoded_source = iconv($in_enc,'UTF-8', $source);
  458. if ($encoded_source) {
  459. return array(xml_parser_create('UTF-8'), $encoded_source);
  460. }
  461. }
  462. // iconv didn't work, try mb_convert_encoding
  463. // @see http://php.net/mbstring
  464. if(function_exists('mb_convert_encoding')) {
  465. $encoded_source = mb_convert_encoding($source, 'UTF-8', $in_enc );
  466. if ($encoded_source) {
  467. return array(xml_parser_create('UTF-8'), $encoded_source);
  468. }
  469. }
  470. // else
  471. $this->error("Feed is in an unsupported character encoding. ($in_enc) " .
  472. "You may see strange artifacts, and mangled characters.",
  473. E_USER_NOTICE);
  474. return array(xml_parser_create(), $source);
  475. }
  476. function known_encoding($enc) {
  477. $enc = strtoupper($enc);
  478. if ( in_array($enc, $this->_KNOWN_ENCODINGS) ) {
  479. return $enc;
  480. }
  481. else {
  482. return false;
  483. }
  484. }
  485. function error ($errormsg, $lvl=E_USER_WARNING) {
  486. // append PHP's error message if track_errors enabled
  487. if ( isset($php_errormsg) ) {
  488. $errormsg .= " ($php_errormsg)";
  489. }
  490. if ( MAGPIE_DEBUG ) {
  491. trigger_error( $errormsg, $lvl);
  492. }
  493. else {
  494. error_log( $errormsg, 0);
  495. }
  496. $notices = E_USER_NOTICE|E_NOTICE;
  497. if ( $lvl&$notices ) {
  498. $this->WARNING = $errormsg;
  499. } else {
  500. $this->ERROR = $errormsg;
  501. }
  502. }
  503. } // end class RSS
  504. function map_attrs($k, $v) {
  505. return "$k=\"$v\"";
  506. }
  507. // patch to support medieval versions of PHP4.1.x,
  508. // courtesy, Ryan Currie, ryan@digibliss.com
  509. if (!function_exists('array_change_key_case')) {
  510. define("CASE_UPPER",1);
  511. define("CASE_LOWER",0);
  512. function array_change_key_case($array,$case=CASE_LOWER) {
  513. if ($case=CASE_LOWER) $cmd=strtolower;
  514. elseif ($case=CASE_UPPER) $cmd=strtoupper;
  515. foreach($array as $key=>$value) {
  516. $output[$cmd($key)]=$value;
  517. }
  518. return $output;
  519. }
  520. }
  521. ?>