PageRenderTime 148ms CodeModel.GetById 51ms app.highlight 76ms RepoModel.GetById 14ms app.codeStats 1ms

/inc/magpie/rss_parse.inc

https://github.com/chregu/fluxcms
PHP | 567 lines | 347 code | 69 blank | 151 comment | 83 complexity | 0d08bbf57a4299dd7c24e7586cba1597 MD5 | raw file
  1<?php
  2/*
  3 * Project:     MagpieRSS: a simple RSS integration tool
  4 * File:        rss_parse.inc  - parse an RSS or Atom feed
  5 *              return as a simple object.
  6 *
  7 * Handles RSS 0.9x, RSS 2.0, RSS 1.0, and Atom 0.3
  8 *
  9 * The lastest version of MagpieRSS can be obtained from:
 10 * http://magpierss.sourceforge.net
 11 *
 12 * For questions, help, comments, discussion, etc., please join the
 13 * Magpie mailing list:
 14 * magpierss-general@lists.sourceforge.net
 15 *
 16 * Author:      Kellan Elliott-McCrea <kellan@protest.net>
 17 * Version:     0.6a
 18 * License:     GPL
 19 *
 20 *
 21 *  ABOUT MAGPIE's APPROACH TO PARSING:
 22 *   - Magpie is based on expat, an XML parser, and therefore will only parse
 23 *     valid XML files.  This includes all properly constructed RSS or Atom.
 24 *
 25 *   - Magpie is an inclusive parser.  It will include any elements that 
 26 *     it can turn into a key value pair in the parsed feed object it returns. 
 27 *      
 28 *   - Magpie supports namespaces, and will return any elements found in a 
 29 *     namespace in a sub-array, with the key point to that array being the 
 30 *     namespace prefix.  
 31 *     (e.g. if an item contains a <dc:date> element, then that date can 
 32 *     be accessed at $item['dc']['date']
 33 *      
 34 *   - Magpie supports nested elements by combining the names.  If an item 
 35 *     includes XML like:
 36 *      <author>
 37 *        <name>Kellan</name>
 38 *      </author>
 39 *      
 40 *    The name field is accessible at $item['author_name']
 41 *  
 42 *   - Magpie makes no attempt validate a feed beyond insuring that it
 43 *     is valid XML.   
 44 *     RSS validators are readily available on the web at:
 45 *       http://feeds.archive.org/validator/
 46 *       http://www.ldodds.com/rss_validator/1.0/validator.html
 47 *
 48 *
 49 * EXAMPLE PARSED RSS ITEM:
 50 *
 51 * Magpie tries to parse RSS into easy to use PHP datastructures.
 52 *
 53 * For example, Magpie on encountering (a rather complex) RSS 1.0 item entry:
 54 *
 55 * <item rdf:about="http://protest.net/NorthEast/calendrome.cgi?span=event&#38;ID=210257">
 56 *   <title>Weekly Peace Vigil</title>
 57 *   <link>http://protest.net/NorthEast/calendrome.cgi?span=event&#38;ID=210257</link>
 58 *   <description>Wear a white ribbon</description>
 59 *   <dc:subject>Peace</dc:subject>
 60 *   <ev:startdate>2002-06-01T11:00:00</ev:startdate>
 61 *   <ev:location>Northampton, MA</ev:location>
 62 *   <ev:type>Protest</ev:type>
 63 * </item>
 64 * 
 65 * Would transform it into the following associative array, and push it
 66 * onto the array $rss-items
 67 *
 68 * array(
 69 *	title => 'Weekly Peace Vigil',
 70 *	link => 'http://protest.net/NorthEast/calendrome.cgi?span=event&#38;ID=210257',
 71 *	description => 'Wear a white ribbon',
 72 *	dc => array (
 73 *			subject => 'Peace'
 74 *		),
 75 *	ev => array (
 76 *		startdate => '2002-06-01T11:00:00',
 77 *		enddate => '2002-06-01T12:00:00',
 78 *		type => 'Protest',
 79 *		location => 'Northampton, MA'
 80 *	)
 81 * )
 82 *
 83 *
 84 *
 85 *  A FEW NOTES ON PARSING Atom FEEDS
 86 *
 87 *  Atom support is considered alpha.  Atom elements will be often be available
 88 *  as their RSS equivalent, summary is available as description for example.
 89 *
 90 *  Elements of mode=xml, as flattened into a single string, just as if they
 91 *  had been wrapped in a CDATA container.
 92 *
 93 *  See:  http://laughingmeme.org/archives/001676.html
 94 *
 95 */
 96
 97define('RSS', 'RSS');
 98define('ATOM', 'Atom');
 99define('RDFNS', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#');
100
101
102class MagpieRSS {
103	/*
104	 * Hybrid parser, and object.  (probably a bad idea! :)
105	 *
106	 * Useage Example:
107	 *
108	 * $some_rss = "<?xml version="1.0"......
109	 *
110	 * $rss = new MagpieRSS( $some_rss );
111	 *
112	 * // print rss chanel title
113	 * echo $rss->channel['title'];
114	 *
115	 * // print the title of each item
116	 * foreach ($rss->items as $item ) {
117	 *	  echo $item[title];
118	 * }
119	 *
120	 * see: rss_fetch.inc for a simpler interface
121	 */
122	 
123	public $parser;
124	
125	public $current_item	= array();	// item currently being parsed
126	public $items			= array();	// collection of parsed items
127	public $channel		= array();	// hash of channel fields
128	public $textinput		= array();
129	public $image			= array();
130	public $feed_type;
131	public $feed_version;
132
133	// parser publiciables
134	public $stack				= array(); // parser stack
135	public $inchannel			= false;
136	public $initem 			= false;
137	public $incontent			= false; // if in Atom <content mode="xml"> field 
138	public $intextinput		= false;
139	public $inimage 			= false;
140	public $current_field		= '';
141	public $current_namespace	= false;
142	
143	
144	public $namespaces = array(   'http://purl.org/rss/1.0/'=>'', 
145                               RDFNS=>'rdf',
146                               'http://purl.org/dc/elements/1.1/'=>'dc' ,
147                               'http://purl.org/rss/1.0/modules/syndication/'=>'sy', 
148                               'http://webns.net/mvcb/'=>'admin' ,
149                               'http://purl.org/rss/1.0/modules/content/'=>'content'
150); 
151	
152	
153	public $ERROR = "";
154	
155	public $_CONTENT_CONSTRUCTS = array('content', 'summary', 'info', 'title', 'tagline', 'copyright');
156/*======================================================================*\
157	Function: MagpieRSS
158	Purpose:  Constructor, sets up XML parser,parses source,
159			  and populates object.. 
160	Input:	  String containing the RSS to be parsed
161\*======================================================================*/
162	function MagpieRSS ($source) {
163		
164		# if PHP xml isn't compiled in, die
165		#
166		if (!function_exists('xml_parser_create')) {
167			$this->error( "Failed to load PHP's XML Extension. " . 
168						  "http://www.php.net/manual/en/ref.xml.php",
169						   E_USER_ERROR );
170		}
171		
172		$parser = @xml_parser_create_ns("UTF-8","@");
173		xml_parser_set_option($parser,XML_OPTION_CASE_FOLDING,false);
174		if (!is_resource($parser))
175		{
176			$this->error( "Failed to create an instance of PHP's XML parser. " .
177						  "http://www.php.net/manual/en/ref.xml.php",
178						  E_USER_ERROR );
179		}
180
181		
182		$this->parser = $parser;
183		
184		# pass in parser, and a reference to this object
185		# setup handlers
186		#
187		xml_set_object( $this->parser, $this );
188		xml_set_element_handler($this->parser, 
189				'feed_start_element', 'feed_end_element' );
190						
191		xml_set_character_data_handler( $this->parser, 'feed_cdata' ); 
192	
193		$status = xml_parse( $this->parser, $source );
194		
195		if (! $status ) {
196			$errorcode = xml_get_error_code( $this->parser );
197			if ( $errorcode != XML_ERROR_NONE ) {
198				$xml_error = xml_error_string( $errorcode );
199				$error_line = xml_get_current_line_number($this->parser);
200				$error_col = xml_get_current_column_number($this->parser);
201				$errormsg = "$xml_error at line $error_line, column $error_col";
202
203				$this->error( $errormsg );
204			}
205		}
206		
207		xml_parser_free( $this->parser );
208
209		$this->normalize();
210	}
211	
212	function feed_start_element($p, $element, &$attrs) {
213		$el = $element = strtolower($element);
214		$attrs = array_change_key_case($attrs, CASE_LOWER);
215		
216		// check for a namespace, and split if found
217		$ns	= false;
218		if ( strpos( $element, '@' ) ) {
219			list($ns, $el) = split( '@', $element, 2); 
220		}
221		if ( $ns and $ns != RDFNS) {
222			
223			$this->current_namespace = $this->namespaces[$ns];
224		}
225			
226		# if feed type isn't set, then this is first element of feed
227		# identify feed from root element
228		#
229		if (!isset($this->feed_type) ) {
230			if ( $el == 'rdf' ) {
231				$this->feed_type = RSS;
232				$this->feed_version = '1.0';
233			}
234			elseif ( $el == 'rss' ) {
235				$this->feed_type = RSS;
236				$this->feed_version = $attrs['version'];
237			}
238			elseif ( $el == 'feed' ) {
239				$this->feed_type = ATOM;
240				$this->feed_version = $attrs['version'];
241				$this->inchannel = true;
242			}
243			return;
244		}
245	
246		if ( $el == 'channel' ) 
247		{
248			$this->inchannel = true;
249		}
250		elseif ($el == 'item' or $el == 'entry' ) 
251		{
252			$this->initem = true;
253			if ( isset($attrs[RDFNS.'@about']) ) {
254				$this->current_item['about'] = $attrs[RDFNS.'@about'];	
255			}
256		}
257		
258		// if we're in the default namespace of an RSS feed,
259		//  record textinput or image fields
260		elseif ( 
261			$this->feed_type == RSS and 
262			$this->current_namespace == '' and 
263			$el == 'textinput' ) 
264		{
265			$this->intextinput = true;
266		}
267		
268		elseif (
269			$this->feed_type == RSS and 
270			$this->current_namespace == '' and 
271			$el == 'image' ) 
272		{
273			$this->inimage = true;
274		}
275		
276		# handle atom content constructs
277		elseif ( $this->feed_type == ATOM and in_array($el, $this->_CONTENT_CONSTRUCTS) )
278		{
279			// avoid clashing w/ RSS mod_content
280			if ($el == 'content' ) {
281				$el = 'atom_content';
282			}
283			
284			$this->incontent = $el;
285			
286			
287		}
288		
289		// if inside an Atom content construct (e.g. content or summary) field treat tags as text
290		elseif ($this->feed_type == ATOM and $this->incontent ) 
291		{
292			// if tags are inlined, then flatten
293			$attrs_str = join(' ', 
294					array_map('map_attrs', 
295					array_keys($attrs), 
296					array_values($attrs) ) );
297			
298			$this->append_content( "<$element $attrs_str>"  );
299					
300			array_unshift( $this->stack, $el );
301		}
302		
303		// Atom support many links per containging element.
304		// Magpie treats link elements of type rel='alternate'
305		// as being equivalent to RSS's simple link element.
306		//
307		elseif ($this->feed_type == ATOM and $el == 'link' ) 
308		{
309			if ( isset($attrs['rel']) and $attrs['rel'] == 'alternate' ) 
310			{
311				$link_el = 'link';
312			}
313			else {
314				$link_el = 'link_' . $attrs['rel'];
315			}
316			
317			$this->append($link_el, $attrs['href']);
318		}
319		// set stack[0] to current element
320		else {
321			array_unshift($this->stack, $el);
322		}
323	}
324	
325
326	
327	function feed_cdata ($p, $text) {
328		
329		if ($this->feed_type == ATOM and $this->incontent) 
330		{
331			$this->append_content( $text );
332		}
333		else {
334			
335			$current_el = join('_', array_reverse($this->stack));
336			$this->append($current_el, $text);
337		}
338	}
339	
340	function feed_end_element ($p, $el) {
341		$el = strtolower($el);
342		if ( strpos( $el, '@' ) ) {
343			list($ns, $el) = split( '@', $el, 2); 
344		}
345		if ( $ns ) {
346			$ns = $this->namespaces[$ns];
347		}
348		
349		if ( $el == 'item' or $el == 'entry' ) 
350		{
351				
352			$this->items[] = $this->current_item;
353			$this->current_item = array();
354			$this->initem = false;
355		}
356		elseif ($this->feed_type == RSS and $this->current_namespace == '' and $el == 'textinput' ) 
357		{
358			$this->intextinput = false;
359		}
360		elseif ($this->feed_type == RSS and $this->current_namespace == '' and $el == 'image' ) 
361		{
362			$this->inimage = false;
363		}
364		elseif ($this->feed_type == ATOM and in_array($el, $this->_CONTENT_CONSTRUCTS) )
365		{	
366			$this->incontent = false;
367		}
368		elseif ($el == 'channel' or $el == 'feed' ) 
369		{
370			$this->inchannel = false;
371		}
372		elseif ($this->feed_type == ATOM and $this->incontent  ) {
373			// balance tags properly
374			// note:  i don't think this is actually neccessary
375			if ( $this->stack[0] == $el ) 
376			{
377				$this->append_content("</$el>");
378			}
379			else {
380				$this->append_content("<$el />");
381			}
382
383			array_shift( $this->stack );
384		}
385		else {
386			array_shift( $this->stack );
387		}
388		
389		$this->current_namespace = false;
390	}
391	
392	function concat (&$str1, $str2="") {
393		if (!isset($str1) ) {
394			$str1="";
395		}
396		$str1 .= $str2;
397	}
398	
399	
400	
401	function append_content($text) {
402		if ( $this->initem ) {
403			$this->concat( $this->current_item[ $this->incontent ], $text );
404		}
405		elseif ( $this->inchannel ) {
406			$this->concat( $this->channel[ $this->incontent ], $text );
407		}
408	}
409	
410	// smart append - field and namespace aware
411	function append($el, $text) {
412		if (!$el) {
413			return;
414		}
415		if ( $this->current_namespace ) 
416		{
417			if ( $this->initem ) {
418				$this->concat(
419					$this->current_item[ $this->current_namespace ][ $el ], $text);
420			}
421			elseif ($this->inchannel) {
422				$this->concat(
423					$this->channel[ $this->current_namespace][ $el ], $text );
424			}
425			elseif ($this->intextinput) {
426				$this->concat(
427					$this->textinput[ $this->current_namespace][ $el ], $text );
428			}
429			elseif ($this->inimage) {
430				$this->concat(
431					$this->image[ $this->current_namespace ][ $el ], $text );
432			}
433		}
434		else {
435			if ( $this->initem ) {
436				$this->concat(
437					$this->current_item[ $el ], $text);
438			}
439			elseif ($this->intextinput) {
440				$this->concat(
441					$this->textinput[ $el ], $text );
442			}
443			elseif ($this->inimage) {
444				$this->concat(
445					$this->image[ $el ], $text );
446			}
447			elseif ($this->inchannel) {
448				$this->concat(
449					$this->channel[ $el ], $text );
450			}
451			
452		}
453	}
454	
455	function normalize () {
456		// if atom populate rss fields
457		if ( $this->is_atom() ) {
458			$this->channel['descripton'] = $this->channel['tagline'];
459			for ( $i = 0; $i < count($this->items); $i++) {
460				$item = $this->items[$i];
461				if ( isset($item['summary']) )
462					$item['description'] = $item['summary'];
463				if ( isset($item['atom_content']))
464					$item['content']['encoded'] = $item['atom_content'];
465				
466				$this->items[$i] = $item;
467			}		
468		}
469		elseif ( $this->is_rss() ) {
470			$this->channel['tagline'] = $this->channel['description'];
471			for ( $i = 0; $i < count($this->items); $i++) {
472				$item = $this->items[$i];
473				if ( isset($item['description']))
474					$item['summary'] = $item['description'];
475				if ( isset($item['content']['encoded'] ) )
476					$item['atom_content'] = $item['content']['encoded'];
477			
478				$this->items[$i] = $item;
479			}
480		}
481	}
482	
483	function error ($errormsg, $lvl=E_USER_WARNING) {
484		// append PHP's error message if track_errors enabled
485		if ( $php_errormsg ) { 
486			$errormsg .= " ($php_errormsg)";
487		}
488		$this->ERROR = $errormsg;
489		if ( MAGPIE_DEBUG ) {
490			trigger_error( $errormsg, $lvl);		
491		}
492		else {
493			error_log( $errormsg, 0);
494		}
495	}
496	
497	function is_rss () {
498		if ( $this->feed_type == RSS ) {
499			return $this->feed_version;	
500		}
501		else {
502			return false;
503		}
504	}
505	
506	function is_atom() {
507		if ( $this->feed_type == ATOM ) {
508			return $this->feed_version;
509		}
510		else {
511			return false;
512		}
513	}
514
515/*======================================================================*\
516	EVERYTHING BELOW HERE IS FOR DEBUGGING PURPOSES
517\*======================================================================*/
518	function show_list () {
519		echo "<ol>\n";
520		foreach ($this->items as $item) {
521			echo "<li>", $this->show_item( $item );
522		}
523		echo "</ol>";
524	}
525	
526	function show_channel () {
527		echo "channel:<br>";
528		echo "<ul>";
529		while ( list($key, $value) = each( $this->channel ) ) {
530			echo "<li> $key: $value";
531		}
532		echo "</ul>";
533	}
534	
535	function show_item ($item) {
536		echo "item: $item[title]";
537		echo "<ul>";
538		while ( list($key, $value) = each($item) ) {
539			if ( is_array($value) ) {
540				echo "<br><b>$key</b>";
541				echo "<ul>";
542				while ( list( $ns_key, $ns_value) = each( $value ) ) {
543					echo "<li>$ns_key: $ns_value";
544				}
545				echo "</ul>";
546			}
547			else {
548				echo "<li> $key: $value";
549			}
550		}
551		echo "</ul>";
552	}
553
554/*======================================================================*\
555	END DEBUGGING FUNCTIONS	
556\*======================================================================*/
557	
558
559
560} # end class RSS
561
562function map_attrs($k, $v) {
563	return "$k=\"$v\"";
564}
565
566
567?>