PageRenderTime 53ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/htdocs/core/class/rssparser.class.php

https://github.com/asterix14/dolibarr
PHP | 642 lines | 508 code | 50 blank | 84 comment | 61 complexity | 72f7d651072eb6070183314a0ef82b7c MD5 | raw file
Possible License(s): LGPL-2.0
  1. <?php
  2. /* Copyright (C) 2011 Laurent Destailleur <eldy@users.sourceforge.net>
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 2 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. */
  17. /**
  18. * \file htdocs/core/class/rssparser.class.php
  19. * \ingroup core
  20. * \brief File of class to parse rss feeds
  21. */
  22. class RssParser
  23. {
  24. var $db;
  25. var $error;
  26. protected $_format='';
  27. protected $_urlRSS;
  28. protected $_language;
  29. protected $_generator;
  30. protected $_copyright;
  31. protected $_lastbuilddate;
  32. protected $_imageurl;
  33. protected $_link;
  34. protected $_title;
  35. protected $_description;
  36. protected $_lastfetchdate; // Last successful fetch
  37. protected $_rssarray=array();
  38. // Accessors
  39. public function getFormat() { return $this->_format; }
  40. public function getUrlRss() { return $this->_urlRSS; }
  41. public function getLanguage() { return $this->_language; }
  42. public function getGenerator() { return $this->_generator; }
  43. public function getCopyright() { return $this->_copyright; }
  44. public function getLastBuildDate() { return $this->_lastbuilddate; }
  45. public function getImageUrl() { return $this->_imageurl; }
  46. public function getLink() { return $this->_link; }
  47. public function getTitle() { return $this->_title; }
  48. public function getDescription() { return $this->_description; }
  49. public function getLastFetchDate() { return $this->_lastfetchdate; }
  50. public function getItems() { return $this->_rssarray; }
  51. // For parsing with xmlparser
  52. var $stack = array(); // parser stack
  53. var $_CONTENT_CONSTRUCTS = array('content', 'summary', 'info', 'title', 'tagline', 'copyright');
  54. /**
  55. * Constructor
  56. *
  57. * @param DoliDB $db Database handler
  58. */
  59. public function RssParser($db)
  60. {
  61. $this->db=$db;
  62. }
  63. /**
  64. * Parse rss URL
  65. *
  66. * @param urlRSS Url to parse
  67. * @param maxNb Max nb of records to get (0 for no limit)
  68. * @param cachedelay 0=No cache, nb of seconds we accept cache files (cachedir must also be defined)
  69. * @param cachedir Directory where to save cache file
  70. * @return int <0 if KO, >0 if OK
  71. */
  72. public function parser($urlRSS, $maxNb=0, $cachedelay=60, $cachedir='')
  73. {
  74. include_once(DOL_DOCUMENT_ROOT.'/core/lib/files.lib.php');
  75. $str=''; // This will contain content of feed
  76. // Check parameters
  77. if (! dol_is_url($urlRSS))
  78. {
  79. $this->error="ErrorBadUrl";
  80. return -1;
  81. }
  82. $this->_urlRSS = $urlRSS;
  83. $newpathofdestfile=$cachedir.'/'.dol_hash($this->_urlRSS);
  84. $newmask='0644';
  85. //dol_syslog("RssPArser::parser parse url=".$urlRSS." => cache file=".$newpathofdestfile);
  86. $nowgmt = dol_now();
  87. // Search into cache
  88. $foundintocache=0;
  89. if ($cachedelay > 0 && $cachedir)
  90. {
  91. $filedate=dol_filemtime($newpathofdestfile);
  92. if ($filedate >= ($nowgmt - $cachedelay))
  93. {
  94. //dol_syslog("RssParser::parser cache file ".$newpathofdestfile." is not older than now - cachedelay (".$nowgmt." - ".$cachedelay.") so we use it.");
  95. $foundintocache=1;
  96. $this->_lastfetchdate=$filedate;
  97. }
  98. else
  99. {
  100. dol_syslog("RssParser::parser cache file ".$newpathofdestfile." is not found or older than now - cachedelay (".$nowgmt." - ".$cachedelay.") so we can't use it.");
  101. }
  102. }
  103. // Load file into $str
  104. if ($foundintocache) // Cache file found and is not too old
  105. {
  106. $str = file_get_contents($newpathofdestfile);
  107. }
  108. else
  109. {
  110. try {
  111. ini_set("user_agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)");
  112. ini_set("max_execution_time", 10);
  113. $str = file_get_contents($this->_urlRSS);
  114. }
  115. catch (Exception $e) {
  116. print 'Error retrieving URL '.$this->urlRSS.' - '.$e->getMessage();
  117. }
  118. }
  119. // Convert $str into xml
  120. if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
  121. {
  122. //print 'xx'.LIBXML_NOCDATA;
  123. libxml_use_internal_errors(false);
  124. $rss = simplexml_load_string($str, "SimpleXMLElement", LIBXML_NOCDATA);
  125. }
  126. else
  127. {
  128. $xmlparser=xml_parser_create('');
  129. if (!is_resource($xmlparser)) { $this->error="ErrorFailedToCreateParser"; return -1; }
  130. xml_set_object($xmlparser, $this);
  131. xml_set_element_handler($xmlparser, 'feed_start_element', 'feed_end_element' );
  132. xml_set_character_data_handler($xmlparser, 'feed_cdata');
  133. $status = xml_parse($xmlparser, $str);
  134. xml_parser_free($xmlparser);
  135. $rss=$this;
  136. //var_dump($rss->_format);exit;
  137. }
  138. // If $rss loaded
  139. if ($rss)
  140. {
  141. // Save file into cache
  142. if (empty($foundintocache) && $cachedir)
  143. {
  144. dol_syslog("RssParser::parser cache file ".$newpathofdestfile." is saved onto disk.");
  145. if (! dol_is_dir($cachedir)) dol_mkdir($cachedir);
  146. $fp = fopen($newpathofdestfile, 'w');
  147. fwrite($fp, $str);
  148. fclose($fp);
  149. if (! empty($conf->global->MAIN_UMASK)) $newmask=$conf->global->MAIN_UMASK;
  150. @chmod($newpathofdestfile, octdec($newmask));
  151. $this->_lastfetchdate=$nowgmt;
  152. }
  153. unset($str); // Free memory
  154. if (empty($rss->_format)) // If format not detected automatically
  155. {
  156. $rss->_format='rss';
  157. if (empty($rss->channel)) $rss->_format='atom';
  158. }
  159. $items=array();
  160. // Save description entries
  161. if ($rss->_format == 'rss')
  162. {
  163. //var_dump($rss);
  164. if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
  165. {
  166. if (!empty($rss->channel->language)) $this->_language = (string) $rss->channel->language;
  167. if (!empty($rss->channel->generator)) $this->_generator = (string) $rss->channel->generator;
  168. if (!empty($rss->channel->copyright)) $this->_copyright = (string) $rss->channel->copyright;
  169. if (!empty($rss->channel->lastbuilddate)) $this->_lastbuilddate = (string) $rss->channel->lastbuilddate;
  170. if (!empty($rss->channel->image->url[0])) $this->_imageurl = (string) $rss->channel->image->url[0];
  171. if (!empty($rss->channel->link)) $this->_link = (string) $rss->channel->link;
  172. if (!empty($rss->channel->title)) $this->_title = (string) $rss->channel->title;
  173. if (!empty($rss->channel->description)) $this->_description = (string) $rss->channel->description;
  174. }
  175. else
  176. {
  177. //var_dump($rss->channel);
  178. if (!empty($rss->channel['language'])) $this->_language = (string) $rss->channel['language'];
  179. if (!empty($rss->channel['generator'])) $this->_generator = (string) $rss->channel['generator'];
  180. if (!empty($rss->channel['copyright'])) $this->_copyright = (string) $rss->channel['copyright'];
  181. if (!empty($rss->channel['lastbuilddate'])) $this->_lastbuilddate = (string) $rss->channel['lastbuilddate'];
  182. if (!empty($rss->image['url'])) $this->_imageurl = (string) $rss->image['url'];
  183. if (!empty($rss->channel['link'])) $this->_link = (string) $rss->channel['link'];
  184. if (!empty($rss->channel['title'])) $this->_title = (string) $rss->channel['title'];
  185. if (!empty($rss->channel['description'])) $this->_description = (string) $rss->channel['description'];
  186. }
  187. if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML)) $items=$rss->channel->item; // With simplexml
  188. else $items=$rss->items; // With xmlparse
  189. //var_dump($items);exit;
  190. }
  191. else if ($rss->_format == 'atom')
  192. {
  193. //var_dump($rss);
  194. if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
  195. {
  196. if (!empty($rss->generator)) $this->_generator = (string) $rss->generator;
  197. if (!empty($rss->lastbuilddate)) $this->_lastbuilddate = (string) $rss->modified;
  198. if (!empty($rss->link->href)) $this->_link = (string) $rss->link->href;
  199. if (!empty($rss->title)) $this->_title = (string) $rss->title;
  200. if (!empty($rss->description)) $this->_description = (string) $rss->description;
  201. }
  202. else
  203. {
  204. //if (!empty($rss->channel['rss_language'])) $this->_language = (string) $rss->channel['rss_language'];
  205. if (!empty($rss->channel['generator'])) $this->_generator = (string) $rss->channel['generator'];
  206. //if (!empty($rss->channel['rss_copyright'])) $this->_copyright = (string) $rss->channel['rss_copyright'];
  207. if (!empty($rss->channel['modified'])) $this->_lastbuilddate = (string) $rss->channel['modified'];
  208. //if (!empty($rss->image['rss_url'])) $this->_imageurl = (string) $rss->image['rss_url'];
  209. if (!empty($rss->channel['link'])) $this->_link = (string) $rss->channel['link'];
  210. if (!empty($rss->channel['title'])) $this->_title = (string) $rss->channel['title'];
  211. //if (!empty($rss->channel['rss_description'])) $this->_description = (string) $rss->channel['rss_description'];
  212. }
  213. if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML)) { $tmprss=xml2php($rss); $items=$tmprss['entry'];} // With simplexml
  214. else $items=$rss->items; // With xmlparse
  215. //var_dump($items);exit;
  216. }
  217. $i = 0;
  218. // Loop on each record
  219. foreach($items as $item)
  220. {
  221. //var_dump($item);exit;
  222. if ($rss->_format == 'rss')
  223. {
  224. if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
  225. {
  226. $itemLink = (string) $item->link;
  227. $itemTitle = (string) $item->title;
  228. $itemDescription = (string) $item->description;
  229. $itemPubDate = (string) $item->pubDate;
  230. $itemId = '';
  231. $itemAuthor = '';
  232. }
  233. else
  234. {
  235. $itemLink = (string) $item['link'];
  236. $itemTitle = (string) $item['title'];
  237. $itemDescription = (string) $item['description'];
  238. $itemPubDate = (string) $item['pubdate'];
  239. $itemId = (string) $item['guid'];
  240. $itemAuthor = (string) $item['author'];
  241. }
  242. // Loop on each category
  243. $itemCategory=array();
  244. if (is_array($item->category))
  245. {
  246. foreach ($item->category as $cat)
  247. {
  248. $itemCategory[] = (string) $cat;
  249. }
  250. }
  251. }
  252. else if ($rss->_format == 'atom')
  253. {
  254. if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
  255. {
  256. $itemLink = (string) $item['link']['href'];
  257. $itemTitle = (string) $item['title'];
  258. $itemDescription = (string) $item['summary'];
  259. $itemPubDate = (string) $item['created'];
  260. $itemId = (string) $item['id'];
  261. $itemAuthor = (string) ($item['author']?$item['author']:$item['author_name']);
  262. }
  263. else
  264. {
  265. $itemLink = (string) $item['link']['href'];
  266. $itemTitle = (string) $item['title'];
  267. $itemDescription = (string) $item['summary'];
  268. $itemPubDate = (string) $item['created'];
  269. $itemId = (string) $item['id'];
  270. $itemAuthor = (string) ($item['author']?$item['author']:$item['author_name']);
  271. }
  272. }
  273. else print 'ErrorBadFeedFormat';
  274. // Add record to result array
  275. $this->_rssarray[$i] = array(
  276. 'link'=>$itemLink,
  277. 'title'=>$itemTitle,
  278. 'description'=>$itemDescription,
  279. 'pubDate'=>$itemPubDate,
  280. 'category'=>$itemCategory,
  281. 'id'=>$itemId,
  282. 'author'=>$itemAuthor);
  283. //var_dump($this->_rssarray);
  284. $i++;
  285. if ($i > $maxNb) break; // We get all records we want
  286. }
  287. return 1;
  288. }
  289. else
  290. {
  291. $this->error='ErrorFailedToLoadRSSFile';
  292. return -1;
  293. }
  294. }
  295. /**
  296. * Triggered when opened tag is found
  297. *
  298. * @param $p
  299. * @param $element Tag
  300. * @param $attrs Attributes of tags
  301. */
  302. function feed_start_element($p, $element, &$attrs)
  303. {
  304. $el = $element = strtolower($element);
  305. $attrs = array_change_key_case($attrs, CASE_LOWER);
  306. // check for a namespace, and split if found
  307. $ns = false;
  308. if ( strpos( $element, ':' ) ) {
  309. list($ns, $el) = explode(':', $element, 2);
  310. }
  311. if ( $ns and $ns != 'rdf' ) {
  312. $this->current_namespace = $ns;
  313. }
  314. // if feed type isn't set, then this is first element of feed identify feed from root element
  315. if (empty($this->_format))
  316. {
  317. if ( $el == 'rdf' ) {
  318. $this->_format = 'rss';
  319. $this->feed_version = '1.0';
  320. }
  321. elseif ( $el == 'rss' ) {
  322. $this->_format = 'rss';
  323. $this->feed_version = $attrs['version'];
  324. }
  325. elseif ( $el == 'feed' ) {
  326. $this->_format = 'atom';
  327. $this->feed_version = $attrs['version'];
  328. $this->inchannel = true;
  329. }
  330. return;
  331. }
  332. if ( $el == 'channel' )
  333. {
  334. $this->inchannel = true;
  335. }
  336. elseif ($el == 'item' or $el == 'entry' )
  337. {
  338. $this->initem = true;
  339. if ( isset($attrs['rdf:about']) ) {
  340. $this->current_item['about'] = $attrs['rdf:about'];
  341. }
  342. }
  343. // if we're in the default namespace of an RSS feed,
  344. // record textinput or image fields
  345. elseif (
  346. $this->_format == 'rss' and
  347. $this->current_namespace == '' and
  348. $el == 'textinput' )
  349. {
  350. $this->intextinput = true;
  351. }
  352. elseif (
  353. $this->_format == 'rss' and
  354. $this->current_namespace == '' and
  355. $el == 'image' )
  356. {
  357. $this->inimage = true;
  358. }
  359. // handle atom content constructs
  360. elseif ( $this->_format == 'atom' and in_array($el, $this->_CONTENT_CONSTRUCTS) )
  361. {
  362. // avoid clashing w/ RSS mod_content
  363. if ($el == 'content' ) {
  364. $el = 'atom_content';
  365. }
  366. $this->incontent = $el;
  367. }
  368. // if inside an Atom content construct (e.g. content or summary) field treat tags as text
  369. elseif ($this->_format == 'atom' and $this->incontent )
  370. {
  371. // if tags are inlined, then flatten
  372. $attrs_str = join(' ',
  373. array_map('map_attrs',
  374. array_keys($attrs),
  375. array_values($attrs) ) );
  376. $this->append_content("<$element $attrs_str>" );
  377. array_unshift($this->stack, $el);
  378. }
  379. // Atom support many links per containging element.
  380. // Magpie treats link elements of type rel='alternate'
  381. // as being equivalent to RSS's simple link element.
  382. //
  383. elseif ($this->_format == 'atom' and $el == 'link' )
  384. {
  385. if ( isset($attrs['rel']) and $attrs['rel'] == 'alternate' )
  386. {
  387. $link_el = 'link';
  388. }
  389. else {
  390. $link_el = 'link_' . $attrs['rel'];
  391. }
  392. $this->append($link_el, $attrs['href']);
  393. }
  394. // set stack[0] to current element
  395. else {
  396. array_unshift($this->stack, $el);
  397. }
  398. }
  399. /**
  400. * Triggered when CDATA is found
  401. *
  402. * @param $p
  403. * @param $text Tag
  404. */
  405. function feed_cdata($p, $text)
  406. {
  407. if ($this->_format == 'atom' and $this->incontent)
  408. {
  409. $this->append_content($text);
  410. }
  411. else {
  412. $current_el = join('_', array_reverse($this->stack));
  413. $this->append($current_el, $text);
  414. }
  415. }
  416. /**
  417. * Triggered when closed tag is found
  418. *
  419. * @param $p
  420. * @param $el Tag
  421. */
  422. function feed_end_element($p, $el)
  423. {
  424. $el = strtolower($el);
  425. if ( $el == 'item' or $el == 'entry' )
  426. {
  427. $this->items[] = $this->current_item;
  428. $this->current_item = array();
  429. $this->initem = false;
  430. }
  431. elseif ($this->_format == 'rss' and $this->current_namespace == '' and $el == 'textinput' )
  432. {
  433. $this->intextinput = false;
  434. }
  435. elseif ($this->_format == 'rss' and $this->current_namespace == '' and $el == 'image' )
  436. {
  437. $this->inimage = false;
  438. }
  439. elseif ($this->_format == 'atom' and in_array($el, $this->_CONTENT_CONSTRUCTS) )
  440. {
  441. $this->incontent = false;
  442. }
  443. elseif ($el == 'channel' or $el == 'feed' )
  444. {
  445. $this->inchannel = false;
  446. }
  447. elseif ($this->_format == 'atom' and $this->incontent ) {
  448. // balance tags properly
  449. // note: i don't think this is actually neccessary
  450. if ( $this->stack[0] == $el )
  451. {
  452. $this->append_content("</$el>");
  453. }
  454. else {
  455. $this->append_content("<$el />");
  456. }
  457. array_shift($this->stack);
  458. }
  459. else {
  460. array_shift($this->stack);
  461. }
  462. $this->current_namespace = false;
  463. }
  464. /**
  465. * To concat 2 string with no warning if an operand is not defined
  466. *
  467. * @param $str1
  468. * @param $str2
  469. */
  470. function concat(&$str1, $str2="")
  471. {
  472. if (!isset($str1) ) {
  473. $str1="";
  474. }
  475. $str1 .= $str2;
  476. }
  477. /**
  478. */
  479. function append_content($text)
  480. {
  481. if ( $this->initem ) {
  482. $this->concat($this->current_item[ $this->incontent ], $text);
  483. }
  484. elseif ( $this->inchannel ) {
  485. $this->concat($this->channel[ $this->incontent ], $text);
  486. }
  487. }
  488. /**
  489. * smart append - field and namespace aware
  490. */
  491. function append($el, $text)
  492. {
  493. if (!$el) {
  494. return;
  495. }
  496. if ( $this->current_namespace )
  497. {
  498. if ( $this->initem ) {
  499. $this->concat( $this->current_item[ $this->current_namespace ][ $el ], $text);
  500. }
  501. elseif ($this->inchannel) {
  502. $this->concat( $this->channel[ $this->current_namespace][ $el ], $text);
  503. }
  504. elseif ($this->intextinput) {
  505. $this->concat( $this->textinput[ $this->current_namespace][ $el ], $text);
  506. }
  507. elseif ($this->inimage) {
  508. $this->concat( $this->image[ $this->current_namespace ][ $el ], $text);
  509. }
  510. }
  511. else {
  512. if ( $this->initem ) {
  513. $this->concat( $this->current_item[ $el ], $text);
  514. }
  515. elseif ($this->intextinput) {
  516. $this->concat( $this->textinput[ $el ], $text);
  517. }
  518. elseif ($this->inimage) {
  519. $this->concat( $this->image[ $el ], $text);
  520. }
  521. elseif ($this->inchannel) {
  522. $this->concat( $this->channel[ $el ], $text);
  523. }
  524. }
  525. }
  526. }
  527. /**
  528. * Function to convert an XML object into an array
  529. */
  530. function xml2php($xml)
  531. {
  532. $fils = 0;
  533. $tab = false;
  534. $array = array();
  535. foreach($xml->children() as $key => $value)
  536. {
  537. $child = xml2php($value);
  538. //To deal with the attributes
  539. foreach($value->attributes() as $ak=>$av)
  540. {
  541. $child[$ak] = (string) $av;
  542. }
  543. //Let see if the new child is not in the array
  544. if($tab==false && in_array($key,array_keys($array)))
  545. {
  546. //If this element is already in the array we will create an indexed array
  547. $tmp = $array[$key];
  548. $array[$key] = NULL;
  549. $array[$key][] = $tmp;
  550. $array[$key][] = $child;
  551. $tab = true;
  552. }
  553. elseif($tab == true)
  554. {
  555. //Add an element in an existing array
  556. $array[$key][] = $child;
  557. }
  558. else
  559. {
  560. //Add a simple element
  561. $array[$key] = $child;
  562. }
  563. $fils++;
  564. }
  565. if($fils==0)
  566. {
  567. return (string) $xml;
  568. }
  569. return $array;
  570. }
  571. ?>