PageRenderTime 25ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/parsers/ARC2_RSSParser.php

http://github.com/semsol/arc2
PHP | 200 lines | 169 code | 17 blank | 14 comment | 27 complexity | a9ceb78514bc6fbb1d1f6efefd8cca31 MD5 | raw file
Possible License(s): GPL-3.0
  1. <?php
  2. /*
  3. @homepage <https://github.com/semsol/arc2>
  4. @license W3C Software License and GPL
  5. class: ARC2 RSS Parser
  6. author: Benjamin Nowack
  7. version: 2010-11-16
  8. */
  9. ARC2::inc('LegacyXMLParser');
  10. class ARC2_RSSParser extends ARC2_LegacyXMLParser
  11. {
  12. public function __construct($a, &$caller)
  13. {
  14. parent::__construct($a, $caller);
  15. }
  16. public function __init()
  17. {/* reader */
  18. parent::__init();
  19. $this->triples = [];
  20. $this->target_encoding = '';
  21. $this->t_count = 0;
  22. $this->added_triples = [];
  23. $this->skip_dupes = false;
  24. $this->bnode_prefix = $this->v('bnode_prefix', 'arc'.substr(md5(uniqid(rand())), 0, 4).'b', $this->a);
  25. $this->bnode_id = 0;
  26. $this->cache = [];
  27. $this->allowCDataNodes = 0;
  28. }
  29. public function done()
  30. {
  31. $this->extractRDF();
  32. }
  33. public function setReader(&$reader)
  34. {
  35. $this->reader = $reader;
  36. }
  37. public function createBnodeID()
  38. {
  39. ++$this->bnode_id;
  40. return '_:'.$this->bnode_prefix.$this->bnode_id;
  41. }
  42. public function addT($t)
  43. {
  44. //if (!isset($t['o_datatype']))
  45. if ($this->skip_dupes) {
  46. $h = md5(serialize($t));
  47. if (!isset($this->added_triples[$h])) {
  48. $this->triples[$this->t_count] = $t;
  49. ++$this->t_count;
  50. $this->added_triples[$h] = true;
  51. }
  52. } else {
  53. $this->triples[$this->t_count] = $t;
  54. ++$this->t_count;
  55. }
  56. }
  57. public function getTriples()
  58. {
  59. return $this->v('triples', []);
  60. }
  61. public function countTriples()
  62. {
  63. return $this->t_count;
  64. }
  65. public function getSimpleIndex($flatten_objects = 1, $vals = '')
  66. {
  67. return ARC2::getSimpleIndex($this->getTriples(), $flatten_objects, $vals);
  68. }
  69. public function extractRDF()
  70. {
  71. $index = $this->getNodeIndex();
  72. $this->rdf = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
  73. $this->rss = 'http://purl.org/rss/1.0/';
  74. $this->dc = 'http://purl.org/dc/elements/1.1/';
  75. $this->dct = 'http://purl.org/dc/terms/';
  76. $this->content = 'http://purl.org/rss/1.0/modules/content/';
  77. $this->enc = 'http://purl.oclc.org/net/rss_2.0/enc#';
  78. $this->mappings = [
  79. 'channel' => $this->rss.'channel',
  80. 'item' => $this->rss.'item',
  81. 'title' => $this->rss.'title',
  82. 'link' => $this->rss.'link',
  83. 'description' => $this->rss.'description',
  84. 'guid' => $this->dc.'identifier',
  85. 'author' => $this->dc.'creator',
  86. 'category' => $this->dc.'subject',
  87. 'pubDate' => $this->dc.'date',
  88. 'pubdate' => $this->dc.'date',
  89. 'source' => $this->dc.'source',
  90. 'enclosure' => $this->enc.'enclosure',
  91. ];
  92. $this->dt_props = [
  93. $this->dc.'identifier',
  94. $this->rss.'link',
  95. ];
  96. foreach ($index as $p_id => $nodes) {
  97. foreach ($nodes as $pos => $node) {
  98. $tag = $this->v('tag', '', $node);
  99. if ('channel' == $tag) {
  100. $struct = $this->extractChannel($index[$node['id']]);
  101. $triples = ARC2::getTriplesFromIndex($struct);
  102. foreach ($triples as $t) {
  103. $this->addT($t);
  104. }
  105. } elseif ('item' == $tag) {
  106. $struct = $this->extractItem($index[$node['id']]);
  107. $triples = ARC2::getTriplesFromIndex($struct);
  108. foreach ($triples as $t) {
  109. $this->addT($t);
  110. }
  111. }
  112. }
  113. }
  114. }
  115. public function extractChannel($els)
  116. {
  117. $res = [$this->rdf.'type' => [['value' => $this->rss.'channel', 'type' => 'uri']]];
  118. $res = array_merge($res, $this->extractProps($els, 'channel'));
  119. return [$res[$this->rss.'link'][0]['value'] => $res];
  120. }
  121. public function extractItem($els)
  122. {
  123. $res = [$this->rdf.'type' => [['value' => $this->rss.'item', 'type' => 'uri']]];
  124. $res = array_merge($res, $this->extractProps($els, 'item'));
  125. if (isset($res[$this->rss.'link'])) {
  126. return [$res[$this->rss.'link'][0]['value'] => $res];
  127. }
  128. if (isset($res[$this->dc.'identifier'])) {
  129. return [$res[$this->dc.'identifier'][0]['value'] => $res];
  130. }
  131. }
  132. public function extractProps($els, $container)
  133. {
  134. $res = [];
  135. foreach ($els as $info) {
  136. /* key */
  137. $tag = $info['tag'];
  138. if (!preg_match('/^[a-z0-9]+\:/i', $tag)) {
  139. $k = isset($this->mappings[$tag]) ? $this->mappings[$tag] : '';
  140. } else {
  141. $k = $tag;
  142. }
  143. if (('channel' == $container) && ($k == $this->rss.'item')) {
  144. continue;
  145. }
  146. /* val */
  147. $v = $info['cdata'];
  148. if (!$v) {
  149. $v = $this->v('url', '', $info['a']);
  150. }
  151. if (!$v) {
  152. $v = $this->v('href', '', $info['a']);
  153. }
  154. /* prop */
  155. if ($k) {
  156. /* enclosure handling */
  157. if ($k == $this->enc.'enclosure') {
  158. $sub_res = [];
  159. foreach (['length', 'type'] as $attr) {
  160. if ($attr_v = $this->v($attr, 0, $info['a'])) {
  161. $sub_res[$this->enc.$attr] = [['value' => $attr_v, 'type' => 'literal']];
  162. }
  163. }
  164. $struct[$v] = $sub_res;
  165. }
  166. /* date handling */
  167. if (in_array($k, [$this->dc.'date', $this->dct.'modified'])) {
  168. if (!preg_match('/^[0-9]{4}/', $v) && ($sub_v = strtotime($v)) && (-1 != $sub_v)) {
  169. $tz = date('Z', $sub_v); /* timezone offset */
  170. $sub_v -= $tz; /* utc */
  171. $v = date('Y-m-d\TH:i:s\Z', $sub_v);
  172. }
  173. }
  174. if (!isset($res[$k])) {
  175. $res[$k] = [];
  176. }
  177. $res[$k][] = ['value' => $v, 'type' => in_array($k, $this->dt_props) || !preg_match('/^[a-z0-9]+\:[^\s]+$/is', $v) ? 'literal' : 'uri'];
  178. }
  179. }
  180. return $res;
  181. }
  182. }