PageRenderTime 46ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/feed.php

https://bitbucket.org/paulsnar/river
PHP | 438 lines | 403 code | 35 blank | 0 comment | 27 complexity | 2a26650a21bcbad3837c21903f22855d MD5 | raw file
  1. <?php declare(strict_types=1);
  2. namespace PN\River;
  3. require_lib('db');
  4. require_lib('httpc');
  5. require_lib('xml_utils');
  6. require_src('feedfixr');
  7. const FEED_DEFAULT_SYNC_INTERVAL = 15 * 60;
  8. const FEED_JITTER = 200;
  9. const XMLNS_ATOM = 'http://www.w3.org/2005/Atom';
  10. const XMLNS_XHTML = 'http://www.w3.org/1999/xhtml';
  11. class Item
  12. {
  13. public $_id;
  14. public $of_feed;
  15. public $feedwide_id;
  16. public $timestamp;
  17. public $title;
  18. public $content;
  19. public $link;
  20. public function save()
  21. {
  22. $exists = DB::selectOne('select count(1) as feed_exists from entries ' .
  23. 'where of_feed = ? and feedwide_id = ?',
  24. [ $this->of_feed, $this->feedwide_id ])['feed_exists'];
  25. $exists = intval($exists, 10);
  26. if ($exists === 1) {
  27. return;
  28. }
  29. DB::query('insert into entries (of_feed, feedwide_id, created_at, ' .
  30. 'published_at, title, content, link) values (?, ?, ?, ?, ?, ?, ?)',
  31. [ $this->of_feed, $this->feedwide_id, time(), $this->timestamp,
  32. $this->title, $this->content, $this->link ]);
  33. }
  34. }
  35. class Feed
  36. {
  37. public $_id;
  38. public $name;
  39. public $url;
  40. public $type;
  41. public $ttl;
  42. protected $entries;
  43. public function __construct($self)
  44. {
  45. $this->_id = $self['_id'];
  46. $this->name = $self['name'];
  47. $this->url = $self['url'];
  48. $this->type = $self['type'];
  49. }
  50. public static function lookup($id)
  51. {
  52. $data = DB::selectOne('select * from feeds where _id = ?', [ $id ]);
  53. if ($data === null) {
  54. return null;
  55. }
  56. return new Feed($data);
  57. }
  58. public function sync()
  59. {
  60. if ( ! in_array($this->type, [ 'rss', 'atom' ])) {
  61. throw new \Exception("Don't know how to parse feed type {$this->type}");
  62. }
  63. [ $status, $headers, $resp ] = httpc_request('GET', $this->url);
  64. if (200 > $status || 300 <= $status) {
  65. throw new \Exception("Feed fetch failed with status {$status}");
  66. }
  67. if ($this->type === 'rss') {
  68. $p = new RSSParser();
  69. } else if ($this->type === 'atom') {
  70. $p = new AtomParser();
  71. }
  72. $p->ingest($resp);
  73. $entries = $p->feedItems();
  74. $meta = $p->feedMeta();
  75. $key = feedfixr_has($this->url);
  76. foreach ($entries as $entry) {
  77. $entry->of_feed = $this->_id;
  78. if ($key !== null) {
  79. $entry = feedfixr_entry($key, $this, $entry);
  80. }
  81. $entry->save();
  82. }
  83. if (array_key_exists('ttl', $meta)) {
  84. $this->ttl = $meta['ttl'];
  85. }
  86. }
  87. public function rescheduleSync($last_sync)
  88. {
  89. $next = time();
  90. if ($this->ttl !== null) {
  91. $next += $this->ttl * 60;
  92. } else {
  93. $next += FEED_DEFAULT_SYNC_INTERVAL;
  94. }
  95. $next += mt_rand(-FEED_JITTER, FEED_JITTER);
  96. DB::query('insert into feed_poll_schedule ' .
  97. '(of_feed, at, done) values (?, ?, 0)',
  98. [ $this->_id, $next ]);
  99. }
  100. }
  101. abstract class XMLParser
  102. {
  103. protected $p;
  104. protected $_els = [ ];
  105. protected $_ns;
  106. protected $_nsHistory = [ ];
  107. protected $_charbuf;
  108. public function __construct()
  109. {
  110. $this->_ns = [ ];
  111. $this->p = xml_parser_create();
  112. xml_parser_set_option($this->p, XML_OPTION_CASE_FOLDING, 0);
  113. xml_set_start_namespace_decl_handler($this->p,
  114. [ $this, '_xmlStartNamespace' ]);
  115. xml_set_element_handler($this->p,
  116. [ $this, '_xmlStartElementNSWrapper' ],
  117. [ $this, '_xmlEndElementNSWrapper' ]);
  118. xml_set_character_data_handler($this->p,
  119. [ $this, '_xmlCdata' ]);
  120. }
  121. public function __destruct()
  122. {
  123. xml_parser_free($this->p);
  124. }
  125. public function nsResolve($name, $attrs)
  126. {
  127. if (strpos($name, ':') !== false) {
  128. [ $ns, $name ] = explode(':', $name, 2);
  129. $ns = $this->_ns[$ns] ?? $ns;
  130. return [ $ns, $name ];
  131. } else if (array_key_exists('', $this->_ns)) {
  132. return [ $this->_ns[''], $name ];
  133. } else {
  134. return [ null, $name ];
  135. }
  136. }
  137. public function _xmlStartNamespace($p, $prefix, $uri)
  138. {
  139. $this->_ns[$prefix] = $uri;
  140. }
  141. public function _xmlCdata($p, $data)
  142. {
  143. if (trim($data) === '') {
  144. return;
  145. }
  146. if ($this->_charbuf === null) {
  147. $this->_charbuf = $data;
  148. } else {
  149. $this->_charbuf .= $data;
  150. }
  151. }
  152. public function ingest(string $xml)
  153. {
  154. xml_parse($this->p, $xml);
  155. }
  156. public function _xmlStartElementNSWrapper($p, $name, $attrs)
  157. {
  158. if (array_key_exists('xmlns', $attrs)) {
  159. $this->_nsHistory[] = [ $name, $attrs['xmlns'], 1 ];
  160. $this->_ns[''] = $attrs['xmlns'];
  161. } else {
  162. if (count($this->_nsHistory) > 0) {
  163. $h =& array_top_r($this->_nsHistory);
  164. $h[2] += 1;
  165. }
  166. }
  167. $el = $this->_els[] = $this->nsResolve($name, $attrs);
  168. // var_dump($this);
  169. $this->_xmlStartElement($p, $el, $attrs);
  170. // var_dump($this);
  171. }
  172. public function _xmlEndElementNSWrapper($p, $name)
  173. {
  174. if (count($this->_nsHistory) > 0) {
  175. $h =& array_top_r($this->_nsHistory);
  176. [ $el, $ns ] = $h;
  177. $h[2] -= 1;
  178. if ($h[2] === 0 && $name === $el) {
  179. array_pop($this->_nsHistory);
  180. if (count($this->_nsHistory) > 0) {
  181. $h =& array_top_r($this->_nsHistory);
  182. [ $el, $ns ] = $h;
  183. $this->_ns[''] = $ns;
  184. $h[2] -= 1;
  185. } else {
  186. unset($this->_ns['']);
  187. }
  188. }
  189. }
  190. $el = array_pop($this->_els);
  191. // var_dump($this);
  192. $this->_xmlEndElement($p, $el);
  193. $this->_charbuf = null;
  194. // var_dump($this);
  195. }
  196. abstract public function feedItems();
  197. abstract public function feedMeta();
  198. abstract public function _xmlStartElement($p, array $ns_name, $attrs);
  199. abstract public function _xmlEndElement($p, array $ns_name);
  200. }
  201. class RSSParser extends XMLParser
  202. {
  203. protected $_meta = [ ];
  204. protected $_items = [ ];
  205. protected $_item;
  206. public function feedItems()
  207. {
  208. return $this->_items;
  209. }
  210. public function feedMeta()
  211. {
  212. return $this->_meta;
  213. }
  214. public function _xmlStartElement($p, array $ns_name, $attrs)
  215. {
  216. $name = $ns_name[1];
  217. if ($name === 'item') {
  218. if ($this->_item === null) {
  219. $this->_item = new Item();
  220. }
  221. } else if ($name === 'guid') {
  222. $this->_meta['guid_isPermalink'] = $attrs['isPermaLink'] ?? null;
  223. }
  224. }
  225. public function _xmlEndElement($p, array $ns_name)
  226. {
  227. $name = $ns_name[1];
  228. if ($this->_item === null) {
  229. if ($name === 'ttl') {
  230. $this->_meta['ttl'] = trim($this->_charbuf);
  231. }
  232. } else {
  233. $item = $this->_item;
  234. switch ($name) {
  235. case 'title':
  236. $item->title = $this->_charbuf;
  237. break;
  238. case 'description':
  239. // we just assume it's serialized HTML
  240. $descr = html_entity_decode($this->_charbuf);
  241. $descr = strip_tags($descr);
  242. $item->content = ws_normalize($descr);
  243. break;
  244. case 'guid':
  245. if (array_key_exists('guid_isPermalink', $this->_meta)) {
  246. $isPermalink = $this->_meta['guid_isPermalink'];
  247. if ($item->link === null && $isPermalink === 'true') {
  248. $item->link = $this->_charbuf;
  249. }
  250. unset($this->_meta['guid_isPermalink']);
  251. }
  252. $item->feedwide_id = $this->_charbuf;
  253. break;
  254. case 'pubDate':
  255. $item->timestamp = time_rfc2822(trim($this->_charbuf));
  256. break;
  257. case 'link':
  258. $item->link = $this->_charbuf;
  259. break;
  260. case 'item':
  261. $this->_items[] = $item;
  262. $this->_item = null;
  263. break;
  264. }
  265. }
  266. }
  267. }
  268. class AtomParser extends XMLParser
  269. {
  270. protected $_nsHistory = [ ];
  271. protected $_items = [ ];
  272. protected $_item;
  273. protected $_attrs = [ ];
  274. public function feedItems()
  275. {
  276. return $this->_items;
  277. }
  278. public function feedMeta()
  279. {
  280. return [ ];
  281. }
  282. protected $_xhtml;
  283. protected $_xhtmlLvl = 0;
  284. public function _xmlStartElement($p, $ns_name, $attrs)
  285. {
  286. [ $ns, $name ] = $ns_name;
  287. if ($ns === XMLNS_ATOM && $name === 'entry') {
  288. if ($this->_item === null) {
  289. $this->_item = new Item();
  290. }
  291. } else if ($ns === XMLNS_XHTML) {
  292. if ($this->_xhtml === null) {
  293. $this->_xhtml = "<{$name}";
  294. } else {
  295. $this->_xhtml .= "<{$name}";
  296. }
  297. if ( ! array_key_exists('xmlns', $attrs)) {
  298. $attrs['xmlns'] = XMLNS_XHTML;
  299. }
  300. if (count($attrs) > 0) {
  301. $this->_xhtml .= ' ' . xe_attr_serialize($attrs);
  302. }
  303. $this->_xhtml .= '>';
  304. $this->_xhtmlLvl += 1;
  305. }
  306. $this->_attrs[] = $attrs;
  307. }
  308. public function _xmlEndElement($p, $ns_name)
  309. {
  310. [ $ns, $name ] = $ns_name;
  311. $attrs = array_pop($this->_attrs);
  312. if ($this->_xhtml !== null) {
  313. $this->_xhtml .= $this->_charbuf;
  314. $this->_xhtml .= "</{$name}>";
  315. $this->_xhtmlLvl -= 1;
  316. if ($this->_xhtmlLvl === 0) {
  317. $this->_charbuf = $this->_xhtml;
  318. $this->_xhtml = null;
  319. }
  320. } else if ($this->_item !== null) {
  321. $item = $this->_item;
  322. switch ($name) {
  323. case 'id':
  324. $item->feedwide_id = $this->_charbuf;
  325. break;
  326. case 'title':
  327. $item->title = $this->_atomSanitizeString($attrs);
  328. break;
  329. case 'content':
  330. if ($item->content === null) {
  331. $item->content = $this->_atomSanitizeString($attrs);
  332. }
  333. break;
  334. case 'summary':
  335. $item->content = $this->_atomSanitizeString($attrs);
  336. break;
  337. case 'updated':
  338. if ($item->timestamp === null) {
  339. $item->timestamp = strtotime($this->_charbuf);
  340. }
  341. break;
  342. case 'published':
  343. $item->timestamp = strtotime($this->_charbuf);
  344. break;
  345. case 'link':
  346. if (array_key_exists('rel', $attrs) &&
  347. $attrs['rel'] === 'alternate') {
  348. if ($item->link === null || (
  349. array_key_exists('type', $attrs) &&
  350. $attrs['type'] === 'text/html')) {
  351. $item->link = $attrs['href'];
  352. }
  353. }
  354. break;
  355. case 'entry':
  356. $this->_items[] = $item;
  357. $this->_item = null;
  358. break;
  359. }
  360. }
  361. }
  362. protected function _atomSanitizeString($attrs)
  363. {
  364. $t = $attrs['type'] ?? 'text';
  365. $text = $this->_charbuf;
  366. if ($t === 'html' || $t === 'xhtml') {
  367. $text = strip_tags($text);
  368. }
  369. return ws_normalize(trim($text));
  370. }
  371. }