PageRenderTime 50ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/model/feed.php

https://github.com/NeoRazorX/feedstorm
PHP | 653 lines | 600 code | 30 blank | 23 comment | 37 complexity | 991fad5510d565b16f5f22be8261b0aa MD5 | raw file
Possible License(s): LGPL-3.0
  1. <?php
  2. /*
  3. * This file is part of FeedStorm
  4. * Copyright (C) 2014 Carlos Garcia Gomez neorazorx@gmail.com
  5. *
  6. * This program is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU Affero General Public License as
  8. * published by the Free Software Foundation, either version 3 of the
  9. * License, or (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU Affero General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Affero General Public License
  17. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. require_once 'base/fs_model.php';
  20. require_once 'model/feed_story.php';
  21. require_once 'model/suscription.php';
  22. require_once 'model/story.php';
  23. class feed extends fs_model
  24. {
  25. public $url;
  26. public $name;
  27. public $description;
  28. public $last_check_date;
  29. public $last_update;
  30. public $suscriptors;
  31. public $strikes;
  32. public $num_stories;
  33. public $native_lang;
  34. public $parody;
  35. public $penalize;
  36. public function __construct($f=FALSE)
  37. {
  38. parent::__construct('feeds');
  39. $this->id = NULL;
  40. $this->url = NULL;
  41. $this->name = $this->random_string(15);
  42. $this->description = 'Sin descripción.';
  43. $this->last_check_date = 0;
  44. $this->last_update = 0;
  45. $this->suscriptors = 0;
  46. $this->strikes = 0;
  47. $this->num_stories = 0;
  48. $this->native_lang = TRUE;
  49. $this->parody = FALSE;
  50. $this->penalize = FALSE;
  51. if($f)
  52. {
  53. $this->id = $f['_id'];
  54. $this->url = $f['url'];
  55. $this->name = $f['name'];
  56. $this->description = $f['description'];
  57. $this->last_check_date = $f['last_check_date'];
  58. $this->last_update = $f['last_update'];
  59. $this->suscriptors = $f['suscriptors'];
  60. $this->strikes = $f['strikes'];
  61. $this->num_stories = $f['num_stories'];
  62. $this->native_lang = $f['native_lang'];
  63. if( isset($f['parody']) )
  64. $this->parody = $f['parody'];
  65. if( isset($f['penalize']) )
  66. $this->penalize = $f['penalize'];
  67. }
  68. }
  69. public function install_indexes()
  70. {
  71. $this->collection->ensureIndex('url');
  72. $this->collection->ensureIndex('name');
  73. }
  74. public function url($w3c = TRUE)
  75. {
  76. if( is_null($this->id) )
  77. return FS_PATH.'index.php';
  78. else
  79. return FS_PATH.'explore_feed/'.$this->id;
  80. }
  81. public function show_url($size=60)
  82. {
  83. if( mb_strlen($this->url) < $size )
  84. return $this->url;
  85. else
  86. return mb_substr($this->url, 0, $size).'...';
  87. }
  88. public function last_check_date()
  89. {
  90. if( is_null($this->last_check_date) )
  91. return '-';
  92. else
  93. return Date('Y-m-d H:m', $this->last_check_date);
  94. }
  95. public function last_check_timesince()
  96. {
  97. if( is_null($this->last_check_date) )
  98. return '-';
  99. else
  100. return $this->time2timesince($this->last_check_date);
  101. }
  102. public function last_update()
  103. {
  104. if( is_null($this->last_update) )
  105. return '-';
  106. else
  107. return Date('Y-m-d H:m', $this->last_update);
  108. }
  109. public function last_update_timesince()
  110. {
  111. if( is_null($this->last_update) )
  112. return '-';
  113. else
  114. return $this->time2timesince($this->last_update);
  115. }
  116. public function meneame()
  117. {
  118. return ( mb_substr($this->url, 0, 23) == 'http://www.meneame.net/' );
  119. }
  120. public function reddit()
  121. {
  122. return ( mb_substr($this->url, 0, 22) == 'http://www.reddit.com/' );
  123. }
  124. public function stories()
  125. {
  126. $feed_story = new feed_story();
  127. $stories = array();
  128. foreach($feed_story->last4feed($this->id) as $fs)
  129. {
  130. if( $fs->story() )
  131. $stories[] = $fs->story();
  132. }
  133. return $stories;
  134. }
  135. public function suscriptors()
  136. {
  137. $suscription = new suscription();
  138. return $suscription->count4feed($this->id);
  139. }
  140. public function num_stories()
  141. {
  142. $feed_story = new feed_story();
  143. return $feed_story->count4feed($this->id);
  144. }
  145. public function read()
  146. {
  147. try
  148. {
  149. if( $this->reddit() )
  150. $this->curl_save($this->url, 'tmp/'.$this->get_id().'.xml');
  151. else
  152. $this->curl_save($this->url, 'tmp/'.$this->get_id().'.xml', TRUE, TRUE);
  153. if( file_exists('tmp/'.$this->get_id().'.xml') )
  154. {
  155. libxml_use_internal_errors(TRUE);
  156. $xml = simplexml_load_file('tmp/'.$this->get_id().'.xml');
  157. if($xml)
  158. {
  159. /// intentamos leer las noticias
  160. if( $xml->channel->item )
  161. {
  162. foreach($xml->channel->item as $item)
  163. $this->new_story($item);
  164. }
  165. else if( $xml->item )
  166. {
  167. foreach($xml->item as $item)
  168. $this->new_story($item);
  169. }
  170. else if( $xml->feed->entry )
  171. {
  172. foreach($xml->feed->entry as $item)
  173. $this->new_story($item);
  174. }
  175. else if( $xml->entry )
  176. {
  177. foreach($xml->entry as $item)
  178. $this->new_story($item);
  179. }
  180. else
  181. {
  182. $this->new_error("Estructura irreconocible en el feed: ".$this->name);
  183. $this->strikes++;
  184. }
  185. /// leemos el titulo del feed
  186. if( $xml->channel->title )
  187. $this->name = $this->remove_bad_utf8( (string)$xml->channel->title );
  188. else if( $xml->title )
  189. {
  190. foreach($xml->title as $item)
  191. {
  192. $this->name = $this->remove_bad_utf8( (string)$item );
  193. break;
  194. }
  195. }
  196. /// leemos la descripción
  197. if( $xml->channel->description )
  198. $this->description = $this->remove_bad_utf8( (string)$xml->channel->description );
  199. else if( $xml->description )
  200. {
  201. foreach($xml->description as $item)
  202. {
  203. $this->description = $this->remove_bad_utf8( (string)$item );
  204. break;
  205. }
  206. }
  207. }
  208. else
  209. {
  210. $this->new_error("Imposible leer el xml.");
  211. $this->strikes++;
  212. }
  213. }
  214. else
  215. {
  216. $this->new_error("Imposible leer el archivo: tmp/".$this->get_id().'.xml');
  217. $this->strikes++;
  218. }
  219. }
  220. catch(Exception $e)
  221. {
  222. $this->new_error("Error al leer el feed: ".$this->url.'. '.$e);
  223. $this->strikes++;
  224. }
  225. $this->last_check_date = time();
  226. $this->suscriptors = $this->suscriptors();
  227. $this->num_stories = $this->num_stories();
  228. $this->save();
  229. }
  230. private function new_story(&$item)
  231. {
  232. $this->strikes = 0;
  233. $feed_story = new feed_story();
  234. $feed_story->feed_id = $this->id;
  235. $feed_story->title = $this->remove_bad_utf8( (string)$item->title );
  236. $story = new story();
  237. $story->title = $this->remove_bad_utf8( (string)$item->title );
  238. /// intentamos obtener el enlace original de meneame
  239. $meneos = 0;
  240. foreach($item->children('meneame', TRUE) as $element)
  241. {
  242. if($element->getName() == 'url')
  243. {
  244. $story->link = (string)$element;
  245. $feed_story->link = (string)$item->link;
  246. }
  247. else if($element->getName() == 'votes')
  248. {
  249. $meneos = intval( (string)$element );
  250. }
  251. }
  252. /// ¿La noticia es de humor?
  253. $story->parody = $this->parody;
  254. if($item->category)
  255. {
  256. foreach($item->category as $catg)
  257. {
  258. if( strpos((string)$catg, 'humor') !== FALSE )
  259. {
  260. $story->parody = TRUE;
  261. break;
  262. }
  263. }
  264. }
  265. /// ¿reddit?
  266. if( $this->reddit() )
  267. {
  268. $links = array();
  269. if( preg_match_all("/<a href=\"([^\"]*)\">\[link/", (string)$item->description, $links) )
  270. {
  271. $story->link = $links[1][0];
  272. $feed_story->link = (string)$item->link;
  273. }
  274. }
  275. if( is_null($story->link) )
  276. {
  277. /// intentamos obtener el enlace original de feedburner
  278. foreach($item->children('feedburner', TRUE) as $element)
  279. {
  280. if($element->getName() == 'origLink')
  281. {
  282. $story->link = (string)$element;
  283. break;
  284. }
  285. }
  286. /// intentamos leer el/los links
  287. if( is_null($story->link) AND $item->link)
  288. {
  289. foreach($item->link as $l)
  290. {
  291. if( mb_substr((string)$l, 0, 4) == 'http' )
  292. $story->link = (string)$l;
  293. else
  294. {
  295. if( $l->attributes()->rel == 'alternate' AND $l->attributes()->type == 'text/html' )
  296. $story->link = (string)$l->attributes()->href;
  297. else if( $l->attributes()->type == 'text/html' )
  298. $story->link = (string)$l->attributes()->href;
  299. }
  300. }
  301. }
  302. $feed_story->link = $story->link;
  303. }
  304. /// reemplazamos los &amp;
  305. $story->link = str_replace('&amp;', '&', $story->link);
  306. if( $item->pubDate )
  307. $story->date = min( array( strtotime( (string)$item->pubDate ), time() ) );
  308. else if( $item->published )
  309. $story->date = min( array( strtotime( (string)$item->published ), time() ) );
  310. $feed_story->date = $story->date;
  311. if($feed_story->date > $this->last_update)
  312. $this->last_update = $feed_story->date;
  313. if( $item->description )
  314. $description = (string)$item->description;
  315. else if( $item->content )
  316. $description = (string)$item->content;
  317. else if( $item->summary )
  318. $description = (string)$item->summary;
  319. else
  320. {
  321. $description = '';
  322. /// intentamos leer el espacio de nombres atom
  323. foreach($item->children('atom', TRUE) as $element)
  324. {
  325. if($element->getName() == 'summary')
  326. {
  327. $description = (string)$element;
  328. break;
  329. }
  330. }
  331. foreach($item->children('content', TRUE) as $element)
  332. {
  333. if($element->getName() == 'encoded')
  334. {
  335. $description = (string)$element;
  336. break;
  337. }
  338. }
  339. }
  340. if( $this->meneame() )
  341. {
  342. /// buscamos noticias relacionadas
  343. $urls = array();
  344. if( preg_match_all('#<a href="http://www.meneame.net/story/(.+)" title=#', $description, $urls) )
  345. {
  346. foreach($urls[1] as $url)
  347. {
  348. $fs0 = $feed_story->get_by_link('http://www.meneame.net/story/'.$url);
  349. if($fs0)
  350. {
  351. $story3 = $fs0->story();
  352. if($story3)
  353. $story->related_id = $story3->get_id();
  354. }
  355. }
  356. }
  357. /// quitamos el latiguillo de las noticias de menéame
  358. $aux = '';
  359. for($i = 0; $i < mb_strlen($description); $i++)
  360. {
  361. if( mb_substr($description, $i, 4) == '</p>' )
  362. break;
  363. else
  364. $aux .= mb_substr($description, $i, 1);
  365. }
  366. $description = $aux;
  367. }
  368. else if( $this->reddit() )
  369. {
  370. $description = $story->title;
  371. }
  372. /// eliminamos el html
  373. $description = preg_replace("/<\s*style.+?<\s*\/\s*style.*?>/si", '', html_entity_decode($description, ENT_QUOTES, 'UTF-8') );
  374. $story->description = $this->remove_bad_utf8( strip_tags($description) );
  375. /// ¿story ya existe?
  376. $story2 = $story->get_by_link($story->link);
  377. /// ¿seguro que no existe?
  378. if(!$story2)
  379. {
  380. $story->new_name();
  381. $story2 = $story->get($story->name);
  382. }
  383. if($story2)
  384. {
  385. /// ¿la noticia ya está enlazada con esta fuente?
  386. $encontrada = FALSE;
  387. foreach($feed_story->all4story($story2->get_id()) as $fs)
  388. {
  389. if($fs->feed_id == $this->id)
  390. {
  391. $encontrada = TRUE;
  392. break;
  393. }
  394. }
  395. if( !$encontrada )
  396. {
  397. $feed_story->story_id = $story2->get_id();
  398. $feed_story->save();
  399. /// ¿La fuente proporciona información nativa de una noticia no nativa?
  400. if( !$story2->native_lang AND $this->native_lang )
  401. {
  402. $story2->native_lang = TRUE;
  403. $story2->title = $story->title;
  404. $story2->description = $story->description;
  405. }
  406. /// ¿La noticia está penalizada pero la fuente no?
  407. if($story2->penalize AND !$this->penalize)
  408. $story2->penalize = FALSE;
  409. /// actualizamos la noticia
  410. if($meneos > $story2->meneos)
  411. $story2->meneos = $meneos;
  412. $story2->random_count( !$this->meneame() );
  413. $story2->num_feeds++;
  414. $story2->save();
  415. }
  416. else if( mt_rand(0, 4) == 0 )
  417. {
  418. /// actualizamos la noticia
  419. if($meneos > $story2->meneos)
  420. $story2->meneos = $meneos;
  421. $story2->random_count( !$this->meneame() );
  422. $story2->save();
  423. }
  424. }
  425. else if( $story->date > time() - FS_MAX_AGE ) /// no guardamos noticias antiguas
  426. {
  427. $story->meneos = $meneos;
  428. $story->random_count( $meneos == 0 );
  429. $story->native_lang = $this->native_lang;
  430. $story->penalize = $this->penalize;
  431. $story->num_feeds = 1;
  432. $story->save(); /// hay que guardar para tener un ID
  433. $feed_story->story_id = $story->get_id();
  434. $feed_story->save();
  435. }
  436. }
  437. public function get($id)
  438. {
  439. $this->add2history(__CLASS__.'::'.__FUNCTION__);
  440. try
  441. {
  442. $data = $this->collection->findone( array('_id' => new MongoId($id)) );
  443. if($data)
  444. return new feed($data);
  445. else
  446. return FALSE;
  447. }
  448. catch(Exception $e)
  449. {
  450. $this->new_error($e);
  451. return FALSE;
  452. }
  453. }
  454. public function get_by_url($url)
  455. {
  456. $this->add2history(__CLASS__.'::'.__FUNCTION__);
  457. $data = $this->collection->findone( array('url' => $this->var2str($url) ) );
  458. if($data)
  459. return new feed($data);
  460. else
  461. return FALSE;
  462. }
  463. public function exists()
  464. {
  465. if( is_null($this->id) )
  466. return FALSE;
  467. else
  468. {
  469. $this->add2history(__CLASS__.'::'.__FUNCTION__);
  470. $data = $this->collection->findone( array('_id' => $this->id) );
  471. if($data)
  472. return TRUE;
  473. else
  474. return FALSE;
  475. }
  476. }
  477. public function test()
  478. {
  479. if( $this->suscriptors < 0 )
  480. $this->suscriptors = 0;
  481. if( filter_var($this->url, FILTER_VALIDATE_URL) )
  482. return TRUE;
  483. else
  484. {
  485. $this->new_error('URL no válida.');
  486. return FALSE;
  487. }
  488. if($this->name == '')
  489. $this->name = 'sin nombre';
  490. }
  491. public function save()
  492. {
  493. if( $this->test() )
  494. {
  495. $data = array(
  496. 'url' => $this->url,
  497. 'name' => $this->ucfirst( $this->true_text_break($this->name, 30) ),
  498. 'description' => $this->true_text_break($this->description, 200),
  499. 'last_check_date' => $this->last_check_date,
  500. 'last_update' => $this->last_update,
  501. 'suscriptors' => $this->suscriptors,
  502. 'strikes' => $this->strikes,
  503. 'num_stories' => $this->num_stories,
  504. 'native_lang' => $this->native_lang,
  505. 'parody' => $this->parody,
  506. 'penalize' => $this->penalize
  507. );
  508. if( $this->exists() )
  509. {
  510. $this->add2history(__CLASS__.'::'.__FUNCTION__.'@update');
  511. $filter = array('_id' => $this->id);
  512. $this->collection->update($filter, $data);
  513. }
  514. else
  515. {
  516. $this->add2history(__CLASS__.'::'.__FUNCTION__.'@insert');
  517. $this->collection->insert($data);
  518. $this->id = $data['_id'];
  519. }
  520. return TRUE;
  521. }
  522. else
  523. return FALSE;
  524. }
  525. public function delete()
  526. {
  527. $this->add2history(__CLASS__.'::'.__FUNCTION__);
  528. $this->collection->remove( array('_id' => $this->id) );
  529. $suscription = new suscription();
  530. $suscription->delete4feed($this->id);
  531. $feed_story = new feed_story();
  532. $feed_story->delete4feed($this->id);
  533. }
  534. public function all()
  535. {
  536. $this->add2history(__CLASS__.'::'.__FUNCTION__);
  537. $feeds = array();
  538. foreach($this->collection->find()->sort(array('name'=>1)) as $f)
  539. $feeds[] = new feed($f);
  540. return $feeds;
  541. }
  542. public function random()
  543. {
  544. $feed = FALSE;
  545. $all_feeds = $this->all();
  546. if( count($all_feeds) > 1 )
  547. {
  548. $selection = mt_rand(0, count($all_feeds));
  549. $i = 0;
  550. foreach($all_feeds as $f)
  551. {
  552. if($i == $selection)
  553. {
  554. $feed = $f;
  555. break;
  556. }
  557. $i++;
  558. }
  559. }
  560. return $feed;
  561. }
  562. public function cron_job()
  563. {
  564. echo "\nProcesamos las fuentes...";
  565. foreach($this->all() as $f)
  566. $f->mini_cron_job();
  567. }
  568. public function mini_cron_job()
  569. {
  570. if($this->strikes > 72)
  571. {
  572. $this->delete();
  573. echo "\n * Eliminada la fuente ".$this->url.".\n";
  574. }
  575. else
  576. {
  577. echo "\n * Procesando: ".$this->url."\n ** Archivo: tmp/".$this->get_id().".xml ...\n";
  578. $this->read();
  579. foreach($this->get_errors() as $e)
  580. echo $e."\n";
  581. $this->clean_errors();
  582. foreach($this->get_messages() as $m)
  583. echo $m."\n";
  584. $this->clean_messages();
  585. }
  586. }
  587. }