PageRenderTime 48ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/webapp/lib/Crawler.php

https://github.com/komagata/plnet
PHP | 436 lines | 372 code | 53 blank | 11 comment | 59 complexity | 099d1c5c02d77620ced1f33ce5b3a816 MD5 | raw file
Possible License(s): LGPL-2.1
  1. <?php
  2. require_once 'FeedParser.php';
  3. require_once 'DB/DataObject.php';
  4. require_once 'Utils.php';
  5. require_once 'LogUtils.php';
  6. require_once 'DBUtils.php';
  7. class Crawler
  8. {
  9. var $db;
  10. function Crawler()
  11. {
  12. $this->db = DBUtils::connect(false);
  13. }
  14. function crawlAll()
  15. {
  16. LogUtils::debug('[Crawling start]');
  17. $feeds = $this->getFeeds();
  18. $cnt = count($feeds);
  19. $success = 0;
  20. foreach ($feeds as $uri) {
  21. $result = $this->crawl($uri);
  22. if ($result === false) {
  23. LogUtils::debug("Crawl failed: $uri");
  24. } else {
  25. $success++;
  26. LogUtils::debug("[Crawling: {$success}/{$cnt}]");
  27. LogUtils::debug("Memory usage: ".
  28. number_format(memory_get_usage()));
  29. }
  30. }
  31. LogUtils::debug("[Crawling finished: {$success}/{$cnt}]");
  32. }
  33. function crawl($uri)
  34. {
  35. $this->db->query('BEGIN');
  36. LogUtils::debug("Feed: $uri");
  37. $parser =& new FeedParser($uri);
  38. if ($parser->parse($caching) === false) {
  39. trigger_error("Crawler::crawl(): Failed to parse feed: $uri", E_USER_NOTICE);
  40. return false;
  41. }
  42. $channel = $parser->getChannel();
  43. // feed
  44. if ($this->feedIsExists($channel['uri'])) {
  45. LogUtils::debug("Feed is exsists");
  46. if ($this->feedIsUpdated($channel)) {
  47. LogUtils::debug("Feed is updated");
  48. $feedId = $this->feedUpdate($channel);
  49. if ($feedId === false) return false;
  50. } else {
  51. LogUtils::debug("Feed is not updated");
  52. $feedId = $this->getFeedId($channel['uri']);
  53. }
  54. } else {
  55. LogUtils::debug("Feed is not exsists");
  56. $feedId = $this->feedInsert($channel);
  57. if ($feedId === false) return false;
  58. }
  59. // items
  60. foreach ($parser->getItems() as $item) {
  61. LogUtils::debug("Entry: {$item['uri']}");
  62. if ($this->entryIsExists($feedId, $item['uri'])) {
  63. LogUtils::debug("Entry is exsists");
  64. if ($this->entryIsUpdated($feedId, $item)) {
  65. LogUtils::debug("Entry is updated");
  66. $entryId = $this->entryUpdate($feedId, $item);
  67. if ($entryId === false) return false;
  68. } else {
  69. LogUtils::debug("Entry is not updated");
  70. $entryId = $this->getEntryId($feedId, $item['uri']);
  71. }
  72. } else {
  73. LogUtils::debug("Entry is not exsists");
  74. $entryId = $this->entryInsert($feedId, $item);
  75. if ($entryId === false) return false;
  76. }
  77. // tags
  78. if (is_array($item['category'])) {
  79. $item['category'] = $this->array_trim_lower_uniq($item['category']);
  80. LogUtils::debug("Tags is exsists");
  81. // tag
  82. foreach ($item['category'] as $tag) {
  83. LogUtils::debug("Tag: $tag");
  84. if ($this->tagIsExists($tag)) {
  85. LogUtils::debug("Tag is exsists");
  86. $tagId = $this->getTagId($tag);
  87. } else {
  88. LogUtils::debug("Tag is not exsists");
  89. $tagId = $this->tagInsert($tag);
  90. if ($tagId === false) return false;
  91. }
  92. }
  93. // entry to tag
  94. if ($this->tagsIsUpdated($entryId, $item['category'])) {
  95. // delete and insert
  96. LogUtils::debug("Tags is updated");
  97. $res = $this->tagsReplace($entryId, $item['category']);
  98. if ($res === false) return false;
  99. } else {
  100. LogUtils::debug("Tags is not updated");
  101. }
  102. } else {
  103. LogUtils::debug("Tags is not exsists");
  104. }
  105. }
  106. $this->db->commit();
  107. return true;
  108. }
  109. function feedIsExists($uri)
  110. {
  111. $cnt = $this->db->getOne('SELECT COUNT(id) FROM feed WHERE uri = ?', array($uri));
  112. if ($cnt > 0) {
  113. return true;
  114. } else {
  115. return false;
  116. }
  117. }
  118. function feedIsUpdated($feed)
  119. {
  120. $sql = 'SELECT * FROM feed WHERE uri = ?';
  121. $storedFeed = $this->db->getRow($sql, array($feed['uri']));
  122. // print_r($storedFeed); print_r($feed);
  123. if ($storedFeed['link'] == $feed['link'] and
  124. $storedFeed['title'] == $feed['title'] and
  125. $storedFeed['description'] == $feed['description'] and
  126. $storedFeed['favicon'] == $feed['favicon']) {
  127. return false;
  128. } else {
  129. return true;
  130. }
  131. }
  132. function getFeedId($uri)
  133. {
  134. $sql = 'SELECT id FROM feed WHERE uri = ?';
  135. $id = $this->db->getOne($sql, array($uri));
  136. return $id;
  137. }
  138. function feedInsert($feed)
  139. {
  140. $fields = array(
  141. 'uri' => $feed['uri'],
  142. 'link' => $feed['link'],
  143. 'title' => $feed['title'],
  144. 'description' => $feed['description'],
  145. 'favicon' => $feed['favicon'],
  146. 'lastupdatedtime' => $feed['last_modified']
  147. );
  148. $res = $this->db->autoExecute('feed', $fields, DB_AUTOQUERY_INSERT);
  149. if (DB::isError($res)) {
  150. $this->db->rollback();
  151. trigger_error('Crawler::feedInsert(): Failed to insert. '.$res->toString(), E_USER_WARNING);
  152. return false;
  153. } else {
  154. // specific mysql
  155. $id = $this->db->getOne('SELECT LAST_INSERT_ID()');
  156. LogUtils::debug("Feed insert: $id");
  157. return $id;
  158. }
  159. }
  160. function feedUpdate($feed)
  161. {
  162. $sql = 'SELECT id FROM feed WHERE uri = ?';
  163. $id = $this->db->getOne($sql, array($feed['uri']));
  164. $fields = array(
  165. 'uri' => $feed['uri'],
  166. 'link' => $feed['link'],
  167. 'title' => $feed['title'],
  168. 'description' => $feed['description'],
  169. 'favicon' => $feed['favicon'],
  170. 'lastupdatedtime' => $feed['last_modified']
  171. );
  172. $res = $this->db->autoExecute('feed', $fields, DB_AUTOQUERY_UPDATE, "id = $id");
  173. if (DB::isError($res)) {
  174. $this->db->rollback();
  175. trigger_error('Crawler::feedUpdate(): Failed to update. '.$res->toString(), E_USER_WARNING);
  176. return false;
  177. } else {
  178. LogUtils::debug("Feed update: $id");
  179. return $id;
  180. }
  181. }
  182. function entryIsExists($feedId, $entryUri)
  183. {
  184. $sql = 'SELECT COUNT(id) FROM entry WHERE feed_id = ? AND uri_md5 = ?';
  185. $cnt = $this->db->getOne($sql, array($feedId, md5($entryUri)));
  186. if ($cnt > 0) {
  187. return true;
  188. } else {
  189. return false;
  190. }
  191. }
  192. function entryIsUpdated($feedId, $entry)
  193. {
  194. $sql = 'SELECT id, uri, title, author,
  195. description, UNIX_TIMESTAMP(date) AS date
  196. FROM entry WHERE feed_id = ? AND uri_md5 = ?';
  197. $storedEntry = $this->db->getRow($sql, array($feedId, md5($entry['uri'])));
  198. // print_r($storedEntry); print_r($entry);
  199. if ($storedEntry['title'] == $entry['title'] and
  200. $storedEntry['description'] == $entry['description'] and
  201. $storedEntry['author'] == $entry['author'] and
  202. $storedEntry['date'] == $entry['date']) {
  203. return false;
  204. } else {
  205. return true;
  206. }
  207. }
  208. function getEntryId($feedId, $entryUri)
  209. {
  210. $sql = 'SELECT id FROM entry WHERE feed_id = ? AND uri_md5 = ?';
  211. $id = $this->db->getOne($sql, array($feedId, md5($entryUri)));
  212. return $id;
  213. }
  214. function entryInsert($feedId, $entry)
  215. {
  216. $fields = array(
  217. 'feed_id' => $feedId,
  218. 'uri' => $entry['uri'],
  219. 'uri_md5' => md5($entry['uri']),
  220. 'title' => $entry['title'],
  221. 'description' => $entry['description'],
  222. 'author' => $entry['author'],
  223. 'date' => date("Y-m-d H:i:s", $entry['date']),
  224. 'lastupdatedtime' => date("Y-m-d H:i:s")
  225. );
  226. $res = $this->db->autoExecute('entry', $fields, DB_AUTOQUERY_INSERT);
  227. if (DB::isError($res)) {
  228. $this->db->rollback();
  229. trigger_error('Crawler::entryInsert(): Failed to insert. '.$res->toString(), E_USER_WARNING);
  230. return false;
  231. } else {
  232. // specific mysql
  233. $id = $this->db->getOne('SELECT LAST_INSERT_ID()');
  234. LogUtils::debug("Entry insert: $id");
  235. return $id;
  236. }
  237. }
  238. function entryUpdate($feedId, $entry)
  239. {
  240. $sql = 'SELECT id FROM entry WHERE feed_id = ? AND uri_md5 = ?';
  241. $id = $this->db->getOne($sql, array($feedId, md5($entry['uri'])));
  242. $fields = array(
  243. 'feed_id' => $feedId,
  244. 'uri' => $entry['uri'],
  245. 'uri_md5' => md5($entry['uri']),
  246. 'title' => $entry['title'],
  247. 'description' => $entry['description'],
  248. 'author' => $entry['author'],
  249. 'date' => date("Y-m-d H:i:s", $entry['date']),
  250. 'lastupdatedtime' => date("Y-m-d H:i:s")
  251. );
  252. $res = $this->db->autoExecute('entry', $fields, DB_AUTOQUERY_UPDATE, "id = $id");
  253. if (DB::isError($res)) {
  254. $this->db->rollback();
  255. trigger_error('Crawler::entryUpdate(): Failed to update. '.$res->toString(), E_USER_WARNING);
  256. return false;
  257. } else {
  258. LogUtils::debug("Entry update: $id");
  259. return $id;
  260. }
  261. }
  262. function tagIsExists($tag)
  263. {
  264. $sql = 'SELECT COUNT(id) FROM tag WHERE name = ?';
  265. $cnt = $this->db->getOne($sql, array($tag));
  266. if ($cnt > 0) {
  267. return true;
  268. } else {
  269. return false;
  270. }
  271. }
  272. function getTagId($tag)
  273. {
  274. $sql = 'SELECT id FROM tag WHERE name = ?';
  275. $id = $this->db->getOne($sql, array($tag));
  276. return $id;
  277. }
  278. function tagInsert($tag)
  279. {
  280. $fields = array(
  281. 'name' => $tag,
  282. 'updatedtime' => date("Y-m-d H:i:s")
  283. );
  284. $res = $this->db->autoExecute('tag', $fields, DB_AUTOQUERY_INSERT);
  285. if (DB::isError($res)) {
  286. $this->db->rollback();
  287. trigger_error('Crawler::tagInsert(): Failed to insert. '
  288. .$res->toString(), E_USER_WARNING);
  289. return false;
  290. } else {
  291. // specific mysql
  292. $id = $this->db->getOne('SELECT LAST_INSERT_ID()');
  293. LogUtils::debug("Tag insert: $id");
  294. return $id;
  295. }
  296. }
  297. function tagsIsUpdated($entryId, $tags)
  298. {
  299. $sql = 'SELECT name FROM tag t
  300. JOIN entry_to_tag e2t ON t.id = e2t.tag_id
  301. WHERE entry_id = ?';
  302. $stored = $this->db->getAll($sql, array($entryId));
  303. if (DB::isError($stored)) {
  304. $this->db->rollback();
  305. trigger_error('Crawler::tagsIsUpdated(): Failed to select. '
  306. .$stored->toString(), E_USER_WARNING);
  307. }
  308. $storedTags = array();
  309. foreach ($stored as $row) {
  310. $storedTags[] = $row['name'];
  311. }
  312. LogUtils::debug('Compare: '.join(" ", $storedTags).' <-> '.
  313. join(" ", $tags));
  314. $res = $this->array_compare($storedTags, $tags);
  315. if ($res) {
  316. return false;
  317. } else {
  318. return true;
  319. }
  320. }
  321. function tagsReplace($entryId, $tags)
  322. {
  323. $sql = 'DELETE FROM entry_to_tag WHERE entry_id = ?';
  324. $res = $this->db->query($sql, array($entryId));
  325. if (DB::isError($res)) {
  326. $this->db->rollback();
  327. trigger_error('Crawler::tagsReplace(): Failed to delete. Feed URL:'
  328. .$this->uri.' '.$res->toString(), E_USER_WARNING);
  329. return false;
  330. }
  331. LogUtils::debug("EntryToTag: Delete at entry_id = $entryId");
  332. $date = date("Y-m-d H:i:s");
  333. foreach ($tags as $tag) {
  334. $tagId = $this->getTagId($tag);
  335. $fields = array(
  336. 'entry_id' => $entryId,
  337. 'tag_id' => $tagId
  338. );
  339. $res = $this->db->autoExecute(
  340. 'entry_to_tag', $fields, DB_AUTOQUERY_INSERT);
  341. if (DB::isError($res)) {
  342. $this->db->rollback();
  343. trigger_error('Crawler::tagsReplace(): Failed to insert. '
  344. .$res->toString(), E_USER_WARNING);
  345. return false;
  346. }
  347. }
  348. }
  349. function getFeeds()
  350. {
  351. $sql = 'SELECT uri FROM feed ORDER BY uri';
  352. $res = $this->db->getAll($sql);
  353. if (DB::isError($res)) {
  354. trigger_error('Crawler::getFeeds(): Failed to select. '
  355. .$res->toString(), E_USER_WARNING);
  356. }
  357. $feeds = array();
  358. foreach ($res as $row) {
  359. $feeds[] = $row['uri'];
  360. }
  361. return $feeds;
  362. }
  363. function array_compare($aru, $imi)
  364. {
  365. sort($aru);
  366. sort($imi);
  367. $max = count($aru) > count($imi) ? count($aru) : count($imi);
  368. $res = true;
  369. for ($i = 0; $i <$max; $i++) {
  370. if (mb_strtolower($aru[$i]) !== mb_strtolower($imi[$i])) $res = false;
  371. }
  372. return $res;
  373. }
  374. function array_trim_lower_uniq($array)
  375. {
  376. $result = array();
  377. foreach ($array as $key => $value) {
  378. $result[$key] = mb_strtolower(trim($value));
  379. }
  380. return array_unique($result);
  381. }
  382. }
  383. ?>