/app/controllers/components/indexer.php

https://github.com/tsep/tsep1 · PHP · 285 lines · 149 code · 95 blank · 41 comment · 18 complexity · f7cfc90bf1220121b60fd0fae4d848ff MD5 · raw file

  1. <?php
  2. class IndexerComponent extends Object {
  3. /**
  4. * @var AppController
  5. */
  6. var $controller;
  7. /**
  8. * @var QueueComponent
  9. */
  10. var $Queue;
  11. /**
  12. * @var Index
  13. */
  14. var $Index;
  15. var $components = array('Queue');
  16. function initialize (&$controller, $settings = array()) {
  17. $this->controller =& $controller;
  18. $this->Index = ClassRegistry::init('Index');
  19. }
  20. private function _import() {
  21. App::import('Vendor', 'html_to_text');
  22. App::import('Vendor','get_time_left');
  23. App::import('Vendor', 'resolve_url');
  24. App::import('Vendor', 'tsep_crawler');
  25. App::import('Vendor', 'tsep_indexer');
  26. App::import('Vendor', 'start_script');
  27. App::import('Vendor', 'random_string');
  28. }
  29. /**
  30. * _begin
  31. * Creates the indexer_running.tmp file
  32. */
  33. private function _begin() {
  34. register_shutdown_function(array($this, 'end'));
  35. $fp = @fopen(TMP.'indexer_running.tmp', 'w');
  36. @fclose($fp);
  37. }
  38. /**
  39. * _end
  40. * removes the indexer_running.tmp file
  41. */
  42. private function _end () {
  43. @unlink(TMP.'indexer_running.tmp');
  44. }
  45. /**
  46. * Helper function on PHP shutdown
  47. */
  48. function end () {
  49. $this->_end();
  50. }
  51. private function _singular () {
  52. if(!file_exists(TMP.'indexer_running.tmp')) {
  53. return true;
  54. }
  55. else {
  56. return false;
  57. }
  58. }
  59. private function _generateAuth () {
  60. App::import('Vendor', 'random_string');
  61. $randstr = random_string(10);
  62. file_put_contents(TMP.'auth.tmp', $randstr);
  63. return $randstr;
  64. }
  65. private function _verifyAuth ($auth_key) {
  66. $auth = @file_get_contents(TMP.'auth.tmp');
  67. if($auth_key != $auth) {
  68. return false;
  69. }
  70. else{
  71. @unlink(TMP.'auth.tmp');
  72. return true;
  73. }
  74. }
  75. private function _run () {
  76. if(!$this->_singular()) {
  77. $this->log('Singular; Indexing aborted');
  78. return false;
  79. }
  80. else {
  81. //Register the indexer is running
  82. $this->_begin();
  83. $this->log('Checking for jobs', LOG_INFO);
  84. if($this->Queue->isJob('indexer')) {
  85. $job = $this->Queue->getJob('indexer');
  86. $this->log('Job found; Processing Job');
  87. $new_job = $this->_index($job);
  88. if($new_job) {
  89. $this->Queue->addJob($new_job['function_name'], $new_job['params'], 'indexer');
  90. }
  91. $this->_end();
  92. return $this->_generateAuth();
  93. }
  94. else {
  95. $this->log('No jobs found; aborting', LOG_INFO);
  96. $this->_end();
  97. return false;
  98. }
  99. }
  100. }
  101. /**
  102. * _index
  103. * Indexes the job given
  104. * @param array $job The Job to index
  105. * @return mixed false on completion, array job to be requeued on incompletion
  106. */
  107. private function _index($job) {
  108. $this->log('#0003 Initializing', LOG_INFO);
  109. if (empty($job)) {
  110. $this->log('Invalid Job given to indexer');
  111. return false;
  112. }
  113. $this->log('Loading framework');
  114. $id = $job['function_name'];
  115. if (!empty($job['params'])) {
  116. //We are resuming
  117. $this->log('Resuming from previous state');
  118. $indexer = $job['params']['indexer'];
  119. $crawler = $job['params']['crawler'];
  120. }
  121. else {
  122. //We are initializing
  123. $this->log('Loading dependancies', LOG_INFO);
  124. $profile = $this->Index->Profile->findById($id);
  125. if(empty($profile)) {
  126. $this->log('#0001 Invalid Profile Request; Indexing Failed.', LOG_ERROR);
  127. return false;
  128. }
  129. $stopwords = $this->Index->Profile->Stopword->find('all');
  130. //TODO: Specify a different user agent for ea. indexing profile
  131. $crawler = new TSEPCrawler($profile['Profile']['url'], $profile['Profile']['regex'], Configure::read('UserAgent'));
  132. $indexer = new TSEPIndexer($stopwords);
  133. $this->log('Deleting indexes');
  134. $this->Index->deleteAll(array('Index.profile_id' => $id), false);
  135. }
  136. $this->log('Beginning crawl');
  137. while ($page = $crawler->crawl()) {
  138. if($indexer->parse($page)) {
  139. $save = $this->Index->create(array(
  140. 'Index' => array(
  141. 'profile_id' => $id,
  142. 'url' => $page->url,
  143. 'text' => $page->content
  144. )
  145. ));
  146. $this->log('Saving Page', LOG_INFO);
  147. $this->Index->save($save);
  148. }
  149. if (get_time_left() <= 10) {
  150. //Return job to be requeued
  151. return array(
  152. 'function_name' => $id,
  153. 'params' => array(
  154. 'crawler' => $crawler,
  155. 'indexer' => $indexer
  156. )
  157. );
  158. }
  159. }
  160. $this->log('Indexing Complete');
  161. return false;
  162. }
  163. /**
  164. * Processes the Indexing Queue
  165. * @param string $auth_key The authorization key provided
  166. * @return mixed string on incompletion, false on completion
  167. */
  168. function processRequest ($auth_key) {
  169. $this->log('Processing Request');
  170. if (!$this->_verifyAuth($auth_key)) {
  171. $this->log('Authentication Failed', LOG_INFO);
  172. return false;
  173. }
  174. else {
  175. $this->log('Authentication Success', LOG_INFO);
  176. $this->_import();
  177. return $this->_run();
  178. }
  179. }
  180. /**
  181. * Submit an indexing request
  182. * @param string $id The profile to index
  183. * @return string the auth key
  184. */
  185. function submitRequest ($id) {
  186. $this->log('#0002 Request submitted', LOG_INFO);
  187. $this->Queue->addJob($id, array(), 'indexer');
  188. return $this->_generateAuth();
  189. }
  190. }