PageRenderTime 30ms CodeModel.GetById 26ms RepoModel.GetById 1ms app.codeStats 0ms

/core/php/crawler.php

https://github.com/thiagof/omCrawler
PHP | 1160 lines | 670 code | 208 blank | 282 comment | 97 complexity | 82585b18672a851aa0e12cbc9c95e84c MD5 | raw file
Possible License(s): GPL-2.0, MIT
  1. <?php
  2. //todo: make todo list
  3. //------------------
  4. //todo: otimizar tempo de resposta e consumo de recursos em geral
  5. // proxys parecem muito demorados.. talvez possamos mensurar velocidades deles (rodar speedtest no check) e então classificar
  6. // testar também se phantom sem proxy desempenha melhor e depois comparar com curl!!
  7. // ps: proxys americanos sao BEMM mais rapidos.
  8. if ( count($argv) <= 1 ) {
  9. echo "Usage: $argv[0] suite_name [resume_pid\n";
  10. exit;
  11. }
  12. require('crawler.utils.php');
  13. require('rb.php');
  14. global $debuglevel;
  15. $debuglevel = 1;
  16. $suite_file = $argv[1];
  17. $crawler = new Crawler( $suite_file );
  18. $crawler->start();
  19. return $crawler->totalitems;
  20. Class Crawler {
  21. //global configuration (from input file)
  22. //todo: criar todos os campos possíveis aqui, como exemplo, se possível com comentários
  23. private $config = array(
  24. 'databases' => array(
  25. array('config', 'sqlite:./data/config','crawler','yeswecan',false)
  26. ),
  27. 'crawler' => array(
  28. 'proxy' => false,
  29. 'proxy_limit' => 12,
  30. //todo: transformar o sequential em sequential_suite, ou seja, manter sequencia para o suite inteiro e não só o craw
  31. 'proxy_mode' => 'aleatory', //the proxy list should act: aleatory, sequential, sequential_suite
  32. 'delay' => 1, //wait time in seconds between the scrapes
  33. 'loop' => 1, //number of times to repeat scrapers
  34. 'debug' => 2, //false, //todo: validar funcionalidade?!
  35. ),
  36. 'scrapers' => array(),
  37. );
  38. //instance config settings
  39. public $settings = array();
  40. //total of items crawled in this instance
  41. public $totalitems = 0;
  42. //crawler bean instance
  43. public $dbobj = null;
  44. //resume process vars (todo)
  45. public $resume = false, $resumepid = null;
  46. //array of scrapers which has been already run
  47. public $scraped = array();
  48. //internal retry pages scraper
  49. private $retry = array(
  50. 'pages' => array(
  51. 'error' => array(),
  52. 'success' => array(),
  53. ),
  54. 'query' => array(
  55. 'error' => array(),
  56. 'success' => array(),
  57. ),
  58. 'retries' => 0, //pointer for each scraper
  59. );
  60. public $Json;
  61. function Crawler( $configfile ) {
  62. Log::time('crawler');
  63. //startup steps
  64. $this->loadConfig($configfile);
  65. $this->startDatabases();
  66. //get process id from cmd-line in order to resume its jobs
  67. //not tested!!
  68. if (isset( $argv[2] )) {
  69. $this->resume = true;
  70. $this->resumepid = $argv[2];
  71. }
  72. //init the database object for the crawler
  73. $this->saveDbObj('init', $configfile);
  74. //debuglevel: setting is based on the log entry names (further)
  75. global $debuglevel;
  76. $debuglevel = $this->settings['debug'];
  77. register_shutdown_function(array($this, '__destruct'));
  78. set_time_limit(0);
  79. Log::debug('Crawler', 'New instance running: '. $this->dbobj->pid );
  80. Log::debug('Crawler/config/json', json_encode($this->config) );
  81. }
  82. //crawler object in database
  83. function saveDbObj( $state, $params=array() ) {
  84. R::selectDatabase( 'config' );
  85. switch ($state) {
  86. case 'init':
  87. $name = $params;
  88. //initiate and/or resume the object
  89. //todo: for the cron-resume-all:
  90. // figure out if happened another craw after the resumed one date
  91. // which has been finished properly
  92. // if so set the past to finished too
  93. //todo: guardar json com configurações (craw e scrap)
  94. //todo: melhorar query!! os pids se repetem........ é preciso colocar o id a disposição do resume..!
  95. /*$this->dbobj = R::findOrDispense('omCrawler', 'pid=? ORDER BY id DESC', array( $this->resumepid ));
  96. $this->dbobj = reset($this->dbobj);
  97. $this->dbobj->pid = getmypid();*/
  98. //todo: review
  99. /*if ( !$this->resume ) {
  100. $this->dbobj->status = 'i'; //initiated
  101. $this->dbobj->try = 0; //first try - hopefully the only one
  102. } else {
  103. //todo: maxtries and complete resume
  104. $this->dbobj->try += 1;
  105. }
  106. if ( isset($this->dbobj->totalitems) && $this->dbobj->totalitems > 0 )
  107. $this->totalitems = $this->dbobj->totalitems;
  108. */
  109. $this->dbobj = R::dispense('omCrawler');
  110. $this->dbobj->name = $name;
  111. $this->dbobj->status = 'i'; //initiated
  112. $this->dbobj->pid = getmypid();
  113. $this->dbobj->try = 0; //first try - hopefully the only one
  114. $this->dbobj->time_start = date('Y-m-d H:i:s');
  115. break;
  116. case 'finish':
  117. $this->dbobj->totalitems = $this->totalitems;
  118. if ( $this->dbobj->status == 'i' )
  119. $this->dbobj->status = 'f'; //finished
  120. //$this->dbobj->time_end = date('Y-m-d H:i:s');
  121. break;
  122. case 'scraperStart':
  123. $scrpdb = $params;
  124. $i = count( $this->scraped ); //next scraped array index
  125. if ($this->retry['retries'] > 0) {
  126. //we are on a subscrape | retries is not 0-based
  127. $pointer = ($i -1) ."-$this->retry[retries]";
  128. } else {
  129. //normal scrape - parent
  130. $pointer = $i;
  131. }
  132. $this->dbobj->pointer = $pointer; //scraper index to know where it is
  133. $this->dbobj->ownOmScraper[] = $scrpdb;
  134. break;
  135. case 'scraperEnd':
  136. $ids = $params;
  137. $scrpdb->count = count($ids);
  138. $scrpdb->dataids = json_encode( $ids );
  139. break;
  140. case 'crawlerError':
  141. $this->dbobj->pages = json_encode( $this->retry['pages'] );
  142. $this->dbobj->status = 'e';
  143. case 'crawlerEnd':
  144. $this->dbobj->time_end = date('Y-m-d H:i:s');
  145. break;
  146. }
  147. if ($this->dbobj)
  148. R::store($this->dbobj);
  149. }
  150. //load a configuration file
  151. function loadConfig( $file=false ) {
  152. if (!$file)
  153. return $this->config;
  154. $hdl = fopen( $file, 'r' );
  155. $json = fread( $hdl, filesize($file) );
  156. $json = json_clean_functions( $json );
  157. //echo $json, "\n";
  158. //exit;
  159. require 'JSON.php';
  160. $this->Json = new Services_JSON(SERVICES_JSON_LOOSE_TYPE);
  161. $usrconfig = $this->Json->decode($json);
  162. //$usrconfig = json_decode( json_clean($json), true );
  163. //var_dump($usrconfig); exit;
  164. //json error catch
  165. if ( !$usrconfig ) {
  166. // Define the errors.
  167. $constants = get_defined_constants(true);
  168. $json_errors = array(
  169. JSON_ERROR_NONE => 'No error has occurred',
  170. JSON_ERROR_DEPTH => 'The maximum stack depth has been exceeded',
  171. JSON_ERROR_CTRL_CHAR => 'Control character error, possibly incorrectly encoded',
  172. JSON_ERROR_SYNTAX => 'Syntax error',
  173. );
  174. Log::out($json, 'Crawler/config', 2);
  175. Log::halt( "Error while parsing config file\nLast error: ". $json_errors[ json_last_error() ]);
  176. }
  177. //merge configs
  178. $dbs = $this->config['databases']; //number indexed arrays get overwrite in the replace bellow
  179. $this->config = array_replace_recursive( $this->config, $usrconfig );
  180. $this->config['databases'] = array_merge($dbs, $this->config['databases']); //merge dbs array
  181. $this->settings =& $this->config['crawler']; //link config to the crawler settings
  182. fclose( $hdl );
  183. }
  184. //configura bases
  185. private function startDatabases() {
  186. foreach ( $this->config['databases'] as $dbsetup ) {
  187. if (is_array($dbsetup))
  188. call_user_func_array( 'R::addDatabase', $dbsetup );
  189. }
  190. }
  191. //fetch the datascrape, call the scraper parser and store data
  192. function start() {
  193. //grap scrapers from config input
  194. $scrps = array_filter( $this->config['scrapers'] );
  195. //startup scrapings looking for resume proc
  196. //todo: pensar melhor nesse resume, devido nova funcionalidade de captar error e fazer na hora
  197. //sim: pode resumir tanto os erros quanto um processo interrompido (index/pointer)
  198. //todo: prepareResume() - vai passar ao scrape() somente scrps que precisam ser feitos
  199. //talvez resume não seja mesmo necessário porque as ferramentas de api e multi scraper dão conta de lidar com os lotes de consulta, sem se desgastar com recuperações (scrapes finalizados devem sempre retornar)
  200. /*$is = 0; //scraper index
  201. if ($this->resume) {
  202. if ($this->dbobj->pointer > count($scrps) )
  203. Log::halt('Cannot resume process: already finished.');
  204. //continue from the last started scraper
  205. $is = (int) $this->dbobj->pointer;
  206. }*/
  207. $is = (int) $this->dbobj->pointer;
  208. for ( $p=0; $p < $this->settings['loop']; $p++ ) {
  209. //Run the scrapers batch
  210. $this->totalitems += $this->scrape( $scrps, $is );
  211. //delay the next scrape
  212. if ( $p < $this->settings['loop'] && (int) $this->settings['delay'] > 0 )
  213. sleep( $this->settings['delay'] );
  214. }
  215. //todo: save the crawler as finished
  216. $this->saveDbObj('finish');
  217. }
  218. function scrape( array $scrps, $indexscraper = 0 ) {
  219. $num_obj = 0; //global (scrps) data count
  220. //loop throught defined scrapers
  221. for ( $i = 0; $i < count($scrps); $i++ ) {
  222. $scrp = $scrps[$i];
  223. //if there's no data is not a scraper
  224. if ( count($scrp) == 0 || ( !isset($scrp['suite']) && !isset($scrp['data']) ) )
  225. continue;
  226. //create Scraper class instance and load proxy
  227. $scraper = $this->prepareScraper( $scrp, $i );
  228. //if there's no data to query, should not continue scraping !??
  229. if ( $scraper->settings['query'] && count($scraper->settings['query']['data']) < 1 ) {
  230. Log::out("No data to query on $scraper->suite_name", "Crawler/Scraper #$i/Query", 3);
  231. continue;
  232. }
  233. //log starting suite
  234. Log::out("Starting Suite [{$scraper->dbobj->id}]: $scraper->suite_name", "Crawler/Scraper #$i", 2);
  235. Log::debug("Crawler/scrape/suitejs", $scraper->suitejs, 2);
  236. //temp store some scrape data
  237. $numitms = 0; $ids = array();
  238. if ( isset($scrp['data']) && isset($scrp['store']) )
  239. { //user defined scraper data, no fetching
  240. $numitms = $scraper->store( $scrp['data'] );
  241. }
  242. else
  243. { //fetch data from scraper suite
  244. //TODO: terminar/testar controle dos erros e retrials. aprimorar log e estruturas de dados, junto com o processo infinito (repete ate finalizar as paginas ou então desiste por numero limite)";
  245. //todo: melhorar chamadas de metodos.. verificar se é mesmo preciso retornar um valor, ou se seria melhor já seta-lo a um objeto dentro do proprio metodo (fica mais legivel)
  246. //todo: configurar timeout para chamadas não ficarem travadas
  247. $output = $this->runPhantom( $scraper );
  248. //echo $output; exit;
  249. $scraper->load( $output );
  250. //print_r( $scraper->pages );
  251. //exit;
  252. //save retrieved rows
  253. if ( isset( $scrp['store'] ) )
  254. $numitms = $scraper->store();
  255. //store the scrape in the scraped stack
  256. if ($this->retry['retries'] == 0)
  257. $this->scraped[] = $scrp;
  258. //todo: refactor (finishScrape)
  259. //atualiza status das query
  260. R::selectDatabase( 'config' );
  261. foreach ($scraper->dbobj->ownOmQuery as $query) {
  262. $qryerr = array_msearch($scraper->query['error'], 'id', $query->id);
  263. if ( count($qryerr) > 0 )
  264. $query->status = 'e';
  265. else
  266. $query->status = 'f';
  267. }
  268. R::store($scraper->dbobj);
  269. R::storeAll($scraper->dbobj->ownOmQuery);
  270. //re-fetch failed pages if exist any
  271. $numitms += $this->scrapeErrors( $scraper );
  272. }
  273. //todo: total result must be the new itens (not all itens)
  274. Log::out("Finished {$scraper->suite_name} with $numitms total results", 'Crawler');
  275. $num_obj += $numitms;
  276. //associate fetched data with the scrape data object
  277. $this->saveDbObj('scraperEnd', $scraper->ids);
  278. //delay the next scrape
  279. if ( $i < count($scrps) && (int) $this->settings['delay'] > 0 )
  280. sleep( $this->settings['delay'] );
  281. }
  282. return $num_obj;
  283. }
  284. //FERRAMENTA de RETRY - tenta buscar na hora os itens falhos no cliente.
  285. //todo: fatal error tb precisa de catch, talvez dbobj->status
  286. private function scrapeErrors( $scraper ) {
  287. $num_obj = 0;
  288. //remove last succeed pages from the error stack
  289. $this->retry['pages']['error'] = array_diff( $this->retry['pages']['error'], $scraper->pages['success'] );
  290. //add failed pages to error stack
  291. $this->retry['pages']['error'] = array_unique(array_merge( $this->retry['pages']['error'], $scraper->pages['error'] ));
  292. //add successfull pages to stack
  293. $this->retry['pages']['success'] = array_unique(array_merge( $this->retry['pages']['success'], $scraper->pages['success'] ));
  294. //add failed querys to error stack
  295. $this->retry['query']['error'] = array_unique(array_merge( $this->retry['query']['error'], $scraper->query['error'] ));
  296. //todo: log/debug scrapeErrors
  297. //todo: fazer o retry das querys junto (ou separado) das páginas - pensar melhor
  298. if ( count( $scraper->query['error'] ) > 0 && true==false) {
  299. //define status das queries como error
  300. //depois vamos fazer repetição automatica..
  301. //e por fim precisamos finalizar as que estão corretas (status)
  302. foreach ( $scraper->query['error'] as $qry ) {
  303. $qry = json_decode($qry, true);
  304. $qryrow = array_msearch( $scraper->dbobj->ownOmQuery, 'id', $qryrow['id'] );
  305. /*
  306. todo: come back here
  307. print_r($qryrow);
  308. exit();*/
  309. $qryrow->status = 'e';
  310. R::store($qryrow);
  311. $qryrow[''];
  312. }
  313. }
  314. //there's error in scraped pages: make retrial
  315. if ( count($this->retry['pages']['error']) > 0 ) {
  316. //todo: delay / sleep
  317. //max number of retries per scrape
  318. if ( $this->retry['retries'] > 7 ) {
  319. //todo: too much tries
  320. //error in this crawler on pages ZX
  321. //log the problem
  322. $this->saveDbObj('crawlerError');
  323. $this->retry['pages']['error'] = array();
  324. $this->retry['pages']['success'] = array();
  325. $this->retry['pages']['query'] = array();
  326. $this->retry['retries'] = 0;
  327. } else {
  328. //todo: refactor prepareReScrape, ScrapeError..
  329. //pega suite original (array ou arquivo)
  330. if ( is_array($scraper->settings['suite']) ) {
  331. $rescrp = $scraper->settings['suite'];
  332. } else {
  333. //todo: buscar suite file e fazer parse das suas configurações
  334. // porenquanto será necessário usar configuração em um unico arquivo (suite array)
  335. // mas depois podemos usar um parser:
  336. // 1) fake js que recebe as chamadas (add e config) convertendo os args em json
  337. // 2) parse file string com regex etc
  338. $rescrp = array();
  339. }
  340. //configura novo suite com informações que precisamos passar pro scrape()/prepareScrape();
  341. $rescrp['suite']['url'] = null;
  342. $rescrp['suite']['urls'] = $this->retry['pages']['error'];
  343. $rescrp['suite']['config']['ignoreUrls'] = $this->retry['pages']['success'];
  344. //little delay for system/network recovery
  345. sleep(5);
  346. //re-scrape error pages
  347. $this->retry['retries'] += 1;
  348. $num_obj = $this->scrape( array( $rescrp ) );
  349. //store the scrape in the scraped stack
  350. $i = count( $this->scraped ) -1;
  351. if ($this->retry['retries'] == 1) //if first retry
  352. $this->scraped[$i] = array( $this->scraped[$i] );
  353. $this->scraped[$i][] = $rescrp;
  354. }
  355. } else {
  356. //no more errors, clean pointer and stacks
  357. $this->retry['pages']['error'] = array();
  358. $this->retry['pages']['success'] = array();
  359. $this->retry['retries'] = 0;
  360. }
  361. return $num_obj;
  362. }
  363. private function prepareScraper( $scrp, $index ) {
  364. R::selectDatabase( 'config' );
  365. //initiate the scraper instance
  366. $scraper = new Scraper( $scrp );
  367. //TODO: review this, review scraper dbobjs (+resume fields)
  368. // fazer na hora de completar o resume
  369. //create a new scrape db obj
  370. $scrpdb = R::dispense( 'omScraper' );
  371. //$scrpdb->suite = ( is_array($scrp['suite']) ) ? json_encode( $scrp['suite'] ) : $scrp['suite'];
  372. $scrpdb->suite = $scraper->suite_name;
  373. $scrpdb->entity = @$scrp['store']['entity'];
  374. $scrpdb->date = date('Y-m-d H:i:s');
  375. $scrpdb->try = $this->retry['retries']; //retry pointer of the scrape
  376. R::store($scrpdb);
  377. //store crawler resume pointer
  378. $this->saveDbObj('scraperStart', $scrpdb);
  379. $scraper->dbobj = $scrpdb;
  380. //prepare the scraper
  381. $this->prepareQuery( $scraper );
  382. $this->prepareSuite( $scraper );
  383. //todo: refactor - suite_name só é gerado no prepareSuite.. melhorar a geração desse objeto e otimizar os saves..!
  384. $scrpdb->suite = $scraper->suite_name;
  385. R::store($scrpdb);
  386. //todo: refactor - prepareProxy
  387. //fetch proxy data
  388. if ( $this->settings['proxy'] === true ) {
  389. $limit = $this->settings['proxy_limit'];
  390. $mode = $this->settings['proxy_mode'];
  391. $except = array();
  392. //sequential proxys should not repeat themselfs in the same crawler
  393. if ( $mode == 'sequential' ) {
  394. //find proxys that was used on this craw
  395. $except = R::getCol('SELECT proxy FROM omScraper WHERE proxy > 0 AND omCrawler_id = ?', array( $scraper->dbobj->omCrawler_id ));
  396. }
  397. //sequential_suite do not repeat the proxy for the suite in any time
  398. if ( $mode == 'sequential_suite' ) {
  399. //todo: somente scrapings que deram certo (os falhos podem repetir). porem é preciso de alguma forma catalogar esses proxys ruins.. deixar por ultimo
  400. //find proxys that was used for the suite before
  401. $except = R::getCol('SELECT proxy FROM omScraper WHERE proxy > 0 AND suite = ?', array( $scraper->dbobj->suite ));
  402. }
  403. //grab the proxy to be used in this scrape
  404. $scraper->proxy = Proxy::getProxy( $limit, $mode, $except );
  405. //associate the proxy with the scrape
  406. $scrpdb->proxy = $scraper->proxy->id;
  407. R::store($scrpdb);
  408. //agent defined by proxy
  409. if ( $scraper->proxy instanceof RedBean_OODBBean && !empty( $scraper->proxy->agent ) ) {
  410. $suite['config']['pageSettings']['userAgent'] = $scraper->proxy->agent;
  411. }
  412. //user-defined proxy (string)
  413. } elseif ( !empty($this->settings['proxy']) ) {
  414. if ( Proxy::checkProxy($proxy) )
  415. $scraper->proxy = $proxy;
  416. else
  417. $scraper->proxy = null; //proxy is dead (timeout)
  418. }
  419. return $scraper;
  420. }
  421. private function prepareSuite( &$scraper ) {
  422. //get scraper config array
  423. $scrp = $scraper->settings;
  424. //prepare suite if is a javascript
  425. if (is_array( $scrp['suite'] )) {
  426. //basic js suite
  427. $suite = array(
  428. 'url'=>'',
  429. 'scraper'=>'function(){ return {}; }',
  430. 'config'=> array(
  431. 'pageSettings'=> array(
  432. 'userAgent'=>'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:13.0) Gecko/20100101 Firefox/13.1',
  433. 'loadImages'=>false,
  434. //'timeoutInterval'=>
  435. )
  436. )
  437. );
  438. //merge base suite with the one user-defined
  439. $suite = array_replace_recursive( $suite, $scrp['suite'] );
  440. //prepare the api query
  441. if (isset( $scrp['api']['caller'] )) {
  442. $suite['url'] = $scrp['api']['caller'];
  443. //$this->prepareApi($scraper);
  444. }
  445. //api/query data
  446. //todo: rename variable to api e refatorar tool - unir query e api pois fazem parte do mesmo esquema
  447. $suite['query'][] = is_array( $scrp['query']['data'] ) ? $scrp['query']['data'] : array();
  448. $suite['query'][] = $scrp['api'];
  449. //pjs config
  450. $suitejs_config = is_array($suite['config']) ? $suite['config'] : array();
  451. $suitejs_config = json_encode( $suitejs_config );
  452. //pjs suite (sem config)
  453. $suitejs = array_diff_assoc( $suite, array( 'config'=>array() ) );
  454. $suitejs = json_encode( $suitejs );
  455. //unquote js functions
  456. $suitejs = preg_replace('/:"(function.*?\})"([\,\}\]])/', ':$1$2', $suitejs);
  457. $suitejs = stripcslashes($suitejs);
  458. //parse suite options to javascript notation
  459. //todo: make possible multiple suites in one scrape (multiple addSuite) (arrayfy suite var in json config)
  460. $suitejs = "pjs.addSuite($suitejs);";
  461. $suitejs_config = "pjs.config($suitejs_config);";
  462. //print_r($suite);
  463. //exit;
  464. $scraper->suite = json_encode($suite);
  465. $scraper->suitejs = $suitejs . $suitejs_config;
  466. if ( !$scraper->suite_name )
  467. $scraper->suite_name = @(is_array( $scrp['suite']['url'] )) ? implode(',', $scrp['suite']['url']) : $scrp['suite']['url'] ;
  468. } else {
  469. //string suite - is a file with the js suite
  470. $cmd = "phantomjs core/js/pjs-wrapper.js $scrp[suite]";
  471. //grab the suite for parsing
  472. $output = shell_exec($cmd);
  473. $json = json_decode($output, true);
  474. //var_dump($output); exit;
  475. //todo: catch parse error on null json
  476. $suite_bkp = $scraper->settings['suite'];
  477. $scraper->settings['suite'] = $json[0][0];
  478. $scraper->settings['suite']['config'] = $json[1];
  479. $scraper->suite_name = $scrp['suite'];
  480. $this->prepareSuite($scraper);
  481. $scraper->settings['suite'] = $suite_bkp;
  482. }
  483. }
  484. //todo: padronizar passagem de parametros (instancia ou array?)
  485. // $scrp means it is the array from settings
  486. // $scraper means it is a Scraper class instance
  487. private function prepareQuery( &$scraper ) {
  488. //get scraper config array
  489. $scrp = $scraper->settings;
  490. $query = $scrp['query'];
  491. if ( $query ) {
  492. if (isset( $query['data'] ))
  493. {
  494. //user-defined query data
  495. $scraper->settings['query']['data'] = $query['data'];
  496. //todo: gravar no banco, associar crawler, etc - refactor
  497. }
  498. else {
  499. //todo:comentar o motivo de cada campo (levar para settings la em cima)
  500. $database = $query['database'];
  501. $entity = @$query['entity'];
  502. $sql = ( isset($query['sql']) ? $query['sql'] : '1 = 1' ) ;
  503. $values = @(array) $query['values'];
  504. $limit = ( isset($query['limit']) ) ? $query['limit'] : null;
  505. $fields = ( isset($query['fields']) ) ? $query['fields'] : array();
  506. $sanitize = ( isset($query['sanitize']) ) ? $query['sanitize'] : false;
  507. //todo: rever esquema de paginação query OK
  508. // status inicializado tb nao entra na conta OK
  509. // apenas contar e ordenar não garante que teremos os itens certos. é preciso buscar diretamente ou empregar regra que funcione. OK
  510. // qryobj anexo ao crawler ou scraper? OK
  511. //ideia: contar independente do status (ou seja, rodar todos itens ate o fim, em ordem) e depois (ou memos durante) estar pegando os itens que ficaram pra tras com erros etc.
  512. //todo: depois de buscar todo o offset busca os itens com status nao finalizado
  513. //ideia: permitir aqui tb o loop de querys (mais de um dicionario)
  514. //fetch the proper offset for this instance
  515. if ( $limit ) {
  516. $adapter = R::$toolboxes['config']->getDatabaseAdapter();
  517. try {
  518. //the offset is the amount of runned queries
  519. $offset = (int) $adapter->getCell("SELECT count(1)
  520. FROM omQuery q
  521. INNER JOIN omScraper s
  522. ON s.id = q.omScraper_id
  523. INNER JOIN omCrawler c
  524. ON c.id = s.omCrawler_id
  525. AND c.name = '{$this->dbobj->name}'
  526. GROUP BY c.name");
  527. $sql .= " LIMIT $limit OFFSET $offset";
  528. } catch ( Exception $e ) {
  529. //first run?
  530. $sql .= " LIMIT $limit OFFSET 0";
  531. }
  532. }
  533. if ($entity) {
  534. //fetch query db objs
  535. $qrydb = R::$toolboxes[$database]->getRedBean();
  536. $beans = $qrydb->find( $entity, array(), array($sql, $values) );
  537. } else {
  538. //pure sql query
  539. $qryad = R::$toolboxes[$database]->getDatabaseAdapter();
  540. $beans = $qryad->get($sql, $values);
  541. }
  542. //filter selected fields & transform to array
  543. //also sanitize query fields
  544. $queries = array();
  545. foreach ( $beans as $row ) {
  546. if ( $row instanceof RedBean_OODBBean )
  547. $row = $row->export();
  548. if ($sanitize) {
  549. foreach ($row as $key=>$val)
  550. $row[$key] = Convert::sane_text($val);
  551. }
  552. if ( count($fields) > 0 ) {
  553. $fields[] = 'id';
  554. $fields = array_unique($fields);
  555. //its mandatory to have a id field to identify the query obj
  556. $queries[] = array_intersect_key($row, array_flip($fields));
  557. } else {
  558. $queries[] = $row;
  559. }
  560. }
  561. //set the scraper query data array
  562. $scraper->settings['query']['data'] = $queries;
  563. //arrange query array to be saved to database
  564. $query_rows = array_map(function($row) {
  565. return array(
  566. 'data' => json_encode($row),
  567. 'status' => 'i',
  568. );
  569. }, $queries);
  570. //statuses: i: initialized, e: error, f: finished
  571. R::selectDatabase('config');
  572. //associate query beans with this scraper
  573. $query_beans = array();
  574. foreach ( $query_rows as $row ) {
  575. $bean = R::dispense('omQuery');
  576. $bean->import($row);
  577. $query_beans[] = $bean;
  578. }
  579. $scraper->dbobj->ownOmQuery = $query_beans;
  580. //save query config objs to db
  581. R::storeAll($query_beans);
  582. R::store($scraper->dbobj);
  583. }
  584. } else {
  585. $scraper->settings['query'] = null;
  586. }
  587. //case in which user specify the query data
  588. //return formated fields
  589. }
  590. //todo: prepara parametros API como caller, destiny e outros via php-array
  591. private function prepareApi( &$scraper ) {
  592. //print_r($scraper);
  593. $scraper->suite['url'] = $scraper->settings['api']['caller'];
  594. }
  595. private function runPhantom( $scraper ) {
  596. Log::time('phantom');
  597. $params = array(
  598. 'load-images' => 'no',
  599. 'max-disk-cache-size' => '100101',
  600. );
  601. //Define proxy phantom setup
  602. $proxy = $scraper->proxy;
  603. if ( $proxy instanceof RedBean_OODBBean ) {
  604. $params = array_replace( $params, array('proxy' => $proxy->server,
  605. 'proxy-type' => $proxy->type)
  606. );
  607. } elseif ( !empty($proxy) ) {
  608. $params = array_replace( $params, array('proxy' => $proxy,
  609. 'proxy-type' => 'http')
  610. );
  611. }
  612. //configure suite to be sent
  613. //todo: se string for muito grande criar arquivo temporario!! | $ getconf ARG_MAX
  614. // expr `getconf ARG_MAX` - `env|wc -c` - `env|wc -l` \* 4 - 2048
  615. // http://www.in-ulm.de/~mascheck/various/argmax/
  616. if ( isset($scraper->suitejs) )
  617. $suite = $scraper->suitejs;
  618. //$suite = escapeshellarg($scraper->suitejs);
  619. else
  620. $suite = $scraper->suite;
  621. //print_r($scraper->suitejs);
  622. //cria arquivo suite temporario
  623. $fname = "/tmp/". md5($scraper->suite_name . time());
  624. $hdl = fopen("$fname", 'w');
  625. fwrite($hdl, $suite);
  626. fclose($hdl);
  627. $pjscrape = 'core/pjscrape/pjscrape.js';
  628. $cmd = "phantomjs --". array_implode('=', ' --', $params) ." $pjscrape $fname";
  629. //echo $cmd, "\n\n"; exit;
  630. //todo: achar uma forma de: 1) trazer o resultado do exec em real time; 2) associar o pid do phantom com o php; 3) assim tentar encontrar processos stalled e dar solução/debug/retry..
  631. //ref: http://stackoverflow.com/questions/1281140/run-process-with-realtime-output-in-php
  632. //exec the phantom command and get outputs
  633. Log::debug('Crawler/Phantomjs/cmd', $cmd );
  634. $output = shell_exec($cmd);
  635. //apaga arquivo temporario
  636. //unlink($fname);
  637. Log::timeEnd('phantom');
  638. //Log::debug('Crawler/Phantom', $output ."\n" );
  639. return $output;
  640. }
  641. function __destruct() {
  642. //close databases
  643. foreach ( R::$toolboxes as $toolbox ) {
  644. $toolbox->getDatabaseAdapter()->close();
  645. }
  646. //save end time
  647. if (isset( R::$toolboxes['config'] ))
  648. $this->saveDbObj('crawlerEnd');
  649. Log::timeEnd('crawler');
  650. }
  651. }
  652. Class Scraper {
  653. //scraper configuration
  654. public $settings = array(
  655. 'suite' => array(), //suite complete configuration
  656. 'format' => array(), //rows before store formating
  657. 'store' => array(), //database store settings
  658. 'query' => array(), //data query settings
  659. 'api' => array(), //api query settings
  660. );
  661. //modified data ids, log and data colected
  662. public $ids = array();
  663. private $numitems, $log, $data=array();
  664. //scraped pages stack
  665. public $pages = array(
  666. 'error'=>array(),
  667. 'success'=>array(),
  668. );
  669. public $query = array(
  670. 'error'=>array(),
  671. 'success'=>array(),
  672. );
  673. //db obj instance and proxy instance/server
  674. public $dbobj = null, $proxy;
  675. //suite string and javascript inject
  676. public $suite, $suitejs;
  677. //suite readable name
  678. public $suite_name='';
  679. function Scraper( $config, $output=null ) {
  680. $this->settings = array_replace_recursive( $this->settings, $config );
  681. //Log::debug('Scraper/settings/json', json_encode($this->settings));
  682. if ($output)
  683. $this->load( $output );
  684. }
  685. //loads and parses output
  686. function load( $outputstr, $merge=false ) {
  687. $itemscount = 0;
  688. //loads clean json
  689. echo $outputstr;
  690. $output = $this->loadPjscrape( $outputstr );
  691. //$json = json_clean( $output );
  692. //$data = json_decode($json, true);
  693. $data = json_decode($output, true);
  694. Log::debug('Scraper/load', "Parsed Json Count: ".count($data) );
  695. //json error catch
  696. if (is_null( $data )) {
  697. // Define the errors.
  698. $constants = get_defined_constants(true);
  699. $json_errors = array(
  700. JSON_ERROR_NONE => 'No error has occurred',
  701. JSON_ERROR_DEPTH => 'The maximum stack depth has been exceeded',
  702. JSON_ERROR_CTRL_CHAR => 'Control character error, possibly incorrectly encoded',
  703. JSON_ERROR_SYNTAX => 'Syntax error',
  704. );
  705. Log::debug('Scraper/output', $outputstr);
  706. Log::debug('Scraper/parsejson', $output/*$json*/);
  707. Log::halt( "Error while parsing scraper data\nLast error: ". $json_errors[ json_last_error() ], 'Scraper');
  708. }
  709. //todo: use merge also in the store method (for cllass obvisly)?
  710. // casar o uso das duas alternativas (validar)
  711. if ( $merge )
  712. $this->data = array_merge_recursive( $data );
  713. else
  714. $this->data = $data;
  715. }
  716. //clean-up pjscrape output parsing its return
  717. function loadPjscrape( $output ) {
  718. //pj logs
  719. $logs = array(); $logerr = array();
  720. //failed pages
  721. $errpages = array();
  722. //success pages
  723. $scspages = array();
  724. $pages = array(
  725. 'success' => array(),
  726. 'error' => array(),
  727. );
  728. $query = array(
  729. 'success' => array(),
  730. 'error' => array(),
  731. );
  732. //commom log callback
  733. $logcb = function( $mtch ) use ( &$log, &$pages ) {
  734. $log[] = $mtch[0];
  735. //store successfull pages
  736. preg_match( '/Scraping (.+)$/', $mtch[0], $m );
  737. if ( count($m) == 2 ) {
  738. $pages['success'][] = $m[1];
  739. }
  740. return '';
  741. };
  742. //error/alert log callback
  743. $errcb = function( $mtch ) use ( &$log, &$logerr, &$pages ) {
  744. $logerr[] = $mtch[0];
  745. $log[] = $mtch[0];
  746. //Till now, we have in pjs:
  747. //3 ERRORS
  748. //- Page did not load (status)*
  749. //- Page not found (404) -> sad, but nothing to do
  750. //- Page error code (status)*
  751. //2 ALERTS
  752. //- Timeout after (waitFor)* (todo, check)
  753. //- phantom->page.onAlert
  754. //
  755. //- * we must care
  756. //TODO: store error codes, log
  757. preg_match( '/Page did not load \(status=(.+)\): (.+)$/', $mtch[0], $merr );
  758. if ( count($merr) == 3 ) {
  759. $pages['error'][] = $merr[2];
  760. }
  761. preg_match( '/Page error code (.+) on (.+)$/', $mtch[0], $merr );
  762. if ( count($merr) == 3 ) {
  763. $pages['error'][] = $merr[2];
  764. }
  765. };
  766. //todo: callbak para FATAL ERROR (fail function)
  767. //client/console origin messages
  768. $clicb = function( $mtch ) use ( &$log, &$query ) {
  769. $log[] = $mtch[0];
  770. //store failed query items
  771. preg_match( '/Item Failed: (.+)$/', $mtch[0], $m );
  772. if ( count($m) == 2 ) {
  773. $query['error'][] = $m[1];
  774. }
  775. return '';
  776. };
  777. $output = preg_replace_callback('/^(ERROR|!).*$/im', $errcb, $output); //pjs > log.err && log.alert
  778. $output = preg_replace_callback('/^(\* CLIENT: ).*$/m', $clicb, $output);
  779. $output = preg_replace_callback('/^\*.*$/m', $logcb, $output); //pjs > log.msg
  780. //$output = preg_replace_callback('/^(!|CLIENT|Timeout).*$/im', $errparse, $output); //pjs ERROR: OUTPUT
  781. //$output = preg_replace_callback('/(\* Saved (\d+) items\n)/', $itemscb, $output);
  782. $output = preg_replace('/\n/', '', $output); //blank lines
  783. $this->pages = $pages;
  784. $this->query = $query;
  785. return $output;
  786. }
  787. //store data into database
  788. function store( $data=null, $merge=false ) {
  789. if ($data)
  790. if ( $merge )
  791. $this->data = array_merge_recursive( $data );
  792. else
  793. $this->data = $data;
  794. Log::time('store');
  795. R::selectDatabase( $this->settings['store']['database'] );
  796. if ( count((array) $this->data) > 0 ) {
  797. //todo:transaction
  798. //R::begin();
  799. //importa output-data procurando por item duplicado
  800. $bean = $this->settings['store']['entity'];
  801. $unique = @$this->settings['store']['unique'];
  802. $rows = array();
  803. //todo: unique multicampos (array.foreach)
  804. foreach ( $this->data as $datarow ) {
  805. $row = null;
  806. $findp = array( $bean ); //find params
  807. //search for unique / no duplicate
  808. if ( $unique && $datarow[$unique] ) {
  809. $findp[] = "$unique = ? ORDER BY id DESC";
  810. $findp[] = array( $datarow[$unique] );
  811. }
  812. $row = call_user_func_array('R::findOrDispense', $findp);
  813. $row = reset($row); //get first
  814. //datarow insert/update
  815. $row->import( $datarow );
  816. //format row using user-defined actions
  817. $this->formatRow( &$row );
  818. $rows[] = $row;
  819. }
  820. //armazenas rows e guarda ids
  821. $ids = R::storeAll($rows);
  822. $this->ids = array_merge( $this->ids, $ids );
  823. //commit changes
  824. //R::commit();
  825. Log::timeEnd('store');
  826. return count($ids);
  827. } else {
  828. Log::out( "Error while storing data. No data present" );
  829. }
  830. }
  831. private function formatRow( &$row ) {
  832. try {
  833. foreach ( (array) @$this->settings['format'] as $rowfmt ) {
  834. eval( "$rowfmt;" );
  835. }
  836. } catch( Exception $e ) {
  837. Log::out( "Error formating scrape data\n". $e->getMessage(), 'Scraper/format', 2 );
  838. }
  839. }
  840. }
  841. Class Proxy {
  842. static function checkProxy( $host_port, $timeout=10 ) {
  843. list($host, $port) = explode(':', $host_port);
  844. $fsock = @fsockopen($host, $port, $errno, $errstr, $timeout);
  845. if ( ! $fsock ) {
  846. return FALSE;
  847. } else
  848. return TRUE;
  849. //TODO: fazer trial também de uso do proxy: será que ele vai abrir nossa pagina (scrape)?
  850. }
  851. //todo: associate the proxy with a user agent, so a server is always with the same agent name
  852. // make a agents list and randomly match with the proxy
  853. // on the scrape!!
  854. static function getProxy( $maxhours, $mode='aleatory', $except=array() ) {
  855. R::selectDatabase('config');
  856. $sql = '1';
  857. $params = array();
  858. //limit proxys by the age they were online
  859. if ($maxhours > 0) {
  860. $date_limit = mktime() - ( 60*60*$maxhours );
  861. $sql = 'date > ? ';
  862. $params[] = $date_limit;
  863. }
  864. //proxy consume sequence mode
  865. if ($mode == 'aleatory') {
  866. $sql .= ' ORDER BY RANDOM()';
  867. } else { //sequential and sequential_suite
  868. //$except = array_filter($except);
  869. $sql .= ' AND id NOT IN ('. implode(',', $except) .') ORDER BY date DESC';
  870. }
  871. //match the proxy
  872. $proxy = R::findOne('proxy', $sql, $params);
  873. if (!$proxy)
  874. throw new Exception('Proxy não encontrado');
  875. //check if is a valid proxy
  876. if ( ! self::checkProxy($proxy->server) ) {
  877. $proxy->fail = ((int) $proxy->fail)+1;
  878. R::store($proxy);
  879. Log::out('Failed to resolve '. $proxy->server, 'Proxy/check');
  880. if ($proxy->fail >= 5)
  881. R::trash( $proxy );
  882. //search another proxy (online)
  883. return self::getProxy( $maxhours );
  884. }
  885. //renew proxy date/time
  886. $proxy->date = mktime();
  887. R::store($proxy);
  888. return $proxy;
  889. }
  890. }
  891. Class Log {
  892. //todo: change the args order to match debug func - ($hierarchy, $msg, $level)
  893. static function out( $msg, $hierarchy='Crawler', $writelevel=1 ) {
  894. $output = array();
  895. $level=0;
  896. //recursively findout the message level, by the hierarchies suplied
  897. foreach ( preg_split('/\//', "$hierarchy\/") as $path ) {
  898. $level++;
  899. }
  900. $level--;
  901. $output[] = "######################################";
  902. $output[] = "# $hierarchy <".date('Y-m-d H:i:s').">";
  903. $output[] = "####";
  904. //convert msg to array of lines
  905. if ( !is_array($msg) )
  906. $msg = preg_split('/\n/', "$msg\n");
  907. array_push($output, $msg[0]);
  908. //tabulate each line
  909. for ($x=0; $x < count($output); $x++) {
  910. $line = $output[$x];
  911. $line = @str_pad( $line, $level*2, null, STR_PAD_LEFT );
  912. $output[$x] = $line;
  913. }
  914. self::store( $output );
  915. //should write this log?
  916. if ( (int) $writelevel >= $level )
  917. self::write( $output );
  918. }
  919. static function write( $output ) {
  920. //echoe the msg
  921. echo implode( "\n", $output ), PHP_EOL;
  922. }
  923. static function halt( $msg ) {
  924. Log::out( $msg );
  925. die;
  926. }
  927. static function store($lines) {
  928. //store crawler logs
  929. global $crawler_logs;
  930. $crawler_log = array_merge( (array) $crawler_logs, (array) $lines );
  931. }
  932. static function time( $label ) {
  933. //stores current time in global
  934. global $log_timer;
  935. $log_timer[$label] = microtime(true);
  936. }
  937. static function timeEnd( $label ) {
  938. //shows diference from time
  939. global $log_timer;
  940. if (isset( $log_timer[$label] )) {
  941. $total = microtime(true) - $log_timer[$label];
  942. self::debug("Timer $label", "$label lasts $total seconds" );
  943. unset( $log_timer[$label] );
  944. }
  945. }
  946. static function debug( $section, $msg ) {
  947. global $debuglevel;
  948. self::out( $msg, $section, $debuglevel );
  949. }
  950. }