crawler.php | searchcode

/core/php/crawler.php

https://github.com/thiagof/omCrawler
PHP | 1160 lines | 670 code | 208 blank | 282 comment | 97 complexity | 82585b18672a851aa0e12cbc9c95e84c MD5 | raw file
Possible License(s): GPL-2.0, MIT

<?php
//todo: make todo list
//------------------
//todo: otimizar tempo de resposta e consumo de recursos em geral
//      proxys parecem muito demorados.. talvez possamos mensurar velocidades deles (rodar speedtest no check) e então classificar
//      testar também se phantom sem proxy desempenha melhor e depois comparar com curl!!
//      ps: proxys americanos sao BEMM mais rapidos.
if ( count($argv) <= 1 ) {
    echo "Usage: $argv[0] suite_name [resume_pid\n";
    exit;
}

require('crawler.utils.php');
require('rb.php');

global $debuglevel;
$debuglevel = 1;

$suite_file = $argv[1];
$crawler = new Crawler( $suite_file );

$crawler->start();
return $crawler->totalitems;

Class Crawler {
    //global configuration (from input file)
    //todo: criar todos os campos possíveis aqui, como exemplo, se possível com comentários
    private $config = array(
                'databases' => array(
                    array('config', 'sqlite:./data/config','crawler','yeswecan',false)
                    ),
                'crawler' => array(
                    'proxy' => false,
                    'proxy_limit' => 12,
                    //todo: transformar o sequential em sequential_suite, ou seja, manter sequencia para o suite inteiro e não só o craw
                    'proxy_mode'  => 'aleatory', //the proxy list should act: aleatory, sequential, sequential_suite
                    'delay' => 1, //wait time in seconds between the scrapes
                    'loop'  => 1, //number of times to repeat scrapers
                    'debug' => 2, //false, //todo: validar funcionalidade?!
                    ),
                'scrapers' => array(),
              );
    //instance config settings
    public $settings = array();
    //total of items crawled in this instance
    public $totalitems = 0;
    //crawler bean instance
    public $dbobj = null;
    //resume process vars (todo)
    public $resume = false, $resumepid = null;
    //array of scrapers which has been already run
    public $scraped = array();
    //internal retry pages scraper
    private $retry = array(
        'pages' => array(
            'error' => array(),
            'success' => array(),
            ),
        'query' => array(
            'error' => array(),
            'success' => array(),
            ),
        'retries' => 0, //pointer for each scraper
        );
    public $Json;
    
    function Crawler( $configfile ) {
        Log::time('crawler');
        
        //startup steps
        $this->loadConfig($configfile);
        $this->startDatabases();
        
        //get process id from cmd-line in order to resume its jobs
        //not tested!!
        if (isset( $argv[2] )) {
            $this->resume = true;
            $this->resumepid = $argv[2];
        }

        //init the database object for the crawler
        $this->saveDbObj('init', $configfile);
        
        //debuglevel: setting is based on the log entry names (further)
        global $debuglevel;
        $debuglevel = $this->settings['debug'];

        register_shutdown_function(array($this, '__destruct'));
        set_time_limit(0);
        
        Log::debug('Crawler', 'New instance running: '. $this->dbobj->pid );
        Log::debug('Crawler/config/json', json_encode($this->config) );
    }

    //crawler object in database
    function saveDbObj( $state, $params=array() ) {
        R::selectDatabase( 'config' );

        switch ($state) {
            case 'init':
                $name = $params;
            
                //initiate and/or resume the object
                //todo: for the cron-resume-all:
                //      figure out if happened another craw after the resumed one date
                //      which has been finished properly
                //      if so set the past to finished too
                //todo: guardar json com configurações (craw e scrap)
                //todo: melhorar query!! os pids se repetem........ é preciso colocar o id a disposição do resume..!

                /*$this->dbobj = R::findOrDispense('omCrawler', 'pid=? ORDER BY id DESC', array( $this->resumepid ));
                $this->dbobj = reset($this->dbobj);
                $this->dbobj->pid = getmypid();*/
                //todo: review
                /*if ( !$this->resume ) {
                    $this->dbobj->status = 'i'; //initiated
                    $this->dbobj->try = 0; //first try - hopefully the only one
                } else {
                    //todo: maxtries and complete resume
                    $this->dbobj->try += 1;
                }
                if ( isset($this->dbobj->totalitems) && $this->dbobj->totalitems > 0 )
                    $this->totalitems = $this->dbobj->totalitems;

                 */

                $this->dbobj = R::dispense('omCrawler');
                $this->dbobj->name = $name;
                $this->dbobj->status = 'i'; //initiated
                $this->dbobj->pid = getmypid();
                $this->dbobj->try = 0; //first try - hopefully the only one

                $this->dbobj->time_start = date('Y-m-d H:i:s');
                
                break;
            case 'finish':
                $this->dbobj->totalitems = $this->totalitems;
            
                if ( $this->dbobj->status == 'i' )
                    $this->dbobj->status = 'f'; //finished
                
                //$this->dbobj->time_end = date('Y-m-d H:i:s');

                break;
            case 'scraperStart':
                $scrpdb = $params;
                $i = count( $this->scraped ); //next scraped array index

                if ($this->retry['retries'] > 0) {
                    //we are on a subscrape | retries is not 0-based
                    $pointer = ($i -1) ."-$this->retry[retries]";
                } else {
                    //normal scrape - parent
                    $pointer = $i;
                }
                
                $this->dbobj->pointer = $pointer; //scraper index to know where it is
                $this->dbobj->ownOmScraper[] = $scrpdb;
                break;
            case 'scraperEnd':
                $ids = $params;
                $scrpdb->count = count($ids);
                $scrpdb->dataids = json_encode( $ids );
                break;
            case 'crawlerError':
                $this->dbobj->pages = json_encode( $this->retry['pages'] );
                $this->dbobj->status = 'e';
            case 'crawlerEnd':
                $this->dbobj->time_end = date('Y-m-d H:i:s');
                break;
        }
        
        if ($this->dbobj)
            R::store($this->dbobj);
    }

    //load a configuration file
    function loadConfig( $file=false ) {
        if (!$file)
            return $this->config;

        $hdl = fopen( $file, 'r' );
        $json = fread( $hdl, filesize($file) );
        $json = json_clean_functions( $json );
        
        //echo $json, "\n";
        //exit;
        
        require 'JSON.php';
        $this->Json = new Services_JSON(SERVICES_JSON_LOOSE_TYPE);
        $usrconfig = $this->Json->decode($json);
        
        //$usrconfig = json_decode( json_clean($json), true );
        
        //var_dump($usrconfig); exit;
        
        //json error catch
        if ( !$usrconfig ) {
            // Define the errors.
            $constants = get_defined_constants(true);
            $json_errors = array(
                JSON_ERROR_NONE => 'No error has occurred',
                JSON_ERROR_DEPTH => 'The maximum stack depth has been exceeded',
                JSON_ERROR_CTRL_CHAR => 'Control character error, possibly incorrectly encoded',
                JSON_ERROR_SYNTAX => 'Syntax error',
            );
            Log::out($json, 'Crawler/config', 2);
            Log::halt( "Error while parsing config file\nLast error: ". $json_errors[ json_last_error() ]);
        }
        
        //merge configs
        $dbs = $this->config['databases']; //number indexed arrays get overwrite in the replace bellow
        $this->config = array_replace_recursive( $this->config, $usrconfig );
        $this->config['databases'] = array_merge($dbs, $this->config['databases']); //merge dbs array
        $this->settings =& $this->config['crawler']; //link config to the crawler settings

        fclose( $hdl );
    }
    
    //configura bases
    private function startDatabases() {
        foreach ( $this->config['databases'] as $dbsetup ) {
            if (is_array($dbsetup))
                call_user_func_array( 'R::addDatabase', $dbsetup );
        }
    }
    
    //fetch the datascrape, call the scraper parser and store data
    function start() {
    	//grap scrapers from config input
        $scrps = array_filter( $this->config['scrapers'] );
        
        //startup scrapings looking for resume proc
        //todo: pensar melhor nesse resume, devido nova funcionalidade de captar error e fazer na hora
        //sim: pode resumir tanto os erros quanto um processo interrompido (index/pointer)
        //todo: prepareResume() - vai passar ao scrape() somente scrps que precisam ser feitos
        //talvez resume não seja mesmo necessário porque as ferramentas de api e multi scraper dão conta de lidar com os lotes de consulta, sem se desgastar com recuperações (scrapes finalizados devem sempre retornar)
        /*$is = 0; //scraper index
        if ($this->resume) {
        	if ($this->dbobj->pointer > count($scrps) )
        	   Log::halt('Cannot resume process: already finished.');
        	
        	//continue from the last started scraper
            $is = (int) $this->dbobj->pointer;
        }*/
        $is = (int) $this->dbobj->pointer;

        
        for ( $p=0; $p < $this->settings['loop']; $p++  ) {
            //Run the scrapers batch
            $this->totalitems += $this->scrape( $scrps, $is );

           //delay the next scrape
           if ( $p < $this->settings['loop'] && (int) $this->settings['delay'] > 0 )
               sleep( $this->settings['delay'] );
        }
        
        //todo: save the crawler as finished
        $this->saveDbObj('finish');
    }
    
    function scrape( array $scrps, $indexscraper = 0 ) {
        $num_obj = 0; //global (scrps) data count
        
        //loop throught defined scrapers
        for ( $i = 0; $i < count($scrps); $i++ ) {
            $scrp = $scrps[$i];

            //if there's no data is not a scraper
            if ( count($scrp) == 0 || ( !isset($scrp['suite']) && !isset($scrp['data']) ) )
                continue; 
            
            //create Scraper class instance and load proxy
            $scraper = $this->prepareScraper( $scrp, $i );
                        
            //if there's no data to query, should not continue scraping !??
            if ( $scraper->settings['query'] && count($scraper->settings['query']['data']) < 1 ) {
                Log::out("No data to query on $scraper->suite_name", "Crawler/Scraper #$i/Query", 3);
                continue;
            }
            
            //log starting suite
            Log::out("Starting Suite [{$scraper->dbobj->id}]: $scraper->suite_name", "Crawler/Scraper #$i", 2);
            Log::debug("Crawler/scrape/suitejs", $scraper->suitejs, 2);
            
            //temp store some scrape data
            $numitms = 0; $ids = array();
            
            if ( isset($scrp['data']) && isset($scrp['store']) )
            { //user defined scraper data, no fetching
                $numitms = $scraper->store( $scrp['data'] );
            }
            else
            { //fetch data from scraper suite
                //TODO: terminar/testar controle dos erros e retrials. aprimorar log e estruturas de dados, junto com o processo infinito (repete ate finalizar as paginas ou então desiste por numero limite)";
                //todo: melhorar chamadas de metodos.. verificar se é mesmo preciso retornar um valor, ou se seria melhor já seta-lo a um objeto dentro do proprio metodo (fica mais legivel)
                //todo: configurar timeout para chamadas não ficarem travadas
                
                $output = $this->runPhantom( $scraper );

                //echo $output; exit;
                $scraper->load( $output );
                //print_r( $scraper->pages );
                //exit;
                
                //save retrieved rows
                if ( isset( $scrp['store'] ) )
                    $numitms = $scraper->store();
                
                
                //store the scrape in the scraped stack
                if ($this->retry['retries'] == 0)
                    $this->scraped[] = $scrp;

                //todo: refactor (finishScrape)
                //atualiza status das query
                R::selectDatabase( 'config' );
                foreach ($scraper->dbobj->ownOmQuery as $query) {
                    $qryerr = array_msearch($scraper->query['error'], 'id', $query->id);
                    if ( count($qryerr) > 0 )
                        $query->status = 'e';
                    else
                        $query->status = 'f';
                }
                R::store($scraper->dbobj);
                R::storeAll($scraper->dbobj->ownOmQuery);
                
                //re-fetch failed pages if exist any
                $numitms += $this->scrapeErrors( $scraper );
           }

           //todo: total result must be the new itens (not all itens)
           Log::out("Finished {$scraper->suite_name} with $numitms total results", 'Crawler');
           $num_obj += $numitms;

           //associate fetched data with the scrape data object
           $this->saveDbObj('scraperEnd', $scraper->ids);
           
           //delay the next scrape
           if ( $i < count($scrps) && (int) $this->settings['delay'] > 0 )
               sleep( $this->settings['delay'] );
        }
        
        return $num_obj;
    }

    //FERRAMENTA de RETRY - tenta buscar na hora os itens falhos no cliente.
    //todo: fatal error tb precisa de catch, talvez dbobj->status
    private function scrapeErrors( $scraper ) {
        $num_obj = 0;

        //remove last succeed pages from the error stack
        $this->retry['pages']['error'] = array_diff( $this->retry['pages']['error'], $scraper->pages['success'] );
        //add failed pages to error stack
        $this->retry['pages']['error'] = array_unique(array_merge( $this->retry['pages']['error'], $scraper->pages['error'] ));
        //add successfull pages to stack
        $this->retry['pages']['success'] = array_unique(array_merge( $this->retry['pages']['success'], $scraper->pages['success'] ));
        //add failed querys to error stack
        $this->retry['query']['error'] = array_unique(array_merge( $this->retry['query']['error'], $scraper->query['error'] ));
        
        //todo: log/debug scrapeErrors

        //todo: fazer o retry das querys junto (ou separado) das páginas - pensar melhor
        if ( count( $scraper->query['error'] ) > 0 && true==false) {
            //define status das queries como error
            //depois vamos fazer repetição automatica..
            //e por fim precisamos finalizar as que estão corretas (status)
            foreach ( $scraper->query['error'] as $qry ) {
                $qry = json_decode($qry, true);
                $qryrow = array_msearch( $scraper->dbobj->ownOmQuery, 'id', $qryrow['id'] );
                
                /*
                todo: come back here
                print_r($qryrow);
                exit();*/
                
                $qryrow->status = 'e';
                R::store($qryrow);
                $qryrow[''];
            }
        }

        //there's error in scraped pages: make retrial
        if ( count($this->retry['pages']['error']) > 0 ) {
            //todo: delay / sleep
        	//max number of retries per scrape
            if ( $this->retry['retries'] > 7 ) {
                //todo: too much tries
                //error in this crawler on pages ZX
                //log the problem
                
                $this->saveDbObj('crawlerError');
                
                $this->retry['pages']['error'] = array();
                $this->retry['pages']['success'] = array();
                $this->retry['pages']['query'] = array();
                $this->retry['retries'] = 0;
            } else {
            	//todo: refactor prepareReScrape, ScrapeError..
                
                //pega suite original (array ou arquivo)
                if ( is_array($scraper->settings['suite']) ) {
                    $rescrp = $scraper->settings['suite'];
                } else {
                	//todo: buscar suite file e fazer parse das suas configurações
                	// porenquanto será necessário usar configuração em um unico arquivo (suite array)
                	// mas depois podemos usar um parser:
                	//   1) fake js que recebe as chamadas (add e config) convertendo os args em json
                	//   2) parse file string com regex etc
                	$rescrp = array();
                }
                
                //configura novo suite com informações que precisamos passar pro scrape()/prepareScrape();
                $rescrp['suite']['url'] = null;
                $rescrp['suite']['urls'] = $this->retry['pages']['error'];
                $rescrp['suite']['config']['ignoreUrls'] = $this->retry['pages']['success'];
                
                //little delay for system/network recovery
                sleep(5);
                
                //re-scrape error pages
                $this->retry['retries'] += 1;
                $num_obj = $this->scrape( array( $rescrp ) );
                
                //store the scrape in the scraped stack
                $i = count( $this->scraped ) -1;
                if ($this->retry['retries'] == 1) //if first retry
                    $this->scraped[$i] = array( $this->scraped[$i] );
                $this->scraped[$i][] = $rescrp;
            }
        } else {
            //no more errors, clean pointer and stacks
            $this->retry['pages']['error'] = array();
            $this->retry['pages']['success'] = array();
            $this->retry['retries'] = 0;
        }
        
        return $num_obj;
    }
    
    private function prepareScraper( $scrp, $index ) {
        R::selectDatabase( 'config' );

        //initiate the scraper instance
        $scraper = new Scraper( $scrp );
        
        //TODO: review this, review scraper dbobjs (+resume fields)
        //      fazer na hora de completar o resume
        
        //create a new scrape db obj
        $scrpdb = R::dispense( 'omScraper' );
        //$scrpdb->suite = ( is_array($scrp['suite']) ) ? json_encode( $scrp['suite'] ) : $scrp['suite'];
        $scrpdb->suite = $scraper->suite_name;
        $scrpdb->entity = @$scrp['store']['entity'];
        $scrpdb->date = date('Y-m-d H:i:s');
        $scrpdb->try = $this->retry['retries']; //retry pointer of the scrape
        R::store($scrpdb);
        
        //store crawler resume pointer
        $this->saveDbObj('scraperStart', $scrpdb);
        
        $scraper->dbobj = $scrpdb;

        
        //prepare the scraper
        $this->prepareQuery( $scraper );
        $this->prepareSuite( $scraper );
        
        //todo: refactor - suite_name só é gerado no prepareSuite.. melhorar a geração desse objeto e otimizar os saves..!
        $scrpdb->suite = $scraper->suite_name;
        R::store($scrpdb);
        
        //todo: refactor - prepareProxy
        //fetch proxy data
        if ( $this->settings['proxy'] === true ) {
            
            $limit = $this->settings['proxy_limit'];
            $mode = $this->settings['proxy_mode'];
            $except = array();
            
            //sequential proxys should not repeat themselfs in the same crawler
            if ( $mode == 'sequential' ) {
                //find proxys that was used on this craw
                $except = R::getCol('SELECT proxy FROM omScraper WHERE proxy > 0 AND omCrawler_id = ?', array( $scraper->dbobj->omCrawler_id ));
            }
            //sequential_suite do not repeat the proxy for the suite in any time
            if ( $mode == 'sequential_suite' ) {
                //todo: somente scrapings que deram certo (os falhos podem repetir). porem é preciso de alguma forma catalogar esses proxys ruins.. deixar por ultimo
                //find proxys that was used for the suite before
                $except = R::getCol('SELECT proxy FROM omScraper WHERE proxy > 0 AND suite = ?', array( $scraper->dbobj->suite ));
            }
            
            //grab the proxy to be used in this scrape
            $scraper->proxy = Proxy::getProxy( $limit, $mode, $except );
            
            //associate the proxy with the scrape
            $scrpdb->proxy = $scraper->proxy->id;
            R::store($scrpdb);

            
            //agent defined by proxy
            if ( $scraper->proxy instanceof RedBean_OODBBean && !empty( $scraper->proxy->agent ) ) {
                $suite['config']['pageSettings']['userAgent'] = $scraper->proxy->agent;
            }

        //user-defined proxy (string)
        } elseif ( !empty($this->settings['proxy']) ) {
            
            if ( Proxy::checkProxy($proxy) )
                $scraper->proxy = $proxy;
            else
                $scraper->proxy = null; //proxy is dead (timeout)
                
        }


        return $scraper;
    }
    
    private function prepareSuite( &$scraper ) {
        //get scraper config array
        $scrp = $scraper->settings;

        //prepare suite if is a javascript
        if (is_array( $scrp['suite'] )) {
            //basic js suite
            $suite = array(
                'url'=>'',
                'scraper'=>'function(){ return {}; }',
                'config'=> array(
                    'pageSettings'=> array(
                        'userAgent'=>'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:13.0) Gecko/20100101 Firefox/13.1',
                        'loadImages'=>false,
                        //'timeoutInterval'=>
                        )
                    )
                );
            //merge base suite with the one user-defined
            $suite = array_replace_recursive( $suite, $scrp['suite'] );

            //prepare the api query
            if (isset( $scrp['api']['caller'] )) {
                $suite['url'] = $scrp['api']['caller'];
                //$this->prepareApi($scraper);
            }
            //api/query data
            //todo: rename variable to api e refatorar tool - unir query e api pois fazem parte do mesmo esquema
            $suite['query'][] = is_array( $scrp['query']['data'] ) ? $scrp['query']['data'] : array();
            $suite['query'][] = $scrp['api'];

            //pjs config
            $suitejs_config = is_array($suite['config']) ? $suite['config'] : array();
            $suitejs_config = json_encode( $suitejs_config );
            
            //pjs suite (sem config)
            $suitejs = array_diff_assoc( $suite, array( 'config'=>array() ) );
            $suitejs = json_encode( $suitejs );

            //unquote js functions
            $suitejs = preg_replace('/:"(function.*?\})"([\,\}\]])/', ':$1$2', $suitejs);
            $suitejs = stripcslashes($suitejs);
            
            //parse suite options to javascript notation
            //todo: make possible multiple suites in one scrape (multiple addSuite) (arrayfy suite var in json config)
            $suitejs = "pjs.addSuite($suitejs);";
            $suitejs_config = "pjs.config($suitejs_config);";
            
            //print_r($suite);
            //exit;

            $scraper->suite = json_encode($suite);
            $scraper->suitejs = $suitejs . $suitejs_config;

            if ( !$scraper->suite_name )
                $scraper->suite_name = @(is_array( $scrp['suite']['url'] )) ? implode(',', $scrp['suite']['url']) : $scrp['suite']['url'] ;
        } else {
            //string suite - is a file with the js suite
            $cmd = "phantomjs core/js/pjs-wrapper.js $scrp[suite]";

            //grab the suite for parsing
            $output = shell_exec($cmd);
            $json = json_decode($output, true);

            //var_dump($output); exit;
            //todo: catch parse error on null json
            
            $suite_bkp = $scraper->settings['suite'];

            $scraper->settings['suite'] = $json[0][0];
            $scraper->settings['suite']['config'] = $json[1];
            $scraper->suite_name = $scrp['suite'];
            $this->prepareSuite($scraper);
            
            $scraper->settings['suite'] = $suite_bkp;
        }
    }

    //todo: padronizar passagem de parametros (instancia ou array?)
    // $scrp means it is the array from settings
    // $scraper means it is a Scraper class instance
    private function prepareQuery( &$scraper ) {
        //get scraper config array
        $scrp = $scraper->settings;
        $query = $scrp['query'];

        if ( $query ) {
            if (isset( $query['data'] ))
            {
                //user-defined query data
                $scraper->settings['query']['data'] = $query['data'];

                //todo: gravar no banco, associar crawler, etc - refactor
            }
            else {
                //todo:comentar o motivo de cada campo (levar para settings la em cima)
                
                $database = $query['database'];
                $entity = @$query['entity'];
                $sql = ( isset($query['sql']) ? $query['sql'] : '1 = 1' ) ;
                $values = @(array) $query['values'];
                $limit = ( isset($query['limit']) ) ? $query['limit'] : null;
                $fields = ( isset($query['fields']) ) ? $query['fields'] : array();
                $sanitize = ( isset($query['sanitize']) ) ? $query['sanitize'] : false;

                //todo: rever esquema de paginação query OK
                //      status inicializado tb nao entra na conta OK
                //      apenas contar e ordenar não garante que teremos os itens certos. é preciso buscar diretamente ou empregar regra que funcione. OK
                //      qryobj anexo ao crawler ou scraper? OK
                 //ideia: contar independente do status (ou seja, rodar todos itens ate o fim, em ordem) e depois (ou memos durante) estar pegando os itens que ficaram pra tras com erros etc.
                //todo: depois de buscar todo o offset busca os itens com status nao finalizado
                //ideia: permitir aqui tb o loop de querys (mais de um dicionario)

                //fetch the proper offset for this instance
                if ( $limit ) {
                    $adapter = R::$toolboxes['config']->getDatabaseAdapter();

                    try {
                        //the offset is the amount of runned queries
                        $offset = (int) $adapter->getCell("SELECT count(1)  
                                                       FROM omQuery q
                                                         INNER JOIN omScraper s
                                                            ON s.id = q.omScraper_id
                                                         INNER JOIN omCrawler c
                                                            ON c.id = s.omCrawler_id
                                                           AND c.name = '{$this->dbobj->name}'
                                                       GROUP BY c.name");
                        $sql .= " LIMIT $limit OFFSET $offset";
                    } catch ( Exception $e ) {
                        //first run?
                        $sql .= " LIMIT $limit OFFSET 0";
                    }
                }
                
                if ($entity) {
                    //fetch query db objs
                    $qrydb = R::$toolboxes[$database]->getRedBean();
                    $beans = $qrydb->find( $entity, array(), array($sql, $values) );
                } else {
                    //pure sql query
                    $qryad = R::$toolboxes[$database]->getDatabaseAdapter();
                    $beans = $qryad->get($sql, $values);
                }
                
                //filter selected fields & transform to array
                //also sanitize query fields
                $queries = array();
                foreach ( $beans as $row ) {
                    if ( $row instanceof RedBean_OODBBean )
                        $row = $row->export();

                    if ($sanitize) {
                        foreach ($row as $key=>$val)
                            $row[$key] = Convert::sane_text($val);
                    }

                    if ( count($fields) > 0 ) {
                        $fields[] = 'id';
                        $fields = array_unique($fields);

                        //its mandatory to have a id field to identify the query obj
                        $queries[] = array_intersect_key($row, array_flip($fields));
                    } else {
                        $queries[] = $row;
                    }
                }

                //set the scraper query data array
                $scraper->settings['query']['data'] = $queries;

                //arrange query array to be saved to database
                $query_rows = array_map(function($row) {
                                            return array(
                                                'data' => json_encode($row),
                                                'status' => 'i',
                                                );
                                        }, $queries);

                                        //statuses: i: initialized, e: error, f: finished
                                        
                R::selectDatabase('config');
                //associate query beans with this scraper
                $query_beans = array();
                foreach ( $query_rows as $row ) {
                    $bean = R::dispense('omQuery');
                    $bean->import($row);
                    $query_beans[] = $bean;
                }
                $scraper->dbobj->ownOmQuery = $query_beans;

                //save query config objs to db
                R::storeAll($query_beans);
                R::store($scraper->dbobj);
            }
        } else {
            $scraper->settings['query'] = null;
        }
        //case in which user specify the query data
        //return formated fields
    }
    
    //todo: prepara parametros API como caller, destiny e outros via php-array
    private function prepareApi( &$scraper ) {
        //print_r($scraper);
        $scraper->suite['url'] = $scraper->settings['api']['caller'];
    }

    private function runPhantom( $scraper ) {
        Log::time('phantom');
        $params = array(
            'load-images' => 'no',
            'max-disk-cache-size' => '100101',
            );
        
        //Define proxy phantom setup
        $proxy = $scraper->proxy;
        if ( $proxy instanceof RedBean_OODBBean ) {
            $params = array_replace( $params, array('proxy' => $proxy->server,
                                        'proxy-type' => $proxy->type)
                );
        } elseif ( !empty($proxy) ) {
            $params = array_replace( $params, array('proxy' => $proxy,
                                        'proxy-type' => 'http')
                );
        }
        
        //configure suite to be sent
        //todo: se string for muito grande criar arquivo temporario!! | $ getconf ARG_MAX
        //  expr `getconf ARG_MAX` - `env|wc -c` - `env|wc -l` \* 4 - 2048
        // http://www.in-ulm.de/~mascheck/various/argmax/
        if ( isset($scraper->suitejs) )
            $suite = $scraper->suitejs;
            //$suite = escapeshellarg($scraper->suitejs);
        else
            $suite = $scraper->suite;
        //print_r($scraper->suitejs);

        //cria arquivo suite temporario
        $fname = "/tmp/". md5($scraper->suite_name . time());
        $hdl = fopen("$fname", 'w');
        fwrite($hdl, $suite);
        fclose($hdl);
        
        $pjscrape = 'core/pjscrape/pjscrape.js';
        
        $cmd = "phantomjs --". array_implode('=', ' --', $params) ." $pjscrape $fname";

        //echo $cmd, "\n\n"; exit;

        //todo: achar uma forma de: 1) trazer o resultado do exec em real time; 2) associar o pid do phantom com o php; 3) assim tentar encontrar processos stalled e dar solução/debug/retry..
        //ref: http://stackoverflow.com/questions/1281140/run-process-with-realtime-output-in-php

        //exec the phantom command and get outputs
        Log::debug('Crawler/Phantomjs/cmd', $cmd );
        $output = shell_exec($cmd);

        //apaga arquivo temporario
        //unlink($fname);

        Log::timeEnd('phantom');
        
        //Log::debug('Crawler/Phantom', $output ."\n" );
        return $output;
    }
    
    
    function __destruct() {
        //close databases
        foreach ( R::$toolboxes as $toolbox ) {
            $toolbox->getDatabaseAdapter()->close();
        }
    	
        //save end time
        if (isset( R::$toolboxes['config'] ))
        $this->saveDbObj('crawlerEnd');
        
        Log::timeEnd('crawler');
    }
}

Class Scraper {
	//scraper configuration
    public $settings = array(
		        'suite' => array(), //suite complete configuration
		        'format' => array(), //rows before store formating
		        'store' => array(), //database store settings
		        'query' => array(), //data query settings
		        'api' => array(), //api query settings
		    );
		    
    //modified data ids, log and data colected
    public $ids = array();
    private $numitems, $log, $data=array();
    
    //scraped pages stack
    public $pages = array(
                        'error'=>array(),
                        'success'=>array(),
                        );
    public $query = array(
                        'error'=>array(),
                        'success'=>array(),
                        );
                        
    //db obj instance and proxy instance/server
    public $dbobj = null, $proxy;
    //suite string and javascript inject
    public $suite, $suitejs;
    //suite readable name
    public $suite_name='';
    
    function Scraper( $config, $output=null ) {
        $this->settings = array_replace_recursive( $this->settings, $config );
        //Log::debug('Scraper/settings/json', json_encode($this->settings));
        
        if ($output)
            $this->load( $output );
    }
    
    //loads and parses output
    function load( $outputstr, $merge=false ) {
        $itemscount = 0;
            
        //loads clean json
        echo $outputstr;
        $output = $this->loadPjscrape( $outputstr );
        //$json = json_clean( $output );
        //$data = json_decode($json, true);
        $data = json_decode($output, true);
        
        Log::debug('Scraper/load', "Parsed Json Count: ".count($data) );
        
        //json error catch
        if (is_null( $data )) {
            // Define the errors.
            $constants = get_defined_constants(true);
            $json_errors = array(
                JSON_ERROR_NONE => 'No error has occurred',
                JSON_ERROR_DEPTH => 'The maximum stack depth has been exceeded',
                JSON_ERROR_CTRL_CHAR => 'Control character error, possibly incorrectly encoded',
                JSON_ERROR_SYNTAX => 'Syntax error',
            );
            Log::debug('Scraper/output', $outputstr);
            Log::debug('Scraper/parsejson', $output/*$json*/);
            Log::halt( "Error while parsing scraper data\nLast error: ". $json_errors[ json_last_error() ], 'Scraper');
        }
        
        //todo: use merge also in the store method (for cllass obvisly)?
        //      casar o uso das duas alternativas (validar)
        if ( $merge )
            $this->data = array_merge_recursive( $data );
        else
            $this->data = $data;
    }
    
    //clean-up pjscrape output parsing its return
    function loadPjscrape( $output ) {
        //pj logs
        $logs = array(); $logerr = array();
        //failed pages
        $errpages = array();
        //success pages
        $scspages = array();
        $pages = array(
            'success' => array(),
            'error' => array(),
        );
        $query = array(
            'success' => array(),
            'error' => array(),
        );
        
        //commom log callback
        $logcb = function( $mtch ) use ( &$log, &$pages ) {
            $log[] = $mtch[0];
            
            //store successfull pages
            preg_match( '/Scraping (.+)$/', $mtch[0], $m );
            if ( count($m) == 2 ) {
                $pages['success'][] = $m[1];
            }
            
            return '';
        };
        //error/alert log callback
        $errcb = function( $mtch ) use ( &$log, &$logerr, &$pages ) {
            $logerr[] = $mtch[0];
            $log[] = $mtch[0];
            //Till now, we have in pjs:
            //3 ERRORS
            //- Page did not load (status)*
            //- Page not found (404) -> sad, but nothing to do
            //- Page error code (status)*
            //2 ALERTS
            //- Timeout after (waitFor)* (todo, check)
            //- phantom->page.onAlert
            //
            //- * we must care
            //TODO: store error codes, log
            
            preg_match( '/Page did not load \(status=(.+)\): (.+)$/', $mtch[0], $merr );
            if ( count($merr) == 3 ) {
                $pages['error'][] = $merr[2];
            }
            preg_match( '/Page error code (.+) on (.+)$/', $mtch[0], $merr );
            if ( count($merr) == 3 ) {
                $pages['error'][] = $merr[2];
            }
        };
        //todo: callbak para FATAL ERROR (fail function)
        //client/console origin messages
        $clicb = function( $mtch ) use ( &$log, &$query ) {
            $log[] = $mtch[0];

            //store failed query items
            preg_match( '/Item Failed: (.+)$/', $mtch[0], $m );
            if ( count($m) == 2 ) {
                $query['error'][] = $m[1];
            }

            return '';
        };
        
        $output = preg_replace_callback('/^(ERROR|!).*$/im', $errcb, $output); //pjs > log.err && log.alert
        $output = preg_replace_callback('/^(\* CLIENT: ).*$/m', $clicb, $output);
        $output = preg_replace_callback('/^\*.*$/m', $logcb, $output); //pjs > log.msg
        //$output = preg_replace_callback('/^(!|CLIENT|Timeout).*$/im', $errparse, $output); //pjs ERROR: OUTPUT
        //$output = preg_replace_callback('/(\* Saved (\d+) items\n)/', $itemscb, $output);
        $output = preg_replace('/\n/', '', $output); //blank lines
        
        $this->pages = $pages;
        $this->query = $query;
        
        return $output;
    }
    
    //store data into database
    function store( $data=null, $merge=false ) {
        if ($data)
            if ( $merge )
                $this->data = array_merge_recursive( $data );
            else
                $this->data = $data;
        
        Log::time('store');
        
        R::selectDatabase( $this->settings['store']['database'] );
        
        if ( count((array) $this->data) > 0 ) { 
            //todo:transaction
            //R::begin();
            
            //importa output-data procurando por item duplicado
            $bean = $this->settings['store']['entity'];
            $unique = @$this->settings['store']['unique'];
            $rows = array();
            
            //todo: unique multicampos (array.foreach)
            
            foreach ( $this->data as $datarow ) {
                $row = null;
                $findp = array( $bean ); //find params
                
                //search for unique / no duplicate
                if ( $unique && $datarow[$unique] ) { 
                    $findp[] = "$unique = ?  ORDER BY id DESC";
                    $findp[] = array( $datarow[$unique] );
                }
                $row = call_user_func_array('R::findOrDispense', $findp);
                $row = reset($row); //get first
                    
                //datarow insert/update
                $row->import( $datarow );
        
                //format row using user-defined actions
                $this->formatRow( &$row );
                
                $rows[] = $row;
            }
            
            //armazenas rows e guarda ids
            $ids = R::storeAll($rows);
            $this->ids = array_merge( $this->ids, $ids );

            
            //commit changes
            //R::commit();
            
            Log::timeEnd('store');
            
            return count($ids);
            
        } else {
            Log::out( "Error while storing data. No data present" );
        }
    }
    
    private function formatRow( &$row ) {
        try {
            foreach ( (array) @$this->settings['format'] as $rowfmt ) {
                eval( "$rowfmt;" );
            }
        } catch( Exception $e ) {
            Log::out( "Error formating scrape data\n". $e->getMessage(), 'Scraper/format', 2 );
        }
    }
}

Class Proxy {
    
    static function checkProxy( $host_port, $timeout=10 ) {
        list($host, $port) = explode(':', $host_port);
        $fsock = @fsockopen($host, $port, $errno, $errstr, $timeout);
        
        if ( ! $fsock ) {
            return FALSE;
        } else
            return TRUE;
        //TODO: fazer trial também de uso do proxy: será que ele vai abrir nossa pagina (scrape)?
    }
    
    //todo: associate the proxy with a user agent, so a server is always with the same agent name
    //  make a agents list and randomly match with the proxy
    //  on the scrape!!
    static function getProxy( $maxhours, $mode='aleatory', $except=array() ) {
        R::selectDatabase('config');
        $sql = '1';
        $params = array();

        //limit proxys by the age they were online
        if ($maxhours > 0) {
            $date_limit = mktime() - ( 60*60*$maxhours );
            $sql = 'date > ? ';
            $params[] = $date_limit;
        }
        
        //proxy consume sequence mode
        if ($mode == 'aleatory') {
            $sql .= ' ORDER BY RANDOM()';
        } else { //sequential and sequential_suite
            //$except = array_filter($except);
            $sql .= ' AND id NOT IN ('. implode(',', $except) .') ORDER BY date DESC';
        }

        //match the proxy
        $proxy = R::findOne('proxy', $sql, $params);

        if (!$proxy)
            throw new Exception('Proxy não encontrado');
        
        //check if is a valid proxy
        if ( ! self::checkProxy($proxy->server) ) {
            $proxy->fail = ((int) $proxy->fail)+1;
            R::store($proxy);
            
            Log::out('Failed to resolve '. $proxy->server, 'Proxy/check');
    
            if ($proxy->fail >= 5)
                R::trash( $proxy );

            //search another proxy (online)
            return self::getProxy( $maxhours );
        }
        //renew proxy date/time
        $proxy->date = mktime();
        R::store($proxy);

        return $proxy;
    }
}

Class Log {
    //todo: change the args order to match debug func - ($hierarchy, $msg, $level)
    static function out( $msg, $hierarchy='Crawler', $writelevel=1 ) {
        $output = array();
        $level=0;
        
        //recursively findout the message level, by the hierarchies suplied
        foreach ( preg_split('/\//', "$hierarchy\/") as $path ) {
            $level++;
        }
        $level--;
        
        $output[] = "######################################";
        $output[] = "# $hierarchy <".date('Y-m-d H:i:s').">";
        $output[] = "####";
        
        //convert msg to array of lines
        if ( !is_array($msg) )
            $msg = preg_split('/\n/', "$msg\n");
        
        array_push($output, $msg[0]);

        //tabulate each line
        for ($x=0; $x < count($output); $x++) {
            $line = $output[$x];
            $line = @str_pad( $line, $level*2, null, STR_PAD_LEFT );
            $output[$x] = $line;
        }
        
        self::store( $output );
        //should write this log?
        if ( (int) $writelevel >= $level )
            self::write( $output );
    }
    
    static function write( $output ) {
        //echoe the msg
        echo implode( "\n", $output ), PHP_EOL;
    }
    
    static function halt( $msg ) {
        Log::out( $msg );
        die;
    }
    
    static function store($lines) {
        //store crawler logs
        global $crawler_logs;
        $crawler_log = array_merge( (array) $crawler_logs, (array) $lines );
    }
    
    static function time( $label ) {
        //stores current time in global
        global $log_timer;
        $log_timer[$label] = microtime(true);
    }
    static function timeEnd( $label ) {
        //shows diference from time
        global $log_timer;
        if (isset( $log_timer[$label] )) {
            $total = microtime(true) - $log_timer[$label];
            self::debug("Timer $label", "$label lasts $total seconds" );
            unset( $log_timer[$label] );
        }
    }
    static function debug( $section, $msg ) {
        global $debuglevel;
        self::out( $msg, $section, $debuglevel );
    }
}