PageRenderTime 75ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/dev/saitos/magnitola.ru/engine.pl

http://myperlmodules.googlecode.com/
Perl | 568 lines | 407 code | 119 blank | 42 comment | 40 complexity | fc0650dea3512ab89d81829337bda669 MD5 | raw file
  1. use threads;
  2. use threads::shared;
  3. use strict;
  4. use warnings;
  5. use Encode 'decode';
  6. use Error qw(:try);
  7. use LWP::Simple 'get';
  8. use LWP::UserAgent;
  9. use HTML::TreeBuilder::XPath;
  10. use Thread::Queue;
  11. use Thread::Semaphore;
  12. use URI;
  13. use lib ("/work/perl_lib");
  14. use ISoft::Conf;
  15. use ISoft::DB;
  16. use ISoft::Exception;
  17. use ISoft::Exception::DB;
  18. use ISoft::Exception::NetworkError;
  19. use ISoft::Exception::ScriptError;
  20. use DB_Member;
  21. use Parsers;
  22. # get configuration
  23. # database connection settings
  24. our $db_name:shared = $constants{Database}{DB_Name};
  25. our $db_user:shared = $constants{Database}{DB_User};
  26. our $db_pass:shared = $constants{Database}{DB_Pass};
  27. our $site_root:shared = $constants{General}{Site_Root};
  28. our $process_categories_once:shared = $constants{Category}{Process_Once};
  29. our $product_has_many_pictures:shared = $constants{Product}{Many_Pictures};
  30. our $product_description_has_pictures:shared = $constants{Product}{Description_With_Pictures};
  31. our $files_directory:shared = $constants{General}{Files_Directory};
  32. our $echo_url:shared = $constants{Debug}{Echo_Url};
  33. our $echo_time:shared = $constants{Debug}{Echo_Time};
  34. our $echo_stat:shared = $constants{Debug}{Echo_Statistic};
  35. our $allow_fails:shared = $constants{General}{Allow_Fails};
  36. our $use_proxy:shared = $constants{Proxy}{Use_Proxy};
  37. our %proxy_registry:shared;
  38. our $qProxy:shared;
  39. our $useragent:shared = '';
  40. our @agent_list:shared;
  41. # auxiliary variables
  42. our $break_application:shared = 0;
  43. local $SIG{INT} = sub{
  44. # interrupt handler. useful for finalization after that user has pressed Ctrl+C
  45. print "Break signalled, please wait for finish\n";
  46. $break_application = 1;
  47. exit;
  48. };
  49. # begin work
  50. # the three lines below do not make sense but allows to avoid a strange error in threads
  51. my $dbhx = get_dbh();
  52. $dbhx->rollback();
  53. $dbhx->disconnect();
  54. my $queue = Thread::Queue->new();
  55. my $sem = Thread::Semaphore->new(0);
  56. init();
  57. make_threads($queue, $sem);
  58. start($queue, $sem);
  59. finalize();
  60. exit;
  61. ############################ File functions ############################
  62. sub process_file {
  63. my ($dbh, $member_obj, $response) = @_;
  64. my $id = $member_obj->ID;
  65. my $path = "$files_directory/$id";
  66. save_file($path, $response);
  67. $member_obj->update($dbh);
  68. }
  69. sub save_file {
  70. my ($name, $resp) = @_;
  71. open (XX, '>', $name) or throw ISoft::Exception::ScriptError(message=>"Error creating file: $!");
  72. binmode XX;
  73. print XX $resp->content();
  74. close XX;
  75. }
  76. ############################ Category functions ############################
  77. sub process_subcategories {
  78. my ($dbh, $member_obj, $tree) = @_;
  79. my $level = $member_obj->Level();
  80. # call a function from Parsers.pm
  81. my $sc_list = get_categories($tree, $level);
  82. my $member_id = $member_obj->ID;
  83. $level += 1;
  84. foreach my $category (@$sc_list){
  85. my $new_member_obj = DB_Member->new;
  86. $new_member_obj->Member_ID($member_id);
  87. $new_member_obj->Level($level);
  88. $new_member_obj->Type($DB_Member::TYPE_CATEGORY);
  89. $new_member_obj->Status($DB_Member::STATUS_READY);
  90. $new_member_obj->setByHash($category);
  91. $new_member_obj->insert($dbh);
  92. }
  93. }
  94. sub process_category {
  95. my ($dbh, $member_obj, $response) = @_;
  96. my $tree = get_tree($response);
  97. my $page = $member_obj->Page();
  98. if($page==1){
  99. # only first page might contain sub categories.
  100. process_subcategories($dbh, $member_obj, $tree);
  101. }
  102. process_products($dbh, $member_obj, $tree);
  103. # call a function from Parsers.pm
  104. if(my $nextpage = get_next_page($tree)){
  105. $member_obj->Page($page+1);
  106. $member_obj->NextURL($nextpage);
  107. $member_obj->Status($DB_Member::STATUS_READY);
  108. }
  109. $member_obj->update($dbh);
  110. $tree->delete();
  111. }
  112. ############################ Product functions ############################
  113. sub process_products {
  114. my ($dbh, $member_obj, $tree) = @_;
  115. # call a function from Parsers.pm
  116. my $prodlist = get_products($tree);
  117. my $member_id = $member_obj->ID;
  118. foreach my $product (@$prodlist){
  119. my $new_member_obj = DB_Member->new;
  120. $new_member_obj->Member_ID($member_id);
  121. $new_member_obj->Type($DB_Member::TYPE_PRODUCT);
  122. $new_member_obj->Status($DB_Member::STATUS_READY);
  123. $new_member_obj->setByHash($product);
  124. $new_member_obj->insert($dbh);
  125. }
  126. }
  127. sub process_product {
  128. my ($dbh, $member_obj, $response) = @_;
  129. my $tree = get_tree($response);
  130. # call a function from Parsers.pm
  131. my $info = get_product_info($tree);
  132. if($product_description_has_pictures && exists $info->{Description}){
  133. $info->{Description} = process_description_pictures($dbh, $member_obj, $info->{Description});
  134. }
  135. $member_obj->setByHash($info);
  136. $member_obj->update($dbh);
  137. my $member_id = $member_obj->ID;
  138. # call a function from Parsers.pm
  139. my @pictures = get_product_picture($tree);
  140. if($product_has_many_pictures){
  141. push @pictures, @{ get_product_additional_pictures($tree) };
  142. }
  143. foreach my $picture (@pictures){
  144. my $new_member_obj = DB_Member->new;
  145. $new_member_obj->Member_ID($member_id);
  146. $new_member_obj->Type($DB_Member::TYPE_PICTURE);
  147. $new_member_obj->Status($DB_Member::STATUS_READY);
  148. $new_member_obj->setByHash($picture);
  149. $new_member_obj->insert($dbh);
  150. }
  151. $tree->delete();
  152. }
  153. sub process_description_pictures {
  154. my ($dbh, $member_obj, $text) = @_;
  155. my $tree = HTML::TreeBuilder::XPath->new;
  156. $tree->parse_content("<html><head><title>1</title></head><body><span id='contentholder'>$text</span></body></html");
  157. my @picnodes = $tree->findnodes( q{//img} );
  158. my $member_id = $member_obj->ID;
  159. foreach my $node (@picnodes){
  160. my $pic_member_obj = DB_Member->new;
  161. $pic_member_obj->Member_ID($member_id);
  162. $pic_member_obj->URL($node->attr('src'));
  163. $pic_member_obj->Type($DB_Member::TYPE_DESCRIPTION_PICTURE);
  164. $pic_member_obj->Status($DB_Member::STATUS_READY);
  165. $pic_member_obj->insert($dbh);
  166. $node->attr('member', $pic_member_obj->ID);
  167. }
  168. my @cnodes = $tree->findnodes( q{/html/body/span[@id='contentholder']/*} );
  169. my $html = '';
  170. foreach (@cnodes){
  171. $html .= $_->as_HTML('<>&', ' ', {});
  172. }
  173. $html =~ s/\r|\n|\t/ /g;
  174. $html =~ s/\s{2,}/ /g;
  175. $tree->delete();
  176. return $html;
  177. }
  178. ############################ Engine functions ############################
  179. sub get_tree {
  180. my $response = shift;
  181. my $content = $response->decoded_content();
  182. my $tree = HTML::TreeBuilder::XPath->new;
  183. $tree->parse_content($content);
  184. return $tree;
  185. }
  186. sub set_agent {
  187. my $ua = shift;
  188. if ($useragent){
  189. $ua->agent($useragent);
  190. } elsif (my $len = @agent_list) {
  191. # get random agent
  192. my $pos = int( rand($len) );
  193. $ua->agent($agent_list[$pos]);
  194. }
  195. }
  196. # we will use a separate function for getting response since it is helpful when we use proxy list
  197. sub get_response {
  198. my ($ua, $url, $noproxy) = @_;
  199. my $proxy;
  200. if($use_proxy && !$noproxy){
  201. $proxy = $qProxy->dequeue_nb();
  202. throw ISoft::Exception::ScriptError(message=>"No proxy") unless $proxy;
  203. $ua->proxy('http', "http://$proxy");
  204. }
  205. set_agent($ua);
  206. my $resp = $ua->get($url);
  207. if($proxy){
  208. lock %proxy_registry;
  209. if($resp->is_success()){
  210. # mark this proxy as Ok
  211. $proxy_registry{$proxy} = 0;
  212. } else {
  213. # mark this proxy as Bad
  214. my $count = exists $proxy_registry{$proxy} ? $proxy_registry{$proxy}+1 : 1;
  215. $proxy_registry{$proxy} = $count;
  216. if($count==3){
  217. delete $proxy_registry{$proxy};
  218. print "Proxy $proxy was permanently removed from queue after 3 errors\n";
  219. print scalar keys %proxy_registry, " proxies left\n";
  220. $proxy = '';
  221. }
  222. }
  223. # return to queue
  224. $qProxy->enqueue($proxy) if $proxy;
  225. }
  226. throw ISoft::Exception::NetworkError(message=>"Network error")
  227. unless $resp->is_success();
  228. return $resp;
  229. }
  230. sub worker {
  231. my ($queue, $sem) = @_;
  232. # prepare utility objects
  233. my $ua = LWP::UserAgent->new();
  234. # set reasonable timeout value
  235. $ua->timeout(20);
  236. my $dbh;
  237. while ( defined( my $member_obj = $queue->dequeue() ) ){
  238. $sem->up();
  239. # this status can be overriden
  240. $member_obj->Status($DB_Member::STATUS_DONE);
  241. my $error = 0;
  242. my $message = '';
  243. my $url;
  244. try {
  245. # get/restore database handler
  246. $dbh = get_dbh() unless defined $dbh;
  247. # processing of each emmber starts with getting its content.
  248. # if a member have Page=1 then we use URL else NextURL
  249. $url = $member_obj->Page()==1 ? $member_obj->URL() : $member_obj->NextURL();
  250. print "$url\n" if $echo_url;
  251. $url = URI->new($url)->abs($site_root);
  252. # select action
  253. if($member_obj->isCategory()){
  254. process_category($dbh, $member_obj, get_response($ua, $url));
  255. } elsif($member_obj->isProduct()){
  256. process_product($dbh, $member_obj, get_response($ua, $url));
  257. } elsif ($member_obj->isPicture() || $member_obj->isDescriptionPicture()){
  258. # no proxy for static content
  259. process_file($dbh, $member_obj, get_response($ua, $url, 1));
  260. } else {
  261. throw ISoft::Exception::ScriptError(message=>"Unknown member type");
  262. }
  263. } catch ISoft::Exception::DB with {
  264. # not fatal, but the member should be processed again
  265. $error = 5; # heavy error weight
  266. $message = $@->longMessage();
  267. } catch ISoft::Exception::ScriptError with {
  268. # fatal for application in whole
  269. $message = $@->longMessage();
  270. $error = 1;
  271. $break_application = 1;
  272. } catch ISoft::Exception::NetworkError with {
  273. # not fatal, try again
  274. $error = 1;
  275. $message = $@->longMessage();
  276. } otherwise {
  277. # fatal for application in whole
  278. $message = $@;
  279. $error = 1;
  280. $break_application = 1;
  281. };
  282. # restore status after error
  283. if($error){
  284. print "\nError happened during processing of $url: $message\n\n";
  285. try {
  286. # discard changes
  287. $dbh->rollback();
  288. unless ($break_application){
  289. my $id = $member_obj->ID;
  290. $member_obj = DB_Member->new;
  291. $member_obj->set('ID', $id);
  292. $member_obj->select($dbh);
  293. my $errors = $member_obj->Errors + $error;
  294. $member_obj->Errors($errors);
  295. if($errors > $allow_fails){
  296. $member_obj->Status($DB_Member::STATUS_FAILED);
  297. }
  298. $member_obj->update($dbh);
  299. }
  300. } otherwise {
  301. print "Cannot restore status after error. Going to shutdown\n";
  302. $break_application = 1;
  303. };
  304. }
  305. $dbh->commit() unless $break_application;
  306. $sem->down();
  307. last if $break_application;
  308. threads->yield();
  309. }
  310. }
  311. sub statistic {
  312. my $dbh = get_dbh();
  313. # read the existing types
  314. my $sql = 'select distinct type from member';
  315. my @types = ISoft::DB::do_query($dbh, sql=>$sql, arr_ref=>1);
  316. # get statistic for each the type
  317. foreach my $typeref (@types){
  318. my $type = $typeref->[0];
  319. $sql = qq(
  320. select * from (
  321. (select count(*) as ready from member where status=1 and type=$type) as x,
  322. (select count(*) as done from member where status=3 and type=$type) as y,
  323. (select count(*) as failed from member where status=4 and type=$type) as z
  324. )
  325. );
  326. my @clist = ISoft::DB::do_query($dbh, sql=>$sql, arr_ref=>1);
  327. my $row = shift @clist;
  328. print "Type $type: $row->[0] / $row->[1] / $row->[2]\n";
  329. }
  330. $dbh->rollback();
  331. $dbh->disconnect();
  332. }
  333. sub start {
  334. my ($queue, $sem) = @_;
  335. my $stop = 0;
  336. do {
  337. my $dbh = get_dbh();
  338. my @objlist = get_opened_members($dbh, $constants{General}{Threads}*50);
  339. $dbh->rollback();
  340. $dbh->disconnect();
  341. if(@objlist>0){
  342. if($echo_time){
  343. my $tm = localtime(time);
  344. print "$tm\n";
  345. }
  346. if($echo_stat){
  347. statistic();
  348. }
  349. foreach my $obj(@objlist){
  350. $queue->enqueue($obj);
  351. }
  352. my $work = 1;
  353. do {
  354. sleep 5;
  355. {
  356. lock $$sem;
  357. if(($queue->pending()==0 || $break_application) && $$sem==0){
  358. $work = 0;
  359. }
  360. }
  361. } while($work);
  362. } else {
  363. $stop = 1;
  364. }
  365. } while (!$stop && !$break_application);
  366. }
  367. sub make_threads {
  368. my ($queue, $sem) = @_;
  369. foreach (1..$constants{General}{Threads}){
  370. threads->create( 'worker', $queue, $sem )->detach();
  371. }
  372. }
  373. sub init_agent(){
  374. $useragent = exists $constants{UserAgent}{Agent_Name} ? $constants{UserAgent}{Agent_Name} : '';
  375. my $listname = exists $constants{UserAgent}{Agent_List} ? $constants{UserAgent}{Agent_List} : '';
  376. if ($listname){
  377. open (LST, $listname) or throw ISoft::Exception::ScriptError(message=>"Cannot open $listname: $!");
  378. while (<LST>){
  379. chomp;
  380. push @agent_list, $_;
  381. }
  382. close LST;
  383. }
  384. }
  385. sub init_proxy {
  386. $qProxy = Thread::Queue->new;
  387. my $listname = $constants{Proxy}{Proxy_List};
  388. open (SRC, $listname) or throw ISoft::Exception::ScriptError(message=>"Cannot open $listname: $!");
  389. while (<SRC>){
  390. chomp;
  391. $qProxy->enqueue("$_");
  392. }
  393. close SRC;
  394. if($qProxy->pending()==0){
  395. throw ISoft::Exception::ScriptError(message=>"No proxies");
  396. } elsif($qProxy->pending()<10){
  397. print "Please note that your proxy list is not enough full\n";
  398. }
  399. }
  400. # performs finalization
  401. sub finalize {
  402. # if we are using proxy list then the %proxy_registry hash contains addresses of alive proxy servers.
  403. # it makes sense to store the data in order to avoid checking of proxy list after next start.
  404. my @list = keys %proxy_registry;
  405. open P, '>', $constants{Proxy}{Backup_List};
  406. foreach (@list){
  407. print P "$_\n";
  408. }
  409. close P;
  410. }
  411. sub init {
  412. # directory for files
  413. unless (-e $files_directory && -d $files_directory){
  414. mkdir $files_directory or throw ISoft::Exception::ScriptError(message=>"Cannot create files directory: $!");
  415. }
  416. # proxy list
  417. if ($use_proxy){
  418. init_proxy();
  419. }
  420. init_agent();
  421. srand();
  422. my $dbh = get_dbh();
  423. my $member_obj = DB_Member->new;
  424. $member_obj->Member_ID(undef);
  425. $member_obj->set('Name', $constants{General}{Root_Category_Name});
  426. $member_obj->URL($constants{General}{Root_Category_Url});
  427. $member_obj->Type($DB_Member::TYPE_CATEGORY);
  428. if(!$member_obj->checkExistence($dbh)){
  429. print "First start\n";
  430. $member_obj->Status($DB_Member::STATUS_READY);
  431. $member_obj->insert($dbh);
  432. $dbh->commit();
  433. } else {
  434. print "Continue work\n";
  435. }
  436. $dbh->rollback();
  437. $dbh->disconnect();
  438. }
  439. sub get_dbh {
  440. return ISoft::DB::get_dbh_mysql($db_name, $db_user, $db_pass);
  441. }
  442. sub get_opened_members {
  443. my ($dbh, $count) = @_;
  444. my $member_obj = DB_Member->new;
  445. $member_obj->Status($DB_Member::STATUS_READY);
  446. $member_obj->maxReturn($count) if $count;
  447. my @list = $member_obj->listSelect($dbh);
  448. return @list;
  449. }