/dev/_misc/yarkosvetit/parser.pl

https://github.com/Infarch/MyPerlModules · Perl · 180 lines · 114 code · 47 blank · 19 comment · 12 complexity · 461314768230431a026998c76354437a MD5 · raw file

  1. use strict;
  2. use warnings;
  3. # parse data
  4. use Error ':try';
  5. use LWP::UserAgent;
  6. use lib ("/work/perl_lib", "local_lib");
  7. use ISoft::Conf;
  8. use ISoft::ParseEngine::Agents;
  9. use ISoft::ParseEngine::ThreadProcessor;
  10. use ISoft::DBHelper;
  11. # Members
  12. use Category;
  13. use Product;
  14. use Property;
  15. use ProductPicture;
  16. # of course the classes below might be overriden
  17. use ISoft::ParseEngine::Member::File::CategoryPicture;
  18. use ISoft::ParseEngine::Member::File::ProductDescriptionPicture;
  19. parse();
  20. # ----------------------------
  21. sub parse {
  22. # get database handler
  23. my $dbh = get_dbh();
  24. # prepare environment
  25. my @init_list = (
  26. Category->new,
  27. Product->new,
  28. Property->new,
  29. #ISoft::ParseEngine::Member::File::CategoryPicture->new,
  30. ProductPicture->new,
  31. #ISoft::ParseEngine::Member::File::ProductDescriptionPicture->new,
  32. # ...
  33. );
  34. foreach my $init_obj (@init_list){
  35. $init_obj->prepareEnvironment($dbh);
  36. $dbh->commit();
  37. }
  38. # at least one object should exist
  39. check_root($dbh);
  40. # release the handler
  41. release_dbh($dbh);
  42. # do parsing
  43. # instantiate the ThreadProcessor
  44. my $tp = get_tp();
  45. if(1){
  46. # use agent list
  47. $tp->addAgent(@agents);
  48. }
  49. if(0){
  50. # use proxy list
  51. my @proxylist;
  52. load_list('proxy.txt', \@proxylist);
  53. $tp->addProxy(@proxylist);
  54. }
  55. if(0){
  56. my $login_obj = ISoft::ParseEngine::Login::xxx->new(
  57. username => 'somename',
  58. password => '******',
  59. login_url => 'http://www.example.com/login.php'
  60. );
  61. $tp->setLoginProvider($login_obj);
  62. }
  63. # use existing list
  64. # start parsing
  65. while(1){
  66. my $stop;
  67. my $left = $constants{Parser}{Queue};
  68. print "Start reading DB...\n";
  69. my $dbhx = get_dbh();
  70. my @worklist;
  71. foreach my $workobj(@init_list){
  72. my $limit = $left - @worklist;
  73. last if $limit==0;
  74. my @tmp = $workobj->getWorkList($dbhx, $limit) if $workobj->can('getWorkList');
  75. push @worklist, @tmp;
  76. }
  77. release_dbh($dbhx);
  78. if(@worklist>0){
  79. print "Enqueue ", scalar @worklist, " items\n";
  80. $tp->enqueueMember(@worklist);
  81. $tp->start($constants{Parser}{Threads});
  82. $stop = $tp->stop();
  83. } else {
  84. last;
  85. }
  86. last if $stop;
  87. }
  88. if($tp->stop()){
  89. print "Thread processor stopped!!!\n\n";
  90. } else {
  91. print "Done\n\n";
  92. }
  93. my $fdbh = get_dbh();
  94. foreach my $obj (@init_list){
  95. my $tbname = $obj->tablename();
  96. my $failed = $obj->getFailedCount($fdbh);
  97. print "$tbname: $failed failed records\n";
  98. }
  99. release_dbh($fdbh);
  100. }
  101. sub check_root {
  102. my $dbh = shift;
  103. # make root
  104. my $root = Category->new;
  105. $root->set('URL', $constants{Parser}{Root_Category});
  106. $root->set('Level', 0);
  107. unless($root->checkExistence($dbh)){
  108. $root->insert($dbh);
  109. $dbh->commit();
  110. }
  111. }
  112. # creates an instance of the ThreadProcessor class
  113. sub get_tp {
  114. return new ISoft::ParseEngine::ThreadProcessor(
  115. dbname=>$constants{Database}{DB_Name},
  116. dbuser => $constants{Database}{DB_User},
  117. dbpassword => $constants{Database}{DB_Password},
  118. dbhost => $constants{Database}{DB_Host}
  119. );
  120. }
  121. sub load_list {
  122. my ($file, $list_ref) = @_;
  123. return 0 unless open SRC, $file;
  124. while (<SRC>){
  125. chomp;
  126. push @$list_ref, $_;
  127. }
  128. close SRC;
  129. if(@$list_ref>0){
  130. $list_ref->[0] =~ s/^\xEF\xBB\xBF//;
  131. }
  132. return 1;
  133. }