/boots/scrape_boots.pl

http://scrapeshops.googlecode.com/ · Perl · 170 lines · 99 code · 45 blank · 26 comment · 17 complexity · 7d2a28b54d82241dca7d814d815ba40e MD5 · raw file

  1. use strict;
  2. use warnings;
  3. ###############################################################################################
  4. # TO DO:
  5. ###############################################################################################
  6. # - Try to make XPath work
  7. ###############################################################################################
  8. use WWW::Mechanize;
  9. use WWW::Mechanize::FormFiller;
  10. use URI::URL;
  11. use HTML::TreeBuilder;
  12. #use HTML::TreeBuilder::XPath;
  13. use Data::Dumper;
  14. my $dir = 'data';
  15. mkpath($dir) if ! -d $dir;
  16. my $file_base = "$dir/boots_";
  17. my $file_ext = ".html";
  18. # PAGE 1: http://www.boots.com/webapp/wcs/stores/servlet/StoreLocator?storeId=10052&catalogId=11051&langId=-1&errorViewName=StoreLookupView&requiredAction=locateNearestStores&displayView=StoreLookupResultsView&startIndex=0&currentPage=0&postcode=&town=london&submitLookupStore.x=38&submitLookupStore.y=7
  19. # PAGE 2: http://www.boots.com/webapp/wcs/stores/servlet/StoreLocator?selectedServiceIdsString=&requiredAction=locateNearestStores&catalogId=11051&errorViewName=StoreLookupResultsView&displayView=StoreLookupResultsView&postcode=&currentPage=1&langId=-1&town=london&startIndex=10&storeId=10052&cgiRequestType=placesCGI
  20. #my $form_url = 'http://www.boots.com/webapp/wcs/stores/servlet/StoreLocator?requiredAction=displayStoreLookupPage&displayView=StoreLookupView&langId=-1&storeId=10052&catalogId=11051';
  21. my $url = 'http://www.boots.com/webapp/wcs/stores/servlet/StoreLocator?storeId=10052&catalogId=11051&langId=-1&errorViewName=StoreLookupView&requiredAction=locateNearestStores&displayView=StoreLookupResultsView&startIndex=0&currentPage=0&postcode=&town=london&submitLookupStore.x=38&submitLookupStore.y=7';
  22. my $startIndex = 0;
  23. my $currentPage = 0;
  24. my @list = ();
  25. my $results_file = "results.tsv.txt";
  26. open(RESULTS,'>',$results_file) || die "cant open $results_file for writing";
  27. select RESULTS; $| = 1;
  28. select STDOUT; $| = 1;
  29. ###########################################
  30. # start
  31. ###########################################
  32. foreach my $c (1..100) {
  33. warn "getting page $c...\n";
  34. my $file = $file_base . $currentPage . $file_ext;
  35. # download page
  36. &getPage( file => $file, url => $url );
  37. sleep 1;
  38. # parse page
  39. my $tree = HTML::TreeBuilder->new;
  40. $tree->parse_file($file);
  41. # <script type="text/javascript">
  42. my @scripts = $tree->look_down('_tag', 'script', sub { defined $_[0] && defined $_[0]->attr('type') && $_[0]->attr('type') eq "text/javascript" });
  43. my $storelist;
  44. foreach my $script (@scripts) {
  45. my $html = $script->as_HTML;
  46. #warn "script: " . $html;
  47. if ($html =~ m{MMIsSupportedBrowser}) {
  48. $storelist = $html;
  49. last;
  50. }
  51. }
  52. my @stores = split(/createMarker/, $storelist);
  53. shift @stores; # header row
  54. foreach my $store (@stores) {
  55. next unless $store =~ m{<div class="adr">};
  56. #my ($postcode) = $address =~ m/\b([A-PR-UWYZ][A-HK-Y0-9][A-HJKSTUW0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2})\b/g;
  57. my ($lat) = $store =~ m{var latitude = "([^"]+?)"};
  58. my ($long) = $store =~ m{var longitude = "([^"]+?)"};
  59. my ($name) = $store =~ m{infoHtml \+=\s+'<h2>([^>]+?)</h2>'};
  60. my ($address1) = $store =~ m{<span class="address-line1">([^>]+?)<};
  61. my ($address2) = $store =~ m{<span class="address-line2">([^>]+?)<};
  62. my ($address3) = $store =~ m{<span class="address-line3">([^>]+?)<};
  63. my ($postcode) = $store =~ m{<span class="postal-code">([^>]+?)<};
  64. my ($phone) = $store =~ m{<div class="tel">[^>]+?<span class="key">Phone:</span>[^>]+?<span class="value">([^>]+?)</span>};
  65. # warn "Checking: $store\n";
  66. my $item = {
  67. name => ($name || "null"),
  68. address1 => ($address1 || "null"),
  69. address2 => ($address2 || "null"),
  70. address3 => ($address3 || "null"),
  71. postcode => ($postcode || "null"),
  72. phone => ($phone || "null"),
  73. latitude => ($lat || "null"),
  74. longitude => ($long || "null"),
  75. };
  76. # warn "found item: " . Dumper($item);
  77. push @list, $item;
  78. print "writing item: " . Dumper($item);
  79. print RESULTS join( "\t",
  80. $item->{name}, $item->{latitude}, $item->{longitude},
  81. $item->{address1}, $item->{address2}, $item->{address3},
  82. $item->{postcode}, $item->{phone}
  83. ) . "\n";
  84. }
  85. # increment page
  86. $currentPage += 1;
  87. $url =~ s{currentPage=\d+}{currentPage=$currentPage};
  88. $startIndex += 10;
  89. $url =~ s{startIndex=\d+}{startIndex=$startIndex};
  90. }
  91. ###########################################
  92. # finish
  93. ###########################################
  94. close(RESULTS);
  95. exit;
  96. ###########################################
  97. # subs
  98. ###########################################
  99. sub clean_up {
  100. my $s = shift;
  101. $s =~ s/^\s+//g;
  102. $s =~ s/\s+$//g;
  103. return $s;
  104. }
  105. sub getPage {
  106. my %p = @_;
  107. my $file = $p{file};
  108. my $url = $p{url};
  109. if (! -f $file) {
  110. warn "getting url\n";
  111. my $agent = WWW::Mechanize->new( autocheck => 1 );
  112. $agent->get($url);
  113. if (! $agent->success() ) {
  114. die "failed to get $url, stopping.\n";
  115. }
  116. warn "saving page\n";
  117. my $first_page = $agent->content;
  118. open(PAGE,'>',$file) || die "cant write to $file: $!";
  119. print PAGE $first_page;
  120. close(PAGE);
  121. }
  122. else {
  123. warn "reading local file $file\n";
  124. }
  125. }