/boots/scrape_boots.pl
http://scrapeshops.googlecode.com/ · Perl · 170 lines · 99 code · 45 blank · 26 comment · 17 complexity · 7d2a28b54d82241dca7d814d815ba40e MD5 · raw file
- use strict;
- use warnings;
-
- ###############################################################################################
- # TO DO:
- ###############################################################################################
- # - Try to make XPath work
- ###############################################################################################
-
- use WWW::Mechanize;
- use WWW::Mechanize::FormFiller;
- use URI::URL;
- use HTML::TreeBuilder;
- #use HTML::TreeBuilder::XPath;
- use Data::Dumper;
-
- my $dir = 'data';
- mkpath($dir) if ! -d $dir;
- my $file_base = "$dir/boots_";
- my $file_ext = ".html";
-
- # PAGE 1: http://www.boots.com/webapp/wcs/stores/servlet/StoreLocator?storeId=10052&catalogId=11051&langId=-1&errorViewName=StoreLookupView&requiredAction=locateNearestStores&displayView=StoreLookupResultsView&startIndex=0¤tPage=0&postcode=&town=london&submitLookupStore.x=38&submitLookupStore.y=7
- # PAGE 2: http://www.boots.com/webapp/wcs/stores/servlet/StoreLocator?selectedServiceIdsString=&requiredAction=locateNearestStores&catalogId=11051&errorViewName=StoreLookupResultsView&displayView=StoreLookupResultsView&postcode=¤tPage=1&langId=-1&town=london&startIndex=10&storeId=10052&cgiRequestType=placesCGI
-
- #my $form_url = 'http://www.boots.com/webapp/wcs/stores/servlet/StoreLocator?requiredAction=displayStoreLookupPage&displayView=StoreLookupView&langId=-1&storeId=10052&catalogId=11051';
- my $url = 'http://www.boots.com/webapp/wcs/stores/servlet/StoreLocator?storeId=10052&catalogId=11051&langId=-1&errorViewName=StoreLookupView&requiredAction=locateNearestStores&displayView=StoreLookupResultsView&startIndex=0¤tPage=0&postcode=&town=london&submitLookupStore.x=38&submitLookupStore.y=7';
-
- my $startIndex = 0;
- my $currentPage = 0;
-
- my @list = ();
-
- my $results_file = "results.tsv.txt";
- open(RESULTS,'>',$results_file) || die "cant open $results_file for writing";
- select RESULTS; $| = 1;
- select STDOUT; $| = 1;
-
- ###########################################
- # start
- ###########################################
-
- foreach my $c (1..100) {
- warn "getting page $c...\n";
-
- my $file = $file_base . $currentPage . $file_ext;
-
- # download page
- &getPage( file => $file, url => $url );
- sleep 1;
-
- # parse page
- my $tree = HTML::TreeBuilder->new;
- $tree->parse_file($file);
-
- # <script type="text/javascript">
- my @scripts = $tree->look_down('_tag', 'script', sub { defined $_[0] && defined $_[0]->attr('type') && $_[0]->attr('type') eq "text/javascript" });
-
- my $storelist;
- foreach my $script (@scripts) {
- my $html = $script->as_HTML;
- #warn "script: " . $html;
- if ($html =~ m{MMIsSupportedBrowser}) {
- $storelist = $html;
- last;
- }
- }
- my @stores = split(/createMarker/, $storelist);
-
- shift @stores; # header row
- foreach my $store (@stores) {
-
- next unless $store =~ m{<div class="adr">};
-
- #my ($postcode) = $address =~ m/\b([A-PR-UWYZ][A-HK-Y0-9][A-HJKSTUW0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2})\b/g;
-
- my ($lat) = $store =~ m{var latitude = "([^"]+?)"};
- my ($long) = $store =~ m{var longitude = "([^"]+?)"};
-
- my ($name) = $store =~ m{infoHtml \+=\s+'<h2>([^>]+?)</h2>'};
- my ($address1) = $store =~ m{<span class="address-line1">([^>]+?)<};
- my ($address2) = $store =~ m{<span class="address-line2">([^>]+?)<};
- my ($address3) = $store =~ m{<span class="address-line3">([^>]+?)<};
- my ($postcode) = $store =~ m{<span class="postal-code">([^>]+?)<};
-
- my ($phone) = $store =~ m{<div class="tel">[^>]+?<span class="key">Phone:</span>[^>]+?<span class="value">([^>]+?)</span>};
-
- # warn "Checking: $store\n";
-
- my $item = {
- name => ($name || "null"),
- address1 => ($address1 || "null"),
- address2 => ($address2 || "null"),
- address3 => ($address3 || "null"),
- postcode => ($postcode || "null"),
- phone => ($phone || "null"),
- latitude => ($lat || "null"),
- longitude => ($long || "null"),
- };
-
- # warn "found item: " . Dumper($item);
-
- push @list, $item;
-
- print "writing item: " . Dumper($item);
- print RESULTS join( "\t",
- $item->{name}, $item->{latitude}, $item->{longitude},
- $item->{address1}, $item->{address2}, $item->{address3},
- $item->{postcode}, $item->{phone}
- ) . "\n";
-
- }
-
- # increment page
- $currentPage += 1;
- $url =~ s{currentPage=\d+}{currentPage=$currentPage};
- $startIndex += 10;
- $url =~ s{startIndex=\d+}{startIndex=$startIndex};
-
- }
-
- ###########################################
- # finish
- ###########################################
-
- close(RESULTS);
-
-
- exit;
-
-
-
-
-
-
- ###########################################
- # subs
- ###########################################
-
- sub clean_up {
- my $s = shift;
- $s =~ s/^\s+//g;
- $s =~ s/\s+$//g;
- return $s;
- }
-
-
- sub getPage {
- my %p = @_;
- my $file = $p{file};
- my $url = $p{url};
-
- if (! -f $file) {
- warn "getting url\n";
- my $agent = WWW::Mechanize->new( autocheck => 1 );
- $agent->get($url);
-
- if (! $agent->success() ) {
- die "failed to get $url, stopping.\n";
- }
-
- warn "saving page\n";
- my $first_page = $agent->content;
- open(PAGE,'>',$file) || die "cant write to $file: $!";
- print PAGE $first_page;
- close(PAGE);
- }
- else {
- warn "reading local file $file\n";
- }
- }