PageRenderTime 29ms CodeModel.GetById 14ms app.highlight 13ms RepoModel.GetById 1ms app.codeStats 0ms

/boots/scrape_boots.pl

http://scrapeshops.googlecode.com/
Perl | 170 lines | 99 code | 45 blank | 26 comment | 18 complexity | 7d2a28b54d82241dca7d814d815ba40e MD5 | raw file
  1use strict;
  2use warnings;
  3
  4###############################################################################################
  5# TO DO:
  6###############################################################################################
  7# - Try to make XPath work
  8###############################################################################################
  9
 10use WWW::Mechanize;
 11use WWW::Mechanize::FormFiller;
 12use URI::URL;
 13use HTML::TreeBuilder;
 14#use HTML::TreeBuilder::XPath;
 15use Data::Dumper;
 16
 17my $dir = 'data';
 18mkpath($dir) if ! -d $dir;
 19my $file_base = "$dir/boots_";
 20my $file_ext = ".html";
 21
 22	# PAGE 1: http://www.boots.com/webapp/wcs/stores/servlet/StoreLocator?storeId=10052&catalogId=11051&langId=-1&errorViewName=StoreLookupView&requiredAction=locateNearestStores&displayView=StoreLookupResultsView&startIndex=0&currentPage=0&postcode=&town=london&submitLookupStore.x=38&submitLookupStore.y=7
 23	# PAGE 2: http://www.boots.com/webapp/wcs/stores/servlet/StoreLocator?selectedServiceIdsString=&requiredAction=locateNearestStores&catalogId=11051&errorViewName=StoreLookupResultsView&displayView=StoreLookupResultsView&postcode=&currentPage=1&langId=-1&town=london&startIndex=10&storeId=10052&cgiRequestType=placesCGI
 24
 25#my $form_url = 'http://www.boots.com/webapp/wcs/stores/servlet/StoreLocator?requiredAction=displayStoreLookupPage&displayView=StoreLookupView&langId=-1&storeId=10052&catalogId=11051';
 26my $url = 'http://www.boots.com/webapp/wcs/stores/servlet/StoreLocator?storeId=10052&catalogId=11051&langId=-1&errorViewName=StoreLookupView&requiredAction=locateNearestStores&displayView=StoreLookupResultsView&startIndex=0&currentPage=0&postcode=&town=london&submitLookupStore.x=38&submitLookupStore.y=7';
 27
 28my $startIndex = 0;
 29my $currentPage = 0;
 30
 31my @list = ();
 32
 33my $results_file = "results.tsv.txt"; 
 34open(RESULTS,'>',$results_file) || die "cant open $results_file for writing";
 35select RESULTS; $| = 1;
 36select STDOUT; $| = 1;
 37
 38###########################################
 39# start
 40###########################################
 41
 42foreach my $c (1..100) {
 43	warn "getting page $c...\n";
 44
 45	my $file = $file_base . $currentPage . $file_ext;
 46
 47	# download page
 48	&getPage( file => $file, url => $url );
 49	sleep 1;
 50
 51	# parse page
 52	my $tree = HTML::TreeBuilder->new;
 53	$tree->parse_file($file);	
 54
 55	# <script type="text/javascript">
 56	my @scripts = $tree->look_down('_tag', 'script', sub { defined $_[0] && defined $_[0]->attr('type') && $_[0]->attr('type') eq "text/javascript" });
 57
 58	my $storelist;	
 59	foreach my $script (@scripts) {
 60		my $html = $script->as_HTML;
 61		#warn "script: " . $html;
 62		if ($html =~ m{MMIsSupportedBrowser}) {
 63			$storelist = $html;
 64			last;
 65		}
 66	}
 67	my @stores = split(/createMarker/, $storelist);
 68
 69	shift @stores; # header row
 70	foreach my $store (@stores) {
 71
 72		next unless $store =~ m{<div class="adr">};
 73
 74		#my ($postcode) = $address =~ m/\b([A-PR-UWYZ][A-HK-Y0-9][A-HJKSTUW0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2})\b/g;
 75
 76		my ($lat) = $store =~ m{var latitude = "([^"]+?)"};
 77		my ($long) = $store =~ m{var longitude = "([^"]+?)"};
 78
 79		my ($name) = $store =~ m{infoHtml \+=\s+'<h2>([^>]+?)</h2>'};
 80		my ($address1) = $store =~ m{<span class="address-line1">([^>]+?)<};
 81		my ($address2) = $store =~ m{<span class="address-line2">([^>]+?)<};
 82		my ($address3) = $store =~ m{<span class="address-line3">([^>]+?)<};
 83		my ($postcode) = $store =~ m{<span class="postal-code">([^>]+?)<};
 84
 85		my ($phone) = $store =~ m{<div class="tel">[^>]+?<span class="key">Phone:</span>[^>]+?<span class="value">([^>]+?)</span>};
 86
 87	#		warn "Checking: $store\n";
 88
 89		my $item = {
 90			name => ($name || "null"),
 91			address1 => ($address1 || "null"),
 92			address2 => ($address2 || "null"),
 93			address3 => ($address3 || "null"),
 94			postcode => ($postcode || "null"),
 95			phone => ($phone || "null"),
 96			latitude => ($lat || "null"),
 97			longitude => ($long || "null"),
 98		};
 99
100	#		warn "found item: " . Dumper($item);
101
102		push @list, $item;
103
104		print "writing item: " . Dumper($item);
105		print RESULTS join( "\t",
106			$item->{name}, $item->{latitude}, $item->{longitude},
107			$item->{address1}, $item->{address2}, $item->{address3},
108			$item->{postcode}, $item->{phone}
109		) . "\n";
110
111	}
112
113	# increment page
114	$currentPage += 1;
115	$url =~ s{currentPage=\d+}{currentPage=$currentPage};
116	$startIndex += 10;
117	$url =~ s{startIndex=\d+}{startIndex=$startIndex};
118
119}	
120
121###########################################
122# finish
123###########################################
124
125close(RESULTS);
126
127
128exit;
129
130
131
132
133
134
135###########################################
136# subs
137###########################################
138	
139sub clean_up {
140	my $s = shift;
141	$s =~ s/^\s+//g;
142	$s =~ s/\s+$//g;
143	return $s;
144}
145	
146	
147sub getPage {
148	my %p = @_;
149	my $file = $p{file};
150	my $url = $p{url};
151
152	if (! -f $file) {
153		warn "getting url\n";
154		my $agent = WWW::Mechanize->new( autocheck => 1 );
155		$agent->get($url);
156
157		if (! $agent->success() ) {
158			die "failed to get $url, stopping.\n";
159		}
160
161		warn "saving page\n";
162		my $first_page = $agent->content;
163		open(PAGE,'>',$file) || die "cant write to $file: $!";
164		print PAGE $first_page;
165		close(PAGE);
166	}
167	else {
168		warn "reading local file $file\n";
169	}
170}