/r2/r2/lib/scraper.py
Python | 352 lines | 287 code | 33 blank | 32 comment | 29 complexity | bd8f08e3963cc3fcb9bc8e105991f27c MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, LGPL-2.1
- # The contents of this file are subject to the Common Public Attribution
- # License Version 1.0. (the "License"); you may not use this file except in
- # compliance with the License. You may obtain a copy of the License at
- # http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
- # License Version 1.1, but Sections 14 and 15 have been added to cover use of
- # software over a computer network and provide for limited attribution for the
- # Original Developer. In addition, Exhibit A has been modified to be consistent
- # with Exhibit B.
- #
- # Software distributed under the License is distributed on an "AS IS" basis,
- # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
- # the specific language governing rights and limitations under the License.
- #
- # The Original Code is Reddit.
- #
- # The Original Developer is the Initial Developer. The Initial Developer of the
- # Original Code is CondeNet, Inc.
- #
- # All portions of the code written by CondeNet are Copyright (c) 2006-2008
- # CondeNet, Inc. All Rights Reserved.
- ################################################################################
- from pylons import g
- from r2.lib import utils
- from r2.lib.memoize import memoize
- from urllib2 import Request, HTTPError, URLError, urlopen
- from httplib import InvalidURL
- import urlparse, re, urllib, logging, StringIO, logging
- import Image, ImageFile, math
- from BeautifulSoup import BeautifulSoup
- log = g.log
- useragent = g.useragent
- chunk_size = 1024
- thumbnail_size = 70, 70
- def image_to_str(image):
- s = StringIO.StringIO()
- image.save(s, image.format)
- s.seek(0)
- return s.read()
- def str_to_image(s):
- s = StringIO.StringIO(s)
- s.seek(0)
- image = Image.open(s)
- return image
- def prepare_image(image):
- image = square_image(image)
- image.thumbnail(thumbnail_size, Image.ANTIALIAS)
- return image
- def image_entropy(img):
- """calculate the entropy of an image"""
- hist = img.histogram()
- hist_size = sum(hist)
- hist = [float(h) / hist_size for h in hist]
- return -sum([p * math.log(p, 2) for p in hist if p != 0])
- def square_image(img):
- """if the image is taller than it is wide, square it off. determine
- which pieces to cut off based on the entropy pieces."""
- x,y = img.size
- while y > x:
- #slice 10px at a time until square
- slice_height = min(y - x, 10)
- bottom = img.crop((0, y - slice_height, x, y))
- top = img.crop((0, 0, x, slice_height))
- #remove the slice with the least entropy
- if image_entropy(bottom) < image_entropy(top):
- img = img.crop((0, 0, x, y - slice_height))
- else:
- img = img.crop((0, slice_height, x, y))
- x,y = img.size
- return img
- def clean_url(url):
- """url quotes unicode data out of urls"""
- s = url
- url = url.encode('utf8')
- url = ''.join([urllib.quote(c) if ord(c) >= 127 else c for c in url])
- return url
- def fetch_url(url, referer = None, retries = 1, dimension = False):
- cur_try = 0
- log.debug('fetching: %s' % url)
- nothing = None if dimension else (None, None)
- url = clean_url(url)
- #just basic urls
- if not url.startswith('http://'):
- return nothing
- while True:
- try:
- req = Request(url)
- if useragent:
- req.add_header('User-Agent', useragent)
- if referer:
- req.add_header('Referer', referer)
- open_req = urlopen(req)
- #if we only need the dimension of the image, we may not
- #need to download the entire thing
- if dimension:
- content = open_req.read(chunk_size)
- else:
- content = open_req.read()
- content_type = open_req.headers.get('content-type')
- if not content_type:
- return nothing
- if 'image' in content_type:
- p = ImageFile.Parser()
- new_data = content
- while not p.image and new_data:
- p.feed(new_data)
- new_data = open_req.read(chunk_size)
- content += new_data
- #return the size, or return the data
- if dimension and p.image:
- return p.image.size
- elif dimension:
- return nothing
- elif dimension:
- #expected an image, but didn't get one
- return nothing
- return content_type, content
- except (URLError, HTTPError, InvalidURL), e:
- cur_try += 1
- if cur_try >= retries:
- log.debug('error while fetching: %s referer: %s' % (url, referer))
- log.debug(e)
- return nothing
- finally:
- if 'open_req' in locals():
- open_req.close()
- @memoize('media.fetch_size')
- def fetch_size(url, referer = None, retries = 1):
- return fetch_url(url, referer, retries, dimension = True)
- class Scraper:
- def __init__(self, url):
- self.url = url
- self.content = None
- self.content_type = None
- self.soup = None
- def download(self):
- self.content_type, self.content = fetch_url(self.url)
- if self.content_type and 'html' in self.content_type and self.content:
- self.soup = BeautifulSoup(self.content)
- def image_urls(self):
- #if the original url was an image, use that
- if 'image' in self.content_type:
- yield self.url
- elif self.soup:
- images = self.soup.findAll('img', src = True)
- for i in images:
- image_url = urlparse.urljoin(self.url, i['src'])
- yield image_url
- def largest_image_url(self):
- if not self.content:
- self.download()
- #if download didn't work
- if not self.content or not self.content_type:
- return None
- max_area = 0
- max_url = None
- for image_url in self.image_urls():
- size = fetch_size(image_url, referer = self.url)
- if not size:
- continue
- area = size[0] * size[1]
- #ignore little images
- if area < 5000:
- log.debug('ignore little %s' % image_url)
- continue
- #ignore excessively long/wide images
- if max(size) / min(size) > 1.5:
- log.debug('ignore dimensions %s' % image_url)
- continue
- if area > max_area:
- max_area = area
- max_url = image_url
- return max_url
- def thumbnail(self):
- image_url = self.largest_image_url()
- if image_url:
- content_type, image_str = fetch_url(image_url, referer = self.url)
- if image_str:
- image = str_to_image(image_str)
- try:
- image = prepare_image(image)
- except IOError, e:
- #can't read interlaced PNGs, ignore
- if 'interlaced' in e.message:
- return
- raise
- return image
- def media_object(self):
- return None
- class MediaScraper(Scraper):
- media_template = ""
- thumbnail_template = ""
- video_id_rx = None
-
- def __init__(self, url):
- m = self.video_id_rx.match(url)
- if m:
- self.video_id = m.groups()[0]
- else:
- #if we can't find the id just treat it like a normal page
- log.debug('reverting to regular scraper: %s' % url)
- self.__class__ = Scraper
- Scraper.__init__(self, url)
- def largest_image_url(self):
- return self.thumbnail_template.replace('$video_id', self.video_id)
- def media_object(self):
- return self.media_template.replace('$video_id', self.video_id)
-
- def youtube_in_google(google_url):
- h = Scraper(google_url)
- h.download()
- try:
- youtube_url = h.soup.find('div', 'original-text').findNext('a')['href']
- log.debug('%s is really %s' % (google_url, youtube_url))
- return youtube_url
- except AttributeError, KeyError:
- pass
- def make_scraper(url):
- domain = utils.domain(url)
- scraper = Scraper
- for suffix, cls in scrapers.iteritems():
- if domain.endswith(suffix):
- scraper = cls
- break
-
- #sometimes youtube scrapers masquerade as google scrapers
- if scraper == GootubeScraper:
- youtube_url = youtube_in_google(url)
- if youtube_url:
- return make_scraper(youtube_url)
- return scraper(url)
- ########## site-specific video scrapers ##########
- #Youtube
- class YoutubeScraper(MediaScraper):
- media_template = '<object width="425" height="350"><param name="movie" value="http://www.youtube.com/v/$video_id"></param><param name="wmode" value="transparent"></param><embed src="http://www.youtube.com/v/$video_id" type="application/x-shockwave-flash" wmode="transparent" width="425" height="350"></embed></object>'
- thumbnail_template = 'http://img.youtube.com/vi/$video_id/default.jpg'
- video_id_rx = re.compile('.*v=([A-Za-z0-9-_]+).*')
- #Metacage
- class MetacafeScraper(MediaScraper):
- media_template = '<embed src="$video_id" width="400" height="345" wmode="transparent" pluginspage="http://www.macromedia.com/go/getflashplayer" type="application/x-shockwave-flash"> </embed>'
- video_id_rx = re.compile('.*/watch/([^/]+)/.*')
- def media_object(self):
- if not self.soup:
- self.download()
- if self.soup:
- video_url = self.soup.find('link', rel = 'video_src')['href']
- return self.media_template.replace('$video_id', video_url)
- def largest_image_url(self):
- if not self.soup:
- self.download()
- if self.soup:
- return self.soup.find('link', rel = 'image_src')['href']
- #Google Video
- gootube_thumb_rx = re.compile(".*thumbnail:\s*\'(http://[^/]+/ThumbnailServer2[^\']+)\'.*", re.IGNORECASE | re.S)
- class GootubeScraper(MediaScraper):
- media_template = '<embed style="width:400px; height:326px;" id="VideoPlayback" type="application/x-shockwave-flash" src="http://video.google.com/googleplayer.swf?docId=$video_id&hl=en" flashvars=""> </embed>'
- video_id_rx = re.compile('.*videoplay\?docid=([A-Za-z0-9-_]+).*')
- def largest_image_url(self):
- if not self.content:
- self.download()
- if not self.content:
- return None
- m = gootube_thumb_rx.match(self.content)
- if m:
- image_url = m.groups()[0]
- image_url = utils.safe_eval_str(image_url)
- return image_url
- scrapers = {'youtube.com': YoutubeScraper,
- 'video.google.com': GootubeScraper,
- 'metacafe.com': MetacafeScraper}
- def test():
- from r2.lib.pool2 import WorkQueue
- jobs = []
- f = open('/tmp/testurls.txt')
- for url in f:
- if url.startswith('#'):
- continue
- if url.startswith('/info'):
- continue
-
- def make_job(url):
- def fetch(url):
- print 'START', url
- url = url.strip()
- h = make_scraper(url)
- image_url = h.largest_image_url()
- print 'DONE', image_url
- return lambda: fetch(url)
- jobs.append(make_job(url))
- print jobs[0]()
- #wq = WorkQueue(jobs)
- #wq.start()
- if __name__ == '__main__':
- test()