PageRenderTime 41ms CodeModel.GetById 12ms RepoModel.GetById 0ms app.codeStats 0ms

/r2/r2/lib/scraper.py

https://github.com/wangmxf/lesswrong
Python | 352 lines | 287 code | 33 blank | 32 comment | 29 complexity | bd8f08e3963cc3fcb9bc8e105991f27c MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, LGPL-2.1
  1. # The contents of this file are subject to the Common Public Attribution
  2. # License Version 1.0. (the "License"); you may not use this file except in
  3. # compliance with the License. You may obtain a copy of the License at
  4. # http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
  5. # License Version 1.1, but Sections 14 and 15 have been added to cover use of
  6. # software over a computer network and provide for limited attribution for the
  7. # Original Developer. In addition, Exhibit A has been modified to be consistent
  8. # with Exhibit B.
  9. #
  10. # Software distributed under the License is distributed on an "AS IS" basis,
  11. # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
  12. # the specific language governing rights and limitations under the License.
  13. #
  14. # The Original Code is Reddit.
  15. #
  16. # The Original Developer is the Initial Developer. The Initial Developer of the
  17. # Original Code is CondeNet, Inc.
  18. #
  19. # All portions of the code written by CondeNet are Copyright (c) 2006-2008
  20. # CondeNet, Inc. All Rights Reserved.
  21. ################################################################################
  22. from pylons import g
  23. from r2.lib import utils
  24. from r2.lib.memoize import memoize
  25. from urllib2 import Request, HTTPError, URLError, urlopen
  26. from httplib import InvalidURL
  27. import urlparse, re, urllib, logging, StringIO, logging
  28. import Image, ImageFile, math
  29. from BeautifulSoup import BeautifulSoup
  30. log = g.log
  31. useragent = g.useragent
  32. chunk_size = 1024
  33. thumbnail_size = 70, 70
  34. def image_to_str(image):
  35. s = StringIO.StringIO()
  36. image.save(s, image.format)
  37. s.seek(0)
  38. return s.read()
  39. def str_to_image(s):
  40. s = StringIO.StringIO(s)
  41. s.seek(0)
  42. image = Image.open(s)
  43. return image
  44. def prepare_image(image):
  45. image = square_image(image)
  46. image.thumbnail(thumbnail_size, Image.ANTIALIAS)
  47. return image
  48. def image_entropy(img):
  49. """calculate the entropy of an image"""
  50. hist = img.histogram()
  51. hist_size = sum(hist)
  52. hist = [float(h) / hist_size for h in hist]
  53. return -sum([p * math.log(p, 2) for p in hist if p != 0])
  54. def square_image(img):
  55. """if the image is taller than it is wide, square it off. determine
  56. which pieces to cut off based on the entropy pieces."""
  57. x,y = img.size
  58. while y > x:
  59. #slice 10px at a time until square
  60. slice_height = min(y - x, 10)
  61. bottom = img.crop((0, y - slice_height, x, y))
  62. top = img.crop((0, 0, x, slice_height))
  63. #remove the slice with the least entropy
  64. if image_entropy(bottom) < image_entropy(top):
  65. img = img.crop((0, 0, x, y - slice_height))
  66. else:
  67. img = img.crop((0, slice_height, x, y))
  68. x,y = img.size
  69. return img
  70. def clean_url(url):
  71. """url quotes unicode data out of urls"""
  72. s = url
  73. url = url.encode('utf8')
  74. url = ''.join([urllib.quote(c) if ord(c) >= 127 else c for c in url])
  75. return url
  76. def fetch_url(url, referer = None, retries = 1, dimension = False):
  77. cur_try = 0
  78. log.debug('fetching: %s' % url)
  79. nothing = None if dimension else (None, None)
  80. url = clean_url(url)
  81. #just basic urls
  82. if not url.startswith('http://'):
  83. return nothing
  84. while True:
  85. try:
  86. req = Request(url)
  87. if useragent:
  88. req.add_header('User-Agent', useragent)
  89. if referer:
  90. req.add_header('Referer', referer)
  91. open_req = urlopen(req)
  92. #if we only need the dimension of the image, we may not
  93. #need to download the entire thing
  94. if dimension:
  95. content = open_req.read(chunk_size)
  96. else:
  97. content = open_req.read()
  98. content_type = open_req.headers.get('content-type')
  99. if not content_type:
  100. return nothing
  101. if 'image' in content_type:
  102. p = ImageFile.Parser()
  103. new_data = content
  104. while not p.image and new_data:
  105. p.feed(new_data)
  106. new_data = open_req.read(chunk_size)
  107. content += new_data
  108. #return the size, or return the data
  109. if dimension and p.image:
  110. return p.image.size
  111. elif dimension:
  112. return nothing
  113. elif dimension:
  114. #expected an image, but didn't get one
  115. return nothing
  116. return content_type, content
  117. except (URLError, HTTPError, InvalidURL), e:
  118. cur_try += 1
  119. if cur_try >= retries:
  120. log.debug('error while fetching: %s referer: %s' % (url, referer))
  121. log.debug(e)
  122. return nothing
  123. finally:
  124. if 'open_req' in locals():
  125. open_req.close()
  126. @memoize('media.fetch_size')
  127. def fetch_size(url, referer = None, retries = 1):
  128. return fetch_url(url, referer, retries, dimension = True)
  129. class Scraper:
  130. def __init__(self, url):
  131. self.url = url
  132. self.content = None
  133. self.content_type = None
  134. self.soup = None
  135. def download(self):
  136. self.content_type, self.content = fetch_url(self.url)
  137. if self.content_type and 'html' in self.content_type and self.content:
  138. self.soup = BeautifulSoup(self.content)
  139. def image_urls(self):
  140. #if the original url was an image, use that
  141. if 'image' in self.content_type:
  142. yield self.url
  143. elif self.soup:
  144. images = self.soup.findAll('img', src = True)
  145. for i in images:
  146. image_url = urlparse.urljoin(self.url, i['src'])
  147. yield image_url
  148. def largest_image_url(self):
  149. if not self.content:
  150. self.download()
  151. #if download didn't work
  152. if not self.content or not self.content_type:
  153. return None
  154. max_area = 0
  155. max_url = None
  156. for image_url in self.image_urls():
  157. size = fetch_size(image_url, referer = self.url)
  158. if not size:
  159. continue
  160. area = size[0] * size[1]
  161. #ignore little images
  162. if area < 5000:
  163. log.debug('ignore little %s' % image_url)
  164. continue
  165. #ignore excessively long/wide images
  166. if max(size) / min(size) > 1.5:
  167. log.debug('ignore dimensions %s' % image_url)
  168. continue
  169. if area > max_area:
  170. max_area = area
  171. max_url = image_url
  172. return max_url
  173. def thumbnail(self):
  174. image_url = self.largest_image_url()
  175. if image_url:
  176. content_type, image_str = fetch_url(image_url, referer = self.url)
  177. if image_str:
  178. image = str_to_image(image_str)
  179. try:
  180. image = prepare_image(image)
  181. except IOError, e:
  182. #can't read interlaced PNGs, ignore
  183. if 'interlaced' in e.message:
  184. return
  185. raise
  186. return image
  187. def media_object(self):
  188. return None
  189. class MediaScraper(Scraper):
  190. media_template = ""
  191. thumbnail_template = ""
  192. video_id_rx = None
  193. def __init__(self, url):
  194. m = self.video_id_rx.match(url)
  195. if m:
  196. self.video_id = m.groups()[0]
  197. else:
  198. #if we can't find the id just treat it like a normal page
  199. log.debug('reverting to regular scraper: %s' % url)
  200. self.__class__ = Scraper
  201. Scraper.__init__(self, url)
  202. def largest_image_url(self):
  203. return self.thumbnail_template.replace('$video_id', self.video_id)
  204. def media_object(self):
  205. return self.media_template.replace('$video_id', self.video_id)
  206. def youtube_in_google(google_url):
  207. h = Scraper(google_url)
  208. h.download()
  209. try:
  210. youtube_url = h.soup.find('div', 'original-text').findNext('a')['href']
  211. log.debug('%s is really %s' % (google_url, youtube_url))
  212. return youtube_url
  213. except AttributeError, KeyError:
  214. pass
  215. def make_scraper(url):
  216. domain = utils.domain(url)
  217. scraper = Scraper
  218. for suffix, cls in scrapers.iteritems():
  219. if domain.endswith(suffix):
  220. scraper = cls
  221. break
  222. #sometimes youtube scrapers masquerade as google scrapers
  223. if scraper == GootubeScraper:
  224. youtube_url = youtube_in_google(url)
  225. if youtube_url:
  226. return make_scraper(youtube_url)
  227. return scraper(url)
  228. ########## site-specific video scrapers ##########
  229. #Youtube
  230. class YoutubeScraper(MediaScraper):
  231. media_template = '<object width="425" height="350"><param name="movie" value="http://www.youtube.com/v/$video_id"></param><param name="wmode" value="transparent"></param><embed src="http://www.youtube.com/v/$video_id" type="application/x-shockwave-flash" wmode="transparent" width="425" height="350"></embed></object>'
  232. thumbnail_template = 'http://img.youtube.com/vi/$video_id/default.jpg'
  233. video_id_rx = re.compile('.*v=([A-Za-z0-9-_]+).*')
  234. #Metacage
  235. class MetacafeScraper(MediaScraper):
  236. media_template = '<embed src="$video_id" width="400" height="345" wmode="transparent" pluginspage="http://www.macromedia.com/go/getflashplayer" type="application/x-shockwave-flash"> </embed>'
  237. video_id_rx = re.compile('.*/watch/([^/]+)/.*')
  238. def media_object(self):
  239. if not self.soup:
  240. self.download()
  241. if self.soup:
  242. video_url = self.soup.find('link', rel = 'video_src')['href']
  243. return self.media_template.replace('$video_id', video_url)
  244. def largest_image_url(self):
  245. if not self.soup:
  246. self.download()
  247. if self.soup:
  248. return self.soup.find('link', rel = 'image_src')['href']
  249. #Google Video
  250. gootube_thumb_rx = re.compile(".*thumbnail:\s*\'(http://[^/]+/ThumbnailServer2[^\']+)\'.*", re.IGNORECASE | re.S)
  251. class GootubeScraper(MediaScraper):
  252. media_template = '<embed style="width:400px; height:326px;" id="VideoPlayback" type="application/x-shockwave-flash" src="http://video.google.com/googleplayer.swf?docId=$video_id&hl=en" flashvars=""> </embed>'
  253. video_id_rx = re.compile('.*videoplay\?docid=([A-Za-z0-9-_]+).*')
  254. def largest_image_url(self):
  255. if not self.content:
  256. self.download()
  257. if not self.content:
  258. return None
  259. m = gootube_thumb_rx.match(self.content)
  260. if m:
  261. image_url = m.groups()[0]
  262. image_url = utils.safe_eval_str(image_url)
  263. return image_url
  264. scrapers = {'youtube.com': YoutubeScraper,
  265. 'video.google.com': GootubeScraper,
  266. 'metacafe.com': MetacafeScraper}
  267. def test():
  268. from r2.lib.pool2 import WorkQueue
  269. jobs = []
  270. f = open('/tmp/testurls.txt')
  271. for url in f:
  272. if url.startswith('#'):
  273. continue
  274. if url.startswith('/info'):
  275. continue
  276. def make_job(url):
  277. def fetch(url):
  278. print 'START', url
  279. url = url.strip()
  280. h = make_scraper(url)
  281. image_url = h.largest_image_url()
  282. print 'DONE', image_url
  283. return lambda: fetch(url)
  284. jobs.append(make_job(url))
  285. print jobs[0]()
  286. #wq = WorkQueue(jobs)
  287. #wq.start()
  288. if __name__ == '__main__':
  289. test()