PageRenderTime 54ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/r2/r2/lib/scraper.py

https://github.com/stevewilber/reddit
Python | 1850 lines | 1739 code | 74 blank | 37 comment | 68 complexity | 4c3265578333dbb2d9c7ead0fd61795f MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. # The contents of this file are subject to the Common Public Attribution
  2. # License Version 1.0. (the "License"); you may not use this file except in
  3. # compliance with the License. You may obtain a copy of the License at
  4. # http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
  5. # License Version 1.1, but Sections 14 and 15 have been added to cover use of
  6. # software over a computer network and provide for limited attribution for the
  7. # Original Developer. In addition, Exhibit A has been modified to be consistent
  8. # with Exhibit B.
  9. #
  10. # Software distributed under the License is distributed on an "AS IS" basis,
  11. # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
  12. # the specific language governing rights and limitations under the License.
  13. #
  14. # The Original Code is reddit.
  15. #
  16. # The Original Developer is the Initial Developer. The Initial Developer of
  17. # the Original Code is reddit Inc.
  18. #
  19. # All portions of the code written by reddit are Copyright (c) 2006-2012 reddit
  20. # Inc. All Rights Reserved.
  21. ###############################################################################
  22. from pylons import g
  23. from r2.lib import utils
  24. from r2.lib.memoize import memoize
  25. import simplejson as json
  26. from urllib2 import Request, HTTPError, URLError, urlopen
  27. from httplib import InvalidURL
  28. import urlparse, re, urllib, logging, StringIO, logging
  29. import Image, ImageFile, math
  30. from BeautifulSoup import BeautifulSoup
  31. log = g.log
  32. useragent = g.useragent
  33. chunk_size = 1024
  34. thumbnail_size = 70, 70
  35. def image_to_str(image):
  36. s = StringIO.StringIO()
  37. image.save(s, image.format)
  38. s.seek(0)
  39. return s.read()
  40. def str_to_image(s):
  41. s = StringIO.StringIO(s)
  42. s.seek(0)
  43. image = Image.open(s)
  44. return image
  45. def prepare_image(image):
  46. image = square_image(image)
  47. image.thumbnail(thumbnail_size, Image.ANTIALIAS)
  48. return image
  49. def image_entropy(img):
  50. """calculate the entropy of an image"""
  51. hist = img.histogram()
  52. hist_size = sum(hist)
  53. hist = [float(h) / hist_size for h in hist]
  54. return -sum([p * math.log(p, 2) for p in hist if p != 0])
  55. def square_image(img):
  56. """if the image is taller than it is wide, square it off. determine
  57. which pieces to cut off based on the entropy pieces."""
  58. x,y = img.size
  59. while y > x:
  60. #slice 10px at a time until square
  61. slice_height = min(y - x, 10)
  62. bottom = img.crop((0, y - slice_height, x, y))
  63. top = img.crop((0, 0, x, slice_height))
  64. #remove the slice with the least entropy
  65. if image_entropy(bottom) < image_entropy(top):
  66. img = img.crop((0, 0, x, y - slice_height))
  67. else:
  68. img = img.crop((0, slice_height, x, y))
  69. x,y = img.size
  70. return img
  71. def clean_url(url):
  72. """url quotes unicode data out of urls"""
  73. s = url
  74. url = url.encode('utf8')
  75. url = ''.join([urllib.quote(c) if ord(c) >= 127 else c for c in url])
  76. return url
  77. def fetch_url(url, referer = None, retries = 1, dimension = False):
  78. cur_try = 0
  79. log.debug('fetching: %s' % url)
  80. nothing = None if dimension else (None, None)
  81. url = clean_url(url)
  82. #just basic urls
  83. if not (url.startswith('http://') or url.startswith('https://')):
  84. return nothing
  85. while True:
  86. try:
  87. req = Request(url)
  88. if useragent:
  89. req.add_header('User-Agent', useragent)
  90. if referer:
  91. req.add_header('Referer', referer)
  92. open_req = urlopen(req)
  93. #if we only need the dimension of the image, we may not
  94. #need to download the entire thing
  95. if dimension:
  96. content = open_req.read(chunk_size)
  97. else:
  98. content = open_req.read()
  99. content_type = open_req.headers.get('content-type')
  100. if not content_type:
  101. return nothing
  102. if 'image' in content_type:
  103. p = ImageFile.Parser()
  104. new_data = content
  105. while not p.image and new_data:
  106. p.feed(new_data)
  107. new_data = open_req.read(chunk_size)
  108. content += new_data
  109. #return the size, or return the data
  110. if dimension and p.image:
  111. return p.image.size
  112. elif dimension:
  113. return nothing
  114. elif dimension:
  115. #expected an image, but didn't get one
  116. return nothing
  117. return content_type, content
  118. except (URLError, HTTPError, InvalidURL), e:
  119. cur_try += 1
  120. if cur_try >= retries:
  121. log.debug('error while fetching: %s referer: %s' % (url, referer))
  122. log.debug(e)
  123. return nothing
  124. finally:
  125. if 'open_req' in locals():
  126. open_req.close()
  127. @memoize('media.fetch_size')
  128. def fetch_size(url, referer = None, retries = 1):
  129. return fetch_url(url, referer, retries, dimension = True)
  130. class MediaEmbed(object):
  131. width = None
  132. height = None
  133. content = None
  134. scrolling = False
  135. def __init__(self, height, width, content, scrolling = False):
  136. self.height = int(height)
  137. self.width = int(width)
  138. self.content = content
  139. self.scrolling = scrolling
  140. class Scraper:
  141. def __init__(self, url):
  142. self.url = url
  143. self.content = None
  144. self.content_type = None
  145. self.soup = None
  146. def __repr__(self):
  147. return "%s(%r)" % (self.__class__.__name__, self.url)
  148. def download(self):
  149. self.content_type, self.content = fetch_url(self.url)
  150. if self.content_type and 'html' in self.content_type and self.content:
  151. self.soup = BeautifulSoup(self.content)
  152. def image_urls(self):
  153. #if the original url was an image, use that
  154. if 'image' in self.content_type:
  155. yield self.url
  156. elif self.soup:
  157. images = self.soup.findAll('img', src = True)
  158. for i in images:
  159. image_url = urlparse.urljoin(self.url, i['src'])
  160. yield image_url
  161. def largest_image_url(self):
  162. if not self.content:
  163. self.download()
  164. #if download didn't work
  165. if not self.content or not self.content_type:
  166. return None
  167. max_area = 0
  168. max_url = None
  169. if self.soup:
  170. og_image = self.soup.find('meta', property='og:image')
  171. if og_image and og_image['content']:
  172. log.debug("Using og:image")
  173. return og_image['content']
  174. thumbnail_spec = self.soup.find('link', rel = 'image_src')
  175. if thumbnail_spec and thumbnail_spec['href']:
  176. log.debug("Using image_src")
  177. return thumbnail_spec['href']
  178. for image_url in self.image_urls():
  179. size = fetch_size(image_url, referer = self.url)
  180. if not size:
  181. continue
  182. area = size[0] * size[1]
  183. #ignore little images
  184. if area < 5000:
  185. log.debug('ignore little %s' % image_url)
  186. continue
  187. #ignore excessively long/wide images
  188. if max(size) / min(size) > 1.5:
  189. log.debug('ignore dimensions %s' % image_url)
  190. continue
  191. #penalize images with "sprite" in their name
  192. if 'sprite' in image_url.lower():
  193. log.debug('penalizing sprite %s' % image_url)
  194. area /= 10
  195. if area > max_area:
  196. max_area = area
  197. max_url = image_url
  198. return max_url
  199. def thumbnail(self):
  200. image_url = self.largest_image_url()
  201. if image_url:
  202. content_type, image_str = fetch_url(image_url, referer = self.url)
  203. if image_str:
  204. image = str_to_image(image_str)
  205. try:
  206. image = prepare_image(image)
  207. except IOError, e:
  208. #can't read interlaced PNGs, ignore
  209. if 'interlaced' in e.message:
  210. return
  211. raise
  212. return image
  213. def media_object(self):
  214. for deepscraper in deepscrapers:
  215. ds = deepscraper()
  216. found = ds.find_media_object(self)
  217. if found:
  218. return found
  219. @classmethod
  220. def media_embed(cls):
  221. raise NotImplementedError
  222. class MediaScraper(Scraper):
  223. media_template = ""
  224. thumbnail_template = ""
  225. video_id = None
  226. video_id_rx = None
  227. def __init__(self, url):
  228. Scraper.__init__(self, url)
  229. # first try the simple regex against the URL. If that fails,
  230. # see if the MediaScraper subclass has its own extraction
  231. # function
  232. if self.video_id_rx:
  233. m = self.video_id_rx.match(url)
  234. if m:
  235. self.video_id = m.groups()[0]
  236. if not self.video_id:
  237. video_id = self.video_id_extract()
  238. if video_id:
  239. self.video_id = video_id
  240. if not self.video_id:
  241. #if we still can't find the id just treat it like a normal page
  242. log.debug('reverting to regular scraper: %s' % url)
  243. self.__class__ = Scraper
  244. def video_id_extract(self):
  245. return None
  246. def largest_image_url(self):
  247. if self.thumbnail_template:
  248. return self.thumbnail_template.replace('$video_id', self.video_id)
  249. else:
  250. return Scraper.largest_image_url(self)
  251. def media_object(self):
  252. return dict(video_id = self.video_id,
  253. type = self.domains[0])
  254. @classmethod
  255. def media_embed(cls, video_id = None, height = None, width = None, **kw):
  256. content = cls.media_template.replace('$video_id', video_id)
  257. return MediaEmbed(height = height or cls.height,
  258. width = width or cls.width,
  259. content = content)
  260. def youtube_in_google(google_url):
  261. h = Scraper(google_url)
  262. h.download()
  263. try:
  264. youtube_url = h.soup.find('div', 'original-text').findNext('a')['href']
  265. log.debug('%s is really %s' % (google_url, youtube_url))
  266. return youtube_url
  267. except AttributeError, KeyError:
  268. pass
  269. def make_scraper(url):
  270. domain = utils.domain(url)
  271. scraper = Scraper
  272. for suffix, clses in scrapers.iteritems():
  273. for cls in clses:
  274. if domain.endswith(suffix):
  275. scraper = cls
  276. break
  277. #sometimes youtube scrapers masquerade as google scrapers
  278. if scraper == GootubeScraper:
  279. youtube_url = youtube_in_google(url)
  280. if youtube_url:
  281. return make_scraper(youtube_url)
  282. return scraper(url)
  283. ########## site-specific video scrapers ##########
  284. class YoutubeScraper(MediaScraper):
  285. domains = ['youtube.com']
  286. height = 295
  287. width = 480
  288. media_template = '<object width="490" height="295"><param name="movie" value="http://www.youtube.com/v/$video_id&fs=1"></param><param name="wmode" value="transparent"></param><param name="allowFullScreen" value="true"></param><embed src="http://www.youtube.com/v/$video_id&fs=1" type="application/x-shockwave-flash" wmode="transparent" allowFullScreen="true" width="480" height="295"></embed></object>'
  289. thumbnail_template = 'http://img.youtube.com/vi/$video_id/default.jpg'
  290. video_id_rx = re.compile('.*v=([A-Za-z0-9-_]+).*')
  291. video_deeplink_rx = re.compile('.*#t=(\d+)m(\d+)s.*')
  292. def video_id_extract(self):
  293. vid = self.video_id_rx.match(self.url)
  294. if(vid):
  295. video_id = vid.groups()[0]
  296. d = self.video_deeplink_rx.match(self.url)
  297. if(d):
  298. seconds = int(d.groups()[0])*60 + int(d.groups()[1])
  299. video_id += "&start=%d" % seconds
  300. return video_id
  301. def largest_image_url(self):
  302. # Remove the deeplink part from the video id
  303. return self.thumbnail_template.replace("$video_id",
  304. self.video_id.split("&")[0])
  305. class TedScraper(MediaScraper):
  306. domains = ['ted.com']
  307. height = 326
  308. width = 446
  309. media_template = '<object width="446" height="326"><param name="movie" value="http://video.ted.com/assets/player/swf/EmbedPlayer.swf"></param><param name="allowFullScreen" value="true" /><param name="wmode" value="transparent"></param><param name="bgColor" value="#ffffff"></param> <param name="flashvars" value="$video_id" /><embed src="http://video.ted.com/assets/player/swf/EmbedPlayer.swf" pluginspace="http://www.macromedia.com/go/getflashplayer" type="application/x-shockwave-flash" wmode="transparent" bgColor="#ffffff" width="446" height="326" allowFullScreen="true" flashvars="$video_id"></embed></object>'
  310. flashvars_rx = re.compile('.*flashvars="(.*)".*')
  311. def video_id_extract(self):
  312. if "/talks/" in self.url:
  313. content_type, content = fetch_url(self.url.replace("/talks/","/talks/embed/"))
  314. if content:
  315. m = self.flashvars_rx.match(content)
  316. if m:
  317. return m.groups()[0]
  318. def largest_image_url(self):
  319. if not self.soup:
  320. self.download()
  321. if self.soup:
  322. return self.soup.find('link', rel = 'image_src')['href']
  323. class MetacafeScraper(MediaScraper):
  324. domains = ['metacafe.com']
  325. height = 345
  326. width = 400
  327. media_template = '<embed src="$video_id" width="400" height="345" wmode="transparent" pluginspage="http://www.macromedia.com/go/getflashplayer" type="application/x-shockwave-flash"> </embed>'
  328. video_id_rx = re.compile('.*/watch/([^/]+)/.*')
  329. def media_object(self):
  330. if not self.soup:
  331. self.download()
  332. if self.soup:
  333. video_url = self.soup.find('link', rel = 'video_src')['href']
  334. return dict(video_id = video_url,
  335. type = self.domains[0])
  336. class GootubeScraper(MediaScraper):
  337. domains = ['video.google.com']
  338. height = 326
  339. width = 400
  340. media_template = '<embed style="width:400px; height:326px;" id="VideoPlayback" type="application/x-shockwave-flash" src="http://video.google.com/googleplayer.swf?docId=$video_id&hl=en" flashvars=""> </embed>'
  341. video_id_rx = re.compile('.*videoplay\?docid=([A-Za-z0-9-_]+).*')
  342. gootube_thumb_rx = re.compile(".*thumbnail:\s*\'(http://[^/]+/ThumbnailServer2[^\']+)\'.*", re.IGNORECASE | re.S)
  343. def largest_image_url(self):
  344. if not self.content:
  345. self.download()
  346. if not self.content:
  347. return None
  348. m = self.gootube_thumb_rx.match(self.content)
  349. if m:
  350. image_url = m.groups()[0]
  351. image_url = utils.safe_eval_str(image_url)
  352. return image_url
  353. class VimeoScraper(MediaScraper):
  354. domains = ['vimeo.com']
  355. height = 448
  356. width = 520
  357. media_template = '<embed src="$video_id" width="520" height="448" wmode="transparent" pluginspage="http://www.macromedia.com/go/getflashplayer" type="application/x-shockwave-flash"> </embed>'
  358. video_id_rx = re.compile('.*/(.*)')
  359. def media_object(self):
  360. if not self.soup:
  361. self.download()
  362. if self.soup:
  363. video_url = self.soup.find('link', rel = 'video_src')['href']
  364. return dict(video_id = video_url,
  365. type = self.domains[0])
  366. class BreakScraper(MediaScraper):
  367. domains = ['break.com']
  368. height = 421
  369. width = 520
  370. media_template = '<object width="520" height="421"><param name="movie" value="$video_id"></param><param name="allowScriptAccess" value="always"></param><embed src="$video_id" type="application/x-shockwave-flash" allowScriptAccess="always" width="520" height="421"></embed></object>'
  371. video_id_rx = re.compile('.*/index/([^/]+).*');
  372. def video_id_extract(self):
  373. if not self.soup:
  374. self.download()
  375. if self.soup:
  376. video_src = self.soup.find('link', rel = 'video_src')
  377. if video_src and video_src['href']:
  378. return video_src['href']
  379. class TheOnionScraper(MediaScraper):
  380. domains = ['theonion.com']
  381. height = 430
  382. width = 480
  383. media_template = """<object width="480" height="430">
  384. <param name="allowfullscreen" value="true" />
  385. <param name="allowscriptaccess" value="always" />
  386. <param name="movie" value="http://www.theonion.com/content/themes/common/assets/onn_embed/embedded_player.swf?&amp;videoid=$video_id" />
  387. <param name="wmode" value="transparent" />
  388. <embed src="http://www.theonion.com/content/themes/common/assets/onn_embed/embedded_player.swf"
  389. width="480" height="430"
  390. wmode="transparent"
  391. pluginspage="http://www.macromedia.com/go/getflashplayer"
  392. type="application/x-shockwave-flash"
  393. flashvars="videoid=$video_id" >
  394. </embed>
  395. </object>"""
  396. video_id_rx = re.compile('.*/video/([^/?#]+).*')
  397. def media_object(self):
  398. if not self.soup:
  399. self.download()
  400. if self.soup:
  401. video_url = self.soup.find('meta', attrs={'name': 'nid'})['content']
  402. return dict(video_id = video_url,
  403. type = self.domains[0])
  404. class CollegeHumorScraper(MediaScraper):
  405. domains = ['collegehumor.com']
  406. height = 390
  407. width = 520
  408. media_template = '<object type="application/x-shockwave-flash" data="http://www.collegehumor.com/moogaloop/moogaloop.swf?clip_id=$video_id&fullscreen=1" width="520" height="390" ><param name="allowfullscreen" value="true" /><param name="AllowScriptAccess" value="true" /><param name="movie" quality="best" value="http://www.collegehumor.com/moogaloop/moogaloop.swf?clip_id=$video_id&fullscreen=1" /></object>'
  409. video_id_rx = re.compile('.*video:(\d+).*');
  410. class FunnyOrDieScraper(MediaScraper):
  411. domains = ['funnyordie.com']
  412. height = 438
  413. width = 464
  414. media_template = '<object width="464" height="438" classid="clsid:d27cdb6e-ae6d-11cf-96b8-444553540000" id="fodplayer"><param name="movie" value="http://player.ordienetworks.com/flash/fodplayer.swf?c79e63ac?key=$video_id" /><param name="flashvars" value="key=$video_id&autostart=true&internal=true" /><param name="allowfullscreen" value="true" /><embed width="464" height="438" flashvars="key=$video_id&autostart=true" allowfullscreen="true" quality="high" src="http://player.ordienetworks.com/flash/fodplayer.swf?c79e63ac" name="fodplayer" type="application/x-shockwave-flash"></embed></object>'
  415. thumbnail_template = 'http://assets1.ordienetworks.com/tmbs/$video_id/medium_2.jpg?c79e63ac'
  416. video_id_rx = re.compile('.*/videos/([^/]+)/.*')
  417. class ComedyCentralScraper(MediaScraper):
  418. domains = ['comedycentral.com']
  419. height = 316
  420. width = 332
  421. media_template = '<embed FlashVars="videoId=$video_id" src="http://www.comedycentral.com/sitewide/video_player/view/default/swf.jhtml" quality="high" bgcolor="#cccccc" width="332" height="316" name="comedy_central_player" align="middle" allowScriptAccess="always" allownetworking="external" type="application/x-shockwave-flash" pluginspage="http://www.macromedia.com/go/getflashplayer"></embed>'
  422. video_id_rx = re.compile('.*videoId=(\d+).*')
  423. class TheDailyShowScraper(MediaScraper):
  424. domains = ['thedailyshow.com']
  425. height = 353
  426. width = 360
  427. media_template = """<embed style='display:block' src='http://media.mtvnservices.com/mgid:cms:item:comedycentral.com:$video_id' width='360' height='301' type='application/x-shockwave-flash' wmode='window' allowFullscreen='true' flashvars='autoPlay=false' allowscriptaccess='always' allownetworking='all' bgcolor='#000000'></embed>"""
  428. def video_id_extract(self):
  429. "This is a bit of a hack"
  430. if not self.soup:
  431. self.download()
  432. if self.soup:
  433. embed_container = self.soup.find('div', {'class': 'videoplayerPromo module'})
  434. if embed_container:
  435. if embed_container['id'].startswith('promo_'):
  436. video_id = embed_container['id'].split('_')[1]
  437. return video_id
  438. class ColbertNationScraper(ComedyCentralScraper):
  439. domains = ['colbertnation.com']
  440. video_id_rx = re.compile('.*videos/(\d+)/.*')
  441. class LiveLeakScraper(MediaScraper):
  442. domains = ['liveleak.com']
  443. height = 370
  444. width = 450
  445. media_template = '<object width="450" height="370"><param name="movie" value="http://www.liveleak.com/e/$video_id"></param><param name="wmode" value="transparent"></param><embed src="http://www.liveleak.com/e/$video_id" type="application/x-shockwave-flash" wmode="transparent" width="450" height="370"></embed></object>'
  446. video_id_rx = re.compile('.*i=([a-zA-Z0-9_]+).*')
  447. def largest_image_url(self):
  448. if not self.soup:
  449. self.download()
  450. if self.soup:
  451. return self.soup.find('link', rel = 'videothumbnail')['href']
  452. class DailyMotionScraper(MediaScraper):
  453. domains = ['dailymotion.com']
  454. height = 381
  455. width = 480
  456. media_template = '<object width="480" height="381"><param name="movie" value="$video_id"></param><param name="allowFullScreen" value="true"></param><param name="allowScriptAccess" value="always"></param><embed src="$video_id" type="application/x-shockwave-flash" width="480" height="381" allowFullScreen="true" allowScriptAccess="always"></embed></object>'
  457. video_id_rx = re.compile('.*/video/([a-zA-Z0-9]+)_.*')
  458. def media_object(self):
  459. if not self.soup:
  460. self.download()
  461. if self.soup:
  462. video_url = self.soup.find('link', rel = 'video_src')['href']
  463. return dict(video_id = video_url,
  464. type = self.domains[0])
  465. class RevverScraper(MediaScraper):
  466. domains = ['revver.com']
  467. height = 392
  468. width = 480
  469. media_template = '<script src="http://flash.revver.com/player/1.0/player.js?mediaId:$video_id;width:480;height:392;" type="text/javascript"></script>'
  470. video_id_rx = re.compile('.*/video/([a-zA-Z0-9]+)/.*')
  471. class EscapistScraper(MediaScraper):
  472. domains = ['escapistmagazine.com']
  473. height = 294
  474. width = 480
  475. media_template = """<script src="http://www.escapistmagazine.com/videos/embed/$video_id"></script>"""
  476. video_id_rx = re.compile('.*/videos/view/[A-Za-z-9-]+/([0-9]+).*')
  477. class JustintvScraper(MediaScraper):
  478. """Can grab streams from justin.tv, but not clips"""
  479. domains = ['justin.tv']
  480. height = 295
  481. width = 353
  482. stream_media_template = """<object type="application/x-shockwave-flash" height="295" width="353" id="jtv_player_flash" data="http://www.justin.tv/widgets/jtv_player.swf?channel=$video_id" bgcolor="#000000"><param name="allowFullScreen" value="true" /><param name="allowScriptAccess" value="always" /><param name="allowNetworking" value="all" /><param name="movie" value="http://www.justin.tv/widgets/jtv_player.swf" /><param name="flashvars" value="channel=$video_id&auto_play=false&start_volume=25" /></object>"""
  483. video_id_rx = re.compile('^http://www.justin.tv/([a-zA-Z0-9_]+)[^/]*$')
  484. @classmethod
  485. def media_embed(cls, video_id, **kw):
  486. content = cls.stream_media_template.replace('$video_id', video_id)
  487. return MediaEmbed(height = cls.height,
  488. width = cls.width,
  489. content = content)
  490. class SoundcloudScraper(MediaScraper):
  491. """soundcloud.com"""
  492. domains = ['soundcloud.com']
  493. height = 81
  494. width = 400
  495. media_template = """<div style="font-size: 11px;">
  496. <object height="81" width="100%">
  497. <param name="movie"
  498. value="http://player.soundcloud.com/player.swf?track=$video_id">
  499. </param>
  500. <param name="allowscriptaccess" value="always"></param>
  501. <embed allowscriptaccess="always" height="81"
  502. src="http://player.soundcloud.com/player.swf?track=$video_id"
  503. type="application/x-shockwave-flash"
  504. width="100%">
  505. </embed>
  506. </object>"""
  507. video_id_rx = re.compile('^http://soundcloud.com/[a-zA-Z0-9_-]+/([a-zA-Z0-9_-]+)')
  508. class CraigslistScraper(MediaScraper):
  509. domains = ['craigslist.org']
  510. height = 480
  511. width = 640
  512. max_size_kb = 50
  513. def video_id_extract(self):
  514. return self.url
  515. def media_object(self):
  516. if not self.soup:
  517. self.download()
  518. if self.soup:
  519. ub = self.soup.find('div', {'id': 'userbody'})
  520. if ub:
  521. ub = str(ub)
  522. if len(ub) <= self.max_size_kb * 1024:
  523. return dict(content = ub,
  524. type = self.domains[0])
  525. @classmethod
  526. def media_embed(cls, content, **kw):
  527. return MediaEmbed(height = cls.height,
  528. width = cls.width,
  529. content = content,
  530. scrolling = True)
  531. ########## oembed rich-media scrapers ##########
  532. class OEmbed(Scraper):
  533. """
  534. Oembed Scraper
  535. ==============
  536. Tries to use the oembed standard to create a media object.
  537. url_re: Regular Expression to match the incoming url against.
  538. api_endpoint: Url of the api end point you are using.
  539. api_params: Default Params to be sent with the outgoing request.
  540. """
  541. url_re = ''
  542. api_endpoint = ''
  543. api_params = {}
  544. def __init__(self, url):
  545. Scraper.__init__(self, url)
  546. self.oembed = None
  547. #Fallback to the scraper if the url doesn't match
  548. if not self.url_re.match(self.url):
  549. self.__class__ = Scraper
  550. def __repr__(self):
  551. return "%s(%r)" % (self.__class__.__name__, self.url)
  552. def download(self):
  553. self.api_params.update( { 'url':self.url})
  554. query = urllib.urlencode(self.api_params)
  555. api_url = "%s?%s" % (self.api_endpoint, query)
  556. self.content_type, self.content = fetch_url(api_url)
  557. #Either a 404 or 500.
  558. if not self.content:
  559. #raise ValueError('ISSUE CALLING %s' %api_url)
  560. log.warn('oEmbed call (%s) failed to return content for %s'
  561. %(api_url, self.url))
  562. return None
  563. try:
  564. self.oembed = json.loads(self.content)
  565. except ValueError, e:
  566. log.error('oEmbed call (%s) return invalid json for %s'
  567. %(api_url, self.url))
  568. return None
  569. def image_urls(self):
  570. #if the original url was an image, use that
  571. if self.oembed and self.oembed.get('type') =='photo':
  572. yield self.oembed.get('url')
  573. elif self.oembed and self.oembed.get('thumbnail_url'):
  574. yield self.oembed.get('thumbnail_url')
  575. def largest_image_url(self):
  576. #Seems to be the default place to check if the download has happened.
  577. if not self.oembed:
  578. self.download()
  579. #if the original url was of the photo type
  580. if self.oembed and self.oembed.get('type') =='photo':
  581. return self.oembed.get('url')
  582. elif self.oembed and self.oembed.get('thumbnail_url'):
  583. return self.oembed.get('thumbnail_url')
  584. def media_object(self):
  585. #Seems to be the default place to check if the download has happened.
  586. if not self.oembed:
  587. self.download()
  588. if self.oembed and self.oembed.get('type') in ['video', 'rich']:
  589. for domain in self.domains:
  590. if self.url.find(domain) > -1:
  591. return dict(type=domain, oembed=self.oembed)
  592. return None
  593. @classmethod
  594. def media_embed(cls, video_id = None, height = None, width = None, **kw):
  595. content = None
  596. oembed = kw.get('oembed')
  597. # check if oembed is there and has html
  598. if oembed and oembed.get('html'):
  599. content = oembed.get('html')
  600. if content and oembed.get('height') and oembed.get('width'):
  601. return MediaEmbed(height = oembed['height'],
  602. width = oembed['width'],
  603. content = content)
  604. class EmbedlyOEmbed(OEmbed):
  605. """
  606. Embedly oEmbed Provider
  607. =======================
  608. documentation: http://api.embed.ly
  609. """
  610. domains = ['23hq.com', '5min.com', '99dollarmusicvideos.com',
  611. 'abcnews.go.com', 'achewood.com', 'allthingsd.com', 'amazon.com',
  612. 'aniboom.com', 'animoto.com', 'asofterworld.com', 'atom.com',
  613. 'audioboo.com', 'bambuser.com', 'bandcamp.com', 'barelydigital.com',
  614. 'barelypolitical.com', 'bigthink.com', 'blip.tv', 'bnter.com',
  615. 'boston.com', 'brainbird.net', 'bravotv.com', 'break.com',
  616. 'brizzly.com', 'cbsnews.com', 'channelfrederator.com', 'chart.ly',
  617. 'cl.ly', 'clikthrough.com', 'clipfish.de', 'clipshack.com', 'cnbc.com',
  618. 'cnn.com', 'colbertnation.com', 'collegehumor.com', 'color.com',
  619. 'comedycentral.com', 'compete.com', 'confreaks.net', 'crackle.com',
  620. 'craigslist.org', 'crocodoc.com', 'crunchbase.com', 'dailybooth.com',
  621. 'dailymile.com', 'dailymotion.com', 'deviantart.com', 'digg.com',
  622. 'dipdive.com', 'discovery.com', 'dotsub.com', 'dribbble.com',
  623. 'edition.cnn.com', 'emberapp.com', 'escapistmagazine.com',
  624. 'espn.go.com', 'facebook.com', 'fancast.com', 'flickr.com', 'fora.tv',
  625. 'formspring.me', 'fotopedia.com', 'freemusicarchive.org',
  626. 'funnyordie.com', 'gametrailers.com', 'gist.github.com',
  627. 'globalpost.com', 'godtube.com', 'gogoyoko.com', 'google.com',
  628. 'graphicly.com', 'grindtv.com', 'grooveshark.com', 'guardian.co.uk',
  629. 'hark.com', 'howcast.com', 'huffduffer.com', 'hulu.com',
  630. 'hungrynation.tv', 'ifood.tv', 'img.ly', 'imgur.com', 'indenti.ca',
  631. 'indymogul.com', 'instagr.am', 'issuu.com', 'itunes.apple.com',
  632. 'justin.tv', 'kickstarter.com', 'kinomap.com', 'kiva.org',
  633. 'koldcast.tv', 'last.fm', 'lightbox.com', 'liveleak.com',
  634. 'livestream.com', 'lockerz.com', 'logotv.com', 'lonelyplanet.com',
  635. 'maps.google.com', 'meadd.com', 'mediamatters.org', 'meetup.com',
  636. 'metacafe.com', 'metacdn.com', 'mixcloud.com', 'mixergy.com',
  637. 'mlkshk.com', 'mobypicture.com', 'money.cnn.com', 'movies.yahoo.com',
  638. 'msnbc.com', 'my.opera.com', 'myloc.me', 'myvideo.de',
  639. 'nationalgeographic.com', 'nfb.ca', 'npr.org', 'nzonscreen.com',
  640. 'overstream.net', 'ow.ly', 'pastebin.com', 'pastie.org',
  641. 'phodroid.com', 'photobucket.com', 'photozou.jp',
  642. 'picasaweb.google.com', 'picplz.com', 'pikchur.com', 'ping.fm',
  643. 'polldaddy.com', 'polleverywhere.com', 'posterous.com', 'prezi.com',
  644. 'qik.com', 'quantcast.com', 'questionablecontent.net', 'qwantz.com',
  645. 'qwiki.com', 'radionomy.com', 'radioreddit.com', 'rdio.com',
  646. 'recordsetter.com','redux.com', 'revision3.com', 'revver.com',
  647. 'saynow.com', 'schooltube.com', 'sciencestage.com', 'scrapblog.com',
  648. 'screencast.com', 'screenr.com', 'scribd.com', 'sendables.jibjab.com',
  649. 'share.ovi.com', 'shitmydadsays.com', 'shopstyle.com', 'skitch.com',
  650. 'slideshare.net', 'smugmug.com', 'snotr.com', 'socialcam.com',
  651. 'someecards.com', 'soundcloud.com', 'speakerdeck.com', 'spike.com',
  652. 'statsheet.com', 'status.net', 'storify.com', 'streetfire.net',
  653. 'studivz.net', 'tangle.com', 'teachertube.com', 'techcrunch.tv',
  654. 'ted.com', 'thedailyshow.com', 'theonion.com', 'threadbanger.com',
  655. 'timetoast.com', 'tinypic.com', 'tmiweekly.com', 'traileraddict.com',
  656. 'trailerspy.com', 'trooptube.tv', 'trutv.com', 'tumblr.com',
  657. 'twitgoo.com', 'twitlonger.com', 'twitpic.com', 'twitrpix.com',
  658. 'twitter.com', 'twitvid.com', 'ultrakawaii.com', 'urtak.com',
  659. 'uservoice.com', 'ustream.com', 'viddler.com', 'video.forbes.com',
  660. 'video.google.com', 'video.jardenberg.com', 'video.pbs.org',
  661. 'video.yahoo.com', 'videos.nymag.com', 'vids.myspace.com', 'vimeo.com',
  662. 'vodcars.com', 'washingtonpost.com', 'whitehouse.gov', 'whosay.com',
  663. 'wikimedia.org', 'wikipedia.org', 'wistia.com', 'wordpress.tv',
  664. 'worldstarhiphop.com', 'xiami.com', 'xkcd.com', 'xtranormal.com',
  665. 'yfrog.com', 'youku.com', 'youtu.be', 'youtube.com', 'zapiks.com',
  666. 'zero-inch.com']
  667. url_re = re.compile(
  668. 'http:\\/\\/.*youtube\\.com\\/watch.*|' +
  669. 'http:\\/\\/.*\\.youtube\\.com\\/v\\/.*|' +
  670. 'https:\\/\\/.*youtube\\.com\\/watch.*|' +
  671. 'https:\\/\\/.*\\.youtube\\.com\\/v\\/.*|' +
  672. 'http:\\/\\/youtu\\.be\\/.*|' +
  673. 'http:\\/\\/.*\\.youtube\\.com\\/user\\/.*|' +
  674. 'http:\\/\\/.*\\.youtube\\.com\\/.*\\#.*\\/.*|' +
  675. 'http:\\/\\/m\\.youtube\\.com\\/watch.*|' +
  676. 'http:\\/\\/m\\.youtube\\.com\\/index.*|' +
  677. 'http:\\/\\/.*\\.youtube\\.com\\/profile.*|' +
  678. 'http:\\/\\/.*\\.youtube\\.com\\/view_play_list.*|' +
  679. 'http:\\/\\/.*\\.youtube\\.com\\/playlist.*|' +
  680. 'http:\\/\\/.*justin\\.tv\\/.*|' +
  681. 'http:\\/\\/.*justin\\.tv\\/.*\\/b\\/.*|' +
  682. 'http:\\/\\/.*justin\\.tv\\/.*\\/w\\/.*|' +
  683. 'http:\\/\\/www\\.ustream\\.tv\\/recorded\\/.*|' +
  684. 'http:\\/\\/www\\.ustream\\.tv\\/channel\\/.*|' +
  685. 'http:\\/\\/www\\.ustream\\.tv\\/.*|' +
  686. 'http:\\/\\/qik\\.com\\/video\\/.*|' +
  687. 'http:\\/\\/qik\\.com\\/.*|' +
  688. 'http:\\/\\/qik\\.ly\\/.*|' +
  689. 'http:\\/\\/.*revision3\\.com\\/.*|' +
  690. 'http:\\/\\/.*\\.dailymotion\\.com\\/video\\/.*|' +
  691. 'http:\\/\\/.*\\.dailymotion\\.com\\/.*\\/video\\/.*|' +
  692. 'http:\\/\\/collegehumor\\.com\\/video:.*|' +
  693. 'http:\\/\\/collegehumor\\.com\\/video\\/.*|' +
  694. 'http:\\/\\/www\\.collegehumor\\.com\\/video:.*|' +
  695. 'http:\\/\\/www\\.collegehumor\\.com\\/video\\/.*|' +
  696. 'http:\\/\\/.*twitvid\\.com\\/.*|' +
  697. 'http:\\/\\/www\\.break\\.com\\/.*\\/.*|' +
  698. 'http:\\/\\/vids\\.myspace\\.com\\/index\\.cfm\\?fuseaction=vids\\.individual&videoid.*|' +
  699. 'http:\\/\\/www\\.myspace\\.com\\/index\\.cfm\\?fuseaction=.*&videoid.*|' +
  700. 'http:\\/\\/www\\.metacafe\\.com\\/watch\\/.*|' +
  701. 'http:\\/\\/www\\.metacafe\\.com\\/w\\/.*|' +
  702. 'http:\\/\\/blip\\.tv\\/.*\\/.*|' +
  703. 'http:\\/\\/.*\\.blip\\.tv\\/.*\\/.*|' +
  704. 'http:\\/\\/video\\.google\\.com\\/videoplay\\?.*|' +
  705. 'http:\\/\\/.*revver\\.com\\/video\\/.*|' +
  706. 'http:\\/\\/video\\.yahoo\\.com\\/watch\\/.*\\/.*|' +
  707. 'http:\\/\\/video\\.yahoo\\.com\\/network\\/.*|' +
  708. 'http:\\/\\/.*viddler\\.com\\/explore\\/.*\\/videos\\/.*|' +
  709. 'http:\\/\\/liveleak\\.com\\/view\\?.*|' +
  710. 'http:\\/\\/www\\.liveleak\\.com\\/view\\?.*|' +
  711. 'http:\\/\\/animoto\\.com\\/play\\/.*|' +
  712. 'http:\\/\\/dotsub\\.com\\/view\\/.*|' +
  713. 'http:\\/\\/www\\.overstream\\.net\\/view\\.php\\?oid=.*|' +
  714. 'http:\\/\\/www\\.livestream\\.com\\/.*|' +
  715. 'http:\\/\\/www\\.worldstarhiphop\\.com\\/videos\\/video.*\\.php\\?v=.*|' +
  716. 'http:\\/\\/worldstarhiphop\\.com\\/videos\\/video.*\\.php\\?v=.*|' +
  717. 'http:\\/\\/teachertube\\.com\\/viewVideo\\.php.*|' +
  718. 'http:\\/\\/www\\.teachertube\\.com\\/viewVideo\\.php.*|' +
  719. 'http:\\/\\/www1\\.teachertube\\.com\\/viewVideo\\.php.*|' +
  720. 'http:\\/\\/www2\\.teachertube\\.com\\/viewVideo\\.php.*|' +
  721. 'http:\\/\\/bambuser\\.com\\/v\\/.*|' +
  722. 'http:\\/\\/bambuser\\.com\\/channel\\/.*|' +
  723. 'http:\\/\\/bambuser\\.com\\/channel\\/.*\\/broadcast\\/.*|' +
  724. 'http:\\/\\/www\\.schooltube\\.com\\/video\\/.*\\/.*|' +
  725. 'http:\\/\\/bigthink\\.com\\/ideas\\/.*|' +
  726. 'http:\\/\\/bigthink\\.com\\/series\\/.*|' +
  727. 'http:\\/\\/sendables\\.jibjab\\.com\\/view\\/.*|' +
  728. 'http:\\/\\/sendables\\.jibjab\\.com\\/originals\\/.*|' +
  729. 'http:\\/\\/www\\.xtranormal\\.com\\/watch\\/.*|' +
  730. 'http:\\/\\/socialcam\\.com\\/v\\/.*|' +
  731. 'http:\\/\\/www\\.socialcam\\.com\\/v\\/.*|' +
  732. 'http:\\/\\/dipdive\\.com\\/media\\/.*|' +
  733. 'http:\\/\\/dipdive\\.com\\/member\\/.*\\/media\\/.*|' +
  734. 'http:\\/\\/dipdive\\.com\\/v\\/.*|' +
  735. 'http:\\/\\/.*\\.dipdive\\.com\\/media\\/.*|' +
  736. 'http:\\/\\/.*\\.dipdive\\.com\\/v\\/.*|' +
  737. 'http:\\/\\/v\\.youku\\.com\\/v_show\\/.*\\.html|' +
  738. 'http:\\/\\/v\\.youku\\.com\\/v_playlist\\/.*\\.html|' +
  739. 'http:\\/\\/www\\.snotr\\.com\\/video\\/.*|' +
  740. 'http:\\/\\/snotr\\.com\\/video\\/.*|' +
  741. 'http:\\/\\/video\\.jardenberg\\.se\\/.*|' +
  742. 'http:\\/\\/www\\.clipfish\\.de\\/.*\\/.*\\/video\\/.*|' +
  743. 'http:\\/\\/www\\.myvideo\\.de\\/watch\\/.*|' +
  744. 'http:\\/\\/www\\.whitehouse\\.gov\\/photos-and-video\\/video\\/.*|' +
  745. 'http:\\/\\/www\\.whitehouse\\.gov\\/video\\/.*|' +
  746. 'http:\\/\\/wh\\.gov\\/photos-and-video\\/video\\/.*|' +
  747. 'http:\\/\\/wh\\.gov\\/video\\/.*|' +
  748. 'http:\\/\\/www\\.hulu\\.com\\/watch.*|' +
  749. 'http:\\/\\/www\\.hulu\\.com\\/w\\/.*|' +
  750. 'http:\\/\\/hulu\\.com\\/watch.*|' +
  751. 'http:\\/\\/hulu\\.com\\/w\\/.*|' +
  752. 'http:\\/\\/.*crackle\\.com\\/c\\/.*|' +
  753. 'http:\\/\\/www\\.fancast\\.com\\/.*\\/videos|' +
  754. 'http:\\/\\/www\\.funnyordie\\.com\\/videos\\/.*|' +
  755. 'http:\\/\\/www\\.funnyordie\\.com\\/m\\/.*|' +
  756. 'http:\\/\\/funnyordie\\.com\\/videos\\/.*|' +
  757. 'http:\\/\\/funnyordie\\.com\\/m\\/.*|' +
  758. 'http:\\/\\/www\\.vimeo\\.com\\/groups\\/.*\\/videos\\/.*|' +
  759. 'http:\\/\\/www\\.vimeo\\.com\\/.*|' +
  760. 'http:\\/\\/vimeo\\.com\\/groups\\/.*\\/videos\\/.*|' +
  761. 'http:\\/\\/vimeo\\.com\\/.*|' +
  762. 'http:\\/\\/vimeo\\.com\\/m\\/\\#\\/.*|' +
  763. 'http:\\/\\/www\\.ted\\.com\\/talks\\/.*\\.html.*|' +
  764. 'http:\\/\\/www\\.ted\\.com\\/talks\\/lang\\/.*\\/.*\\.html.*|' +
  765. 'http:\\/\\/www\\.ted\\.com\\/index\\.php\\/talks\\/.*\\.html.*|' +
  766. 'http:\\/\\/www\\.ted\\.com\\/index\\.php\\/talks\\/lang\\/.*\\/.*\\.html.*|' +
  767. 'http:\\/\\/.*nfb\\.ca\\/film\\/.*|' +
  768. 'http:\\/\\/www\\.thedailyshow\\.com\\/watch\\/.*|' +
  769. 'http:\\/\\/www\\.thedailyshow\\.com\\/full-episodes\\/.*|' +
  770. 'http:\\/\\/www\\.thedailyshow\\.com\\/collection\\/.*\\/.*\\/.*|' +
  771. 'http:\\/\\/movies\\.yahoo\\.com\\/movie\\/.*\\/video\\/.*|' +
  772. 'http:\\/\\/movies\\.yahoo\\.com\\/movie\\/.*\\/trailer|' +
  773. 'http:\\/\\/movies\\.yahoo\\.com\\/movie\\/.*\\/video|' +
  774. 'http:\\/\\/www\\.colbertnation\\.com\\/the-colbert-report-collections\\/.*|' +
  775. 'http:\\/\\/www\\.colbertnation\\.com\\/full-episodes\\/.*|' +
  776. 'http:\\/\\/www\\.colbertnation\\.com\\/the-colbert-report-videos\\/.*|' +
  777. 'http:\\/\\/www\\.comedycentral\\.com\\/videos\\/index\\.jhtml\\?.*|' +
  778. 'http:\\/\\/www\\.theonion\\.com\\/video\\/.*|' +
  779. 'http:\\/\\/theonion\\.com\\/video\\/.*|' +
  780. 'http:\\/\\/wordpress\\.tv\\/.*\\/.*\\/.*\\/.*\\/|' +
  781. 'http:\\/\\/www\\.traileraddict\\.com\\/trailer\\/.*|' +
  782. 'http:\\/\\/www\\.traileraddict\\.com\\/clip\\/.*|' +
  783. 'http:\\/\\/www\\.traileraddict\\.com\\/poster\\/.*|' +
  784. 'http:\\/\\/www\\.escapistmagazine\\.com\\/videos\\/.*|' +
  785. 'http:\\/\\/www\\.trailerspy\\.com\\/trailer\\/.*\\/.*|' +
  786. 'http:\\/\\/www\\.trailerspy\\.com\\/trailer\\/.*|' +
  787. 'http:\\/\\/www\\.trailerspy\\.com\\/view_video\\.php.*|' +
  788. 'http:\\/\\/www\\.atom\\.com\\/.*\\/.*\\/|' +
  789. 'http:\\/\\/fora\\.tv\\/.*\\/.*\\/.*\\/.*|' +
  790. 'http:\\/\\/www\\.spike\\.com\\/video\\/.*|' +
  791. 'http:\\/\\/www\\.gametrailers\\.com\\/video\\/.*|' +
  792. 'http:\\/\\/gametrailers\\.com\\/video\\/.*|' +
  793. 'http:\\/\\/www\\.koldcast\\.tv\\/video\\/.*|' +
  794. 'http:\\/\\/www\\.koldcast\\.tv\\/\\#video:.*|' +
  795. 'http:\\/\\/techcrunch\\.tv\\/watch.*|' +
  796. 'http:\\/\\/techcrunch\\.tv\\/.*\\/watch.*|' +
  797. 'http:\\/\\/mixergy\\.com\\/.*|' +
  798. 'http:\\/\\/video\\.pbs\\.org\\/video\\/.*|' +
  799. 'http:\\/\\/www\\.zapiks\\.com\\/.*|' +
  800. 'http:\\/\\/tv\\.digg\\.com\\/diggnation\\/.*|' +
  801. 'http:\\/\\/tv\\.digg\\.com\\/diggreel\\/.*|' +
  802. 'http:\\/\\/tv\\.digg\\.com\\/diggdialogg\\/.*|' +
  803. 'http:\\/\\/www\\.trutv\\.com\\/video\\/.*|' +
  804. 'http:\\/\\/www\\.nzonscreen\\.com\\/title\\/.*|' +
  805. 'http:\\/\\/nzonscreen\\.com\\/title\\/.*|' +
  806. 'http:\\/\\/app\\.wistia\\.com\\/embed\\/medias\\/.*|' +
  807. 'https:\\/\\/app\\.wistia\\.com\\/embed\\/medias\\/.*|' +
  808. 'http:\\/\\/hungrynation\\.tv\\/.*\\/episode\\/.*|' +
  809. 'http:\\/\\/www\\.hungrynation\\.tv\\/.*\\/episode\\/.*|' +
  810. 'http:\\/\\/hungrynation\\.tv\\/episode\\/.*|' +
  811. 'http:\\/\\/www\\.hungrynation\\.tv\\/episode\\/.*|' +
  812. 'http:\\/\\/indymogul\\.com\\/.*\\/episode\\/.*|' +
  813. 'http:\\/\\/www\\.indymogul\\.com\\/.*\\/episode\\/.*|' +
  814. 'http:\\/\\/indymogul\\.com\\/episode\\/.*|' +
  815. 'http:\\/\\/www\\.indymogul\\.com\\/episode\\/.*|' +
  816. 'http:\\/\\/channelfrederator\\.com\\/.*\\/episode\\/.*|' +
  817. 'http:\\/\\/www\\.channelfrederator\\.com\\/.*\\/episode\\/.*|' +
  818. 'http:\\/\\/channelfrederator\\.com\\/episode\\/.*|' +
  819. 'http:\\/\\/www\\.channelfrederator\\.com\\/episode\\/.*|' +
  820. 'http:\\/\\/tmiweekly\\.com\\/.*\\/episode\\/.*|' +
  821. 'http:\\/\\/www\\.tmiweekly\\.com\\/.*\\/episode\\/.*|' +
  822. 'http:\\/\\/tmiweekly\\.com\\/episode\\/.*|' +
  823. 'http:\\/\\/www\\.tmiweekly\\.com\\/episode\\/.*|' +
  824. 'http:\\/\\/99dollarmusicvideos\\.com\\/.*\\/episode\\/.*|' +
  825. 'http:\\/\\/www\\.99dollarmusicvideos\\.com\\/.*\\/episode\\/.*|' +
  826. 'http:\\/\\/99dollarmusicvideos\\.com\\/episode\\/.*|' +
  827. 'http:\\/\\/www\\.99dollarmusicvideos\\.com\\/episode\\/.*|' +
  828. 'http:\\/\\/ultrakawaii\\.com\\/.*\\/episode\\/.*|' +
  829. 'http:\\/\\/www\\.ultrakawaii\\.com\\/.*\\/episode\\/.*|' +
  830. 'http:\\/\\/ultrakawaii\\.com\\/episode\\/.*|' +
  831. 'http:\\/\\/www\\.ultrakawaii\\.com\\/episode\\/.*|' +
  832. 'http:\\/\\/barelypolitical\\.com\\/.*\\/episode\\/.*|' +
  833. 'http:\\/\\/www\\.barelypolitical\\.com\\/.*\\/episode\\/.*|' +
  834. 'http:\\/\\/barelypolitical\\.com\\/episode\\/.*|' +
  835. 'http:\\/\\/www\\.barelypolitical\\.com\\/episode\\/.*|' +
  836. 'http:\\/\\/barelydigital\\.com\\/.*\\/episode\\/.*|' +
  837. 'http:\\/\\/www\\.barelydigital\\.com\\/.*\\/episode\\/.*|' +
  838. 'http:\\/\\/barelydigital\\.com\\/episode\\/.*|' +
  839. 'http:\\/\\/www\\.barelydigital\\.com\\/episode\\/.*|' +
  840. 'http:\\/\\/threadbanger\\.com\\/.*\\/episode\\/.*|' +
  841. 'http:\\/\\/www\\.threadbanger\\.com\\/.*\\/episode\\/.*|' +
  842. 'http:\\/\\/threadbanger\\.com\\/episode\\/.*|' +
  843. 'http:\\/\\/www\\.threadbanger\\.com\\/episode\\/.*|' +
  844. 'http:\\/\\/vodcars\\.com\\/.*\\/episode\\/.*|' +
  845. 'http:\\/\\/www\\.vodcars\\.com\\/.*\\/episode\\/.*|' +
  846. 'http:\\/\\/vodcars\\.com\\/episode\\/.*|' +
  847. 'http:\\/\\/www\\.vodcars\\.com\\/episode\\/.*|' +
  848. 'http:\\/\\/confreaks\\.net\\/videos\\/.*|' +
  849. 'http:\\/\\/www\\.confreaks\\.net\\/videos\\/.*|' +
  850. 'http:\\/\\/video\\.allthingsd\\.com\\/video\\/.*|' +
  851. 'http:\\/\\/videos\\.nymag\\.com\\/.*|' +
  852. 'http:\\/\\/aniboom\\.com\\/animation-video\\/.*|' +
  853. 'http:\\/\\/www\\.aniboom\\.com\\/animation-video\\/.*|' +
  854. 'http:\\/\\/clipshack\\.com\\/Clip\\.aspx\\?.*|' +
  855. 'http:\\/\\/www\\.clipshack\\.com\\/Clip\\.aspx\\?.*|' +
  856. 'http:\\/\\/grindtv\\.com\\/.*\\/video\\/.*|' +
  857. 'http:\\/\\/www\\.grindtv\\.com\\/.*\\/video\\/.*|' +
  858. 'http:\\/\\/ifood\\.tv\\/recipe\\/.*|' +
  859. 'http:\\/\\/ifood\\.tv\\/video\\/.*|' +
  860. 'http:\\/\\/ifood\\.tv\\/channel\\/user\\/.*|' +
  861. 'http:\\/\\/www\\.ifood\\.tv\\/recipe\\/.*|' +
  862. 'http:\\/\\/www\\.ifood\\.tv\\/video\\/.*|' +
  863. 'http:\\/\\/www\\.ifood\\.tv\\/channel\\/user\\/.*|' +
  864. 'http:\\/\\/logotv\\.com\\/video\\/.*|' +
  865. 'http:\\/\\/www\\.logotv\\.com\\/video\\/.*|' +
  866. 'http:\\/\\/lonelyplanet\\.com\\/Clip\\.aspx\\?.*|' +
  867. 'http:\\/\\/www\\.lonelyplanet\\.com\\/Clip\\.aspx\\?.*|' +
  868. 'http:\\/\\/streetfire\\.net\\/video\\/.*\\.htm.*|' +
  869. 'http:\\/\\/www\\.streetfire\\.net\\/video\\/.*\\.htm.*|' +
  870. 'http:\\/\\/trooptube\\.tv\\/videos\\/.*|' +
  871. 'http:\\/\\/www\\.trooptube\\.tv\\/videos\\/.*|' +
  872. 'http:\\/\\/sciencestage\\.com\\/v\\/.*\\.html|' +
  873. 'http:\\/\\/sciencestage\\.com\\/a\\/.*\\.html|' +
  874. 'http:\\/\\/www\\.sciencestage\\.com\\/v\\/.*\\.html|' +
  875. 'http:\\/\\/www\\.sciencestage\\.com\\/a\\/.*\\.html|' +
  876. 'http:\\/\\/www\\.godtube\\.com\\/featured\\/video\\/.*|' +
  877. 'http:\\/\\/godtube\\.com\\/featured\\/video\\/.*|' +
  878. 'http:\\/\\/www\\.godtube\\.com\\/watch\\/.*|' +
  879. 'http:\\/\\/godtube\\.com\\/watch\\/.*|' +
  880. 'http:\\/\\/www\\.tangle\\.com\\/view_video.*|' +
  881. 'http:\\/\\/mediamatters\\.org\\/mmtv\\/.*|' +
  882. 'http:\\/\\/www\\.clikthrough\\.com\\/theater\\/video\\/.*|' +
  883. 'http:\\/\\/gist\\.github\\.com\\/.*|' +
  884. 'http:\\/\\/twitter\\.com\\/.*\\/status\\/.*|' +
  885. 'http:\\/\\/twitter\\.com\\/.*\\/statuses\\/.*|' +
  886. 'http:\\/\\/www\\.twitter\\.com\\/.*\\/status\\/.*|' +
  887. 'http:\\/\\/www\\.twitter\\.com\\/.*\\/statuses\\/.*|' +
  888. 'http:\\/\\/mobile\\.twitter\\.com\\/.*\\/status\\/.*|' +
  889. 'http:\\/\\/mobile\\.twitter\\.com\\/.*\\/statuses\\/.*|' +
  890. 'https:\\/\\/twitter\\.com\\/.*\\/status\\/.*|' +
  891. 'https:\\/\\/twitter\\.com\\/.*\\/statuses\\/.*|' +
  892. 'https:\\/\\/www\\.twitter\\.com\\/.*\\/status\\/.*|' +
  893. 'https:\\/\\/www\\.twitter\\.com\\/.*\\/statuses\\/.*|' +
  894. 'https:\\/\\/mobile\\.twitter\\.com\\/.*\\/status\\/.*|' +
  895. 'https:\\/\\/mobile\\.twitter\\.com\\/.*\\/statuses\\/.*|' +
  896. 'http:\\/\\/www\\.crunchbase\\.com\\/.*\\/.*|' +
  897. 'http:\\/\\/crunchbase\\.com\\/.*\\/.*|' +
  898. 'http:\\/\\/www\\.slideshare\\.net\\/.*\\/.*|' +
  899. 'http:\\/\\/www\\.slideshare\\.net\\/mobile\\/.*\\/.*|' +
  900. 'http:\\/\\/slidesha\\.re\\/.*|' +
  901. 'http:\\/\\/scribd\\.com\\/doc\\/.*|' +
  902. 'http:\\/\\/www\\.scribd\\.com\\/doc\\/.*|' +
  903. 'http:\\/\\/scribd\\.com\\/mobile\\/documents\\/.*|' +
  904. 'http:\\/\\/www\\.scribd\\.com\\/mobile\\/documents\\/.*|' +
  905. 'http:\\/\\/screenr\\.com\\/.*|' +
  906. 'http:\\/\\/polldaddy\\.com\\/community\\/poll\\/.*|' +
  907. 'http:\\/\\/polldaddy\\.com\\/poll\\/.*|' +
  908. 'http:\\/\\/answers\\.polldaddy\\.com\\/poll\\/.*|' +
  909. 'http:\\/\\/www\\.5min\\.com\\/Video\\/.*|' +
  910. 'http:\\/\\/www\\.howcast\\.com\\/videos\\/.*|' +
  911. 'http:\\/\\/www\\.screencast\\.com\\/.*\\/media\\/.*|' +
  912. 'http:\\/\\/screencast\\.com\\/.*\\/media\\/.*|' +
  913. 'http:\\/\\/www\\.screencast\\.com\\/t\\/.*|' +
  914. 'http:\\/\\/screencast\\.com\\/t\\/.*|' +
  915. 'http:\\/\\/issuu\\.com\\/.*\\/docs\\/.*|' +
  916. 'http:\\/\\/www\\.kickstarter\\.com\\/projects\\/.*\\/.*|' +
  917. 'http:\\/\\/www\\.scrapblog\\.com\\/viewer\\/viewer\\.aspx.*|' +
  918. 'http:\\/\\/ping\\.fm\\/p\\/.*|' +
  919. 'http:\\/\\/chart\\.ly\\/symbols\\/.*|' +
  920. 'http:\\/\\/chart\\.ly\\/.*|' +
  921. 'http:\\/\\/maps\\.google\\.com\\/maps\\?.*|' +
  922. 'http:\\/\\/maps\\.google\\.com\\/\\?.*|' +
  923. 'http:\\/\\/maps\\.google\\.com\\/maps\\/ms\\?.*|' +
  924. 'http:\\/\\/.*\\.craigslist\\.org\\/.*\\/.*|' +
  925. 'http:\\/\\/my\\.opera\\.com\\/.*\\/albums\\/show\\.dml\\?id=.*|' +
  926. 'http:\\/\\/my\\.opera\\.com\\/.*\\/albums\\/showpic\\.dml\\?album=.*&picture=.*|' +
  927. 'http:\\/\\/tumblr\\.com\\/.*|' +
  928. 'http:\\/\\/.*\\.tumblr\\.com\\/post\\/.*|' +
  929. 'http:\\/\\/www\\.polleverywhere\\.com\\/polls\\/.*|' +
  930. 'http:\\/\\/www\\.polleverywhere\\.com\\/multiple_choice_polls\\/.*|' +
  931. 'http:\\/\\/www\\.polleverywhere\\.com\\/free_text_polls\\/.*|' +
  932. 'http:\\/\\/www\\.quantcast\\.com\\/wd:.*|' +
  933. 'http:\\/\\/www\\.quantcast\\.com\\/.*|' +
  934. 'http:\\/\\/siteanalytics\\.compete\\.com\\/.*|' +
  935. 'http:\\/\\/statsheet\\.com\\/statplot\\/charts\\/.*\\/.*\\/.*\\/.*|' +
  936. 'http:\\/\\/statsheet\\.com\\/statplot\\/charts\\/e\\/.*|' +
  937. 'http:\\/\\/statsheet\\.com\\/.*\\/teams\\/.*\\/.*|' +
  938. 'http:\\/\\/statsheet\\.com\\/tools\\/chartlets\\?chart=.*|' +
  939. 'http:\\/\\/.*\\.status\\.net\\/notice\\/.*|' +
  940. 'http:\\/\\/identi\\.ca\\/notice\\/.*|' +
  941. 'http:\\/\\/brainbird\\.net\\/notice\\/.*|' +
  942. 'http:\\/\\/shitmydadsays\\.com\\/notice\\/.*|' +
  943. 'http:\\/\\/www\\.studivz\\.net\\/Profile\\/.*|' +
  944. 'http:\\/\\/www\\.studivz\\.net\\/l\\/.*|' +
  945. 'http:\\/\\/www\\.studivz\\.net\\/Groups\\/Overview\\/.*|' +
  946. 'http:\\/\\/www\\.studivz\\.net\\/Gadgets\\/Info\\/.*|' +
  947. 'http:\\/\\/www\\.studivz\\.net\\/Gadgets\\/Install\\/.*|' +
  948. 'http:\\/\\/www\\.studivz\\.net\\/.*|' +
  949. 'http:\\/\\/www\\.meinvz\\.net\\/Profile\\/.*|' +
  950. 'http:\\/\\/www\\.meinvz\\.net\\/l\\/.*|' +
  951. 'http:\\/\\/www\\.meinvz\\.net\\/Groups\\/Overview\\/.*|' +
  952. 'http:\\/\\/www\\.meinvz\\.net\\/Gadgets\\/Info\\/.*|' +
  953. 'http:\\/\\/www\\.meinvz\\.net\\/Gadgets\\/Install\\/.*|…

Large files files are truncated, but you can click here to view the full file