/r2/r2/lib/scraper.py
Python | 1850 lines | 1739 code | 74 blank | 37 comment | 68 complexity | 4c3265578333dbb2d9c7ead0fd61795f MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, Apache-2.0
Large files files are truncated, but you can click here to view the full file
- # The contents of this file are subject to the Common Public Attribution
- # License Version 1.0. (the "License"); you may not use this file except in
- # compliance with the License. You may obtain a copy of the License at
- # http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
- # License Version 1.1, but Sections 14 and 15 have been added to cover use of
- # software over a computer network and provide for limited attribution for the
- # Original Developer. In addition, Exhibit A has been modified to be consistent
- # with Exhibit B.
- #
- # Software distributed under the License is distributed on an "AS IS" basis,
- # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
- # the specific language governing rights and limitations under the License.
- #
- # The Original Code is reddit.
- #
- # The Original Developer is the Initial Developer. The Initial Developer of
- # the Original Code is reddit Inc.
- #
- # All portions of the code written by reddit are Copyright (c) 2006-2012 reddit
- # Inc. All Rights Reserved.
- ###############################################################################
- from pylons import g
- from r2.lib import utils
- from r2.lib.memoize import memoize
- import simplejson as json
- from urllib2 import Request, HTTPError, URLError, urlopen
- from httplib import InvalidURL
- import urlparse, re, urllib, logging, StringIO, logging
- import Image, ImageFile, math
- from BeautifulSoup import BeautifulSoup
- log = g.log
- useragent = g.useragent
- chunk_size = 1024
- thumbnail_size = 70, 70
- def image_to_str(image):
- s = StringIO.StringIO()
- image.save(s, image.format)
- s.seek(0)
- return s.read()
- def str_to_image(s):
- s = StringIO.StringIO(s)
- s.seek(0)
- image = Image.open(s)
- return image
- def prepare_image(image):
- image = square_image(image)
- image.thumbnail(thumbnail_size, Image.ANTIALIAS)
- return image
- def image_entropy(img):
- """calculate the entropy of an image"""
- hist = img.histogram()
- hist_size = sum(hist)
- hist = [float(h) / hist_size for h in hist]
- return -sum([p * math.log(p, 2) for p in hist if p != 0])
- def square_image(img):
- """if the image is taller than it is wide, square it off. determine
- which pieces to cut off based on the entropy pieces."""
- x,y = img.size
- while y > x:
- #slice 10px at a time until square
- slice_height = min(y - x, 10)
- bottom = img.crop((0, y - slice_height, x, y))
- top = img.crop((0, 0, x, slice_height))
- #remove the slice with the least entropy
- if image_entropy(bottom) < image_entropy(top):
- img = img.crop((0, 0, x, y - slice_height))
- else:
- img = img.crop((0, slice_height, x, y))
- x,y = img.size
- return img
- def clean_url(url):
- """url quotes unicode data out of urls"""
- s = url
- url = url.encode('utf8')
- url = ''.join([urllib.quote(c) if ord(c) >= 127 else c for c in url])
- return url
- def fetch_url(url, referer = None, retries = 1, dimension = False):
- cur_try = 0
- log.debug('fetching: %s' % url)
- nothing = None if dimension else (None, None)
- url = clean_url(url)
- #just basic urls
- if not (url.startswith('http://') or url.startswith('https://')):
- return nothing
- while True:
- try:
- req = Request(url)
- if useragent:
- req.add_header('User-Agent', useragent)
- if referer:
- req.add_header('Referer', referer)
- open_req = urlopen(req)
- #if we only need the dimension of the image, we may not
- #need to download the entire thing
- if dimension:
- content = open_req.read(chunk_size)
- else:
- content = open_req.read()
- content_type = open_req.headers.get('content-type')
- if not content_type:
- return nothing
- if 'image' in content_type:
- p = ImageFile.Parser()
- new_data = content
- while not p.image and new_data:
- p.feed(new_data)
- new_data = open_req.read(chunk_size)
- content += new_data
- #return the size, or return the data
- if dimension and p.image:
- return p.image.size
- elif dimension:
- return nothing
- elif dimension:
- #expected an image, but didn't get one
- return nothing
- return content_type, content
- except (URLError, HTTPError, InvalidURL), e:
- cur_try += 1
- if cur_try >= retries:
- log.debug('error while fetching: %s referer: %s' % (url, referer))
- log.debug(e)
- return nothing
- finally:
- if 'open_req' in locals():
- open_req.close()
- @memoize('media.fetch_size')
- def fetch_size(url, referer = None, retries = 1):
- return fetch_url(url, referer, retries, dimension = True)
- class MediaEmbed(object):
- width = None
- height = None
- content = None
- scrolling = False
- def __init__(self, height, width, content, scrolling = False):
- self.height = int(height)
- self.width = int(width)
- self.content = content
- self.scrolling = scrolling
- class Scraper:
- def __init__(self, url):
- self.url = url
- self.content = None
- self.content_type = None
- self.soup = None
- def __repr__(self):
- return "%s(%r)" % (self.__class__.__name__, self.url)
- def download(self):
- self.content_type, self.content = fetch_url(self.url)
- if self.content_type and 'html' in self.content_type and self.content:
- self.soup = BeautifulSoup(self.content)
- def image_urls(self):
- #if the original url was an image, use that
- if 'image' in self.content_type:
- yield self.url
- elif self.soup:
- images = self.soup.findAll('img', src = True)
- for i in images:
- image_url = urlparse.urljoin(self.url, i['src'])
- yield image_url
- def largest_image_url(self):
- if not self.content:
- self.download()
- #if download didn't work
- if not self.content or not self.content_type:
- return None
- max_area = 0
- max_url = None
- if self.soup:
- og_image = self.soup.find('meta', property='og:image')
- if og_image and og_image['content']:
- log.debug("Using og:image")
- return og_image['content']
- thumbnail_spec = self.soup.find('link', rel = 'image_src')
- if thumbnail_spec and thumbnail_spec['href']:
- log.debug("Using image_src")
- return thumbnail_spec['href']
- for image_url in self.image_urls():
- size = fetch_size(image_url, referer = self.url)
- if not size:
- continue
- area = size[0] * size[1]
- #ignore little images
- if area < 5000:
- log.debug('ignore little %s' % image_url)
- continue
- #ignore excessively long/wide images
- if max(size) / min(size) > 1.5:
- log.debug('ignore dimensions %s' % image_url)
- continue
- #penalize images with "sprite" in their name
- if 'sprite' in image_url.lower():
- log.debug('penalizing sprite %s' % image_url)
- area /= 10
- if area > max_area:
- max_area = area
- max_url = image_url
- return max_url
- def thumbnail(self):
- image_url = self.largest_image_url()
- if image_url:
- content_type, image_str = fetch_url(image_url, referer = self.url)
- if image_str:
- image = str_to_image(image_str)
- try:
- image = prepare_image(image)
- except IOError, e:
- #can't read interlaced PNGs, ignore
- if 'interlaced' in e.message:
- return
- raise
- return image
- def media_object(self):
- for deepscraper in deepscrapers:
- ds = deepscraper()
- found = ds.find_media_object(self)
- if found:
- return found
- @classmethod
- def media_embed(cls):
- raise NotImplementedError
- class MediaScraper(Scraper):
- media_template = ""
- thumbnail_template = ""
- video_id = None
- video_id_rx = None
- def __init__(self, url):
- Scraper.__init__(self, url)
- # first try the simple regex against the URL. If that fails,
- # see if the MediaScraper subclass has its own extraction
- # function
- if self.video_id_rx:
- m = self.video_id_rx.match(url)
- if m:
- self.video_id = m.groups()[0]
- if not self.video_id:
- video_id = self.video_id_extract()
- if video_id:
- self.video_id = video_id
- if not self.video_id:
- #if we still can't find the id just treat it like a normal page
- log.debug('reverting to regular scraper: %s' % url)
- self.__class__ = Scraper
- def video_id_extract(self):
- return None
- def largest_image_url(self):
- if self.thumbnail_template:
- return self.thumbnail_template.replace('$video_id', self.video_id)
- else:
- return Scraper.largest_image_url(self)
- def media_object(self):
- return dict(video_id = self.video_id,
- type = self.domains[0])
- @classmethod
- def media_embed(cls, video_id = None, height = None, width = None, **kw):
- content = cls.media_template.replace('$video_id', video_id)
- return MediaEmbed(height = height or cls.height,
- width = width or cls.width,
- content = content)
-
- def youtube_in_google(google_url):
- h = Scraper(google_url)
- h.download()
- try:
- youtube_url = h.soup.find('div', 'original-text').findNext('a')['href']
- log.debug('%s is really %s' % (google_url, youtube_url))
- return youtube_url
- except AttributeError, KeyError:
- pass
- def make_scraper(url):
- domain = utils.domain(url)
- scraper = Scraper
- for suffix, clses in scrapers.iteritems():
- for cls in clses:
- if domain.endswith(suffix):
- scraper = cls
- break
-
- #sometimes youtube scrapers masquerade as google scrapers
- if scraper == GootubeScraper:
- youtube_url = youtube_in_google(url)
- if youtube_url:
- return make_scraper(youtube_url)
- return scraper(url)
- ########## site-specific video scrapers ##########
- class YoutubeScraper(MediaScraper):
- domains = ['youtube.com']
- height = 295
- width = 480
- media_template = '<object width="490" height="295"><param name="movie" value="http://www.youtube.com/v/$video_id&fs=1"></param><param name="wmode" value="transparent"></param><param name="allowFullScreen" value="true"></param><embed src="http://www.youtube.com/v/$video_id&fs=1" type="application/x-shockwave-flash" wmode="transparent" allowFullScreen="true" width="480" height="295"></embed></object>'
- thumbnail_template = 'http://img.youtube.com/vi/$video_id/default.jpg'
- video_id_rx = re.compile('.*v=([A-Za-z0-9-_]+).*')
- video_deeplink_rx = re.compile('.*#t=(\d+)m(\d+)s.*')
- def video_id_extract(self):
- vid = self.video_id_rx.match(self.url)
- if(vid):
- video_id = vid.groups()[0]
- d = self.video_deeplink_rx.match(self.url)
- if(d):
- seconds = int(d.groups()[0])*60 + int(d.groups()[1])
- video_id += "&start=%d" % seconds
- return video_id
- def largest_image_url(self):
- # Remove the deeplink part from the video id
- return self.thumbnail_template.replace("$video_id",
- self.video_id.split("&")[0])
- class TedScraper(MediaScraper):
- domains = ['ted.com']
- height = 326
- width = 446
- media_template = '<object width="446" height="326"><param name="movie" value="http://video.ted.com/assets/player/swf/EmbedPlayer.swf"></param><param name="allowFullScreen" value="true" /><param name="wmode" value="transparent"></param><param name="bgColor" value="#ffffff"></param> <param name="flashvars" value="$video_id" /><embed src="http://video.ted.com/assets/player/swf/EmbedPlayer.swf" pluginspace="http://www.macromedia.com/go/getflashplayer" type="application/x-shockwave-flash" wmode="transparent" bgColor="#ffffff" width="446" height="326" allowFullScreen="true" flashvars="$video_id"></embed></object>'
- flashvars_rx = re.compile('.*flashvars="(.*)".*')
- def video_id_extract(self):
- if "/talks/" in self.url:
- content_type, content = fetch_url(self.url.replace("/talks/","/talks/embed/"))
- if content:
- m = self.flashvars_rx.match(content)
- if m:
- return m.groups()[0]
- def largest_image_url(self):
- if not self.soup:
- self.download()
- if self.soup:
- return self.soup.find('link', rel = 'image_src')['href']
- class MetacafeScraper(MediaScraper):
- domains = ['metacafe.com']
- height = 345
- width = 400
- media_template = '<embed src="$video_id" width="400" height="345" wmode="transparent" pluginspage="http://www.macromedia.com/go/getflashplayer" type="application/x-shockwave-flash"> </embed>'
- video_id_rx = re.compile('.*/watch/([^/]+)/.*')
- def media_object(self):
- if not self.soup:
- self.download()
- if self.soup:
- video_url = self.soup.find('link', rel = 'video_src')['href']
- return dict(video_id = video_url,
- type = self.domains[0])
- class GootubeScraper(MediaScraper):
- domains = ['video.google.com']
- height = 326
- width = 400
- media_template = '<embed style="width:400px; height:326px;" id="VideoPlayback" type="application/x-shockwave-flash" src="http://video.google.com/googleplayer.swf?docId=$video_id&hl=en" flashvars=""> </embed>'
- video_id_rx = re.compile('.*videoplay\?docid=([A-Za-z0-9-_]+).*')
- gootube_thumb_rx = re.compile(".*thumbnail:\s*\'(http://[^/]+/ThumbnailServer2[^\']+)\'.*", re.IGNORECASE | re.S)
- def largest_image_url(self):
- if not self.content:
- self.download()
- if not self.content:
- return None
- m = self.gootube_thumb_rx.match(self.content)
- if m:
- image_url = m.groups()[0]
- image_url = utils.safe_eval_str(image_url)
- return image_url
- class VimeoScraper(MediaScraper):
- domains = ['vimeo.com']
- height = 448
- width = 520
- media_template = '<embed src="$video_id" width="520" height="448" wmode="transparent" pluginspage="http://www.macromedia.com/go/getflashplayer" type="application/x-shockwave-flash"> </embed>'
- video_id_rx = re.compile('.*/(.*)')
- def media_object(self):
- if not self.soup:
- self.download()
- if self.soup:
- video_url = self.soup.find('link', rel = 'video_src')['href']
- return dict(video_id = video_url,
- type = self.domains[0])
- class BreakScraper(MediaScraper):
- domains = ['break.com']
- height = 421
- width = 520
- media_template = '<object width="520" height="421"><param name="movie" value="$video_id"></param><param name="allowScriptAccess" value="always"></param><embed src="$video_id" type="application/x-shockwave-flash" allowScriptAccess="always" width="520" height="421"></embed></object>'
- video_id_rx = re.compile('.*/index/([^/]+).*');
- def video_id_extract(self):
- if not self.soup:
- self.download()
- if self.soup:
- video_src = self.soup.find('link', rel = 'video_src')
- if video_src and video_src['href']:
- return video_src['href']
- class TheOnionScraper(MediaScraper):
- domains = ['theonion.com']
- height = 430
- width = 480
- media_template = """<object width="480" height="430">
- <param name="allowfullscreen" value="true" />
- <param name="allowscriptaccess" value="always" />
- <param name="movie" value="http://www.theonion.com/content/themes/common/assets/onn_embed/embedded_player.swf?&videoid=$video_id" />
- <param name="wmode" value="transparent" />
- <embed src="http://www.theonion.com/content/themes/common/assets/onn_embed/embedded_player.swf"
- width="480" height="430"
- wmode="transparent"
- pluginspage="http://www.macromedia.com/go/getflashplayer"
- type="application/x-shockwave-flash"
- flashvars="videoid=$video_id" >
- </embed>
- </object>"""
- video_id_rx = re.compile('.*/video/([^/?#]+).*')
- def media_object(self):
- if not self.soup:
- self.download()
- if self.soup:
- video_url = self.soup.find('meta', attrs={'name': 'nid'})['content']
- return dict(video_id = video_url,
- type = self.domains[0])
- class CollegeHumorScraper(MediaScraper):
- domains = ['collegehumor.com']
- height = 390
- width = 520
- media_template = '<object type="application/x-shockwave-flash" data="http://www.collegehumor.com/moogaloop/moogaloop.swf?clip_id=$video_id&fullscreen=1" width="520" height="390" ><param name="allowfullscreen" value="true" /><param name="AllowScriptAccess" value="true" /><param name="movie" quality="best" value="http://www.collegehumor.com/moogaloop/moogaloop.swf?clip_id=$video_id&fullscreen=1" /></object>'
- video_id_rx = re.compile('.*video:(\d+).*');
- class FunnyOrDieScraper(MediaScraper):
- domains = ['funnyordie.com']
- height = 438
- width = 464
- media_template = '<object width="464" height="438" classid="clsid:d27cdb6e-ae6d-11cf-96b8-444553540000" id="fodplayer"><param name="movie" value="http://player.ordienetworks.com/flash/fodplayer.swf?c79e63ac?key=$video_id" /><param name="flashvars" value="key=$video_id&autostart=true&internal=true" /><param name="allowfullscreen" value="true" /><embed width="464" height="438" flashvars="key=$video_id&autostart=true" allowfullscreen="true" quality="high" src="http://player.ordienetworks.com/flash/fodplayer.swf?c79e63ac" name="fodplayer" type="application/x-shockwave-flash"></embed></object>'
- thumbnail_template = 'http://assets1.ordienetworks.com/tmbs/$video_id/medium_2.jpg?c79e63ac'
- video_id_rx = re.compile('.*/videos/([^/]+)/.*')
- class ComedyCentralScraper(MediaScraper):
- domains = ['comedycentral.com']
- height = 316
- width = 332
- media_template = '<embed FlashVars="videoId=$video_id" src="http://www.comedycentral.com/sitewide/video_player/view/default/swf.jhtml" quality="high" bgcolor="#cccccc" width="332" height="316" name="comedy_central_player" align="middle" allowScriptAccess="always" allownetworking="external" type="application/x-shockwave-flash" pluginspage="http://www.macromedia.com/go/getflashplayer"></embed>'
- video_id_rx = re.compile('.*videoId=(\d+).*')
- class TheDailyShowScraper(MediaScraper):
- domains = ['thedailyshow.com']
- height = 353
- width = 360
- media_template = """<embed style='display:block' src='http://media.mtvnservices.com/mgid:cms:item:comedycentral.com:$video_id' width='360' height='301' type='application/x-shockwave-flash' wmode='window' allowFullscreen='true' flashvars='autoPlay=false' allowscriptaccess='always' allownetworking='all' bgcolor='#000000'></embed>"""
- def video_id_extract(self):
- "This is a bit of a hack"
- if not self.soup:
- self.download()
- if self.soup:
- embed_container = self.soup.find('div', {'class': 'videoplayerPromo module'})
- if embed_container:
- if embed_container['id'].startswith('promo_'):
- video_id = embed_container['id'].split('_')[1]
- return video_id
- class ColbertNationScraper(ComedyCentralScraper):
- domains = ['colbertnation.com']
- video_id_rx = re.compile('.*videos/(\d+)/.*')
- class LiveLeakScraper(MediaScraper):
- domains = ['liveleak.com']
- height = 370
- width = 450
- media_template = '<object width="450" height="370"><param name="movie" value="http://www.liveleak.com/e/$video_id"></param><param name="wmode" value="transparent"></param><embed src="http://www.liveleak.com/e/$video_id" type="application/x-shockwave-flash" wmode="transparent" width="450" height="370"></embed></object>'
- video_id_rx = re.compile('.*i=([a-zA-Z0-9_]+).*')
- def largest_image_url(self):
- if not self.soup:
- self.download()
- if self.soup:
- return self.soup.find('link', rel = 'videothumbnail')['href']
- class DailyMotionScraper(MediaScraper):
- domains = ['dailymotion.com']
- height = 381
- width = 480
- media_template = '<object width="480" height="381"><param name="movie" value="$video_id"></param><param name="allowFullScreen" value="true"></param><param name="allowScriptAccess" value="always"></param><embed src="$video_id" type="application/x-shockwave-flash" width="480" height="381" allowFullScreen="true" allowScriptAccess="always"></embed></object>'
- video_id_rx = re.compile('.*/video/([a-zA-Z0-9]+)_.*')
- def media_object(self):
- if not self.soup:
- self.download()
- if self.soup:
- video_url = self.soup.find('link', rel = 'video_src')['href']
- return dict(video_id = video_url,
- type = self.domains[0])
- class RevverScraper(MediaScraper):
- domains = ['revver.com']
- height = 392
- width = 480
- media_template = '<script src="http://flash.revver.com/player/1.0/player.js?mediaId:$video_id;width:480;height:392;" type="text/javascript"></script>'
- video_id_rx = re.compile('.*/video/([a-zA-Z0-9]+)/.*')
- class EscapistScraper(MediaScraper):
- domains = ['escapistmagazine.com']
- height = 294
- width = 480
- media_template = """<script src="http://www.escapistmagazine.com/videos/embed/$video_id"></script>"""
- video_id_rx = re.compile('.*/videos/view/[A-Za-z-9-]+/([0-9]+).*')
- class JustintvScraper(MediaScraper):
- """Can grab streams from justin.tv, but not clips"""
- domains = ['justin.tv']
- height = 295
- width = 353
- stream_media_template = """<object type="application/x-shockwave-flash" height="295" width="353" id="jtv_player_flash" data="http://www.justin.tv/widgets/jtv_player.swf?channel=$video_id" bgcolor="#000000"><param name="allowFullScreen" value="true" /><param name="allowScriptAccess" value="always" /><param name="allowNetworking" value="all" /><param name="movie" value="http://www.justin.tv/widgets/jtv_player.swf" /><param name="flashvars" value="channel=$video_id&auto_play=false&start_volume=25" /></object>"""
- video_id_rx = re.compile('^http://www.justin.tv/([a-zA-Z0-9_]+)[^/]*$')
- @classmethod
- def media_embed(cls, video_id, **kw):
- content = cls.stream_media_template.replace('$video_id', video_id)
- return MediaEmbed(height = cls.height,
- width = cls.width,
- content = content)
- class SoundcloudScraper(MediaScraper):
- """soundcloud.com"""
- domains = ['soundcloud.com']
- height = 81
- width = 400
- media_template = """<div style="font-size: 11px;">
- <object height="81" width="100%">
- <param name="movie"
- value="http://player.soundcloud.com/player.swf?track=$video_id">
- </param>
- <param name="allowscriptaccess" value="always"></param>
- <embed allowscriptaccess="always" height="81"
- src="http://player.soundcloud.com/player.swf?track=$video_id"
- type="application/x-shockwave-flash"
- width="100%">
- </embed>
- </object>"""
- video_id_rx = re.compile('^http://soundcloud.com/[a-zA-Z0-9_-]+/([a-zA-Z0-9_-]+)')
- class CraigslistScraper(MediaScraper):
- domains = ['craigslist.org']
- height = 480
- width = 640
- max_size_kb = 50
- def video_id_extract(self):
- return self.url
- def media_object(self):
- if not self.soup:
- self.download()
- if self.soup:
- ub = self.soup.find('div', {'id': 'userbody'})
- if ub:
- ub = str(ub)
- if len(ub) <= self.max_size_kb * 1024:
- return dict(content = ub,
- type = self.domains[0])
- @classmethod
- def media_embed(cls, content, **kw):
- return MediaEmbed(height = cls.height,
- width = cls.width,
- content = content,
- scrolling = True)
-
- ########## oembed rich-media scrapers ##########
- class OEmbed(Scraper):
- """
- Oembed Scraper
- ==============
- Tries to use the oembed standard to create a media object.
-
- url_re: Regular Expression to match the incoming url against.
- api_endpoint: Url of the api end point you are using.
- api_params: Default Params to be sent with the outgoing request.
- """
- url_re = ''
- api_endpoint = ''
- api_params = {}
-
- def __init__(self, url):
- Scraper.__init__(self, url)
- self.oembed = None
-
- #Fallback to the scraper if the url doesn't match
- if not self.url_re.match(self.url):
- self.__class__ = Scraper
-
- def __repr__(self):
- return "%s(%r)" % (self.__class__.__name__, self.url)
- def download(self):
- self.api_params.update( { 'url':self.url})
- query = urllib.urlencode(self.api_params)
- api_url = "%s?%s" % (self.api_endpoint, query)
- self.content_type, self.content = fetch_url(api_url)
- #Either a 404 or 500.
- if not self.content:
- #raise ValueError('ISSUE CALLING %s' %api_url)
- log.warn('oEmbed call (%s) failed to return content for %s'
- %(api_url, self.url))
- return None
- try:
- self.oembed = json.loads(self.content)
- except ValueError, e:
- log.error('oEmbed call (%s) return invalid json for %s'
- %(api_url, self.url))
- return None
- def image_urls(self):
- #if the original url was an image, use that
- if self.oembed and self.oembed.get('type') =='photo':
- yield self.oembed.get('url')
- elif self.oembed and self.oembed.get('thumbnail_url'):
- yield self.oembed.get('thumbnail_url')
- def largest_image_url(self):
- #Seems to be the default place to check if the download has happened.
- if not self.oembed:
- self.download()
- #if the original url was of the photo type
- if self.oembed and self.oembed.get('type') =='photo':
- return self.oembed.get('url')
- elif self.oembed and self.oembed.get('thumbnail_url'):
- return self.oembed.get('thumbnail_url')
- def media_object(self):
- #Seems to be the default place to check if the download has happened.
- if not self.oembed:
- self.download()
- if self.oembed and self.oembed.get('type') in ['video', 'rich']:
- for domain in self.domains:
- if self.url.find(domain) > -1:
- return dict(type=domain, oembed=self.oembed)
- return None
- @classmethod
- def media_embed(cls, video_id = None, height = None, width = None, **kw):
- content = None
- oembed = kw.get('oembed')
- # check if oembed is there and has html
- if oembed and oembed.get('html'):
- content = oembed.get('html')
- if content and oembed.get('height') and oembed.get('width'):
- return MediaEmbed(height = oembed['height'],
- width = oembed['width'],
- content = content)
- class EmbedlyOEmbed(OEmbed):
- """
- Embedly oEmbed Provider
- =======================
- documentation: http://api.embed.ly
- """
- domains = ['23hq.com', '5min.com', '99dollarmusicvideos.com',
- 'abcnews.go.com', 'achewood.com', 'allthingsd.com', 'amazon.com',
- 'aniboom.com', 'animoto.com', 'asofterworld.com', 'atom.com',
- 'audioboo.com', 'bambuser.com', 'bandcamp.com', 'barelydigital.com',
- 'barelypolitical.com', 'bigthink.com', 'blip.tv', 'bnter.com',
- 'boston.com', 'brainbird.net', 'bravotv.com', 'break.com',
- 'brizzly.com', 'cbsnews.com', 'channelfrederator.com', 'chart.ly',
- 'cl.ly', 'clikthrough.com', 'clipfish.de', 'clipshack.com', 'cnbc.com',
- 'cnn.com', 'colbertnation.com', 'collegehumor.com', 'color.com',
- 'comedycentral.com', 'compete.com', 'confreaks.net', 'crackle.com',
- 'craigslist.org', 'crocodoc.com', 'crunchbase.com', 'dailybooth.com',
- 'dailymile.com', 'dailymotion.com', 'deviantart.com', 'digg.com',
- 'dipdive.com', 'discovery.com', 'dotsub.com', 'dribbble.com',
- 'edition.cnn.com', 'emberapp.com', 'escapistmagazine.com',
- 'espn.go.com', 'facebook.com', 'fancast.com', 'flickr.com', 'fora.tv',
- 'formspring.me', 'fotopedia.com', 'freemusicarchive.org',
- 'funnyordie.com', 'gametrailers.com', 'gist.github.com',
- 'globalpost.com', 'godtube.com', 'gogoyoko.com', 'google.com',
- 'graphicly.com', 'grindtv.com', 'grooveshark.com', 'guardian.co.uk',
- 'hark.com', 'howcast.com', 'huffduffer.com', 'hulu.com',
- 'hungrynation.tv', 'ifood.tv', 'img.ly', 'imgur.com', 'indenti.ca',
- 'indymogul.com', 'instagr.am', 'issuu.com', 'itunes.apple.com',
- 'justin.tv', 'kickstarter.com', 'kinomap.com', 'kiva.org',
- 'koldcast.tv', 'last.fm', 'lightbox.com', 'liveleak.com',
- 'livestream.com', 'lockerz.com', 'logotv.com', 'lonelyplanet.com',
- 'maps.google.com', 'meadd.com', 'mediamatters.org', 'meetup.com',
- 'metacafe.com', 'metacdn.com', 'mixcloud.com', 'mixergy.com',
- 'mlkshk.com', 'mobypicture.com', 'money.cnn.com', 'movies.yahoo.com',
- 'msnbc.com', 'my.opera.com', 'myloc.me', 'myvideo.de',
- 'nationalgeographic.com', 'nfb.ca', 'npr.org', 'nzonscreen.com',
- 'overstream.net', 'ow.ly', 'pastebin.com', 'pastie.org',
- 'phodroid.com', 'photobucket.com', 'photozou.jp',
- 'picasaweb.google.com', 'picplz.com', 'pikchur.com', 'ping.fm',
- 'polldaddy.com', 'polleverywhere.com', 'posterous.com', 'prezi.com',
- 'qik.com', 'quantcast.com', 'questionablecontent.net', 'qwantz.com',
- 'qwiki.com', 'radionomy.com', 'radioreddit.com', 'rdio.com',
- 'recordsetter.com','redux.com', 'revision3.com', 'revver.com',
- 'saynow.com', 'schooltube.com', 'sciencestage.com', 'scrapblog.com',
- 'screencast.com', 'screenr.com', 'scribd.com', 'sendables.jibjab.com',
- 'share.ovi.com', 'shitmydadsays.com', 'shopstyle.com', 'skitch.com',
- 'slideshare.net', 'smugmug.com', 'snotr.com', 'socialcam.com',
- 'someecards.com', 'soundcloud.com', 'speakerdeck.com', 'spike.com',
- 'statsheet.com', 'status.net', 'storify.com', 'streetfire.net',
- 'studivz.net', 'tangle.com', 'teachertube.com', 'techcrunch.tv',
- 'ted.com', 'thedailyshow.com', 'theonion.com', 'threadbanger.com',
- 'timetoast.com', 'tinypic.com', 'tmiweekly.com', 'traileraddict.com',
- 'trailerspy.com', 'trooptube.tv', 'trutv.com', 'tumblr.com',
- 'twitgoo.com', 'twitlonger.com', 'twitpic.com', 'twitrpix.com',
- 'twitter.com', 'twitvid.com', 'ultrakawaii.com', 'urtak.com',
- 'uservoice.com', 'ustream.com', 'viddler.com', 'video.forbes.com',
- 'video.google.com', 'video.jardenberg.com', 'video.pbs.org',
- 'video.yahoo.com', 'videos.nymag.com', 'vids.myspace.com', 'vimeo.com',
- 'vodcars.com', 'washingtonpost.com', 'whitehouse.gov', 'whosay.com',
- 'wikimedia.org', 'wikipedia.org', 'wistia.com', 'wordpress.tv',
- 'worldstarhiphop.com', 'xiami.com', 'xkcd.com', 'xtranormal.com',
- 'yfrog.com', 'youku.com', 'youtu.be', 'youtube.com', 'zapiks.com',
- 'zero-inch.com']
- url_re = re.compile(
- 'http:\\/\\/.*youtube\\.com\\/watch.*|' +
- 'http:\\/\\/.*\\.youtube\\.com\\/v\\/.*|' +
- 'https:\\/\\/.*youtube\\.com\\/watch.*|' +
- 'https:\\/\\/.*\\.youtube\\.com\\/v\\/.*|' +
- 'http:\\/\\/youtu\\.be\\/.*|' +
- 'http:\\/\\/.*\\.youtube\\.com\\/user\\/.*|' +
- 'http:\\/\\/.*\\.youtube\\.com\\/.*\\#.*\\/.*|' +
- 'http:\\/\\/m\\.youtube\\.com\\/watch.*|' +
- 'http:\\/\\/m\\.youtube\\.com\\/index.*|' +
- 'http:\\/\\/.*\\.youtube\\.com\\/profile.*|' +
- 'http:\\/\\/.*\\.youtube\\.com\\/view_play_list.*|' +
- 'http:\\/\\/.*\\.youtube\\.com\\/playlist.*|' +
- 'http:\\/\\/.*justin\\.tv\\/.*|' +
- 'http:\\/\\/.*justin\\.tv\\/.*\\/b\\/.*|' +
- 'http:\\/\\/.*justin\\.tv\\/.*\\/w\\/.*|' +
- 'http:\\/\\/www\\.ustream\\.tv\\/recorded\\/.*|' +
- 'http:\\/\\/www\\.ustream\\.tv\\/channel\\/.*|' +
- 'http:\\/\\/www\\.ustream\\.tv\\/.*|' +
- 'http:\\/\\/qik\\.com\\/video\\/.*|' +
- 'http:\\/\\/qik\\.com\\/.*|' +
- 'http:\\/\\/qik\\.ly\\/.*|' +
- 'http:\\/\\/.*revision3\\.com\\/.*|' +
- 'http:\\/\\/.*\\.dailymotion\\.com\\/video\\/.*|' +
- 'http:\\/\\/.*\\.dailymotion\\.com\\/.*\\/video\\/.*|' +
- 'http:\\/\\/collegehumor\\.com\\/video:.*|' +
- 'http:\\/\\/collegehumor\\.com\\/video\\/.*|' +
- 'http:\\/\\/www\\.collegehumor\\.com\\/video:.*|' +
- 'http:\\/\\/www\\.collegehumor\\.com\\/video\\/.*|' +
- 'http:\\/\\/.*twitvid\\.com\\/.*|' +
- 'http:\\/\\/www\\.break\\.com\\/.*\\/.*|' +
- 'http:\\/\\/vids\\.myspace\\.com\\/index\\.cfm\\?fuseaction=vids\\.individual&videoid.*|' +
- 'http:\\/\\/www\\.myspace\\.com\\/index\\.cfm\\?fuseaction=.*&videoid.*|' +
- 'http:\\/\\/www\\.metacafe\\.com\\/watch\\/.*|' +
- 'http:\\/\\/www\\.metacafe\\.com\\/w\\/.*|' +
- 'http:\\/\\/blip\\.tv\\/.*\\/.*|' +
- 'http:\\/\\/.*\\.blip\\.tv\\/.*\\/.*|' +
- 'http:\\/\\/video\\.google\\.com\\/videoplay\\?.*|' +
- 'http:\\/\\/.*revver\\.com\\/video\\/.*|' +
- 'http:\\/\\/video\\.yahoo\\.com\\/watch\\/.*\\/.*|' +
- 'http:\\/\\/video\\.yahoo\\.com\\/network\\/.*|' +
- 'http:\\/\\/.*viddler\\.com\\/explore\\/.*\\/videos\\/.*|' +
- 'http:\\/\\/liveleak\\.com\\/view\\?.*|' +
- 'http:\\/\\/www\\.liveleak\\.com\\/view\\?.*|' +
- 'http:\\/\\/animoto\\.com\\/play\\/.*|' +
- 'http:\\/\\/dotsub\\.com\\/view\\/.*|' +
- 'http:\\/\\/www\\.overstream\\.net\\/view\\.php\\?oid=.*|' +
- 'http:\\/\\/www\\.livestream\\.com\\/.*|' +
- 'http:\\/\\/www\\.worldstarhiphop\\.com\\/videos\\/video.*\\.php\\?v=.*|' +
- 'http:\\/\\/worldstarhiphop\\.com\\/videos\\/video.*\\.php\\?v=.*|' +
- 'http:\\/\\/teachertube\\.com\\/viewVideo\\.php.*|' +
- 'http:\\/\\/www\\.teachertube\\.com\\/viewVideo\\.php.*|' +
- 'http:\\/\\/www1\\.teachertube\\.com\\/viewVideo\\.php.*|' +
- 'http:\\/\\/www2\\.teachertube\\.com\\/viewVideo\\.php.*|' +
- 'http:\\/\\/bambuser\\.com\\/v\\/.*|' +
- 'http:\\/\\/bambuser\\.com\\/channel\\/.*|' +
- 'http:\\/\\/bambuser\\.com\\/channel\\/.*\\/broadcast\\/.*|' +
- 'http:\\/\\/www\\.schooltube\\.com\\/video\\/.*\\/.*|' +
- 'http:\\/\\/bigthink\\.com\\/ideas\\/.*|' +
- 'http:\\/\\/bigthink\\.com\\/series\\/.*|' +
- 'http:\\/\\/sendables\\.jibjab\\.com\\/view\\/.*|' +
- 'http:\\/\\/sendables\\.jibjab\\.com\\/originals\\/.*|' +
- 'http:\\/\\/www\\.xtranormal\\.com\\/watch\\/.*|' +
- 'http:\\/\\/socialcam\\.com\\/v\\/.*|' +
- 'http:\\/\\/www\\.socialcam\\.com\\/v\\/.*|' +
- 'http:\\/\\/dipdive\\.com\\/media\\/.*|' +
- 'http:\\/\\/dipdive\\.com\\/member\\/.*\\/media\\/.*|' +
- 'http:\\/\\/dipdive\\.com\\/v\\/.*|' +
- 'http:\\/\\/.*\\.dipdive\\.com\\/media\\/.*|' +
- 'http:\\/\\/.*\\.dipdive\\.com\\/v\\/.*|' +
- 'http:\\/\\/v\\.youku\\.com\\/v_show\\/.*\\.html|' +
- 'http:\\/\\/v\\.youku\\.com\\/v_playlist\\/.*\\.html|' +
- 'http:\\/\\/www\\.snotr\\.com\\/video\\/.*|' +
- 'http:\\/\\/snotr\\.com\\/video\\/.*|' +
- 'http:\\/\\/video\\.jardenberg\\.se\\/.*|' +
- 'http:\\/\\/www\\.clipfish\\.de\\/.*\\/.*\\/video\\/.*|' +
- 'http:\\/\\/www\\.myvideo\\.de\\/watch\\/.*|' +
- 'http:\\/\\/www\\.whitehouse\\.gov\\/photos-and-video\\/video\\/.*|' +
- 'http:\\/\\/www\\.whitehouse\\.gov\\/video\\/.*|' +
- 'http:\\/\\/wh\\.gov\\/photos-and-video\\/video\\/.*|' +
- 'http:\\/\\/wh\\.gov\\/video\\/.*|' +
- 'http:\\/\\/www\\.hulu\\.com\\/watch.*|' +
- 'http:\\/\\/www\\.hulu\\.com\\/w\\/.*|' +
- 'http:\\/\\/hulu\\.com\\/watch.*|' +
- 'http:\\/\\/hulu\\.com\\/w\\/.*|' +
- 'http:\\/\\/.*crackle\\.com\\/c\\/.*|' +
- 'http:\\/\\/www\\.fancast\\.com\\/.*\\/videos|' +
- 'http:\\/\\/www\\.funnyordie\\.com\\/videos\\/.*|' +
- 'http:\\/\\/www\\.funnyordie\\.com\\/m\\/.*|' +
- 'http:\\/\\/funnyordie\\.com\\/videos\\/.*|' +
- 'http:\\/\\/funnyordie\\.com\\/m\\/.*|' +
- 'http:\\/\\/www\\.vimeo\\.com\\/groups\\/.*\\/videos\\/.*|' +
- 'http:\\/\\/www\\.vimeo\\.com\\/.*|' +
- 'http:\\/\\/vimeo\\.com\\/groups\\/.*\\/videos\\/.*|' +
- 'http:\\/\\/vimeo\\.com\\/.*|' +
- 'http:\\/\\/vimeo\\.com\\/m\\/\\#\\/.*|' +
- 'http:\\/\\/www\\.ted\\.com\\/talks\\/.*\\.html.*|' +
- 'http:\\/\\/www\\.ted\\.com\\/talks\\/lang\\/.*\\/.*\\.html.*|' +
- 'http:\\/\\/www\\.ted\\.com\\/index\\.php\\/talks\\/.*\\.html.*|' +
- 'http:\\/\\/www\\.ted\\.com\\/index\\.php\\/talks\\/lang\\/.*\\/.*\\.html.*|' +
- 'http:\\/\\/.*nfb\\.ca\\/film\\/.*|' +
- 'http:\\/\\/www\\.thedailyshow\\.com\\/watch\\/.*|' +
- 'http:\\/\\/www\\.thedailyshow\\.com\\/full-episodes\\/.*|' +
- 'http:\\/\\/www\\.thedailyshow\\.com\\/collection\\/.*\\/.*\\/.*|' +
- 'http:\\/\\/movies\\.yahoo\\.com\\/movie\\/.*\\/video\\/.*|' +
- 'http:\\/\\/movies\\.yahoo\\.com\\/movie\\/.*\\/trailer|' +
- 'http:\\/\\/movies\\.yahoo\\.com\\/movie\\/.*\\/video|' +
- 'http:\\/\\/www\\.colbertnation\\.com\\/the-colbert-report-collections\\/.*|' +
- 'http:\\/\\/www\\.colbertnation\\.com\\/full-episodes\\/.*|' +
- 'http:\\/\\/www\\.colbertnation\\.com\\/the-colbert-report-videos\\/.*|' +
- 'http:\\/\\/www\\.comedycentral\\.com\\/videos\\/index\\.jhtml\\?.*|' +
- 'http:\\/\\/www\\.theonion\\.com\\/video\\/.*|' +
- 'http:\\/\\/theonion\\.com\\/video\\/.*|' +
- 'http:\\/\\/wordpress\\.tv\\/.*\\/.*\\/.*\\/.*\\/|' +
- 'http:\\/\\/www\\.traileraddict\\.com\\/trailer\\/.*|' +
- 'http:\\/\\/www\\.traileraddict\\.com\\/clip\\/.*|' +
- 'http:\\/\\/www\\.traileraddict\\.com\\/poster\\/.*|' +
- 'http:\\/\\/www\\.escapistmagazine\\.com\\/videos\\/.*|' +
- 'http:\\/\\/www\\.trailerspy\\.com\\/trailer\\/.*\\/.*|' +
- 'http:\\/\\/www\\.trailerspy\\.com\\/trailer\\/.*|' +
- 'http:\\/\\/www\\.trailerspy\\.com\\/view_video\\.php.*|' +
- 'http:\\/\\/www\\.atom\\.com\\/.*\\/.*\\/|' +
- 'http:\\/\\/fora\\.tv\\/.*\\/.*\\/.*\\/.*|' +
- 'http:\\/\\/www\\.spike\\.com\\/video\\/.*|' +
- 'http:\\/\\/www\\.gametrailers\\.com\\/video\\/.*|' +
- 'http:\\/\\/gametrailers\\.com\\/video\\/.*|' +
- 'http:\\/\\/www\\.koldcast\\.tv\\/video\\/.*|' +
- 'http:\\/\\/www\\.koldcast\\.tv\\/\\#video:.*|' +
- 'http:\\/\\/techcrunch\\.tv\\/watch.*|' +
- 'http:\\/\\/techcrunch\\.tv\\/.*\\/watch.*|' +
- 'http:\\/\\/mixergy\\.com\\/.*|' +
- 'http:\\/\\/video\\.pbs\\.org\\/video\\/.*|' +
- 'http:\\/\\/www\\.zapiks\\.com\\/.*|' +
- 'http:\\/\\/tv\\.digg\\.com\\/diggnation\\/.*|' +
- 'http:\\/\\/tv\\.digg\\.com\\/diggreel\\/.*|' +
- 'http:\\/\\/tv\\.digg\\.com\\/diggdialogg\\/.*|' +
- 'http:\\/\\/www\\.trutv\\.com\\/video\\/.*|' +
- 'http:\\/\\/www\\.nzonscreen\\.com\\/title\\/.*|' +
- 'http:\\/\\/nzonscreen\\.com\\/title\\/.*|' +
- 'http:\\/\\/app\\.wistia\\.com\\/embed\\/medias\\/.*|' +
- 'https:\\/\\/app\\.wistia\\.com\\/embed\\/medias\\/.*|' +
- 'http:\\/\\/hungrynation\\.tv\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.hungrynation\\.tv\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/hungrynation\\.tv\\/episode\\/.*|' +
- 'http:\\/\\/www\\.hungrynation\\.tv\\/episode\\/.*|' +
- 'http:\\/\\/indymogul\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.indymogul\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/indymogul\\.com\\/episode\\/.*|' +
- 'http:\\/\\/www\\.indymogul\\.com\\/episode\\/.*|' +
- 'http:\\/\\/channelfrederator\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.channelfrederator\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/channelfrederator\\.com\\/episode\\/.*|' +
- 'http:\\/\\/www\\.channelfrederator\\.com\\/episode\\/.*|' +
- 'http:\\/\\/tmiweekly\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.tmiweekly\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/tmiweekly\\.com\\/episode\\/.*|' +
- 'http:\\/\\/www\\.tmiweekly\\.com\\/episode\\/.*|' +
- 'http:\\/\\/99dollarmusicvideos\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.99dollarmusicvideos\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/99dollarmusicvideos\\.com\\/episode\\/.*|' +
- 'http:\\/\\/www\\.99dollarmusicvideos\\.com\\/episode\\/.*|' +
- 'http:\\/\\/ultrakawaii\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.ultrakawaii\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/ultrakawaii\\.com\\/episode\\/.*|' +
- 'http:\\/\\/www\\.ultrakawaii\\.com\\/episode\\/.*|' +
- 'http:\\/\\/barelypolitical\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.barelypolitical\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/barelypolitical\\.com\\/episode\\/.*|' +
- 'http:\\/\\/www\\.barelypolitical\\.com\\/episode\\/.*|' +
- 'http:\\/\\/barelydigital\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.barelydigital\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/barelydigital\\.com\\/episode\\/.*|' +
- 'http:\\/\\/www\\.barelydigital\\.com\\/episode\\/.*|' +
- 'http:\\/\\/threadbanger\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.threadbanger\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/threadbanger\\.com\\/episode\\/.*|' +
- 'http:\\/\\/www\\.threadbanger\\.com\\/episode\\/.*|' +
- 'http:\\/\\/vodcars\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.vodcars\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/vodcars\\.com\\/episode\\/.*|' +
- 'http:\\/\\/www\\.vodcars\\.com\\/episode\\/.*|' +
- 'http:\\/\\/confreaks\\.net\\/videos\\/.*|' +
- 'http:\\/\\/www\\.confreaks\\.net\\/videos\\/.*|' +
- 'http:\\/\\/video\\.allthingsd\\.com\\/video\\/.*|' +
- 'http:\\/\\/videos\\.nymag\\.com\\/.*|' +
- 'http:\\/\\/aniboom\\.com\\/animation-video\\/.*|' +
- 'http:\\/\\/www\\.aniboom\\.com\\/animation-video\\/.*|' +
- 'http:\\/\\/clipshack\\.com\\/Clip\\.aspx\\?.*|' +
- 'http:\\/\\/www\\.clipshack\\.com\\/Clip\\.aspx\\?.*|' +
- 'http:\\/\\/grindtv\\.com\\/.*\\/video\\/.*|' +
- 'http:\\/\\/www\\.grindtv\\.com\\/.*\\/video\\/.*|' +
- 'http:\\/\\/ifood\\.tv\\/recipe\\/.*|' +
- 'http:\\/\\/ifood\\.tv\\/video\\/.*|' +
- 'http:\\/\\/ifood\\.tv\\/channel\\/user\\/.*|' +
- 'http:\\/\\/www\\.ifood\\.tv\\/recipe\\/.*|' +
- 'http:\\/\\/www\\.ifood\\.tv\\/video\\/.*|' +
- 'http:\\/\\/www\\.ifood\\.tv\\/channel\\/user\\/.*|' +
- 'http:\\/\\/logotv\\.com\\/video\\/.*|' +
- 'http:\\/\\/www\\.logotv\\.com\\/video\\/.*|' +
- 'http:\\/\\/lonelyplanet\\.com\\/Clip\\.aspx\\?.*|' +
- 'http:\\/\\/www\\.lonelyplanet\\.com\\/Clip\\.aspx\\?.*|' +
- 'http:\\/\\/streetfire\\.net\\/video\\/.*\\.htm.*|' +
- 'http:\\/\\/www\\.streetfire\\.net\\/video\\/.*\\.htm.*|' +
- 'http:\\/\\/trooptube\\.tv\\/videos\\/.*|' +
- 'http:\\/\\/www\\.trooptube\\.tv\\/videos\\/.*|' +
- 'http:\\/\\/sciencestage\\.com\\/v\\/.*\\.html|' +
- 'http:\\/\\/sciencestage\\.com\\/a\\/.*\\.html|' +
- 'http:\\/\\/www\\.sciencestage\\.com\\/v\\/.*\\.html|' +
- 'http:\\/\\/www\\.sciencestage\\.com\\/a\\/.*\\.html|' +
- 'http:\\/\\/www\\.godtube\\.com\\/featured\\/video\\/.*|' +
- 'http:\\/\\/godtube\\.com\\/featured\\/video\\/.*|' +
- 'http:\\/\\/www\\.godtube\\.com\\/watch\\/.*|' +
- 'http:\\/\\/godtube\\.com\\/watch\\/.*|' +
- 'http:\\/\\/www\\.tangle\\.com\\/view_video.*|' +
- 'http:\\/\\/mediamatters\\.org\\/mmtv\\/.*|' +
- 'http:\\/\\/www\\.clikthrough\\.com\\/theater\\/video\\/.*|' +
- 'http:\\/\\/gist\\.github\\.com\\/.*|' +
- 'http:\\/\\/twitter\\.com\\/.*\\/status\\/.*|' +
- 'http:\\/\\/twitter\\.com\\/.*\\/statuses\\/.*|' +
- 'http:\\/\\/www\\.twitter\\.com\\/.*\\/status\\/.*|' +
- 'http:\\/\\/www\\.twitter\\.com\\/.*\\/statuses\\/.*|' +
- 'http:\\/\\/mobile\\.twitter\\.com\\/.*\\/status\\/.*|' +
- 'http:\\/\\/mobile\\.twitter\\.com\\/.*\\/statuses\\/.*|' +
- 'https:\\/\\/twitter\\.com\\/.*\\/status\\/.*|' +
- 'https:\\/\\/twitter\\.com\\/.*\\/statuses\\/.*|' +
- 'https:\\/\\/www\\.twitter\\.com\\/.*\\/status\\/.*|' +
- 'https:\\/\\/www\\.twitter\\.com\\/.*\\/statuses\\/.*|' +
- 'https:\\/\\/mobile\\.twitter\\.com\\/.*\\/status\\/.*|' +
- 'https:\\/\\/mobile\\.twitter\\.com\\/.*\\/statuses\\/.*|' +
- 'http:\\/\\/www\\.crunchbase\\.com\\/.*\\/.*|' +
- 'http:\\/\\/crunchbase\\.com\\/.*\\/.*|' +
- 'http:\\/\\/www\\.slideshare\\.net\\/.*\\/.*|' +
- 'http:\\/\\/www\\.slideshare\\.net\\/mobile\\/.*\\/.*|' +
- 'http:\\/\\/slidesha\\.re\\/.*|' +
- 'http:\\/\\/scribd\\.com\\/doc\\/.*|' +
- 'http:\\/\\/www\\.scribd\\.com\\/doc\\/.*|' +
- 'http:\\/\\/scribd\\.com\\/mobile\\/documents\\/.*|' +
- 'http:\\/\\/www\\.scribd\\.com\\/mobile\\/documents\\/.*|' +
- 'http:\\/\\/screenr\\.com\\/.*|' +
- 'http:\\/\\/polldaddy\\.com\\/community\\/poll\\/.*|' +
- 'http:\\/\\/polldaddy\\.com\\/poll\\/.*|' +
- 'http:\\/\\/answers\\.polldaddy\\.com\\/poll\\/.*|' +
- 'http:\\/\\/www\\.5min\\.com\\/Video\\/.*|' +
- 'http:\\/\\/www\\.howcast\\.com\\/videos\\/.*|' +
- 'http:\\/\\/www\\.screencast\\.com\\/.*\\/media\\/.*|' +
- 'http:\\/\\/screencast\\.com\\/.*\\/media\\/.*|' +
- 'http:\\/\\/www\\.screencast\\.com\\/t\\/.*|' +
- 'http:\\/\\/screencast\\.com\\/t\\/.*|' +
- 'http:\\/\\/issuu\\.com\\/.*\\/docs\\/.*|' +
- 'http:\\/\\/www\\.kickstarter\\.com\\/projects\\/.*\\/.*|' +
- 'http:\\/\\/www\\.scrapblog\\.com\\/viewer\\/viewer\\.aspx.*|' +
- 'http:\\/\\/ping\\.fm\\/p\\/.*|' +
- 'http:\\/\\/chart\\.ly\\/symbols\\/.*|' +
- 'http:\\/\\/chart\\.ly\\/.*|' +
- 'http:\\/\\/maps\\.google\\.com\\/maps\\?.*|' +
- 'http:\\/\\/maps\\.google\\.com\\/\\?.*|' +
- 'http:\\/\\/maps\\.google\\.com\\/maps\\/ms\\?.*|' +
- 'http:\\/\\/.*\\.craigslist\\.org\\/.*\\/.*|' +
- 'http:\\/\\/my\\.opera\\.com\\/.*\\/albums\\/show\\.dml\\?id=.*|' +
- 'http:\\/\\/my\\.opera\\.com\\/.*\\/albums\\/showpic\\.dml\\?album=.*&picture=.*|' +
- 'http:\\/\\/tumblr\\.com\\/.*|' +
- 'http:\\/\\/.*\\.tumblr\\.com\\/post\\/.*|' +
- 'http:\\/\\/www\\.polleverywhere\\.com\\/polls\\/.*|' +
- 'http:\\/\\/www\\.polleverywhere\\.com\\/multiple_choice_polls\\/.*|' +
- 'http:\\/\\/www\\.polleverywhere\\.com\\/free_text_polls\\/.*|' +
- 'http:\\/\\/www\\.quantcast\\.com\\/wd:.*|' +
- 'http:\\/\\/www\\.quantcast\\.com\\/.*|' +
- 'http:\\/\\/siteanalytics\\.compete\\.com\\/.*|' +
- 'http:\\/\\/statsheet\\.com\\/statplot\\/charts\\/.*\\/.*\\/.*\\/.*|' +
- 'http:\\/\\/statsheet\\.com\\/statplot\\/charts\\/e\\/.*|' +
- 'http:\\/\\/statsheet\\.com\\/.*\\/teams\\/.*\\/.*|' +
- 'http:\\/\\/statsheet\\.com\\/tools\\/chartlets\\?chart=.*|' +
- 'http:\\/\\/.*\\.status\\.net\\/notice\\/.*|' +
- 'http:\\/\\/identi\\.ca\\/notice\\/.*|' +
- 'http:\\/\\/brainbird\\.net\\/notice\\/.*|' +
- 'http:\\/\\/shitmydadsays\\.com\\/notice\\/.*|' +
- 'http:\\/\\/www\\.studivz\\.net\\/Profile\\/.*|' +
- 'http:\\/\\/www\\.studivz\\.net\\/l\\/.*|' +
- 'http:\\/\\/www\\.studivz\\.net\\/Groups\\/Overview\\/.*|' +
- 'http:\\/\\/www\\.studivz\\.net\\/Gadgets\\/Info\\/.*|' +
- 'http:\\/\\/www\\.studivz\\.net\\/Gadgets\\/Install\\/.*|' +
- 'http:\\/\\/www\\.studivz\\.net\\/.*|' +
- 'http:\\/\\/www\\.meinvz\\.net\\/Profile\\/.*|' +
- 'http:\\/\\/www\\.meinvz\\.net\\/l\\/.*|' +
- 'http:\\/\\/www\\.meinvz\\.net\\/Groups\\/Overview\\/.*|' +
- 'http:\\/\\/www\\.meinvz\\.net\\/Gadgets\\/Info\\/.*|' +
- 'http:\\/\\/www\\.meinvz\\.net\\/Gadgets\\/Install\\/.*|…
Large files files are truncated, but you can click here to view the full file