PageRenderTime 40ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/plugin.video.alqaheraalyoum/resources/lib/qaheraalyoum/scraper.py

https://github.com/araber14/repository.arabic.xbmc-addons
Python | 54 lines | 51 code | 3 blank | 0 comment | 1 complexity | d269664944bd8887b0f300af75e5c600 MD5 | raw file
Possible License(s): GPL-3.0, AGPL-1.0
  1. import re
  2. from urllib2 import urlopen
  3. from urlparse import urljoin
  4. from utils import get_redirect_flv_stream_url
  5. from BeautifulSoup import BeautifulSoup
  6. BASE_URL = 'http://www.alqaheraalyoum.net/videos/newvideos.php'
  7. def _url(path=''):
  8. """Returns a full url for the given path"""
  9. return urljoin(BASE_URL, path)
  10. def get(url):
  11. """Performs a GET request for the given url and returns the response"""
  12. conn = urlopen(url)
  13. resp = conn.read()
  14. conn.close()
  15. return resp
  16. def _html(url):
  17. """Downloads the resource at the given url and parses via BeautifulSoup"""
  18. return BeautifulSoup(get(url), convertEntities=BeautifulSoup.HTML_ENTITIES)
  19. def get_stream_url(clip_url):
  20. # A simple rename in the clip URL can usually correctly map to the
  21. # correct streaming URL. Check URL after correction and return if positive
  22. streamUrl = re.sub('playvideo.php', 'videos.php', clip_url)
  23. flvUrl = get_redirect_flv_stream_url(streamUrl)
  24. if not flvUrl is '':
  25. return flvUrl
  26. # Do an expensive fetch to the clip's page, and extract stream link from there
  27. html = get(clip_url)
  28. matchObj = re.search( r'file: \'(.*)\'', html, re.M|re.I)
  29. return matchObj.group(1)
  30. def get_clips():
  31. """Returns a list of subjects for the website. Each subject is a dict with keys of 'name' and 'url'."""
  32. url = _url()
  33. html = _html(url)
  34. clips = html.find('div', { 'id': 'newvideos_results' }).findAll('tr', { 'class' : None })
  35. return [_get_clip(clip) for clip in clips]
  36. def _get_clip(el):
  37. return {
  38. 'thumbnail': el.find('img')['src'],
  39. 'url': el.find('a')['href'],
  40. 'name': el.findAll('td')[1].contents[0],
  41. 'addedWhen': el.findAll('td')[3].contents[0],
  42. 'date': el.findAll('td')[2].find('a').contents[0]
  43. }