PageRenderTime 32ms CodeModel.GetById 1ms RepoModel.GetById 0ms app.codeStats 0ms

/plugin.video.vodie/resources/lib/TV3Scrapper.py

http://xbmc-vodie.googlecode.com/
Python | 253 lines | 200 code | 24 blank | 29 comment | 23 complexity | 207eeb0334c497285f0911e04a105468 MD5 | raw file
Possible License(s): AGPL-1.0
  1. #!/usr/bin/python
  2. """
  3. VODie
  4. kitesurfing@kitesurfing.ie
  5. """
  6. import re
  7. import sys
  8. from BeautifulSoup import SoupStrainer, MinimalSoup as BeautifulSoup, BeautifulStoneSoup
  9. import urllib, urllib2
  10. from TVSeriesUtil import Util
  11. import MenuConstants
  12. from datetime import date
  13. import simplejson as S
  14. # Url Constants
  15. KNOWN_TV3_SHOWS_URL = 'http://xbmc-vodie.googlecode.com/svn/trunk/plugin.video.vodie/xml/tv3shows.json'
  16. TV3_URL = 'http://www.tv3.ie/'
  17. #MAINURL = TV3_URL + 'includes/ajax/video_all_shows.php'
  18. MAINURL = TV3_URL + 'index.php'
  19. #EPISODE_URL = TV3_URL + 'videos.php?locID=%s'
  20. EPISODE_URL = TV3_URL + 'shows.php?request=%s'
  21. # Channel Constants
  22. CHANNEL = 'TV3'
  23. TV3LOGO = 'http://www.tv3.ie/graphics/global/image_logo_tv3_new.png'
  24. class TV3:
  25. def __init__(self):
  26. page = urllib2.urlopen(KNOWN_TV3_SHOWS_URL)
  27. #page = open('../../xml/tv3shows.json', 'r')
  28. self.KNOWN_TV3_SHOWS = S.load(page)
  29. print len(self.KNOWN_TV3_SHOWS)
  30. def getChannelDetail(self):
  31. return {'Channel' : CHANNEL,
  32. 'Thumb' : TV3LOGO,
  33. 'Title' : 'TV3',
  34. 'mode' : MenuConstants.MODE_MAINMENU,
  35. 'Plot' : 'TV3'
  36. }
  37. def getStringFor(self, parent, tagName, attrName = None, default = 'None'):
  38. if parent.find(tagName):
  39. if attrName is None:
  40. return str(parent.find(tagName).string.strip())
  41. else:
  42. return str(parent.find(tagName)[attrName])
  43. else:
  44. print "Error: Cannot find tagName: %s in %s"%(tagName, entry)
  45. return default
  46. def getVideoDetails(self, url):
  47. # yield {'Channel' : CHANNEL,
  48. # 'Title' : CHANNEL,
  49. # 'Director' : CHANNEL,
  50. # 'Genre' : CHANNEL,
  51. # 'Plot' : CHANNEL,
  52. # 'PlotOutline' : CHANNEL,
  53. # 'id' : url,
  54. # 'url' : url
  55. # }
  56. #
  57. # return
  58. # Load and read the URL
  59. f = urllib2.urlopen(url)
  60. soup = BeautifulStoneSoup(f)
  61. f.close()
  62. # Grab the data we need
  63. metabase = self.getStringFor(soup, 'meta', 'base')
  64. videosrc = self.getStringFor(soup, 'video', 'src').replace('&','&')
  65. yield {'Channel' : CHANNEL,
  66. 'Title' : CHANNEL,
  67. 'Director' : CHANNEL,
  68. 'Genre' : CHANNEL,
  69. 'Plot' : CHANNEL,
  70. 'PlotOutline' : CHANNEL,
  71. 'id' : url,
  72. 'url' : '%s playpath=%s' % (metabase, videosrc)
  73. }
  74. def getMainMenu(self):
  75. # Load and read the URL
  76. f = urllib2.urlopen(MAINURL)
  77. text = f.read()
  78. f.close()
  79. REGEXP = '<a href="(.*?)" class="dropDown" title="(.*?)">(.*?)</a>'
  80. for mymatch in re.findall(REGEXP, text):
  81. title = str(mymatch[1])
  82. pic = TV3LOGO
  83. try:
  84. fanart = self.KNOWN_TV3_SHOWS[title]['Fanart_Image']
  85. yield {'Channel' : CHANNEL,
  86. 'Thumb' : fanart,
  87. 'url' : mymatch[0],
  88. 'Title' : title,
  89. 'mode' : MenuConstants.MODE_GETEPISODES,
  90. 'Fanart_Image' : fanart}
  91. except:
  92. yield {'Channel' : CHANNEL,
  93. 'Thumb' : pic,
  94. 'url' : mymatch[0],
  95. 'Title' : title,
  96. 'mode' : MenuConstants.MODE_GETEPISODES}
  97. def getEpisodes(self, showID):
  98. # Load and read the URL
  99. f = urllib2.urlopen(EPISODE_URL % (showID))
  100. text = f.read()
  101. f.close()
  102. TITLEREGEXP = '<title>(.*?) - TV3</title>'
  103. for mymatch in re.findall(TITLEREGEXP, text, re.MULTILINE):
  104. the_title = mymatch.strip()
  105. REGEXP = '^<a class="whiteLink" href="(videos.php\?video=.*?&date=(\d\d\d\d-\d\d-\d\d)&date_mode=&page=1&show_cal=\d*&newspanel=&showspanel=&web_only=&full_episodes=)">\s+<img src=(.*?) height="84" alt="(.*?)" title="(.*?)"'
  106. REGEXP = '<div id="panel_video_menu_entry"onclick="window.open\(\'(.*?)\',\'_self\'\)" onMouseOver="style.cursor=\'pointer\'">\s+<p class="video_menu_entry"><img class="float_left" src="(.*?)" width="116" height="64" alt="" border="0">\s+<strong>(.*?)</strong>\s+<br />(.*?)</p>'
  107. for mymatch in re.findall(REGEXP, text, re.MULTILINE):
  108. # Default values
  109. description = 'None'
  110. link = 'None'
  111. # ListItem properties
  112. img = mymatch[1]
  113. datestr = mymatch[2]
  114. description = mymatch[3].strip()
  115. # Look for the higher resolution image
  116. img = img.replace('thumbnail.jpg','preview_vp.jpg')
  117. # Format the date
  118. date_array = datestr.split()
  119. if len(date_array) == 4:
  120. month = date_array[2][:-1].lower()
  121. if month.find('jan') > -1:
  122. month = 1
  123. elif month.find('feb') > -1:
  124. month = 2
  125. elif month.find('mar') > -1:
  126. month = 3
  127. elif month.find('apr') > -1:
  128. month = 4
  129. elif month.find('may') > -1:
  130. month = 5
  131. elif month.find('jun') > -1:
  132. month = 6
  133. elif month.find('jul') > -1:
  134. month = 7
  135. elif month.find('aug') > -1:
  136. month = 8
  137. elif month.find('sep') > -1:
  138. month = 9
  139. elif month.find('oct') > -1:
  140. month = 10
  141. elif month.find('nov') > -1:
  142. month = 11
  143. elif month.find('dec') > -1:
  144. month = 12
  145. else:
  146. month = 0
  147. if month > 0:
  148. datestr = "%02d-%02d-%s" % ( int(date_array[1].replace('th','').replace('st','').replace('nd','')), month, '2011')
  149. title = the_title
  150. else:
  151. title = the_title + ' - ' + datestr
  152. datestr = date.today().strftime("%d-%m-%Y")
  153. else:
  154. title = the_title + ' - ' + datestr
  155. datestr = date.today().strftime("%d-%m-%Y")
  156. year = 2011
  157. # Load the URL for this episode
  158. f2 = urllib2.urlopen(TV3_URL + mymatch[0])
  159. text2 = f2.read()
  160. # Get link for the mp4
  161. mp4re = 'url: \"(.*?mp4)\"'
  162. for mymatch2 in re.findall(mp4re, text2, re.MULTILINE):
  163. link = mymatch2
  164. yield {'Channel' : CHANNEL,
  165. 'Thumb' : img,
  166. 'Fanart_Image': img,
  167. 'url' : link,
  168. 'Title' : title,
  169. 'mode' : MenuConstants.MODE_PLAYVIDEO,
  170. 'Plot' : description,
  171. 'plotoutline' : title,
  172. 'Date' : datestr,
  173. 'Year' : year,
  174. 'Studio' : CHANNEL
  175. }
  176. def convertHTML(self, text):
  177. if not text == '':
  178. return BeautifulStoneSoup(text,
  179. convertEntities=BeautifulStoneSoup.HTML_ENTITIES).contents[0].encode( "utf-8" )
  180. else:
  181. return 'None'
  182. def generateShowsAndSave(self):
  183. f = open('../../xml/tv3shows.json', 'w')
  184. for show in self.getMainMenu():
  185. # Load and read the URL
  186. f2 = urllib2.urlopen(EPISODE_URL % (show['url']))
  187. text = f2.read()
  188. f2.close()
  189. key = show['Title']
  190. try:
  191. showkeys = self.KNOWN_TV3_SHOWS[key].keys()
  192. print 'Updating ' + show['Title']
  193. self.KNOWN_TV3_SHOWS[key]['']
  194. except:
  195. print 'Adding ' + show['Title']
  196. self.KNOWN_TV3_SHOWS[key] = {}
  197. self.KNOWN_TV3_SHOWS[key]['Title'] = show['Title']
  198. REGEXP = '<div id="content" style="background-image: url\((.*?)\)">'
  199. for mymatch in re.findall(REGEXP, text, re.MULTILINE):
  200. fanart = mymatch
  201. print fanart
  202. self.KNOWN_TV3_SHOWS[key]['Fanart_Image'] = fanart
  203. S.dump(self.KNOWN_TV3_SHOWS, f, indent=4)
  204. f.close()
  205. if __name__ == '__main__':
  206. # TV3().generateShowsAndSave()
  207. # exit(1)
  208. items = TV3().getMainMenu()
  209. for item in items:
  210. print item
  211. episodes = TV3().getEpisodes(item['url'])
  212. for episode in episodes:
  213. print episode
  214. for detail in TV3().getVideoDetails(episode['url']):
  215. print detail