PageRenderTime 51ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/plugins/video/Easynews/IMDbClient.py

http://xbmc-addons.googlecode.com/
Python | 377 lines | 352 code | 11 blank | 14 comment | 2 complexity | 1179c27260a910edacaebb8d0f5087cd MD5 | raw file
Possible License(s): AGPL-1.0, GPL-3.0, GPL-2.0, BSD-2-Clause
  1. """
  2. IMDb api client module
  3. Nuka1195
  4. """
  5. import sys
  6. import os
  7. if ( __name__ != "__main__" ):
  8. import xbmc
  9. import urllib
  10. import urllib2
  11. import re
  12. class _Info:
  13. def __init__( self, *args, **kwargs ):
  14. self.__dict__.update( kwargs )
  15. class _IMDbParser:
  16. """
  17. Parser Class: parses an html document for movie info
  18. """
  19. # HTML cleaning regex pattern
  20. pattern_clean = re.compile( '<.+?>' )
  21. # IMDb regex patterns
  22. pattern_title = re.compile( '<meta name="title" content="([^"]*) \(([0-9]*)' )
  23. pattern_year = re.compile( 'a href="/Sections/Years/([0-9]+)/">' )
  24. pattern_top250 = re.compile( 'Top 250: #([0-9]*)</a>' )
  25. pattern_user_rating = re.compile( '<b>([0-9.]+)/10</b>[^<]*<a href="ratings" class="tn15more">([0-9,]+) votes</a>' )
  26. pattern_director = re.compile( '<h5>Directors?[^:]*:</h5>[\n]*(.*)' )
  27. pattern_director2 = re.compile( '<a href="/name/[^>]*>([^<]*)' )
  28. pattern_writer = re.compile( '<h5>Writers?[^:]*:</h5>[\n]*(.*)' )
  29. pattern_writer2 = re.compile( '<a href="/name/[^>]*>(.*?)<br/>' )
  30. pattern_release_date = re.compile( '<h5>Release Date:</h5>[^0-9]*([0-9]* [A-Za-z]* [0-9]*)' )
  31. pattern_genres = re.compile( '"/Sections/Genres/[^/]*/">([^<]*)</a>' )
  32. pattern_tagline = re.compile( '<h5>Tagline:</h5>([^<]*)' )
  33. pattern_plot = re.compile( '<h5>Plot(| Outline| Summary):</h5>([^<]*)' )
  34. pattern_awards = re.compile( '<h5>Awards:</h5>([^<]*)' )
  35. pattern_user_comments = re.compile( '<h5>User Comments:</h5>([^<]*)' )
  36. pattern_mpaa = re.compile( 'MPAA</a>:</h5>([^<]*)' )
  37. pattern_duration = re.compile( '<h5>Runtime:</h5>[^0-9]*([^<]*)' )
  38. pattern_countries = re.compile( '<h5>Countr[ies|y]:</h5>[^>]*>([^<]*)' )
  39. pattern_language = re.compile( '<h5>Language:</h5>[^>]*>([^<]*)' )
  40. pattern_aspect_ratio = re.compile( '<h5>Aspect Ratio:</h5>([^<]*)' )
  41. pattern_sound_mix = re.compile( 'sound-mix=[^>]+>.*\s([^<]+)' )
  42. pattern_certification = re.compile( '<a href="/List\?certificates=[^"]*">([^<]*)</a>[^<]*(<i>([^<]*)</i>)?' )
  43. pattern_locations = re.compile( '<h5>Filming Locations:</h5>.*\s.*\s.*\s([^<]+)' )
  44. pattern_movie_meter = re.compile( '<h5>MOVIEmeter:.*?\n.*?\n[^"]*"([^"]*)">[^>]*>([^<]*)</span>(.*)' )
  45. pattern_studio = re.compile( '"/company/[^/]*/">([^<]*)</a>' )
  46. pattern_trivia = re.compile( '<h5>Trivia:</h5>[\n]*(.*)' )
  47. pattern_goofs = re.compile( '<h5>Goofs:</h5>[\n]*(.*)' )
  48. pattern_quotes = re.compile( '<h5>Quotes:.*?</div>', re.DOTALL )
  49. pattern_quotes2 = re.compile( '/name/[^>]*>([^<]+)?.*\s(\[[^\]]*\])*([^<]*)' )
  50. pattern_quotes3 = re.compile( '<b>([^<]+)</b>(:).*\s(.*)' )
  51. pattern_poster = re.compile( '<a name="poster".*?src="([^"]*)' )
  52. pattern_cast = re.compile( '<table class="cast">.*' )
  53. pattern_cast2 = re.compile( 'href="/name/nm[0-9]*/">([^<]*).*?<td class="char">(.*?)</td>' )
  54. #pattern_trailer = re.compile( '<a href="([^"]*)" onClick=[^>]*link=/video/|link=/title/[^\']*screenplay' )
  55. #pattern_trailer2 = re.compile( 'so.addVariable\("file", "([^"]*)' )
  56. pattern_trailer = re.compile( '<a style=".*?href="([^"]*)".*?>view trailer</a>' )
  57. pattern_trailer2 = re.compile( 'screenplay-([^-]*)-(.*)' )
  58. def __init__( self ):
  59. self.info = _Info()
  60. def parse( self, htmlSource ):
  61. # title
  62. self.info.title = ""
  63. matches = self.pattern_title.findall( htmlSource )
  64. if ( matches ):
  65. self.info.title = self._clean_text( matches[ 0 ][ 0 ] )
  66. # year
  67. self.info.year = 0
  68. matches = self.pattern_year.findall( htmlSource )
  69. if ( matches ):
  70. self.info.year = int( self._clean_text( matches[ 0 ] ) )
  71. # top 250
  72. self.info.top_250 = ""
  73. matches = self.pattern_top250.findall( htmlSource )
  74. if ( matches ):
  75. self.info.top_250 = self._clean_text( matches[ 0 ] )
  76. # user rating
  77. self.info.user_rating = 0.0
  78. self.info.user_votes = ""
  79. matches = self.pattern_user_rating.findall( htmlSource )
  80. if ( matches ):
  81. self.info.user_rating = float( matches[ 0 ][ 0 ] )
  82. self.info.user_votes = matches[ 0 ][ 1 ]
  83. # director
  84. self.info.director = ""
  85. # the first match gets all html code
  86. matches = self.pattern_director.search( htmlSource )
  87. if ( matches ):
  88. # we need to assign element 0 to a variable
  89. text = matches.groups()[ 0 ]
  90. # now findall directors
  91. matches = self.pattern_director2.findall( text )
  92. if ( matches ):
  93. self.info.director = self._clean_text( ' / '.join( matches ) )
  94. # writer
  95. self.info.writer = ""
  96. # the first match gets all html code
  97. matches = self.pattern_writer.search( htmlSource )
  98. if ( matches ):
  99. # we need to assign element 0 to a variable
  100. text = matches.groups()[ 0 ]
  101. # now findall writers
  102. matches = self.pattern_writer2.findall( text )
  103. if ( matches ):
  104. self.info.writer = self._clean_text( ' '.join( matches ) )
  105. # release date
  106. self.info.release_date = ""
  107. matches = self.pattern_release_date.findall( htmlSource )
  108. if ( matches ):
  109. self.info.release_date = self._clean_text( matches[ 0 ] )
  110. # genres
  111. self.info.genre = ""
  112. matches = self.pattern_genres.findall( htmlSource )
  113. if ( matches ):
  114. self.info.genre = self._clean_text( ' / '.join( matches ) )
  115. # tagline
  116. self.info.tagline = ""
  117. matches = self.pattern_tagline.findall( htmlSource )
  118. if ( matches ):
  119. self.info.tagline = self._clean_text( matches[ 0 ] )
  120. # plot
  121. self.info.plot = ""
  122. matches = self.pattern_plot.findall( htmlSource )
  123. if ( matches ):
  124. self.info.plot = self._clean_text( matches[ 0 ][ 1 ] )
  125. # awards
  126. self.info.awards = ""
  127. matches = self.pattern_awards.findall( htmlSource )
  128. if ( matches ):
  129. self.info.awards = self._clean_text( matches[ 0 ].replace( "\n", " " ) )
  130. # user comments
  131. self.info.user_comments = ""
  132. matches = self.pattern_user_comments.findall( htmlSource )
  133. if ( matches ):
  134. self.info.user_comments = self._clean_text( matches[ 0 ] )
  135. # mpaa
  136. self.info.mpaa = ""
  137. matches = self.pattern_mpaa.findall( htmlSource )
  138. if ( matches ):
  139. self.info.mpaa = self._clean_text( matches[ 0 ] )
  140. # duration
  141. self.info.duration = ""
  142. matches = self.pattern_duration.findall( htmlSource )
  143. if ( matches ):
  144. self.info.duration = self._clean_text( matches[ 0 ] )
  145. # countries
  146. self.info.countries = ""
  147. matches = self.pattern_countries.findall( htmlSource )
  148. if ( matches ):
  149. self.info.countries = self._clean_text( matches[ 0 ] )
  150. # language
  151. self.info.language = ""
  152. matches = self.pattern_language.findall( htmlSource )
  153. if ( matches ):
  154. self.info.language = self._clean_text( matches[ 0 ] )
  155. # aspect ratio
  156. self.info.aspect_ratio = ""
  157. matches = self.pattern_aspect_ratio.findall( htmlSource )
  158. if ( matches ):
  159. self.info.aspect_ratio = self._clean_text( matches[ 0 ] )
  160. # sound mix
  161. self.info.sound_mix = ""
  162. # the first match gets all html code
  163. matches = self.pattern_sound_mix.findall( htmlSource )
  164. if ( matches ):
  165. self.info.sound_mix = self._clean_text( ' / '.join( matches ) )
  166. # certification
  167. self.info.certification = ""
  168. matches = self.pattern_certification.findall( htmlSource )
  169. if ( matches ):
  170. for match in matches:
  171. self.info.certification += "%s%s%s%s" % ( ( "", " / ", )[ self.info.certification != "" ], match[ 0 ], ( "", " ", )[ match[ 2 ] != "" ], match[ 2 ], )
  172. self.info.certification = self.info.certification.replace( "\n", "" )
  173. # filming locations
  174. self.info.locations = ""
  175. matches = self.pattern_locations.findall( htmlSource )
  176. if ( matches ):
  177. self.info.locations = self._clean_text( matches[ 0 ] )
  178. # movie meter
  179. self.info.movie_meter = ""
  180. matches = self.pattern_movie_meter.findall( htmlSource )
  181. if ( matches ):
  182. self.info.movie_meter = self._clean_text( ''.join( matches[ 0 ] ) )
  183. # studio
  184. self.info.studio = ""
  185. matches = self.pattern_studio.findall( htmlSource )
  186. if ( matches ):
  187. self.info.studio = self._clean_text( matches[ 0 ] )
  188. # trivia
  189. self.info.trivia = ""
  190. matches = self.pattern_trivia.findall( htmlSource )
  191. if ( matches ):
  192. self.info.trivia = self._clean_text( matches[ 0 ] )
  193. # goofs
  194. self.info.goofs = ""
  195. matches = self.pattern_goofs.findall( htmlSource )
  196. if ( matches ):
  197. self.info.goofs = self._clean_text( matches[ 0 ] )
  198. # quotes
  199. self.info.quotes = []
  200. # the first match gets all html code
  201. matches = self.pattern_quotes.search( htmlSource )
  202. if ( matches ):
  203. # we need to assign element 0 to a variable
  204. text = matches.group()
  205. # now find all quotes
  206. matches = self.pattern_quotes2.findall( text )
  207. # if none try the other format
  208. if ( not matches ):
  209. matches = self.pattern_quotes3.findall( text )
  210. if ( matches ):
  211. for actor, scene, quote in matches:
  212. self.info.quotes += [ '%s%s%s - "%s"' % ( self._clean_text( actor ), ( "", " ", )[ self._clean_text( scene ) != "" ], self._clean_text( scene ), self._clean_text( quote ), ) ]
  213. # poster
  214. self.info.poster = ""
  215. matches = self.pattern_poster.findall( htmlSource )
  216. if ( matches ):
  217. self.info.poster = matches[ 0 ]#.replace( "m.jpg", "f.jpg")
  218. # cast
  219. self.info.cast = []
  220. # the first match gets all html code
  221. matches = self.pattern_cast.search( htmlSource )
  222. if ( matches ):
  223. # we need to assign element 0 to a variable
  224. text = matches.group()
  225. # now find all sound mixes
  226. matches = self.pattern_cast2.findall( text )
  227. if ( matches ):
  228. for actor, role in matches:
  229. self.info.cast += [ ( self._clean_text( actor ), self._clean_text( role ), ) ]
  230. # trailer url
  231. self.info.trailer = ""
  232. matches = self.pattern_trailer.findall( htmlSource )
  233. if ( len( matches ) and matches[ 0 ] ):
  234. codes = self.pattern_trailer2.findall( matches[ 0 ] )
  235. if ( codes ):
  236. self.info.trailer = "http://progressive.totaleclips.com.edgesuite.net/%s/%s_f27.flv?eclipid=%s&bitrateid=%s&vendorid=102&type=.flv" % ( codes[ 0 ][ 0 ][ 1 : 4 ], codes[ 0 ][ 0 ].lower(), codes[ 0 ][ 0 ], codes[ 0 ][ 1 ], )
  237. def _clean_text( self, text ):
  238. # remove html source
  239. text = re.sub( self.pattern_clean, '', text ).strip()
  240. # replace entities and return iso-8859-1 unicode
  241. return unicode( urllib.unquote( text ).replace( "&lt;", "<" ).replace( "&gt;", ">" ).replace( "&quot;", '"' ).replace( "&#38;", "&" ).replace( "&#39;", "'" ).replace( "&amp;", "&" ), "iso-8859-1" )
  242. class IMDbFetcher:
  243. def __init__( self ):
  244. # create the cache folder if it does not exist
  245. self.base_cache_path = self._create_base_cache_path()
  246. def _create_base_cache_path( self ):
  247. """ creates the base cache folder """
  248. # split our path into folders, we replace / with \ for compatability
  249. base_cache_path = os.getcwd().replace( ";", "" ).replace( "/", "\\" )
  250. # if not debugging set the path to the profiles directory
  251. if ( __name__ != "__main__" ):
  252. path = base_cache_path.split( "\\" )
  253. # join the main plugin folders to create our base path
  254. base_cache_path = os.path.join( "P:\\plugin_data", path[ -2 ], path[ -1 ], "cache" )
  255. # if cache path does not exist, create it
  256. if ( not os.path.isdir( xbmc.translatePath( base_cache_path ) ) ):
  257. os.makedirs( xbmc.translatePath( base_cache_path ) )
  258. # return our cache path
  259. return base_cache_path
  260. def fetch_info( self, url, poster_size ):
  261. print url
  262. """ Fetch showtimes if available else return a list of theaters """
  263. try:
  264. # create the cache filename
  265. file_path = self._get_cache_name( url )
  266. # Open url or local cache file
  267. # TODO: check date and refresh if more than a week old
  268. usock = urllib.urlopen( url )
  269. # read source
  270. htmlSource = usock.read()
  271. # close socket
  272. usock.close()
  273. # Save htmlSource to a file
  274. if ( not os.path.exists( file_path ) ):
  275. file_object = open( file_path, "w" )
  276. file_object.write( htmlSource )
  277. file_object.close()
  278. # Parse htmlSource for showtimes
  279. self.parser = _IMDbParser()
  280. self.parser.parse( htmlSource )
  281. # fetch trailer poster
  282. self._fetch_poster( self.parser.info.poster, file_path, poster_size )
  283. # return the IMDb info
  284. print self.parser.info.title
  285. return self.parser.info
  286. except:
  287. # oops print error message
  288. print "ERROR: %s::%s (%d) - %s" % ( self.__class__.__name__, sys.exc_info()[ 2 ].tb_frame.f_code.co_name, sys.exc_info()[ 2 ].tb_lineno, sys.exc_info()[ 1 ], )
  289. return None
  290. def _fetch_poster( self, url, file_path, poster_size ):
  291. # substitute our prefered poster size
  292. url = re.sub( "_SX[0-9]+_SY[0-9]+_.jpg", "_SX%s_SY%s_.jpg" % ( poster_size, poster_size, ), url )
  293. # create the cache filename
  294. file_path += ".jpg"
  295. try:
  296. if ( url ):
  297. #urllib.urlretrieve( url, file_path )
  298. self.parser.info.poster = url
  299. except:
  300. urllib.urlcleanup()
  301. remove_tries = 3
  302. while remove_tries and os.path.isfile( filepath ):
  303. try:
  304. os.remove( filepath )
  305. except:
  306. remove_tries -= 1
  307. xbmc.sleep( 1000 )
  308. def _get_cache_name( self, url ):
  309. # get the imdb title code
  310. title = url.split( "/" )[ -2 ]
  311. # append imdb title code to cache path
  312. file_path = os.path.join( self.base_cache_path, title )
  313. # return our complete file path
  314. if ( __name__ != "__main__" ):
  315. return xbmc.translatePath( file_path )
  316. else:
  317. return file_path
  318. if ( __name__ == "__main__" ):
  319. url = [ "http://www.imdb.com/title/tt0910970/", "http://imdb.com/title/tt0997047/", "http://imdb.com/title/tt0443649/", "http://www.imdb.com/title/tt1073498/", "http://www.imdb.com/title/tt0760329/", "http://www.imdb.com/title/tt0880578/", "http://www.imdb.com/title/tt0080684/", "http://www.imdb.com/title/tt0472062/", "http://www.imdb.com/title/tt0462499/", "http://www.imdb.com/title/tt0389790/", "http://www.imdb.com/title/tt0442933/", "http://www.imdb.com/title/tt0085106/" ]
  320. for cnt in range( 1,2 ):
  321. info = IMDbFetcher().fetch_info( url[ cnt ], "1024" )
  322. if ( info ):
  323. for attr in dir( info ):
  324. if ( not attr.startswith( "__" ) ):
  325. print "%s:" % attr.replace( "_", " " ).title(), getattr( info, attr ), type( getattr( info, attr ) )
  326. print "---------------\n"