PageRenderTime 48ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/myTV/resources/lib/IMDbLib.py

http://xbmc-scripting.googlecode.com/
Python | 508 lines | 430 code | 30 blank | 48 comment | 35 complexity | 5bf1b0392675727fac6850928efa470b MD5 | raw file
Possible License(s): BSD-2-Clause
  1. """ IMDbLib Movie Information Library:
  2. Movie Information parsing from http://www.imdb.com
  3. Originally Written By:
  4. ---------------------
  5. No1CaNTeL
  6. AllTheKillx
  7. MODIFIED by BigBellyBilly AT gmail DOT com
  8. -------------------------
  9. Fixed and modified in several areas. NO NOT REPLACE THIS LIBRARY FILE
  10. Changelog:
  11. 02/11/06 Change: IMDBSearch lookupstr to ensure it always replaces space with +
  12. 28/04/07 Fix: IMDb site changes and added extra tags.
  13. 05/06/07 Fix: Gallery regex
  14. 09/07/07 Fix: Seach for title. caused by site change
  15. 11/12/07 Fix: Scraping regex (see date comments)
  16. 18/01/08 Fix: regex for Cast
  17. 29/08/08 Fix: looks for Popular and Exact , Approx matches, fix unicode lookups
  18. 03/10/08 Change: Improved title name cleaning.
  19. 10/03/09 Fix: Exact search re
  20. """
  21. import os,sys,re,urllib,string, urlparse,traceback,unicodedata
  22. from string import find, translate
  23. DEBUG = True
  24. def log(s):
  25. if DEBUG:
  26. print s
  27. fixReg = re.compile(r"&#[0-9]{1,3};")
  28. reFlags = re.MULTILINE+re.IGNORECASE+re.DOTALL
  29. # full page RE
  30. titleReg = '<title>(.*?)<'
  31. taglineReg = '>Tagline:<.*?>([^<]*)' # 11/12/07
  32. plotOutlineReg = 'h5>Plot.*?:<.*?>(.*?)<' # 06/06/08
  33. yearReg = 'Sections/Years/.*?>(.*?)<'
  34. ratingReg = '<b>User Rating:</b>[^<]*<b>(.*?)<.*?">(.*?)<' # 11/12/07
  35. runtimeReg = '>Runtime:<.*?>([^<]*)' # 11/12/07
  36. countriesReg = '>Country:<.*?>([^<]*)</a' # 11/12/07
  37. languagesReg = '>Language:<.*?>([^<]*)</a' # 11/12/07
  38. awardsReg = '>Awards:<.*?>([^<]*)' # 11/12/07
  39. releaseDateReg = '>Release Date:<.*?>(.*?)<'
  40. userCommentReg = '>User Comments:<.*?>(.*?)<'
  41. akaReg = '>Also Known As:<.*?>(.*?)<'
  42. aspectReg = '>Aspect Ratio:<.*?>(.*?)<'
  43. triviaReg = '>Trivia:<.*?>(.*?)</div' # 11/12/07
  44. goofsReg = '>Goofs:<.*?>(.*?)</div' # 11/12/07
  45. soundtrackReg = '>Soundtrack:<.*?>(.*?)</div' # 11/12/07
  46. soundMixReg = '>Sound Mix:<.*?>(.*?)</div' # 11/12/07
  47. posterURLReg = '"photo">.*?src="(.*?)" height="(\d+)" width="(\d+)"'
  48. companyReg = '"/company/[^/]*/">([^<]*)</a>' # 11/12/07
  49. genresReg = re.compile(r'/Genres/.*?>(.*?)<', reFlags)
  50. certsReg = re.compile(r'certificates=.*?>(.*?)<', reFlags)
  51. # per line RE
  52. directorsReg = re.compile(r'.*?Director:.*')
  53. writersReg = re.compile(r'.*?Writer.*')
  54. castReg = re.compile(r'.*?class="cast".*')
  55. creatorsReg = re.compile(r'.*?Creator.*')
  56. in_directorsReg = re.compile(r'href="/name/.*?>(.*?)</')
  57. in_writersReg = re.compile(r'href="/name/.*?>(.*?)</')
  58. in_castReg = re.compile(r'href="/name/[nm0-9]*/">(.*?)<.*?(?:/character/|"char").*?>(.*?)(?:</a|</t)') # 25/01/08
  59. in_creatorsReg = re.compile(r'href="/name/.*?>(.*?)</')
  60. # gallery RE
  61. #galleriesRE = re.compile('href="photogallery(-.*?)"',reFlags)
  62. #galleryRE = re.compile('<a href="/gallery.*? src="(.*?)".*?(?:width|height)="(\d+)".*?(?:width|height)="(\d+)".*?</a>',reFlags)
  63. galleryThumbsRE = re.compile('img alt="(.*?)".*?src="(.*?)"') # 18/03/08
  64. galleryPageCountRE = re.compile(r'page=(\d+)', reFlags)
  65. class IMDb:
  66. def __init__(self,url):
  67. log("IMDb()")
  68. NA = 'N/A'
  69. self.Title = NA
  70. self.Tagline = NA
  71. self.PlotOutline = NA
  72. self.Year = NA
  73. self.Rating = NA
  74. self.Runtime = NA
  75. self.Countries = NA
  76. self.Languages = NA
  77. self.Awards = NA
  78. self.ReleaseDate = NA
  79. self.UserComment = NA
  80. self.AKA = ''
  81. self.Aspect = NA
  82. self.Trivia = NA
  83. self.Goofs = NA
  84. self.Soundtrack = NA
  85. self.SoundMix = NA
  86. self.PosterURL = None
  87. self.Cast = []
  88. self.Certs = NA
  89. self.Creators = NA
  90. self.Writers = NA
  91. self.Directors = NA
  92. self.Genres = NA
  93. page = readPage(url)
  94. if not page:
  95. return
  96. # FUL PAGE REGEX
  97. log("do full page regex searches ...")
  98. matches = re.search(titleReg, page, reFlags)
  99. if matches:
  100. self.Title = clean(matches.group(1))
  101. # TAGLINE
  102. matches = re.search(taglineReg, page, reFlags)
  103. if matches:
  104. self.Tagline = clean(matches.group(1))
  105. # YEARS
  106. matches = re.search(yearReg, page, reFlags)
  107. if matches:
  108. self.Year = clean(matches.group(1))
  109. # PLOT SUMMARY
  110. matches = re.search(plotOutlineReg, page, reFlags)
  111. if matches:
  112. self.PlotOutline = clean(matches.group(1)).replace('full summary','')
  113. # RATING
  114. matches = re.search(ratingReg, page, reFlags)
  115. if matches:
  116. self.Rating = "%s %s" % (matches.group(1).strip(), matches.group(2).strip())
  117. # RUNTIME
  118. matches = re.search(runtimeReg, page, reFlags)
  119. if matches:
  120. self.Runtime = clean(matches.group(1))
  121. # COUNTRY
  122. matches = re.search(countriesReg, page, reFlags)
  123. if matches:
  124. self.Countries = clean(matches.group(1))
  125. # LANG
  126. matches = re.search(languagesReg, page, reFlags)
  127. if matches:
  128. self.Languages = clean(matches.group(1))
  129. # AWARDS
  130. matches = re.search(awardsReg, page, reFlags)
  131. if matches:
  132. self.Awards = clean(matches.group(1)).replace('more','')
  133. # RELEASE DATE
  134. matches = re.search(releaseDateReg, page, reFlags)
  135. if matches:
  136. self.ReleaseDate = clean(matches.group(1))
  137. # USER COMMENT
  138. matches = re.search(userCommentReg, page, reFlags)
  139. if matches:
  140. self.UserComment = clean(matches.group(1)).replace('more','')
  141. # AKA
  142. matches = re.search(akaReg, page, reFlags)
  143. if matches:
  144. self.AKA = clean(matches.group(1))
  145. # ASPECT
  146. matches = re.search(aspectReg, page, reFlags)
  147. if matches:
  148. self.Aspect = clean(matches.group(1))
  149. # TRIVIA
  150. matches = re.search(triviaReg, page, reFlags)
  151. if matches:
  152. self.Trivia = clean(matches.group(1)).replace('more','')
  153. # GOOFS
  154. matches = re.search(goofsReg, page, reFlags)
  155. if matches:
  156. self.Goofs = clean(matches.group(1)).replace('more','')
  157. # SOUNDTRACK
  158. matches = re.search(soundtrackReg, page, reFlags)
  159. if matches:
  160. self.Soundtrack = clean(matches.group(1)).replace('more','')
  161. # SOUNDMIX
  162. matches = re.search(soundMixReg, page, reFlags)
  163. if matches:
  164. self.SoundMix = clean(matches.group(1)).replace('more','')
  165. # POSTER PHOTO
  166. matches = re.search(posterURLReg, page, reFlags)
  167. if matches:
  168. self.PosterURL = clean(matches.group(1))
  169. # GENRES
  170. matches = genresReg.findall(page)
  171. if matches:
  172. self.Genres = ','.join(matches)
  173. # CERTS
  174. matches = certsReg.findall(page)
  175. if matches:
  176. self.Certs = ','.join(matches)
  177. log("do pre line regex searches ...")
  178. pageLines = page.split('\n')
  179. i = 0
  180. while i < len(pageLines):
  181. if directorsReg.match(pageLines[i]):
  182. i += 1
  183. matches = in_directorsReg.findall(pageLines[i])
  184. if matches:
  185. self.Directors = clean(','.join(matches))
  186. elif writersReg.match(pageLines[i]):
  187. i += 1
  188. matches = in_writersReg.findall(pageLines[i])
  189. if matches:
  190. self.Writers = clean(','.join(matches))
  191. elif castReg.match(pageLines[i]):
  192. matches = in_castReg.findall(pageLines[i])
  193. for actor,role in matches:
  194. self.Cast.append([clean(actor), clean(role)])
  195. if creatorsReg.match(pageLines[i]):
  196. i += 1
  197. matches = in_creatorsReg.findall(pageLines[i])
  198. if matches:
  199. self.Creators = clean(','.join(matches))
  200. i+=1
  201. if DEBUG: # switch on for debug results
  202. print "Title =%s" % self.Title
  203. print "Tagline =%s" % self.Tagline
  204. print "PlotOutline =%s" % self.PlotOutline
  205. print "Year =%s" % self.Year
  206. print "Rating =%s" % self.Rating
  207. print "Runtime =%s" % self.Runtime
  208. print "Countries =%s" % self.Countries
  209. print "Languages =%s" % self.Languages
  210. print "Awards =%s" % self.Awards
  211. print "ReleaseDate =%s" % self.ReleaseDate
  212. print "UserComment =%s" % self.UserComment
  213. print "AKA =%s" % self.AKA
  214. print "Aspect =%s" % self.Aspect
  215. print "Trivia =%s" % self.Trivia
  216. print "Goofs =%s" % self.Goofs
  217. print "Soundtrack =%s" % self.Soundtrack
  218. print "SoundMix =%s" % self.SoundMix
  219. print "PosterURL =%s" % self.PosterURL
  220. print "Certs =%s" % self.Certs
  221. print "Creators =%s" % self.Creators
  222. print "Writers =%s" % self.Writers
  223. print "Directors =%s" % self.Directors
  224. print "Genres =%s" % self.Genres
  225. print "Cast =%s" % self.Cast
  226. ###########################################################################################################
  227. class IMDbSearch:
  228. def __init__(self, findStr):
  229. log("IMDbSearch()")
  230. self.YEAR = 0
  231. self.TITLE = 1
  232. self.URL = 2
  233. try:
  234. trantab = string.maketrans('','')
  235. lookupStr = findStr.strip().translate(trantab,'~!@#$%^&*()_+`-={}|[]\:";\'<>?,./').replace(' ','+')
  236. except:
  237. try:
  238. lookupStr = findStr.replace(' ','+')
  239. except:
  240. lookupStr = findStr
  241. log( "lookupStr assigned" )
  242. self.SearchResults = None
  243. # url = 'http://www.imdb.com/find?s=tt&q='
  244. url = 'http://www.imdb.com/find?s=tt&q=' + lookupStr
  245. # if isinstance(lookupStr,unicode):
  246. # try:
  247. # url += lookupStr.encode('latin-1')
  248. # except:
  249. # url += lookupStr
  250. # else:
  251. # url += lookupStr
  252. page = readPage(url)
  253. if not page:
  254. return
  255. self.SearchResults = []
  256. if string.find(page, 'No Matches') != -1:
  257. log( "found 'No Matches'" )
  258. elif string.find(page, '>Popular') != -1 or string.find(page, 'Exact Matches') != -1 \
  259. or string.find(page, 'Approx Matches') != -1:
  260. log( "found 'Popular' and/or 'Exact' or 'Approx' matches" )
  261. # title code, title, year
  262. search = re.compile('href="/title/([t0-9]*)/".*?>(.*?)</a> *\(([0-9]*)') # updated 10/03/2009
  263. matches = search.findall(page)
  264. if matches:
  265. for a in matches:
  266. url = "http://www.imdb.com/title/%s/" % (a[0])
  267. title = clean(a[1])
  268. year = a[2]
  269. self.SearchResults.append((year, title, url))
  270. # sort resul tuple into year reverse order
  271. if self.SearchResults:
  272. self.SearchResults.sort()
  273. self.SearchResults.reverse()
  274. else:
  275. log( "no matches on page" )
  276. elif string.find(page, '<h1>'+findStr) or string.find(page, '>User Rating:<'):
  277. log("exact match")
  278. search = re.compile('/Years/([0-9]*).*?/title/([t0-9]*)/', re.DOTALL + re.MULTILINE + re.IGNORECASE)
  279. matches = search.findall(page)
  280. if matches:
  281. year = matches[0][0]
  282. url = "http://www.imdb.com/title/%s/" % (matches[0][1])
  283. self.SearchResults.append((year, findStr, url))
  284. else:
  285. log( "no matches on page" )
  286. else:
  287. log( "search 'Years'" )
  288. search = re.compile('/Years/([0-9]*)\".*?/title/([t0-9]*)/', re.DOTALL + re.MULTILINE + re.IGNORECASE)
  289. matches = search.findall(page)
  290. if matches:
  291. year = matches[0][0]
  292. url = "http://www.imdb.com/title/%s/" % (matches[0][1])
  293. self.SearchResults.append((year, findStr, url))
  294. else:
  295. log( "no matches on page" )
  296. def getSearchResults(self):
  297. return self.SearchResults
  298. def getSearchResult(self, idx):
  299. try:
  300. return self.SearchResults[idx]
  301. except:
  302. return None
  303. def getTitle(self, idx):
  304. self.SearchResults[idx][self.TITLE]
  305. def getURL(self, idx):
  306. self.SearchResults[idx][self.URL]
  307. def getYear(self, idx):
  308. self.SearchResults[idx][self.YEAR]
  309. ###########################################################################################################
  310. # http://www.imdb.com/title/tt0076759/mediaindex
  311. ###########################################################################################################
  312. class IMDbGallery:
  313. def __init__(self, url):
  314. if url[-1] != '/': url += '/'
  315. self.baseURL = url + 'mediaindex'
  316. log("IMDbGallery() " + self.baseURL)
  317. self.galleyThumbs = [] # [[url title], [url title], ... ]
  318. self.pageCount = 1
  319. page = readPage(self.baseURL)
  320. if not page:
  321. return
  322. # how many pages ?
  323. matches = galleryPageCountRE.findall(page)
  324. if matches:
  325. # finds them twice, so half the count
  326. self.pageCount = int(len(matches) /2)
  327. else:
  328. self.pageCount = 1
  329. log("pageCount=%s" % self.pageCount)
  330. # get thumb urls from each page
  331. for pageIdx in range(1, self.pageCount+1):
  332. log("fetching pageIdx=%s" % pageIdx)
  333. if pageIdx > 1:
  334. url = self.baseURL + '?page=%s' % pageIdx
  335. urlList = self.getGalleryThumbs(url)
  336. else:
  337. urlList = self.getGalleryThumbs(doc=page)
  338. if urlList:
  339. self.galleyThumbs += urlList
  340. # print self.galleyThumbs
  341. log("galleyThumbs count=%s" % len(self.galleyThumbs))
  342. # GET GALLERY THUMB IMAGES
  343. def getGalleryThumbs(self, url='', doc=''):
  344. if url:
  345. doc = readPage(url)
  346. if doc:
  347. return galleryThumbsRE.findall(doc)
  348. return None
  349. def getGalleryImageCount(self):
  350. return len(self.galleyThumbs)
  351. def getThumb(self, idx):
  352. try:
  353. return self.galleyThumbs[idx]
  354. except:
  355. return ('','')
  356. def getThumbTitle(self, idx):
  357. try:
  358. return self.getThumb(idx)[0]
  359. except:
  360. return ''
  361. def getThumbURL(self, idx):
  362. try:
  363. return self.getThumb(idx)[1]
  364. except:
  365. return ''
  366. def getLargeThumbURL(self, idx):
  367. url = self.getThumbURL(idx)
  368. # url_info = urlparse.urlsplit(url) # ( )
  369. # print url_info
  370. # urlpath = urlparse.urlsplit(url)[2] # dir/dir/dir/name.ext
  371. # head, tail = os.path.split(urlpath) # dir/dir/dir/, name.ext
  372. # name, ext = os.path.splitext(tail) # name, ext
  373. # url = "%s://%s%s" % (url_info[0],url_info[1], head + '/VM._SY400_SX600_' + ext)
  374. head, tail = os.path.split(url)
  375. name, ext = os.path.splitext(tail)
  376. # try reg repl of known fn format -
  377. # eg /VM._CR0,0,255,255_SS80_.jpg -> /VM._SY400_SX600_.jpg
  378. # eg. _V1._CR0,0,216,216_SS80_.jpg -> _V1._SY400_SX600_.jpg
  379. result = re.sub(r"CR\d+,\d+,\d+,\d+_.*?_", 'SY400_SX600_', name)
  380. if result != name: # if sub done OK, will be diff
  381. # re matched and replaced
  382. url = head + '/' + result + ext
  383. else:
  384. # normal replace which works most of the time
  385. url = head + '/VM._SY400_SX600_' + ext
  386. log("getLargeThumbURL() " + url)
  387. return url
  388. ###########################################################################################################
  389. def readPage(url, readLines=False):
  390. log("readPage() readLines=%s" % readLines)
  391. page = None
  392. try:
  393. safe_url = urllib.quote_plus(url,'/:&?=+#@')
  394. log("readPage() " + safe_url)
  395. sock = urllib.urlopen(safe_url)
  396. if not readLines:
  397. page = sock.read()
  398. else:
  399. page = sock.readlines()
  400. sock.close()
  401. if not page:
  402. log("page not found")
  403. except:
  404. log("urlopen() except")
  405. e=sys.exc_info()
  406. print traceback.format_exception(e[0],e[1],e[2],3)
  407. page = None
  408. return page
  409. ###########################################################################################################
  410. def clean(text):
  411. return urlTextToASCII(cleanHTML(text)).strip()
  412. ###########################################################################################################
  413. def cleanHTML(data):
  414. try:
  415. reobj = re.compile('<.+?>', re.IGNORECASE+re.DOTALL+re.MULTILINE)
  416. return (re.sub(reobj, '', data)).strip()
  417. except:
  418. return data
  419. ###########################################################################################################
  420. def urlTextToASCII(text):
  421. try:
  422. compile_obj = re.compile('(&#x(.*?);)', re.IGNORECASE + re.MULTILINE + re.DOTALL)
  423. match_obj = compile_obj.findall(text)
  424. for match in match_obj:
  425. ch = chr(int('0x'+match[1], 16))
  426. text = text.replace(match[0], ch)
  427. compile_obj = re.compile('(&#(\d+);)', re.IGNORECASE + re.MULTILINE + re.DOTALL)
  428. match_obj = compile_obj.findall(text)
  429. for match in match_obj:
  430. ch = chr(int(match[1]))
  431. text = text.replace(match[0], ch)
  432. text = text.replace('&amp;','&').replace('\n',' ').replace('\r',' ').replace('\t',' ')
  433. compile_obj = re.compile('(&(.*?);)', re.IGNORECASE+re.DOTALL+re.MULTILINE)
  434. text = (re.sub(compile_obj, '', text))
  435. except: pass
  436. return text