PageRenderTime 79ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/DVDProfiler/resources/lib/IMDbLib.py

http://xbmc-scripting.googlecode.com/
Python | 507 lines | 430 code | 30 blank | 47 comment | 35 complexity | b8e6bb954842fb4bcd96c88dac6dff45 MD5 | raw file
Possible License(s): BSD-2-Clause
  1. """ IMDbLib Movie Information Library:
  2. Movie Information parsing from http://www.imdb.com
  3. Originally Written By:
  4. ---------------------
  5. No1CaNTeL
  6. AllTheKillx
  7. MODIFIED by BigBellyBilly AT gmail DOT com
  8. -------------------------
  9. Fixed and modified in several areas. NO NOT REPLACE THIS LIBRARY FILE
  10. Changelog:
  11. 02/11/06 Change: IMDBSearch lookupstr to ensure it always replaces space with +
  12. 28/04/07 Fix: IMDb site changes and added extra tags.
  13. 05/06/07 Fix: Gallery regex
  14. 09/07/07 Fix: Seach for title. caused by site change
  15. 11/12/07 Fix: Scraping regex (see date comments)
  16. 18/01/08 Fix: regex for Cast
  17. 29/08/08 Fix: looks for Popular and Exact , Approx matches, fix unicode lookups
  18. 03/10/08 Change: Improved title name cleaning.
  19. """
  20. import os,sys,re,urllib,string, urlparse,traceback,unicodedata
  21. from string import find, translate
  22. DEBUG = True
  23. def log(s):
  24. if DEBUG:
  25. print s
  26. fixReg = re.compile(r"&#[0-9]{1,3};")
  27. reFlags = re.MULTILINE+re.IGNORECASE+re.DOTALL
  28. # full page RE
  29. titleReg = '<title>(.*?)<'
  30. taglineReg = '>Tagline:<.*?>([^<]*)' # 11/12/07
  31. plotOutlineReg = 'h5>Plot.*?:<.*?>(.*?)<' # 06/06/08
  32. yearReg = 'Sections/Years/.*?>(.*?)<'
  33. ratingReg = '<b>User Rating:</b>[^<]*<b>(.*?)<.*?">(.*?)<' # 11/12/07
  34. runtimeReg = '>Runtime:<.*?>([^<]*)' # 11/12/07
  35. countriesReg = '>Country:<.*?>([^<]*)</a' # 11/12/07
  36. languagesReg = '>Language:<.*?>([^<]*)</a' # 11/12/07
  37. awardsReg = '>Awards:<.*?>([^<]*)' # 11/12/07
  38. releaseDateReg = '>Release Date:<.*?>(.*?)<'
  39. userCommentReg = '>User Comments:<.*?>(.*?)<'
  40. akaReg = '>Also Known As:<.*?>(.*?)<'
  41. aspectReg = '>Aspect Ratio:<.*?>(.*?)<'
  42. triviaReg = '>Trivia:<.*?>(.*?)</div' # 11/12/07
  43. goofsReg = '>Goofs:<.*?>(.*?)</div' # 11/12/07
  44. soundtrackReg = '>Soundtrack:<.*?>(.*?)</div' # 11/12/07
  45. soundMixReg = '>Sound Mix:<.*?>(.*?)</div' # 11/12/07
  46. posterURLReg = '"photo">.*?src="(.*?)" height="(\d+)" width="(\d+)"'
  47. companyReg = '"/company/[^/]*/">([^<]*)</a>' # 11/12/07
  48. genresReg = re.compile(r'/Genres/.*?>(.*?)<', reFlags)
  49. certsReg = re.compile(r'certificates=.*?>(.*?)<', reFlags)
  50. # per line RE
  51. directorsReg = re.compile(r'.*?Director:.*')
  52. writersReg = re.compile(r'.*?Writer.*')
  53. castReg = re.compile(r'.*?class="cast".*')
  54. creatorsReg = re.compile(r'.*?Creator.*')
  55. in_directorsReg = re.compile(r'href="/name/.*?>(.*?)</')
  56. in_writersReg = re.compile(r'href="/name/.*?>(.*?)</')
  57. in_castReg = re.compile(r'href="/name/[nm0-9]*/">(.*?)<.*?(?:/character/|"char").*?>(.*?)(?:</a|</t)') # 25/01/08
  58. in_creatorsReg = re.compile(r'href="/name/.*?>(.*?)</')
  59. # gallery RE
  60. #galleriesRE = re.compile('href="photogallery(-.*?)"',reFlags)
  61. #galleryRE = re.compile('<a href="/gallery.*? src="(.*?)".*?(?:width|height)="(\d+)".*?(?:width|height)="(\d+)".*?</a>',reFlags)
  62. galleryThumbsRE = re.compile('img alt="(.*?)".*?src="(.*?)"') # 18/03/08
  63. galleryPageCountRE = re.compile(r'page=(\d+)', reFlags)
  64. class IMDb:
  65. def __init__(self,url):
  66. log("IMDb()")
  67. NA = 'N/A'
  68. self.Title = NA
  69. self.Tagline = NA
  70. self.PlotOutline = NA
  71. self.Year = NA
  72. self.Rating = NA
  73. self.Runtime = NA
  74. self.Countries = NA
  75. self.Languages = NA
  76. self.Awards = NA
  77. self.ReleaseDate = NA
  78. self.UserComment = NA
  79. self.AKA = ''
  80. self.Aspect = NA
  81. self.Trivia = NA
  82. self.Goofs = NA
  83. self.Soundtrack = NA
  84. self.SoundMix = NA
  85. self.PosterURL = None
  86. self.Cast = []
  87. self.Certs = NA
  88. self.Creators = NA
  89. self.Writers = NA
  90. self.Directors = NA
  91. self.Genres = NA
  92. page = readPage(url)
  93. if not page:
  94. return
  95. # FUL PAGE REGEX
  96. log("do full page regex searches ...")
  97. matches = re.search(titleReg, page, reFlags)
  98. if matches:
  99. self.Title = clean(matches.group(1))
  100. # TAGLINE
  101. matches = re.search(taglineReg, page, reFlags)
  102. if matches:
  103. self.Tagline = clean(matches.group(1))
  104. # YEARS
  105. matches = re.search(yearReg, page, reFlags)
  106. if matches:
  107. self.Year = clean(matches.group(1))
  108. # PLOT SUMMARY
  109. matches = re.search(plotOutlineReg, page, reFlags)
  110. if matches:
  111. self.PlotOutline = clean(matches.group(1)).replace('full summary','')
  112. # RATING
  113. matches = re.search(ratingReg, page, reFlags)
  114. if matches:
  115. self.Rating = "%s %s" % (matches.group(1).strip(), matches.group(2).strip())
  116. # RUNTIME
  117. matches = re.search(runtimeReg, page, reFlags)
  118. if matches:
  119. self.Runtime = clean(matches.group(1))
  120. # COUNTRY
  121. matches = re.search(countriesReg, page, reFlags)
  122. if matches:
  123. self.Countries = clean(matches.group(1))
  124. # LANG
  125. matches = re.search(languagesReg, page, reFlags)
  126. if matches:
  127. self.Languages = clean(matches.group(1))
  128. # AWARDS
  129. matches = re.search(awardsReg, page, reFlags)
  130. if matches:
  131. self.Awards = clean(matches.group(1)).replace('more','')
  132. # RELEASE DATE
  133. matches = re.search(releaseDateReg, page, reFlags)
  134. if matches:
  135. self.ReleaseDate = clean(matches.group(1))
  136. # USER COMMENT
  137. matches = re.search(userCommentReg, page, reFlags)
  138. if matches:
  139. self.UserComment = clean(matches.group(1)).replace('more','')
  140. # AKA
  141. matches = re.search(akaReg, page, reFlags)
  142. if matches:
  143. self.AKA = clean(matches.group(1))
  144. # ASPECT
  145. matches = re.search(aspectReg, page, reFlags)
  146. if matches:
  147. self.Aspect = clean(matches.group(1))
  148. # TRIVIA
  149. matches = re.search(triviaReg, page, reFlags)
  150. if matches:
  151. self.Trivia = clean(matches.group(1)).replace('more','')
  152. # GOOFS
  153. matches = re.search(goofsReg, page, reFlags)
  154. if matches:
  155. self.Goofs = clean(matches.group(1)).replace('more','')
  156. # SOUNDTRACK
  157. matches = re.search(soundtrackReg, page, reFlags)
  158. if matches:
  159. self.Soundtrack = clean(matches.group(1)).replace('more','')
  160. # SOUNDMIX
  161. matches = re.search(soundMixReg, page, reFlags)
  162. if matches:
  163. self.SoundMix = clean(matches.group(1)).replace('more','')
  164. # POSTER PHOTO
  165. matches = re.search(posterURLReg, page, reFlags)
  166. if matches:
  167. self.PosterURL = clean(matches.group(1))
  168. # GENRES
  169. matches = genresReg.findall(page)
  170. if matches:
  171. self.Genres = ','.join(matches)
  172. # CERTS
  173. matches = certsReg.findall(page)
  174. if matches:
  175. self.Certs = ','.join(matches)
  176. log("do pre line regex searches ...")
  177. pageLines = page.split('\n')
  178. i = 0
  179. while i < len(pageLines):
  180. if directorsReg.match(pageLines[i]):
  181. i += 1
  182. matches = in_directorsReg.findall(pageLines[i])
  183. if matches:
  184. self.Directors = clean(','.join(matches))
  185. elif writersReg.match(pageLines[i]):
  186. i += 1
  187. matches = in_writersReg.findall(pageLines[i])
  188. if matches:
  189. self.Writers = clean(','.join(matches))
  190. elif castReg.match(pageLines[i]):
  191. matches = in_castReg.findall(pageLines[i])
  192. for actor,role in matches:
  193. self.Cast.append([clean(actor), clean(role)])
  194. if creatorsReg.match(pageLines[i]):
  195. i += 1
  196. matches = in_creatorsReg.findall(pageLines[i])
  197. if matches:
  198. self.Creators = clean(','.join(matches))
  199. i+=1
  200. if DEBUG: # switch on for debug results
  201. print "Title =%s" % self.Title
  202. print "Tagline =%s" % self.Tagline
  203. print "PlotOutline =%s" % self.PlotOutline
  204. print "Year =%s" % self.Year
  205. print "Rating =%s" % self.Rating
  206. print "Runtime =%s" % self.Runtime
  207. print "Countries =%s" % self.Countries
  208. print "Languages =%s" % self.Languages
  209. print "Awards =%s" % self.Awards
  210. print "ReleaseDate =%s" % self.ReleaseDate
  211. print "UserComment =%s" % self.UserComment
  212. print "AKA =%s" % self.AKA
  213. print "Aspect =%s" % self.Aspect
  214. print "Trivia =%s" % self.Trivia
  215. print "Goofs =%s" % self.Goofs
  216. print "Soundtrack =%s" % self.Soundtrack
  217. print "SoundMix =%s" % self.SoundMix
  218. print "PosterURL =%s" % self.PosterURL
  219. print "Certs =%s" % self.Certs
  220. print "Creators =%s" % self.Creators
  221. print "Writers =%s" % self.Writers
  222. print "Directors =%s" % self.Directors
  223. print "Genres =%s" % self.Genres
  224. print "Cast =%s" % self.Cast
  225. ###########################################################################################################
  226. class IMDbSearch:
  227. def __init__(self, findStr):
  228. log("IMDbSearch()")
  229. self.YEAR = 0
  230. self.TITLE = 1
  231. self.URL = 2
  232. try:
  233. trantab = string.maketrans('','')
  234. lookupStr = findStr.strip().translate(trantab,'~!@#$%^&*()_+`-={}|[]\:";\'<>?,./').replace(' ','+')
  235. except:
  236. try:
  237. lookupStr = findStr.replace(' ','+')
  238. except:
  239. lookupStr = findStr
  240. log( "lookupStr assigned" )
  241. self.SearchResults = None
  242. # url = 'http://www.imdb.com/find?s=tt&q='
  243. url = 'http://www.imdb.com/find?s=tt&q=' + lookupStr
  244. # if isinstance(lookupStr,unicode):
  245. # try:
  246. # url += lookupStr.encode('latin-1')
  247. # except:
  248. # url += lookupStr
  249. # else:
  250. # url += lookupStr
  251. page = readPage(url)
  252. if not page:
  253. return
  254. self.SearchResults = []
  255. if string.find(page, 'No Matches') != -1:
  256. log( "found 'No Matches'" )
  257. elif string.find(page, '>Popular') != -1 or string.find(page, 'Exact Matches') != -1 \
  258. or string.find(page, 'Approx Matches') != -1:
  259. log( "found 'Popular' and/or 'Exact' or 'Approx' matches" )
  260. # title code, title, year
  261. search = re.compile('href="/title/([t0-9]*)/">(.*?)</a> *\(([0-9]*)') # updated 11/12/2007
  262. matches = search.findall(page)
  263. if matches:
  264. for a in matches:
  265. url = "http://www.imdb.com/title/%s/" % (a[0])
  266. title = clean(a[1])
  267. year = a[2]
  268. self.SearchResults.append((year, title, url))
  269. # sort resul tuple into year reverse order
  270. if self.SearchResults:
  271. self.SearchResults.sort()
  272. self.SearchResults.reverse()
  273. else:
  274. log( "no matches on page" )
  275. elif string.find(page, '<h1>'+findStr) or string.find(page, '>User Rating:<'):
  276. log("exact match")
  277. search = re.compile('/Years/([0-9]*).*?/title/([t0-9]*)/', re.DOTALL + re.MULTILINE + re.IGNORECASE)
  278. matches = search.findall(page)
  279. if matches:
  280. year = matches[0][0]
  281. url = "http://www.imdb.com/title/%s/" % (matches[0][1])
  282. self.SearchResults.append((year, findStr, url))
  283. else:
  284. log( "no matches on page" )
  285. else:
  286. log( "search 'Years'" )
  287. search = re.compile('/Years/([0-9]*)\".*?/title/([t0-9]*)/', re.DOTALL + re.MULTILINE + re.IGNORECASE)
  288. matches = search.findall(page)
  289. if matches:
  290. year = matches[0][0]
  291. url = "http://www.imdb.com/title/%s/" % (matches[0][1])
  292. self.SearchResults.append((year, findStr, url))
  293. else:
  294. log( "no matches on page" )
  295. def getSearchResults(self):
  296. return self.SearchResults
  297. def getSearchResult(self, idx):
  298. try:
  299. return self.SearchResults[idx]
  300. except:
  301. return None
  302. def getTitle(self, idx):
  303. self.SearchResults[idx][self.TITLE]
  304. def getURL(self, idx):
  305. self.SearchResults[idx][self.URL]
  306. def getYear(self, idx):
  307. self.SearchResults[idx][self.YEAR]
  308. ###########################################################################################################
  309. # http://www.imdb.com/title/tt0076759/mediaindex
  310. ###########################################################################################################
  311. class IMDbGallery:
  312. def __init__(self, url):
  313. if url[-1] != '/': url += '/'
  314. self.baseURL = url + 'mediaindex'
  315. log("IMDbGallery() " + self.baseURL)
  316. self.galleyThumbs = [] # [[url title], [url title], ... ]
  317. self.pageCount = 1
  318. page = readPage(self.baseURL)
  319. if not page:
  320. return
  321. # how many pages ?
  322. matches = galleryPageCountRE.findall(page)
  323. if matches:
  324. # finds them twice, so half the count
  325. self.pageCount = int(len(matches) /2)
  326. else:
  327. self.pageCount = 1
  328. log("pageCount=%s" % self.pageCount)
  329. # get thumb urls from each page
  330. for pageIdx in range(1, self.pageCount+1):
  331. log("fetching pageIdx=%s" % pageIdx)
  332. if pageIdx > 1:
  333. url = self.baseURL + '?page=%s' % pageIdx
  334. urlList = self.getGalleryThumbs(url)
  335. else:
  336. urlList = self.getGalleryThumbs(doc=page)
  337. if urlList:
  338. self.galleyThumbs += urlList
  339. # print self.galleyThumbs
  340. log("galleyThumbs count=%s" % len(self.galleyThumbs))
  341. # GET GALLERY THUMB IMAGES
  342. def getGalleryThumbs(self, url='', doc=''):
  343. if url:
  344. doc = readPage(url)
  345. if doc:
  346. return galleryThumbsRE.findall(doc)
  347. return None
  348. def getGalleryImageCount(self):
  349. return len(self.galleyThumbs)
  350. def getThumb(self, idx):
  351. try:
  352. return self.galleyThumbs[idx]
  353. except:
  354. return ('','')
  355. def getThumbTitle(self, idx):
  356. try:
  357. return self.getThumb(idx)[0]
  358. except:
  359. return ''
  360. def getThumbURL(self, idx):
  361. try:
  362. return self.getThumb(idx)[1]
  363. except:
  364. return ''
  365. def getLargeThumbURL(self, idx):
  366. url = self.getThumbURL(idx)
  367. # url_info = urlparse.urlsplit(url) # ( )
  368. # print url_info
  369. # urlpath = urlparse.urlsplit(url)[2] # dir/dir/dir/name.ext
  370. # head, tail = os.path.split(urlpath) # dir/dir/dir/, name.ext
  371. # name, ext = os.path.splitext(tail) # name, ext
  372. # url = "%s://%s%s" % (url_info[0],url_info[1], head + '/VM._SY400_SX600_' + ext)
  373. head, tail = os.path.split(url)
  374. name, ext = os.path.splitext(tail)
  375. # try reg repl of known fn format -
  376. # eg /VM._CR0,0,255,255_SS80_.jpg -> /VM._SY400_SX600_.jpg
  377. # eg. _V1._CR0,0,216,216_SS80_.jpg -> _V1._SY400_SX600_.jpg
  378. result = re.sub(r"CR\d+,\d+,\d+,\d+_.*?_", 'SY400_SX600_', name)
  379. if result != name: # if sub done OK, will be diff
  380. # re matched and replaced
  381. url = head + '/' + result + ext
  382. else:
  383. # normal replace which works most of the time
  384. url = head + '/VM._SY400_SX600_' + ext
  385. log("getLargeThumbURL() " + url)
  386. return url
  387. ###########################################################################################################
  388. def readPage(url, readLines=False):
  389. log("readPage() readLines=%s" % readLines)
  390. page = None
  391. try:
  392. safe_url = urllib.quote_plus(url,'/:&?=+#@')
  393. log("readPage() " + safe_url)
  394. sock = urllib.urlopen(safe_url)
  395. if not readLines:
  396. page = sock.read()
  397. else:
  398. page = sock.readlines()
  399. sock.close()
  400. if not page:
  401. log("page not found")
  402. except:
  403. log("urlopen() except")
  404. e=sys.exc_info()
  405. print traceback.format_exception(e[0],e[1],e[2],3)
  406. page = None
  407. return page
  408. ###########################################################################################################
  409. def clean(text):
  410. return urlTextToASCII(cleanHTML(text)).strip()
  411. ###########################################################################################################
  412. def cleanHTML(data):
  413. try:
  414. reobj = re.compile('<.+?>', re.IGNORECASE+re.DOTALL+re.MULTILINE)
  415. return (re.sub(reobj, '', data)).strip()
  416. except:
  417. return data
  418. ###########################################################################################################
  419. def urlTextToASCII(text):
  420. try:
  421. compile_obj = re.compile('(&#x(.*?);)', re.IGNORECASE + re.MULTILINE + re.DOTALL)
  422. match_obj = compile_obj.findall(text)
  423. for match in match_obj:
  424. ch = chr(int('0x'+match[1], 16))
  425. text = text.replace(match[0], ch)
  426. compile_obj = re.compile('(&#(\d+);)', re.IGNORECASE + re.MULTILINE + re.DOTALL)
  427. match_obj = compile_obj.findall(text)
  428. for match in match_obj:
  429. ch = chr(int(match[1]))
  430. text = text.replace(match[0], ch)
  431. text = text.replace('&amp;','&').replace('\n',' ').replace('\r',' ').replace('\t',' ')
  432. compile_obj = re.compile('(&(.*?);)', re.IGNORECASE+re.DOTALL+re.MULTILINE)
  433. text = (re.sub(compile_obj, '', text))
  434. except: pass
  435. return text