PageRenderTime 55ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/imdb/parser/mobile/__init__.py

https://gitlab.com/akila-33/Sick-Beard
Python | 844 lines | 799 code | 10 blank | 35 comment | 64 complexity | e3de09fe1579b1ce7c34d28ab8ce9126 MD5 | raw file
  1. """
  2. parser.mobile package (imdb package).
  3. This package provides the IMDbMobileAccessSystem class used to access
  4. IMDb's data for mobile systems.
  5. the imdb.IMDb function will return an instance of this class when
  6. called with the 'accessSystem' argument set to "mobile".
  7. Copyright 2005-2011 Davide Alberani <da@erlug.linux.it>
  8. This program is free software; you can redistribute it and/or modify
  9. it under the terms of the GNU General Public License as published by
  10. the Free Software Foundation; either version 2 of the License, or
  11. (at your option) any later version.
  12. This program is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. GNU General Public License for more details.
  16. You should have received a copy of the GNU General Public License
  17. along with this program; if not, write to the Free Software
  18. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  19. """
  20. import re
  21. import logging
  22. from urllib import unquote
  23. from imdb.Movie import Movie
  24. from imdb.utils import analyze_title, analyze_name, canonicalName, \
  25. date_and_notes
  26. from imdb._exceptions import IMDbDataAccessError
  27. from imdb.parser.http import IMDbHTTPAccessSystem
  28. from imdb.parser.http.utils import subXMLRefs, subSGMLRefs, build_person, \
  29. build_movie, re_spaces
  30. # XXX NOTE: the first version of this module was heavily based on
  31. # regular expressions. This new version replace regexps with
  32. # find() strings' method calls; despite being less flexible, it
  33. # seems to be at least as fast and, hopefully, much more
  34. # lightweight. Yes: the regexp-based version was too heavyweight
  35. # for systems with very limited CPU power and memory footprint.
  36. re_spacessub = re_spaces.sub
  37. # Strip html.
  38. re_unhtml = re.compile(r'<.+?>')
  39. re_unhtmlsub = re_unhtml.sub
  40. # imdb person or movie ids.
  41. re_imdbID = re.compile(r'(?<=nm|tt|ch)([0-9]{7})\b')
  42. # movie AKAs.
  43. re_makas = re.compile('(<p class="find-aka">.*?</p>)')
  44. # Remove episode numbers.
  45. re_filmo_episodes = re.compile('<div class="filmo-episodes">.*?</div>',
  46. re.M | re.I)
  47. def _unHtml(s):
  48. """Return a string without tags and no multiple spaces."""
  49. return subSGMLRefs(re_spacessub(' ', re_unhtmlsub('', s)).strip())
  50. _inttype = type(0)
  51. def _getTagsWith(s, cont, toClosure=False, maxRes=None):
  52. """Return the html tags in the 's' string containing the 'cont'
  53. string; if toClosure is True, everything between the opening
  54. tag and the closing tag is returned."""
  55. lres = []
  56. bi = s.find(cont)
  57. if bi != -1:
  58. btag = s[:bi].rfind('<')
  59. if btag != -1:
  60. if not toClosure:
  61. etag = s[bi+1:].find('>')
  62. if etag != -1:
  63. endidx = bi+2+etag
  64. lres.append(s[btag:endidx])
  65. if maxRes is not None and len(lres) >= maxRes: return lres
  66. lres += _getTagsWith(s[endidx:], cont,
  67. toClosure=toClosure)
  68. else:
  69. spaceidx = s[btag:].find(' ')
  70. if spaceidx != -1:
  71. ctag = '</%s>' % s[btag+1:btag+spaceidx]
  72. closeidx = s[bi:].find(ctag)
  73. if closeidx != -1:
  74. endidx = bi+closeidx+len(ctag)
  75. lres.append(s[btag:endidx])
  76. if maxRes is not None and len(lres) >= maxRes:
  77. return lres
  78. lres += _getTagsWith(s[endidx:], cont,
  79. toClosure=toClosure)
  80. return lres
  81. def _findBetween(s, begins, ends, beginindx=0, maxRes=None, lres=None):
  82. """Return the list of strings from the 's' string which are included
  83. between the 'begins' and 'ends' strings."""
  84. if lres is None:
  85. lres = []
  86. bi = s.find(begins, beginindx)
  87. if bi != -1:
  88. lbegins = len(begins)
  89. if isinstance(ends, (list, tuple)):
  90. eset = [s.find(end, bi+lbegins) for end in ends]
  91. eset[:] = [x for x in eset if x != -1]
  92. if not eset: ei = -1
  93. else: ei = min(eset)
  94. else:
  95. ei = s.find(ends, bi+lbegins)
  96. if ei != -1:
  97. match = s[bi+lbegins:ei]
  98. lres.append(match)
  99. if maxRes is not None and len(lres) >= maxRes: return lres
  100. _findBetween(s, begins, ends, beginindx=ei, maxRes=maxRes,
  101. lres=lres)
  102. return lres
  103. class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
  104. """The class used to access IMDb's data through the web for
  105. mobile terminals."""
  106. accessSystem = 'mobile'
  107. _mobile_logger = logging.getLogger('imdbpy.parser.mobile')
  108. def __init__(self, isThin=0, *arguments, **keywords):
  109. self.accessSystem = 'mobile'
  110. IMDbHTTPAccessSystem.__init__(self, isThin, *arguments, **keywords)
  111. def _clean_html(self, html):
  112. """Normalize the retrieve html."""
  113. html = re_spaces.sub(' ', html)
  114. # Remove silly &nbsp;&raquo; chars.
  115. html = html.replace('&nbsp;&raquo;', '')
  116. return subXMLRefs(html)
  117. def _mretrieve(self, url, size=-1):
  118. """Retrieve an html page and normalize it."""
  119. cont = self._retrieve(url, size=size)
  120. return self._clean_html(cont)
  121. def _getPersons(self, s, sep='<br/>'):
  122. """Return a list of Person objects, from the string s; items
  123. are assumed to be separated by the sep string."""
  124. names = s.split(sep)
  125. pl = []
  126. plappend = pl.append
  127. counter = 1
  128. for name in names:
  129. pid = re_imdbID.findall(name)
  130. if not pid: continue
  131. characters = _getTagsWith(name, 'class="char"',
  132. toClosure=True, maxRes=1)
  133. chpids = []
  134. if characters:
  135. for ch in characters[0].split(' / '):
  136. chid = re_imdbID.findall(ch)
  137. if not chid:
  138. chpids.append(None)
  139. else:
  140. chpids.append(chid[-1])
  141. if not chpids:
  142. chpids = None
  143. elif len(chpids) == 1:
  144. chpids = chpids[0]
  145. name = _unHtml(name)
  146. # Catch unclosed tags.
  147. gt_indx = name.find('>')
  148. if gt_indx != -1:
  149. name = name[gt_indx+1:].lstrip()
  150. if not name: continue
  151. if name.endswith('...'):
  152. name = name[:-3]
  153. p = build_person(name, personID=str(pid[0]), billingPos=counter,
  154. modFunct=self._defModFunct, roleID=chpids,
  155. accessSystem=self.accessSystem)
  156. plappend(p)
  157. counter += 1
  158. return pl
  159. def _search_movie(self, title, results):
  160. ##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title})
  161. ##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results))
  162. ##cont = self._mretrieve(imdbURL_search % params)
  163. cont = subXMLRefs(self._get_search_content('tt', title, results))
  164. title = _findBetween(cont, '<title>', '</title>', maxRes=1)
  165. res = []
  166. if not title:
  167. self._mobile_logger.error('no title tag searching for movie %s',
  168. title)
  169. return res
  170. tl = title[0].lower()
  171. if not tl.startswith('imdb title'):
  172. # a direct hit!
  173. title = _unHtml(title[0])
  174. mid = None
  175. midtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
  176. if midtag:
  177. mid = _findBetween(midtag[0], '/title/tt', '/', maxRes=1)
  178. if not (mid and title):
  179. self._mobile_logger.error('no direct hit title/movieID for' \
  180. ' title %s', title)
  181. return res
  182. if cont.find('<span class="tv-extra">TV mini-series</span>') != -1:
  183. title += ' (mini)'
  184. res[:] = [(str(mid[0]), analyze_title(title))]
  185. else:
  186. # XXX: this results*3 prevents some recursion errors, but...
  187. # it's not exactly understandable (i.e.: why 'results' is
  188. # not enough to get all the results?)
  189. lis = _findBetween(cont, 'td valign="top">', '</td>',
  190. maxRes=results*3)
  191. for li in lis:
  192. akas = re_makas.findall(li)
  193. for idx, aka in enumerate(akas):
  194. aka = aka.replace('" - ', '::', 1)
  195. aka = _unHtml(aka)
  196. if aka.startswith('aka "'):
  197. aka = aka[5:].strip()
  198. if aka[-1] == '"':
  199. aka = aka[:-1]
  200. akas[idx] = aka
  201. imdbid = re_imdbID.findall(li)
  202. li = re_makas.sub('', li)
  203. mtitle = _unHtml(li)
  204. if not (imdbid and mtitle):
  205. self._mobile_logger.debug('no title/movieID parsing' \
  206. ' %s searching for title %s', li,
  207. title)
  208. continue
  209. mtitle = mtitle.replace('(TV mini-series)', '(mini)')
  210. resd = analyze_title(mtitle)
  211. if akas:
  212. resd['akas'] = akas
  213. res.append((str(imdbid[0]), resd))
  214. return res
  215. def get_movie_main(self, movieID):
  216. cont = self._mretrieve(self.urls['movie_main'] % movieID + 'maindetails')
  217. title = _findBetween(cont, '<title>', '</title>', maxRes=1)
  218. if not title:
  219. raise IMDbDataAccessError('unable to get movieID "%s"' % movieID)
  220. title = _unHtml(title[0])
  221. if title.endswith(' - IMDb'):
  222. title = title[:-7]
  223. if cont.find('<span class="tv-extra">TV mini-series</span>') != -1:
  224. title += ' (mini)'
  225. d = analyze_title(title)
  226. kind = d.get('kind')
  227. tv_series = _findBetween(cont, 'TV Series:</h5>', '</a>', maxRes=1)
  228. if tv_series: mid = re_imdbID.findall(tv_series[0])
  229. else: mid = None
  230. if tv_series and mid:
  231. s_title = _unHtml(tv_series[0])
  232. s_data = analyze_title(s_title)
  233. m = Movie(movieID=str(mid[0]), data=s_data,
  234. accessSystem=self.accessSystem,
  235. modFunct=self._defModFunct)
  236. d['kind'] = kind = u'episode'
  237. d['episode of'] = m
  238. if kind in ('tv series', 'tv mini series'):
  239. years = _findBetween(cont, '<h1>', '</h1>', maxRes=1)
  240. if years:
  241. years[:] = _findBetween(years[0], 'TV series', '</span>',
  242. maxRes=1)
  243. if years:
  244. d['series years'] = years[0].strip()
  245. air_date = _findBetween(cont, 'Original Air Date:</h5>', '</div>',
  246. maxRes=1)
  247. if air_date:
  248. air_date = air_date[0]
  249. vi = air_date.find('(')
  250. if vi != -1:
  251. date = _unHtml(air_date[:vi]).strip()
  252. if date != '????':
  253. d['original air date'] = date
  254. air_date = air_date[vi:]
  255. season = _findBetween(air_date, 'Season', ',', maxRes=1)
  256. if season:
  257. season = season[0].strip()
  258. try: season = int(season)
  259. except: pass
  260. if season or type(season) is _inttype:
  261. d['season'] = season
  262. episode = _findBetween(air_date, 'Episode', ')', maxRes=1)
  263. if episode:
  264. episode = episode[0].strip()
  265. try: episode = int(episode)
  266. except: pass
  267. if episode or type(season) is _inttype:
  268. d['episode'] = episode
  269. direct = _findBetween(cont, '<h5>Director', ('</div>', '<br/> <br/>'),
  270. maxRes=1)
  271. if direct:
  272. direct = direct[0]
  273. h5idx = direct.find('/h5>')
  274. if h5idx != -1:
  275. direct = direct[h5idx+4:]
  276. direct = self._getPersons(direct)
  277. if direct: d['director'] = direct
  278. if kind in ('tv series', 'tv mini series', 'episode'):
  279. if kind != 'episode':
  280. seasons = _findBetween(cont, 'Seasons:</h5>', '</div>',
  281. maxRes=1)
  282. if seasons:
  283. d['number of seasons'] = seasons[0].count('|') + 1
  284. creator = _findBetween(cont, 'Created by</h5>', ('class="tn15more"',
  285. '</div>',
  286. '<br/> <br/>'),
  287. maxRes=1)
  288. if not creator:
  289. # They change 'Created by' to 'Creator' and viceversa
  290. # from time to time...
  291. # XXX: is 'Creators' also used?
  292. creator = _findBetween(cont, 'Creator:</h5>',
  293. ('class="tn15more"', '</div>',
  294. '<br/> <br/>'), maxRes=1)
  295. if creator:
  296. creator = creator[0]
  297. if creator.find('tn15more'): creator = '%s>' % creator
  298. creator = self._getPersons(creator)
  299. if creator: d['creator'] = creator
  300. writers = _findBetween(cont, '<h5>Writer', ('</div>', '<br/> <br/>'),
  301. maxRes=1)
  302. if writers:
  303. writers = writers[0]
  304. h5idx = writers.find('/h5>')
  305. if h5idx != -1:
  306. writers = writers[h5idx+4:]
  307. writers = self._getPersons(writers)
  308. if writers: d['writer'] = writers
  309. cvurl = _getTagsWith(cont, 'name="poster"', toClosure=True, maxRes=1)
  310. if cvurl:
  311. cvurl = _findBetween(cvurl[0], 'src="', '"', maxRes=1)
  312. if cvurl: d['cover url'] = cvurl[0]
  313. genres = _findBetween(cont, 'href="/genre/', '"')
  314. if genres:
  315. d['genres'] = list(set(genres))
  316. ur = _findBetween(cont, 'id="star-bar-user-rate">', '</div>',
  317. maxRes=1)
  318. if ur:
  319. rat = _findBetween(ur[0], '<b>', '</b>', maxRes=1)
  320. if rat:
  321. if rat:
  322. d['rating'] = rat[0].strip()
  323. else:
  324. self._mobile_logger.warn('wrong rating: %s', rat)
  325. vi = ur[0].rfind('href="ratings"')
  326. if vi != -1 and ur[0][vi+10:].find('await') == -1:
  327. try:
  328. votes = _findBetween(ur[0][vi:], "title='",
  329. " IMDb", maxRes=1)
  330. votes = int(votes[0].replace(',', ''))
  331. d['votes'] = votes
  332. except (ValueError, IndexError):
  333. self._mobile_logger.warn('wrong votes: %s', ur)
  334. top250 = _findBetween(cont, 'href="/chart/top?', '</a>', maxRes=1)
  335. if top250:
  336. fn = top250[0].rfind('#')
  337. if fn != -1:
  338. try:
  339. td = int(top250[0][fn+1:])
  340. d['top 250 rank'] = td
  341. except ValueError:
  342. self._mobile_logger.warn('wrong top250: %s', top250)
  343. castdata = _findBetween(cont, 'Cast overview', '</table>', maxRes=1)
  344. if not castdata:
  345. castdata = _findBetween(cont, 'Credited cast', '</table>', maxRes=1)
  346. if not castdata:
  347. castdata = _findBetween(cont, 'Complete credited cast', '</table>',
  348. maxRes=1)
  349. if not castdata:
  350. castdata = _findBetween(cont, 'Series Cast Summary', '</table>',
  351. maxRes=1)
  352. if not castdata:
  353. castdata = _findBetween(cont, 'Episode Credited cast', '</table>',
  354. maxRes=1)
  355. if castdata:
  356. castdata = castdata[0]
  357. # Reintegrate the fist tag.
  358. fl = castdata.find('href=')
  359. if fl != -1: castdata = '<a ' + castdata[fl:]
  360. # Exclude the 'rest of cast listed alphabetically' row.
  361. smib = castdata.find('<tr><td align="center" colspan="4"><small>')
  362. if smib != -1:
  363. smie = castdata.rfind('</small></td></tr>')
  364. if smie != -1:
  365. castdata = castdata[:smib].strip() + \
  366. castdata[smie+18:].strip()
  367. castdata = castdata.replace('/tr> <tr', '/tr><tr')
  368. cast = self._getPersons(castdata, sep='</tr><tr')
  369. if cast: d['cast'] = cast
  370. akas = _findBetween(cont, 'Also Known As:</h5>', '</div>', maxRes=1)
  371. if akas:
  372. # For some reason, here <br> is still used in place of <br/>.
  373. akas[:] = [x for x in akas[0].split('<br>') if x.strip()]
  374. akas = [_unHtml(x).replace('" - ','::', 1).lstrip('"').strip()
  375. for x in akas]
  376. if 'See more' in akas: akas.remove('See more')
  377. akas[:] = [x for x in akas if x]
  378. if akas:
  379. d['akas'] = akas
  380. mpaa = _findBetween(cont, 'MPAA</a>:', '</div>', maxRes=1)
  381. if mpaa: d['mpaa'] = _unHtml(mpaa[0])
  382. runtimes = _findBetween(cont, 'Runtime:</h5>', '</div>', maxRes=1)
  383. if runtimes:
  384. runtimes = runtimes[0]
  385. runtimes = [x.strip().replace(' min', '').replace(' (', '::(', 1)
  386. for x in runtimes.split('|')]
  387. d['runtimes'] = [_unHtml(x).strip() for x in runtimes]
  388. if kind == 'episode':
  389. # number of episodes.
  390. epsn = _findBetween(cont, 'title="Full Episode List">', '</a>',
  391. maxRes=1)
  392. if epsn:
  393. epsn = epsn[0].replace(' Episodes', '').strip()
  394. if epsn:
  395. try:
  396. epsn = int(epsn)
  397. except:
  398. self._mobile_logger.warn('wrong episodes #: %s', epsn)
  399. d['number of episodes'] = epsn
  400. country = _findBetween(cont, 'Country:</h5>', '</div>', maxRes=1)
  401. if country:
  402. country[:] = country[0].split(' | ')
  403. country[:] = ['<a %s' % x for x in country if x]
  404. country[:] = [_unHtml(x.replace(' <i>', '::')) for x in country]
  405. if country: d['countries'] = country
  406. lang = _findBetween(cont, 'Language:</h5>', '</div>', maxRes=1)
  407. if lang:
  408. lang[:] = lang[0].split(' | ')
  409. lang[:] = ['<a %s' % x for x in lang if x]
  410. lang[:] = [_unHtml(x.replace(' <i>', '::')) for x in lang]
  411. if lang: d['languages'] = lang
  412. col = _findBetween(cont, '"/search/title?colors=', '</div>')
  413. if col:
  414. col[:] = col[0].split(' | ')
  415. col[:] = ['<a %s' % x for x in col if x]
  416. col[:] = [_unHtml(x.replace(' <i>', '::')) for x in col]
  417. if col: d['color info'] = col
  418. sm = _findBetween(cont, '/search/title?sound_mixes=', '</div>',
  419. maxRes=1)
  420. if sm:
  421. sm[:] = sm[0].split(' | ')
  422. sm[:] = ['<a %s' % x for x in sm if x]
  423. sm[:] = [_unHtml(x.replace(' <i>', '::')) for x in sm]
  424. if sm: d['sound mix'] = sm
  425. cert = _findBetween(cont, 'Certification:</h5>', '</div>', maxRes=1)
  426. if cert:
  427. cert[:] = cert[0].split(' | ')
  428. cert[:] = [_unHtml(x.replace(' <i>', '::')) for x in cert]
  429. if cert: d['certificates'] = cert
  430. plotoutline = _findBetween(cont, 'Plot:</h5>', ['<a ', '</div>'],
  431. maxRes=1)
  432. if plotoutline:
  433. plotoutline = plotoutline[0].strip()
  434. plotoutline = plotoutline.rstrip('|').rstrip()
  435. if plotoutline: d['plot outline'] = _unHtml(plotoutline)
  436. aratio = _findBetween(cont, 'Aspect Ratio:</h5>', ['<a ', '</div>'],
  437. maxRes=1)
  438. if aratio:
  439. aratio = aratio[0].strip().replace(' (', '::(', 1)
  440. if aratio:
  441. d['aspect ratio'] = _unHtml(aratio)
  442. return {'data': d}
  443. def get_movie_plot(self, movieID):
  444. cont = self._mretrieve(self.urls['movie_main'] % movieID + 'plotsummary')
  445. plot = _findBetween(cont, '<p class="plotpar">', '</p>')
  446. plot[:] = [_unHtml(x) for x in plot]
  447. for i in xrange(len(plot)):
  448. p = plot[i]
  449. wbyidx = p.rfind(' Written by ')
  450. if wbyidx != -1:
  451. plot[i] = '%s::%s' % \
  452. (p[:wbyidx].rstrip(),
  453. p[wbyidx+12:].rstrip().replace('{','<').replace('}','>'))
  454. if plot: return {'data': {'plot': plot}}
  455. return {'data': {}}
  456. def _search_person(self, name, results):
  457. ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name})
  458. ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results))
  459. ##cont = self._mretrieve(imdbURL_search % params)
  460. cont = subXMLRefs(self._get_search_content('nm', name, results))
  461. name = _findBetween(cont, '<title>', '</title>', maxRes=1)
  462. res = []
  463. if not name:
  464. self._mobile_logger.warn('no title tag searching for name %s', name)
  465. return res
  466. nl = name[0].lower()
  467. if not nl.startswith('imdb name'):
  468. # a direct hit!
  469. name = _unHtml(name[0])
  470. name = name.replace('- Filmography by type' , '').strip()
  471. pid = None
  472. pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
  473. if pidtag:
  474. pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1)
  475. if not (pid and name):
  476. self._mobile_logger.error('no direct hit name/personID for' \
  477. ' name %s', name)
  478. return res
  479. res[:] = [(str(pid[0]), analyze_name(name, canonical=1))]
  480. else:
  481. lis = _findBetween(cont, 'td valign="top">', '</td>',
  482. maxRes=results*3)
  483. for li in lis:
  484. akas = _findBetween(li, '<em>"', '"</em>')
  485. for sep in ['<small', '<br> aka', '<br> birth name']:
  486. sepIdx = li.find(sep)
  487. if sepIdx != -1:
  488. li = li[:sepIdx]
  489. pid = re_imdbID.findall(li)
  490. pname = _unHtml(li)
  491. if not (pid and pname):
  492. self._mobile_logger.debug('no name/personID parsing' \
  493. ' %s searching for name %s', li,
  494. name)
  495. continue
  496. resd = analyze_name(pname, canonical=1)
  497. if akas:
  498. resd['akas'] = akas
  499. res.append((str(pid[0]), resd))
  500. return res
  501. def get_person_main(self, personID, _parseChr=False):
  502. if not _parseChr:
  503. url = self.urls['person_main'] % personID + 'maindetails'
  504. else:
  505. url = self.urls['character_main'] % personID
  506. s = self._mretrieve(url)
  507. r = {}
  508. name = _findBetween(s, '<title>', '</title>', maxRes=1)
  509. if not name:
  510. if _parseChr: w = 'characterID'
  511. else: w = 'personID'
  512. raise IMDbDataAccessError('unable to get %s "%s"' % (w, personID))
  513. name = _unHtml(name[0].replace(' - IMDb', ''))
  514. if _parseChr:
  515. name = name.replace('(Character)', '').strip()
  516. name = name.replace('- Filmography by type', '').strip()
  517. else:
  518. name = name.replace('- Filmography by', '').strip()
  519. r = analyze_name(name, canonical=not _parseChr)
  520. for dKind in ('Born', 'Died'):
  521. date = _findBetween(s, '%s:</h4>' % dKind.capitalize(),
  522. ('<div class', '</div>', '<br/><br/>'), maxRes=1)
  523. if date:
  524. date = _unHtml(date[0])
  525. if date:
  526. #date, notes = date_and_notes(date)
  527. # TODO: fix to handle real names.
  528. date_notes = date.split(' in ', 1)
  529. notes = u''
  530. date = date_notes[0]
  531. if len(date_notes) == 2:
  532. notes = date_notes[1]
  533. dtitle = 'birth'
  534. if dKind == 'Died':
  535. dtitle = 'death'
  536. if date:
  537. r['%s date' % dtitle] = date
  538. if notes:
  539. r['%s notes' % dtitle] = notes
  540. akas = _findBetween(s, 'Alternate Names:</h4>', ('</div>',
  541. '<br/><br/>'), maxRes=1)
  542. if akas:
  543. akas = akas[0]
  544. if akas:
  545. akas = _unHtml(akas)
  546. if akas.find(' | ') != -1:
  547. akas = akas.split(' | ')
  548. else:
  549. akas = akas.split(' / ')
  550. if akas: r['akas'] = filter(None, [x.strip() for x in akas])
  551. hs = _findBetween(s, "rel='image_src'", '>', maxRes=1)
  552. if not hs:
  553. hs = _findBetween(s, 'rel="image_src"', '>', maxRes=1)
  554. if not hs:
  555. hs = _findBetween(s, '<a name="headshot"', '</a>', maxRes=1)
  556. if hs:
  557. hsl = _findBetween(hs[0], "href='", "'", maxRes=1)
  558. if not hsl:
  559. hsl = _findBetween(hs[0], 'href="', '"', maxRes=1)
  560. if hsl and 'imdb-share-logo' not in hsl[0]:
  561. r['headshot'] = hsl[0]
  562. # Build a list of tuples such [('hrefLink', 'section name')]
  563. workkind = _findBetween(s, 'id="jumpto_', '</a>')
  564. ws = []
  565. for work in workkind:
  566. sep = '" >'
  567. if '">' in work:
  568. sep = '">'
  569. wsplit = work.split(sep, 1)
  570. if len(wsplit) == 2:
  571. sect = wsplit[0]
  572. if '"' in sect:
  573. sect = sect[:sect.find('"')]
  574. ws.append((sect, wsplit[1].lower()))
  575. # XXX: I think "guest appearances" are gone.
  576. if s.find('<a href="#guest-appearances"') != -1:
  577. ws.append(('guest-appearances', 'notable tv guest appearances'))
  578. #if _parseChr:
  579. # ws.append(('filmography', 'filmography'))
  580. for sect, sectName in ws:
  581. raws = u''
  582. if sectName == 'self':
  583. sect = 'Self'
  584. # Everything between the current section link and the end
  585. # of the <ol> tag.
  586. if _parseChr and sect == 'filmography':
  587. inisect = s.find('<div class="filmo">')
  588. else:
  589. inisect = s.find('<a name="%s' % sect)
  590. if inisect != -1:
  591. endsect = s[inisect:].find('<div id="filmo-head-')
  592. if endsect == -1:
  593. endsect = s[inisect:].find('<div class="article"')
  594. if endsect != -1: raws = s[inisect:inisect+endsect]
  595. #if not raws: continue
  596. mlist = _findBetween(raws, '<div class="filmo-row',
  597. ('<div class="clear"/>',))
  598. for m in mlist:
  599. fCB = m.find('>')
  600. if fCB != -1:
  601. m = m[fCB+1:].lstrip()
  602. m = re_filmo_episodes.sub('', m)
  603. # For every movie in the current section.
  604. movieID = re_imdbID.findall(m)
  605. if not movieID:
  606. self._mobile_logger.debug('no movieID in %s', m)
  607. continue
  608. m = m.replace('<br/>', ' .... ', 1)
  609. if not _parseChr:
  610. chrIndx = m.find(' .... ')
  611. else:
  612. chrIndx = m.find(' Played by ')
  613. chids = []
  614. if chrIndx != -1:
  615. chrtxt = m[chrIndx+6:]
  616. if _parseChr:
  617. chrtxt = chrtxt[5:]
  618. for ch in chrtxt.split(' / '):
  619. chid = re_imdbID.findall(ch)
  620. if not chid:
  621. chids.append(None)
  622. else:
  623. chids.append(chid[-1])
  624. if not chids:
  625. chids = None
  626. elif len(chids) == 1:
  627. chids = chids[0]
  628. movieID = str(movieID[0])
  629. # Search the status.
  630. stidx = m.find('<i>')
  631. status = u''
  632. if stidx != -1:
  633. stendidx = m.rfind('</i>')
  634. if stendidx != -1:
  635. status = _unHtml(m[stidx+3:stendidx])
  636. m = m.replace(m[stidx+3:stendidx], '')
  637. year = _findBetween(m, 'year_column">', '</span>', maxRes=1)
  638. if year:
  639. year = year[0]
  640. m = m.replace('<span class="year_column">%s</span>' % year,
  641. '')
  642. else:
  643. year = None
  644. m = _unHtml(m)
  645. if not m:
  646. self._mobile_logger.warn('no title for movieID %s', movieID)
  647. continue
  648. movie = build_movie(m, movieID=movieID, status=status,
  649. roleID=chids, modFunct=self._defModFunct,
  650. accessSystem=self.accessSystem,
  651. _parsingCharacter=_parseChr, year=year)
  652. sectName = sectName.split(':')[0]
  653. r.setdefault(sectName, []).append(movie)
  654. # If available, take the always correct name from a form.
  655. itag = _getTagsWith(s, 'NAME="primary"', maxRes=1)
  656. if not itag:
  657. itag = _getTagsWith(s, 'name="primary"', maxRes=1)
  658. if itag:
  659. vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1)
  660. if not vtag:
  661. vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1)
  662. if vtag:
  663. try:
  664. vtag = unquote(str(vtag[0]))
  665. vtag = unicode(vtag, 'latin_1')
  666. r.update(analyze_name(vtag))
  667. except UnicodeEncodeError:
  668. pass
  669. return {'data': r, 'info sets': ('main', 'filmography')}
  670. def get_person_biography(self, personID):
  671. cont = self._mretrieve(self.urls['person_main'] % personID + 'bio')
  672. d = {}
  673. spouses = _findBetween(cont, 'Spouse</h5>', ('</table>', '</dd>'),
  674. maxRes=1)
  675. if spouses:
  676. sl = []
  677. for spouse in spouses[0].split('</tr>'):
  678. if spouse.count('</td>') > 1:
  679. spouse = spouse.replace('</td>', '::</td>', 1)
  680. spouse = _unHtml(spouse)
  681. spouse = spouse.replace(':: ', '::').strip()
  682. if spouse: sl.append(spouse)
  683. if sl: d['spouse'] = sl
  684. nnames = _findBetween(cont, '<h5>Nickname</h5>', ('<br/> <br/>','<h5>'),
  685. maxRes=1)
  686. if nnames:
  687. nnames = nnames[0]
  688. if nnames:
  689. nnames = [x.strip().replace(' (', '::(', 1)
  690. for x in nnames.split('<br/>')]
  691. if nnames:
  692. d['nick names'] = nnames
  693. misc_sects = _findBetween(cont, '<h5>', '<br/>')
  694. misc_sects[:] = [x.split('</h5>') for x in misc_sects]
  695. misc_sects[:] = [x for x in misc_sects if len(x) == 2]
  696. for sect, data in misc_sects:
  697. sect = sect.lower().replace(':', '').strip()
  698. if d.has_key(sect) and sect != 'mini biography': continue
  699. elif sect in ('spouse', 'nickname'): continue
  700. if sect == 'salary': sect = 'salary history'
  701. elif sect == 'where are they now': sect = 'where now'
  702. elif sect == 'personal quotes': sect = 'quotes'
  703. data = data.replace('</p><p>', '::')
  704. data = data.replace('<br><br>', ' ') # for multi-paragraphs 'bio'
  705. data = data.replace('</td> <td valign="top">', '@@@@')
  706. data = data.replace('</td> </tr>', '::')
  707. data = _unHtml(data)
  708. data = [x.strip() for x in data.split('::')]
  709. data[:] = [x.replace('@@@@', '::') for x in data if x]
  710. if sect == 'height' and data: data = data[0]
  711. elif sect == 'birth name': data = canonicalName(data[0])
  712. elif sect == 'date of birth':
  713. date, notes = date_and_notes(data[0])
  714. if date:
  715. d['birth date'] = date
  716. if notes:
  717. d['birth notes'] = notes
  718. continue
  719. elif sect == 'date of death':
  720. date, notes = date_and_notes(data[0])
  721. if date:
  722. d['death date'] = date
  723. if notes:
  724. d['death notes'] = notes
  725. continue
  726. elif sect == 'mini biography':
  727. ndata = []
  728. for bio in data:
  729. byidx = bio.rfind('IMDb Mini Biography By')
  730. if byidx != -1:
  731. bioAuth = bio[:byidx].rstrip()
  732. else:
  733. bioAuth = 'Anonymous'
  734. bio = u'%s::%s' % (bioAuth, bio[byidx+23:].lstrip())
  735. ndata.append(bio)
  736. data[:] = ndata
  737. if 'mini biography' in d:
  738. d['mini biography'].append(ndata[0])
  739. continue
  740. d[sect] = data
  741. return {'data': d}
  742. def _search_character(self, name, results):
  743. cont = subXMLRefs(self._get_search_content('char', name, results))
  744. name = _findBetween(cont, '<title>', '</title>', maxRes=1)
  745. res = []
  746. if not name:
  747. self._mobile_logger.error('no title tag searching character %s',
  748. name)
  749. return res
  750. nl = name[0].lower()
  751. if not (nl.startswith('imdb search') or nl.startswith('imdb search') \
  752. or nl.startswith('imdb character')):
  753. # a direct hit!
  754. name = _unHtml(name[0]).replace('(Character)', '').strip()
  755. pid = None
  756. pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
  757. if pidtag:
  758. pid = _findBetween(pidtag[0], '/character/ch', '/', maxRes=1)
  759. if not (pid and name):
  760. self._mobile_logger.error('no direct hit name/characterID for' \
  761. ' character %s', name)
  762. return res
  763. res[:] = [(str(pid[0]), analyze_name(name))]
  764. else:
  765. sects = _findBetween(cont, '<b>Popular Characters</b>', '</table>',
  766. maxRes=results*3)
  767. sects += _findBetween(cont, '<b>Characters', '</table>',
  768. maxRes=results*3)
  769. for sect in sects:
  770. lis = _findBetween(sect, '<a href="/character/',
  771. ['<small', '</td>', '<br'])
  772. for li in lis:
  773. li = '<%s' % li
  774. pid = re_imdbID.findall(li)
  775. pname = _unHtml(li)
  776. if not (pid and pname):
  777. self._mobile_logger.debug('no name/characterID' \
  778. ' parsing %s searching for' \
  779. ' character %s', li, name)
  780. continue
  781. res.append((str(pid[0]), analyze_name(pname)))
  782. return res
  783. def get_character_main(self, characterID):
  784. return self.get_person_main(characterID, _parseChr=True)
  785. def get_character_biography(self, characterID):
  786. cont = self._mretrieve(self.urls['character_main'] % characterID + 'bio')
  787. d = {}
  788. intro = _findBetween(cont, '<div class="display">',
  789. ('<span>', '<h4>'), maxRes=1)
  790. if intro:
  791. intro = _unHtml(intro[0]).strip()
  792. if intro:
  793. d['introduction'] = intro
  794. tocidx = cont.find('<table id="toc..')
  795. if tocidx != -1:
  796. cont = cont[tocidx:]
  797. bios = _findBetween(cont, '<h4>', ('<h4>', '</div>'))
  798. if bios:
  799. for bio in bios:
  800. bio = bio.replace('</h4>', '::')
  801. bio = bio.replace('\n', ' ')
  802. bio = bio.replace('<br>', '\n')
  803. bio = bio.replace('<br/>', '\n')
  804. bio = subSGMLRefs(re_unhtmlsub('', bio).strip())
  805. bio = bio.replace(' ::', '::').replace(':: ', '::')
  806. bio = bio.replace('::', ': ', 1)
  807. if bio:
  808. d.setdefault('biography', []).append(bio)
  809. return {'data': d}