/lib/imdb/parser/mobile/__init__.py
Python | 844 lines | 799 code | 10 blank | 35 comment | 64 complexity | e3de09fe1579b1ce7c34d28ab8ce9126 MD5 | raw file
- """
- parser.mobile package (imdb package).
- This package provides the IMDbMobileAccessSystem class used to access
- IMDb's data for mobile systems.
- the imdb.IMDb function will return an instance of this class when
- called with the 'accessSystem' argument set to "mobile".
- Copyright 2005-2011 Davide Alberani <da@erlug.linux.it>
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- """
- import re
- import logging
- from urllib import unquote
- from imdb.Movie import Movie
- from imdb.utils import analyze_title, analyze_name, canonicalName, \
- date_and_notes
- from imdb._exceptions import IMDbDataAccessError
- from imdb.parser.http import IMDbHTTPAccessSystem
- from imdb.parser.http.utils import subXMLRefs, subSGMLRefs, build_person, \
- build_movie, re_spaces
- # XXX NOTE: the first version of this module was heavily based on
- # regular expressions. This new version replace regexps with
- # find() strings' method calls; despite being less flexible, it
- # seems to be at least as fast and, hopefully, much more
- # lightweight. Yes: the regexp-based version was too heavyweight
- # for systems with very limited CPU power and memory footprint.
- re_spacessub = re_spaces.sub
- # Strip html.
- re_unhtml = re.compile(r'<.+?>')
- re_unhtmlsub = re_unhtml.sub
- # imdb person or movie ids.
- re_imdbID = re.compile(r'(?<=nm|tt|ch)([0-9]{7})\b')
- # movie AKAs.
- re_makas = re.compile('(<p class="find-aka">.*?</p>)')
- # Remove episode numbers.
- re_filmo_episodes = re.compile('<div class="filmo-episodes">.*?</div>',
- re.M | re.I)
- def _unHtml(s):
- """Return a string without tags and no multiple spaces."""
- return subSGMLRefs(re_spacessub(' ', re_unhtmlsub('', s)).strip())
- _inttype = type(0)
- def _getTagsWith(s, cont, toClosure=False, maxRes=None):
- """Return the html tags in the 's' string containing the 'cont'
- string; if toClosure is True, everything between the opening
- tag and the closing tag is returned."""
- lres = []
- bi = s.find(cont)
- if bi != -1:
- btag = s[:bi].rfind('<')
- if btag != -1:
- if not toClosure:
- etag = s[bi+1:].find('>')
- if etag != -1:
- endidx = bi+2+etag
- lres.append(s[btag:endidx])
- if maxRes is not None and len(lres) >= maxRes: return lres
- lres += _getTagsWith(s[endidx:], cont,
- toClosure=toClosure)
- else:
- spaceidx = s[btag:].find(' ')
- if spaceidx != -1:
- ctag = '</%s>' % s[btag+1:btag+spaceidx]
- closeidx = s[bi:].find(ctag)
- if closeidx != -1:
- endidx = bi+closeidx+len(ctag)
- lres.append(s[btag:endidx])
- if maxRes is not None and len(lres) >= maxRes:
- return lres
- lres += _getTagsWith(s[endidx:], cont,
- toClosure=toClosure)
- return lres
- def _findBetween(s, begins, ends, beginindx=0, maxRes=None, lres=None):
- """Return the list of strings from the 's' string which are included
- between the 'begins' and 'ends' strings."""
- if lres is None:
- lres = []
- bi = s.find(begins, beginindx)
- if bi != -1:
- lbegins = len(begins)
- if isinstance(ends, (list, tuple)):
- eset = [s.find(end, bi+lbegins) for end in ends]
- eset[:] = [x for x in eset if x != -1]
- if not eset: ei = -1
- else: ei = min(eset)
- else:
- ei = s.find(ends, bi+lbegins)
- if ei != -1:
- match = s[bi+lbegins:ei]
- lres.append(match)
- if maxRes is not None and len(lres) >= maxRes: return lres
- _findBetween(s, begins, ends, beginindx=ei, maxRes=maxRes,
- lres=lres)
- return lres
- class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
- """The class used to access IMDb's data through the web for
- mobile terminals."""
- accessSystem = 'mobile'
- _mobile_logger = logging.getLogger('imdbpy.parser.mobile')
- def __init__(self, isThin=0, *arguments, **keywords):
- self.accessSystem = 'mobile'
- IMDbHTTPAccessSystem.__init__(self, isThin, *arguments, **keywords)
- def _clean_html(self, html):
- """Normalize the retrieve html."""
- html = re_spaces.sub(' ', html)
- # Remove silly » chars.
- html = html.replace(' »', '')
- return subXMLRefs(html)
- def _mretrieve(self, url, size=-1):
- """Retrieve an html page and normalize it."""
- cont = self._retrieve(url, size=size)
- return self._clean_html(cont)
- def _getPersons(self, s, sep='<br/>'):
- """Return a list of Person objects, from the string s; items
- are assumed to be separated by the sep string."""
- names = s.split(sep)
- pl = []
- plappend = pl.append
- counter = 1
- for name in names:
- pid = re_imdbID.findall(name)
- if not pid: continue
- characters = _getTagsWith(name, 'class="char"',
- toClosure=True, maxRes=1)
- chpids = []
- if characters:
- for ch in characters[0].split(' / '):
- chid = re_imdbID.findall(ch)
- if not chid:
- chpids.append(None)
- else:
- chpids.append(chid[-1])
- if not chpids:
- chpids = None
- elif len(chpids) == 1:
- chpids = chpids[0]
- name = _unHtml(name)
- # Catch unclosed tags.
- gt_indx = name.find('>')
- if gt_indx != -1:
- name = name[gt_indx+1:].lstrip()
- if not name: continue
- if name.endswith('...'):
- name = name[:-3]
- p = build_person(name, personID=str(pid[0]), billingPos=counter,
- modFunct=self._defModFunct, roleID=chpids,
- accessSystem=self.accessSystem)
- plappend(p)
- counter += 1
- return pl
- def _search_movie(self, title, results):
- ##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title})
- ##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results))
- ##cont = self._mretrieve(imdbURL_search % params)
- cont = subXMLRefs(self._get_search_content('tt', title, results))
- title = _findBetween(cont, '<title>', '</title>', maxRes=1)
- res = []
- if not title:
- self._mobile_logger.error('no title tag searching for movie %s',
- title)
- return res
- tl = title[0].lower()
- if not tl.startswith('imdb title'):
- # a direct hit!
- title = _unHtml(title[0])
- mid = None
- midtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
- if midtag:
- mid = _findBetween(midtag[0], '/title/tt', '/', maxRes=1)
- if not (mid and title):
- self._mobile_logger.error('no direct hit title/movieID for' \
- ' title %s', title)
- return res
- if cont.find('<span class="tv-extra">TV mini-series</span>') != -1:
- title += ' (mini)'
- res[:] = [(str(mid[0]), analyze_title(title))]
- else:
- # XXX: this results*3 prevents some recursion errors, but...
- # it's not exactly understandable (i.e.: why 'results' is
- # not enough to get all the results?)
- lis = _findBetween(cont, 'td valign="top">', '</td>',
- maxRes=results*3)
- for li in lis:
- akas = re_makas.findall(li)
- for idx, aka in enumerate(akas):
- aka = aka.replace('" - ', '::', 1)
- aka = _unHtml(aka)
- if aka.startswith('aka "'):
- aka = aka[5:].strip()
- if aka[-1] == '"':
- aka = aka[:-1]
- akas[idx] = aka
- imdbid = re_imdbID.findall(li)
- li = re_makas.sub('', li)
- mtitle = _unHtml(li)
- if not (imdbid and mtitle):
- self._mobile_logger.debug('no title/movieID parsing' \
- ' %s searching for title %s', li,
- title)
- continue
- mtitle = mtitle.replace('(TV mini-series)', '(mini)')
- resd = analyze_title(mtitle)
- if akas:
- resd['akas'] = akas
- res.append((str(imdbid[0]), resd))
- return res
- def get_movie_main(self, movieID):
- cont = self._mretrieve(self.urls['movie_main'] % movieID + 'maindetails')
- title = _findBetween(cont, '<title>', '</title>', maxRes=1)
- if not title:
- raise IMDbDataAccessError('unable to get movieID "%s"' % movieID)
- title = _unHtml(title[0])
- if title.endswith(' - IMDb'):
- title = title[:-7]
- if cont.find('<span class="tv-extra">TV mini-series</span>') != -1:
- title += ' (mini)'
- d = analyze_title(title)
- kind = d.get('kind')
- tv_series = _findBetween(cont, 'TV Series:</h5>', '</a>', maxRes=1)
- if tv_series: mid = re_imdbID.findall(tv_series[0])
- else: mid = None
- if tv_series and mid:
- s_title = _unHtml(tv_series[0])
- s_data = analyze_title(s_title)
- m = Movie(movieID=str(mid[0]), data=s_data,
- accessSystem=self.accessSystem,
- modFunct=self._defModFunct)
- d['kind'] = kind = u'episode'
- d['episode of'] = m
- if kind in ('tv series', 'tv mini series'):
- years = _findBetween(cont, '<h1>', '</h1>', maxRes=1)
- if years:
- years[:] = _findBetween(years[0], 'TV series', '</span>',
- maxRes=1)
- if years:
- d['series years'] = years[0].strip()
- air_date = _findBetween(cont, 'Original Air Date:</h5>', '</div>',
- maxRes=1)
- if air_date:
- air_date = air_date[0]
- vi = air_date.find('(')
- if vi != -1:
- date = _unHtml(air_date[:vi]).strip()
- if date != '????':
- d['original air date'] = date
- air_date = air_date[vi:]
- season = _findBetween(air_date, 'Season', ',', maxRes=1)
- if season:
- season = season[0].strip()
- try: season = int(season)
- except: pass
- if season or type(season) is _inttype:
- d['season'] = season
- episode = _findBetween(air_date, 'Episode', ')', maxRes=1)
- if episode:
- episode = episode[0].strip()
- try: episode = int(episode)
- except: pass
- if episode or type(season) is _inttype:
- d['episode'] = episode
- direct = _findBetween(cont, '<h5>Director', ('</div>', '<br/> <br/>'),
- maxRes=1)
- if direct:
- direct = direct[0]
- h5idx = direct.find('/h5>')
- if h5idx != -1:
- direct = direct[h5idx+4:]
- direct = self._getPersons(direct)
- if direct: d['director'] = direct
- if kind in ('tv series', 'tv mini series', 'episode'):
- if kind != 'episode':
- seasons = _findBetween(cont, 'Seasons:</h5>', '</div>',
- maxRes=1)
- if seasons:
- d['number of seasons'] = seasons[0].count('|') + 1
- creator = _findBetween(cont, 'Created by</h5>', ('class="tn15more"',
- '</div>',
- '<br/> <br/>'),
- maxRes=1)
- if not creator:
- # They change 'Created by' to 'Creator' and viceversa
- # from time to time...
- # XXX: is 'Creators' also used?
- creator = _findBetween(cont, 'Creator:</h5>',
- ('class="tn15more"', '</div>',
- '<br/> <br/>'), maxRes=1)
- if creator:
- creator = creator[0]
- if creator.find('tn15more'): creator = '%s>' % creator
- creator = self._getPersons(creator)
- if creator: d['creator'] = creator
- writers = _findBetween(cont, '<h5>Writer', ('</div>', '<br/> <br/>'),
- maxRes=1)
- if writers:
- writers = writers[0]
- h5idx = writers.find('/h5>')
- if h5idx != -1:
- writers = writers[h5idx+4:]
- writers = self._getPersons(writers)
- if writers: d['writer'] = writers
- cvurl = _getTagsWith(cont, 'name="poster"', toClosure=True, maxRes=1)
- if cvurl:
- cvurl = _findBetween(cvurl[0], 'src="', '"', maxRes=1)
- if cvurl: d['cover url'] = cvurl[0]
- genres = _findBetween(cont, 'href="/genre/', '"')
- if genres:
- d['genres'] = list(set(genres))
- ur = _findBetween(cont, 'id="star-bar-user-rate">', '</div>',
- maxRes=1)
- if ur:
- rat = _findBetween(ur[0], '<b>', '</b>', maxRes=1)
- if rat:
- if rat:
- d['rating'] = rat[0].strip()
- else:
- self._mobile_logger.warn('wrong rating: %s', rat)
- vi = ur[0].rfind('href="ratings"')
- if vi != -1 and ur[0][vi+10:].find('await') == -1:
- try:
- votes = _findBetween(ur[0][vi:], "title='",
- " IMDb", maxRes=1)
- votes = int(votes[0].replace(',', ''))
- d['votes'] = votes
- except (ValueError, IndexError):
- self._mobile_logger.warn('wrong votes: %s', ur)
- top250 = _findBetween(cont, 'href="/chart/top?', '</a>', maxRes=1)
- if top250:
- fn = top250[0].rfind('#')
- if fn != -1:
- try:
- td = int(top250[0][fn+1:])
- d['top 250 rank'] = td
- except ValueError:
- self._mobile_logger.warn('wrong top250: %s', top250)
- castdata = _findBetween(cont, 'Cast overview', '</table>', maxRes=1)
- if not castdata:
- castdata = _findBetween(cont, 'Credited cast', '</table>', maxRes=1)
- if not castdata:
- castdata = _findBetween(cont, 'Complete credited cast', '</table>',
- maxRes=1)
- if not castdata:
- castdata = _findBetween(cont, 'Series Cast Summary', '</table>',
- maxRes=1)
- if not castdata:
- castdata = _findBetween(cont, 'Episode Credited cast', '</table>',
- maxRes=1)
- if castdata:
- castdata = castdata[0]
- # Reintegrate the fist tag.
- fl = castdata.find('href=')
- if fl != -1: castdata = '<a ' + castdata[fl:]
- # Exclude the 'rest of cast listed alphabetically' row.
- smib = castdata.find('<tr><td align="center" colspan="4"><small>')
- if smib != -1:
- smie = castdata.rfind('</small></td></tr>')
- if smie != -1:
- castdata = castdata[:smib].strip() + \
- castdata[smie+18:].strip()
- castdata = castdata.replace('/tr> <tr', '/tr><tr')
- cast = self._getPersons(castdata, sep='</tr><tr')
- if cast: d['cast'] = cast
- akas = _findBetween(cont, 'Also Known As:</h5>', '</div>', maxRes=1)
- if akas:
- # For some reason, here <br> is still used in place of <br/>.
- akas[:] = [x for x in akas[0].split('<br>') if x.strip()]
- akas = [_unHtml(x).replace('" - ','::', 1).lstrip('"').strip()
- for x in akas]
- if 'See more' in akas: akas.remove('See more')
- akas[:] = [x for x in akas if x]
- if akas:
- d['akas'] = akas
- mpaa = _findBetween(cont, 'MPAA</a>:', '</div>', maxRes=1)
- if mpaa: d['mpaa'] = _unHtml(mpaa[0])
- runtimes = _findBetween(cont, 'Runtime:</h5>', '</div>', maxRes=1)
- if runtimes:
- runtimes = runtimes[0]
- runtimes = [x.strip().replace(' min', '').replace(' (', '::(', 1)
- for x in runtimes.split('|')]
- d['runtimes'] = [_unHtml(x).strip() for x in runtimes]
- if kind == 'episode':
- # number of episodes.
- epsn = _findBetween(cont, 'title="Full Episode List">', '</a>',
- maxRes=1)
- if epsn:
- epsn = epsn[0].replace(' Episodes', '').strip()
- if epsn:
- try:
- epsn = int(epsn)
- except:
- self._mobile_logger.warn('wrong episodes #: %s', epsn)
- d['number of episodes'] = epsn
- country = _findBetween(cont, 'Country:</h5>', '</div>', maxRes=1)
- if country:
- country[:] = country[0].split(' | ')
- country[:] = ['<a %s' % x for x in country if x]
- country[:] = [_unHtml(x.replace(' <i>', '::')) for x in country]
- if country: d['countries'] = country
- lang = _findBetween(cont, 'Language:</h5>', '</div>', maxRes=1)
- if lang:
- lang[:] = lang[0].split(' | ')
- lang[:] = ['<a %s' % x for x in lang if x]
- lang[:] = [_unHtml(x.replace(' <i>', '::')) for x in lang]
- if lang: d['languages'] = lang
- col = _findBetween(cont, '"/search/title?colors=', '</div>')
- if col:
- col[:] = col[0].split(' | ')
- col[:] = ['<a %s' % x for x in col if x]
- col[:] = [_unHtml(x.replace(' <i>', '::')) for x in col]
- if col: d['color info'] = col
- sm = _findBetween(cont, '/search/title?sound_mixes=', '</div>',
- maxRes=1)
- if sm:
- sm[:] = sm[0].split(' | ')
- sm[:] = ['<a %s' % x for x in sm if x]
- sm[:] = [_unHtml(x.replace(' <i>', '::')) for x in sm]
- if sm: d['sound mix'] = sm
- cert = _findBetween(cont, 'Certification:</h5>', '</div>', maxRes=1)
- if cert:
- cert[:] = cert[0].split(' | ')
- cert[:] = [_unHtml(x.replace(' <i>', '::')) for x in cert]
- if cert: d['certificates'] = cert
- plotoutline = _findBetween(cont, 'Plot:</h5>', ['<a ', '</div>'],
- maxRes=1)
- if plotoutline:
- plotoutline = plotoutline[0].strip()
- plotoutline = plotoutline.rstrip('|').rstrip()
- if plotoutline: d['plot outline'] = _unHtml(plotoutline)
- aratio = _findBetween(cont, 'Aspect Ratio:</h5>', ['<a ', '</div>'],
- maxRes=1)
- if aratio:
- aratio = aratio[0].strip().replace(' (', '::(', 1)
- if aratio:
- d['aspect ratio'] = _unHtml(aratio)
- return {'data': d}
- def get_movie_plot(self, movieID):
- cont = self._mretrieve(self.urls['movie_main'] % movieID + 'plotsummary')
- plot = _findBetween(cont, '<p class="plotpar">', '</p>')
- plot[:] = [_unHtml(x) for x in plot]
- for i in xrange(len(plot)):
- p = plot[i]
- wbyidx = p.rfind(' Written by ')
- if wbyidx != -1:
- plot[i] = '%s::%s' % \
- (p[:wbyidx].rstrip(),
- p[wbyidx+12:].rstrip().replace('{','<').replace('}','>'))
- if plot: return {'data': {'plot': plot}}
- return {'data': {}}
- def _search_person(self, name, results):
- ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name})
- ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results))
- ##cont = self._mretrieve(imdbURL_search % params)
- cont = subXMLRefs(self._get_search_content('nm', name, results))
- name = _findBetween(cont, '<title>', '</title>', maxRes=1)
- res = []
- if not name:
- self._mobile_logger.warn('no title tag searching for name %s', name)
- return res
- nl = name[0].lower()
- if not nl.startswith('imdb name'):
- # a direct hit!
- name = _unHtml(name[0])
- name = name.replace('- Filmography by type' , '').strip()
- pid = None
- pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
- if pidtag:
- pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1)
- if not (pid and name):
- self._mobile_logger.error('no direct hit name/personID for' \
- ' name %s', name)
- return res
- res[:] = [(str(pid[0]), analyze_name(name, canonical=1))]
- else:
- lis = _findBetween(cont, 'td valign="top">', '</td>',
- maxRes=results*3)
- for li in lis:
- akas = _findBetween(li, '<em>"', '"</em>')
- for sep in ['<small', '<br> aka', '<br> birth name']:
- sepIdx = li.find(sep)
- if sepIdx != -1:
- li = li[:sepIdx]
- pid = re_imdbID.findall(li)
- pname = _unHtml(li)
- if not (pid and pname):
- self._mobile_logger.debug('no name/personID parsing' \
- ' %s searching for name %s', li,
- name)
- continue
- resd = analyze_name(pname, canonical=1)
- if akas:
- resd['akas'] = akas
- res.append((str(pid[0]), resd))
- return res
- def get_person_main(self, personID, _parseChr=False):
- if not _parseChr:
- url = self.urls['person_main'] % personID + 'maindetails'
- else:
- url = self.urls['character_main'] % personID
- s = self._mretrieve(url)
- r = {}
- name = _findBetween(s, '<title>', '</title>', maxRes=1)
- if not name:
- if _parseChr: w = 'characterID'
- else: w = 'personID'
- raise IMDbDataAccessError('unable to get %s "%s"' % (w, personID))
- name = _unHtml(name[0].replace(' - IMDb', ''))
- if _parseChr:
- name = name.replace('(Character)', '').strip()
- name = name.replace('- Filmography by type', '').strip()
- else:
- name = name.replace('- Filmography by', '').strip()
- r = analyze_name(name, canonical=not _parseChr)
- for dKind in ('Born', 'Died'):
- date = _findBetween(s, '%s:</h4>' % dKind.capitalize(),
- ('<div class', '</div>', '<br/><br/>'), maxRes=1)
- if date:
- date = _unHtml(date[0])
- if date:
- #date, notes = date_and_notes(date)
- # TODO: fix to handle real names.
- date_notes = date.split(' in ', 1)
- notes = u''
- date = date_notes[0]
- if len(date_notes) == 2:
- notes = date_notes[1]
- dtitle = 'birth'
- if dKind == 'Died':
- dtitle = 'death'
- if date:
- r['%s date' % dtitle] = date
- if notes:
- r['%s notes' % dtitle] = notes
- akas = _findBetween(s, 'Alternate Names:</h4>', ('</div>',
- '<br/><br/>'), maxRes=1)
- if akas:
- akas = akas[0]
- if akas:
- akas = _unHtml(akas)
- if akas.find(' | ') != -1:
- akas = akas.split(' | ')
- else:
- akas = akas.split(' / ')
- if akas: r['akas'] = filter(None, [x.strip() for x in akas])
- hs = _findBetween(s, "rel='image_src'", '>', maxRes=1)
- if not hs:
- hs = _findBetween(s, 'rel="image_src"', '>', maxRes=1)
- if not hs:
- hs = _findBetween(s, '<a name="headshot"', '</a>', maxRes=1)
- if hs:
- hsl = _findBetween(hs[0], "href='", "'", maxRes=1)
- if not hsl:
- hsl = _findBetween(hs[0], 'href="', '"', maxRes=1)
- if hsl and 'imdb-share-logo' not in hsl[0]:
- r['headshot'] = hsl[0]
- # Build a list of tuples such [('hrefLink', 'section name')]
- workkind = _findBetween(s, 'id="jumpto_', '</a>')
- ws = []
- for work in workkind:
- sep = '" >'
- if '">' in work:
- sep = '">'
- wsplit = work.split(sep, 1)
- if len(wsplit) == 2:
- sect = wsplit[0]
- if '"' in sect:
- sect = sect[:sect.find('"')]
- ws.append((sect, wsplit[1].lower()))
- # XXX: I think "guest appearances" are gone.
- if s.find('<a href="#guest-appearances"') != -1:
- ws.append(('guest-appearances', 'notable tv guest appearances'))
- #if _parseChr:
- # ws.append(('filmography', 'filmography'))
- for sect, sectName in ws:
- raws = u''
- if sectName == 'self':
- sect = 'Self'
- # Everything between the current section link and the end
- # of the <ol> tag.
- if _parseChr and sect == 'filmography':
- inisect = s.find('<div class="filmo">')
- else:
- inisect = s.find('<a name="%s' % sect)
- if inisect != -1:
- endsect = s[inisect:].find('<div id="filmo-head-')
- if endsect == -1:
- endsect = s[inisect:].find('<div class="article"')
- if endsect != -1: raws = s[inisect:inisect+endsect]
- #if not raws: continue
- mlist = _findBetween(raws, '<div class="filmo-row',
- ('<div class="clear"/>',))
- for m in mlist:
- fCB = m.find('>')
- if fCB != -1:
- m = m[fCB+1:].lstrip()
- m = re_filmo_episodes.sub('', m)
- # For every movie in the current section.
- movieID = re_imdbID.findall(m)
- if not movieID:
- self._mobile_logger.debug('no movieID in %s', m)
- continue
- m = m.replace('<br/>', ' .... ', 1)
- if not _parseChr:
- chrIndx = m.find(' .... ')
- else:
- chrIndx = m.find(' Played by ')
- chids = []
- if chrIndx != -1:
- chrtxt = m[chrIndx+6:]
- if _parseChr:
- chrtxt = chrtxt[5:]
- for ch in chrtxt.split(' / '):
- chid = re_imdbID.findall(ch)
- if not chid:
- chids.append(None)
- else:
- chids.append(chid[-1])
- if not chids:
- chids = None
- elif len(chids) == 1:
- chids = chids[0]
- movieID = str(movieID[0])
- # Search the status.
- stidx = m.find('<i>')
- status = u''
- if stidx != -1:
- stendidx = m.rfind('</i>')
- if stendidx != -1:
- status = _unHtml(m[stidx+3:stendidx])
- m = m.replace(m[stidx+3:stendidx], '')
- year = _findBetween(m, 'year_column">', '</span>', maxRes=1)
- if year:
- year = year[0]
- m = m.replace('<span class="year_column">%s</span>' % year,
- '')
- else:
- year = None
- m = _unHtml(m)
- if not m:
- self._mobile_logger.warn('no title for movieID %s', movieID)
- continue
- movie = build_movie(m, movieID=movieID, status=status,
- roleID=chids, modFunct=self._defModFunct,
- accessSystem=self.accessSystem,
- _parsingCharacter=_parseChr, year=year)
- sectName = sectName.split(':')[0]
- r.setdefault(sectName, []).append(movie)
- # If available, take the always correct name from a form.
- itag = _getTagsWith(s, 'NAME="primary"', maxRes=1)
- if not itag:
- itag = _getTagsWith(s, 'name="primary"', maxRes=1)
- if itag:
- vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1)
- if not vtag:
- vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1)
- if vtag:
- try:
- vtag = unquote(str(vtag[0]))
- vtag = unicode(vtag, 'latin_1')
- r.update(analyze_name(vtag))
- except UnicodeEncodeError:
- pass
- return {'data': r, 'info sets': ('main', 'filmography')}
- def get_person_biography(self, personID):
- cont = self._mretrieve(self.urls['person_main'] % personID + 'bio')
- d = {}
- spouses = _findBetween(cont, 'Spouse</h5>', ('</table>', '</dd>'),
- maxRes=1)
- if spouses:
- sl = []
- for spouse in spouses[0].split('</tr>'):
- if spouse.count('</td>') > 1:
- spouse = spouse.replace('</td>', '::</td>', 1)
- spouse = _unHtml(spouse)
- spouse = spouse.replace(':: ', '::').strip()
- if spouse: sl.append(spouse)
- if sl: d['spouse'] = sl
- nnames = _findBetween(cont, '<h5>Nickname</h5>', ('<br/> <br/>','<h5>'),
- maxRes=1)
- if nnames:
- nnames = nnames[0]
- if nnames:
- nnames = [x.strip().replace(' (', '::(', 1)
- for x in nnames.split('<br/>')]
- if nnames:
- d['nick names'] = nnames
- misc_sects = _findBetween(cont, '<h5>', '<br/>')
- misc_sects[:] = [x.split('</h5>') for x in misc_sects]
- misc_sects[:] = [x for x in misc_sects if len(x) == 2]
- for sect, data in misc_sects:
- sect = sect.lower().replace(':', '').strip()
- if d.has_key(sect) and sect != 'mini biography': continue
- elif sect in ('spouse', 'nickname'): continue
- if sect == 'salary': sect = 'salary history'
- elif sect == 'where are they now': sect = 'where now'
- elif sect == 'personal quotes': sect = 'quotes'
- data = data.replace('</p><p>', '::')
- data = data.replace('<br><br>', ' ') # for multi-paragraphs 'bio'
- data = data.replace('</td> <td valign="top">', '@@@@')
- data = data.replace('</td> </tr>', '::')
- data = _unHtml(data)
- data = [x.strip() for x in data.split('::')]
- data[:] = [x.replace('@@@@', '::') for x in data if x]
- if sect == 'height' and data: data = data[0]
- elif sect == 'birth name': data = canonicalName(data[0])
- elif sect == 'date of birth':
- date, notes = date_and_notes(data[0])
- if date:
- d['birth date'] = date
- if notes:
- d['birth notes'] = notes
- continue
- elif sect == 'date of death':
- date, notes = date_and_notes(data[0])
- if date:
- d['death date'] = date
- if notes:
- d['death notes'] = notes
- continue
- elif sect == 'mini biography':
- ndata = []
- for bio in data:
- byidx = bio.rfind('IMDb Mini Biography By')
- if byidx != -1:
- bioAuth = bio[:byidx].rstrip()
- else:
- bioAuth = 'Anonymous'
- bio = u'%s::%s' % (bioAuth, bio[byidx+23:].lstrip())
- ndata.append(bio)
- data[:] = ndata
- if 'mini biography' in d:
- d['mini biography'].append(ndata[0])
- continue
- d[sect] = data
- return {'data': d}
- def _search_character(self, name, results):
- cont = subXMLRefs(self._get_search_content('char', name, results))
- name = _findBetween(cont, '<title>', '</title>', maxRes=1)
- res = []
- if not name:
- self._mobile_logger.error('no title tag searching character %s',
- name)
- return res
- nl = name[0].lower()
- if not (nl.startswith('imdb search') or nl.startswith('imdb search') \
- or nl.startswith('imdb character')):
- # a direct hit!
- name = _unHtml(name[0]).replace('(Character)', '').strip()
- pid = None
- pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
- if pidtag:
- pid = _findBetween(pidtag[0], '/character/ch', '/', maxRes=1)
- if not (pid and name):
- self._mobile_logger.error('no direct hit name/characterID for' \
- ' character %s', name)
- return res
- res[:] = [(str(pid[0]), analyze_name(name))]
- else:
- sects = _findBetween(cont, '<b>Popular Characters</b>', '</table>',
- maxRes=results*3)
- sects += _findBetween(cont, '<b>Characters', '</table>',
- maxRes=results*3)
- for sect in sects:
- lis = _findBetween(sect, '<a href="/character/',
- ['<small', '</td>', '<br'])
- for li in lis:
- li = '<%s' % li
- pid = re_imdbID.findall(li)
- pname = _unHtml(li)
- if not (pid and pname):
- self._mobile_logger.debug('no name/characterID' \
- ' parsing %s searching for' \
- ' character %s', li, name)
- continue
- res.append((str(pid[0]), analyze_name(pname)))
- return res
- def get_character_main(self, characterID):
- return self.get_person_main(characterID, _parseChr=True)
- def get_character_biography(self, characterID):
- cont = self._mretrieve(self.urls['character_main'] % characterID + 'bio')
- d = {}
- intro = _findBetween(cont, '<div class="display">',
- ('<span>', '<h4>'), maxRes=1)
- if intro:
- intro = _unHtml(intro[0]).strip()
- if intro:
- d['introduction'] = intro
- tocidx = cont.find('<table id="toc..')
- if tocidx != -1:
- cont = cont[tocidx:]
- bios = _findBetween(cont, '<h4>', ('<h4>', '</div>'))
- if bios:
- for bio in bios:
- bio = bio.replace('</h4>', '::')
- bio = bio.replace('\n', ' ')
- bio = bio.replace('<br>', '\n')
- bio = bio.replace('<br/>', '\n')
- bio = subSGMLRefs(re_unhtmlsub('', bio).strip())
- bio = bio.replace(' ::', '::').replace(':: ', '::')
- bio = bio.replace('::', ': ', 1)
- if bio:
- d.setdefault('biography', []).append(bio)
- return {'data': d}