/imdb/parser/http/movieParser.py
Python | 1951 lines | 1909 code | 19 blank | 23 comment | 23 complexity | c9223950c96d878cfa96a8fd8e0a796d MD5 | raw file
Possible License(s): GPL-2.0
Large files files are truncated, but you can click here to view the full file
- """
- parser.http.movieParser module (imdb package).
- This module provides the classes (and the instances), used to parse the
- IMDb pages on the akas.imdb.com server about a movie.
- E.g., for Brian De Palma's "The Untouchables", the referred
- pages would be:
- combined details: http://akas.imdb.com/title/tt0094226/combined
- plot summary: http://akas.imdb.com/title/tt0094226/plotsummary
- ...and so on...
- Copyright 2004-2016 Davide Alberani <da@erlug.linux.it>
- 2008 H. Turgut Uyar <uyar@tekir.org>
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- """
- import re
- import urllib
- from imdb import imdbURL_base
- from imdb.Person import Person
- from imdb.Movie import Movie
- from imdb.Company import Company
- from imdb.utils import analyze_title, split_company_name_notes, _Container
- from utils import build_person, DOMParserBase, Attribute, Extractor, \
- analyze_imdbid
- # Dictionary used to convert some section's names.
- _SECT_CONV = {
- 'directed': 'director',
- 'directed by': 'director',
- 'directors': 'director',
- 'editors': 'editor',
- 'writing credits': 'writer',
- 'writers': 'writer',
- 'produced': 'producer',
- 'cinematography': 'cinematographer',
- 'film editing': 'editor',
- 'casting': 'casting director',
- 'costume design': 'costume designer',
- 'makeup department': 'make up',
- 'production management': 'production manager',
- 'second unit director or assistant director': 'assistant director',
- 'costume and wardrobe department': 'costume department',
- 'sound department': 'sound crew',
- 'stunts': 'stunt performer',
- 'other crew': 'miscellaneous crew',
- 'also known as': 'akas',
- 'country': 'countries',
- 'runtime': 'runtimes',
- 'language': 'languages',
- 'certification': 'certificates',
- 'genre': 'genres',
- 'created': 'creator',
- 'creators': 'creator',
- 'color': 'color info',
- 'plot': 'plot outline',
- 'seasons': 'number of seasons',
- 'art directors': 'art direction',
- 'assistant directors': 'assistant director',
- 'set decorators': 'set decoration',
- 'visual effects department': 'visual effects',
- 'production managers': 'production manager',
- 'miscellaneous': 'miscellaneous crew',
- 'make up department': 'make up',
- 'plot summary': 'plot outline',
- 'cinematographers': 'cinematographer',
- 'camera department': 'camera and electrical department',
- 'costume designers': 'costume designer',
- 'production designers': 'production design',
- 'production managers': 'production manager',
- 'music original': 'original music',
- 'casting directors': 'casting director',
- 'other companies': 'miscellaneous companies',
- 'producers': 'producer',
- 'special effects by': 'special effects department',
- 'special effects': 'special effects companies'
- }
- def _manageRoles(mo):
- """Perform some transformation on the html, so that roleIDs can
- be easily retrieved."""
- firstHalf = mo.group(1)
- secondHalf = mo.group(2)
- newRoles = []
- roles = secondHalf.split(' / ')
- for role in roles:
- role = role.strip()
- if not role:
- continue
- roleID = analyze_imdbid(role)
- if roleID is None:
- roleID = u'/'
- else:
- roleID += u'/'
- newRoles.append(u'<div class="_imdbpyrole" roleid="%s">%s</div>' % \
- (roleID, role.strip()))
- return firstHalf + u' / '.join(newRoles) + mo.group(3)
- _reRolesMovie = re.compile(r'(<td class="char">)(.*?)(</td>)',
- re.I | re.M | re.S)
- def _replaceBR(mo):
- """Replaces <br> tags with '::' (useful for some akas)"""
- txt = mo.group(0)
- return txt.replace('<br>', '::')
- _reAkas = re.compile(r'<h5>also known as:</h5>.*?</div>', re.I | re.M | re.S)
- def makeSplitter(lstrip=None, sep='|', comments=True,
- origNotesSep=' (', newNotesSep='::(', strip=None):
- """Return a splitter function suitable for a given set of data."""
- def splitter(x):
- if not x: return x
- x = x.strip()
- if not x: return x
- if lstrip is not None:
- x = x.lstrip(lstrip).lstrip()
- lx = x.split(sep)
- lx[:] = filter(None, [j.strip() for j in lx])
- if comments:
- lx[:] = [j.replace(origNotesSep, newNotesSep, 1) for j in lx]
- if strip:
- lx[:] = [j.strip(strip) for j in lx]
- return lx
- return splitter
- def _toInt(val, replace=()):
- """Return the value, converted to integer, or None; if present, 'replace'
- must be a list of tuples of values to replace."""
- for before, after in replace:
- val = val.replace(before, after)
- try:
- return int(val)
- except (TypeError, ValueError):
- return None
- class DOMHTMLMovieParser(DOMParserBase):
- """Parser for the "combined details" (and if instance.mdparse is
- True also for the "main details") page of a given movie.
- The page should be provided as a string, as taken from
- the akas.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example:
- mparser = DOMHTMLMovieParser()
- result = mparser.parse(combined_details_html_string)
- """
- _containsObjects = True
- extractors = [Extractor(label='title',
- path="//h1",
- attrs=Attribute(key='title',
- path=".//text()",
- postprocess=analyze_title)),
- Extractor(label='glossarysections',
- group="//a[@class='glossary']",
- group_key="./@name",
- group_key_normalize=lambda x: x.replace('_', ' '),
- path="../../../..//tr",
- attrs=Attribute(key=None,
- multi=True,
- path={'person': ".//text()",
- 'link': "./td[1]/a[@href]/@href"},
- postprocess=lambda x: \
- build_person(x.get('person') or u'',
- personID=analyze_imdbid(x.get('link')))
- )),
- Extractor(label='cast',
- path="//table[@class='cast']//tr",
- attrs=Attribute(key="cast",
- multi=True,
- path={'person': ".//text()",
- 'link': "td[2]/a/@href",
- 'roleID': \
- "td[4]/div[@class='_imdbpyrole']/@roleid"},
- postprocess=lambda x: \
- build_person(x.get('person') or u'',
- personID=analyze_imdbid(x.get('link')),
- roleID=(x.get('roleID') or u'').split('/'))
- )),
- Extractor(label='genres',
- path="//div[@class='info']//a[starts-with(@href," \
- " '/Sections/Genres')]",
- attrs=Attribute(key="genres",
- multi=True,
- path="./text()")),
- Extractor(label='myrating',
- path="//span[@id='voteuser']",
- attrs=Attribute(key='myrating',
- path=".//text()")),
- Extractor(label='h5sections',
- path="//div[@class='info']/h5/..",
- attrs=[
- Attribute(key="plot summary",
- path="./h5[starts-with(text(), " \
- "'Plot:')]/../div/text()",
- postprocess=lambda x: \
- x.strip().rstrip('|').rstrip()),
- Attribute(key="aspect ratio",
- path="./h5[starts-with(text()," \
- " 'Aspect')]/../div/text()",
- postprocess=lambda x: x.strip()),
- Attribute(key="mpaa",
- path="./h5/a[starts-with(text()," \
- " 'MPAA')]/../../div/text()",
- postprocess=lambda x: x.strip()),
- Attribute(key="countries",
- path="./h5[starts-with(text(), " \
- "'Countr')]/../div[@class='info-content']//text()",
- postprocess=makeSplitter('|')),
- Attribute(key="language",
- path="./h5[starts-with(text(), " \
- "'Language')]/..//text()",
- postprocess=makeSplitter('Language:')),
- Attribute(key='color info',
- path="./h5[starts-with(text(), " \
- "'Color')]/..//text()",
- postprocess=makeSplitter('|')),
- Attribute(key='sound mix',
- path="./h5[starts-with(text(), " \
- "'Sound Mix')]/..//text()",
- postprocess=makeSplitter('Sound Mix:')),
- # Collects akas not encosed in <i> tags.
- Attribute(key='other akas',
- path="./h5[starts-with(text(), " \
- "'Also Known As')]/../div//text()",
- postprocess=makeSplitter(sep='::',
- origNotesSep='" - ',
- newNotesSep='::',
- strip='"')),
- Attribute(key='runtimes',
- path="./h5[starts-with(text(), " \
- "'Runtime')]/../div/text()",
- postprocess=makeSplitter()),
- Attribute(key='certificates',
- path="./h5[starts-with(text(), " \
- "'Certificat')]/..//text()",
- postprocess=makeSplitter('Certification:')),
- Attribute(key='number of seasons',
- path="./h5[starts-with(text(), " \
- "'Seasons')]/..//text()",
- postprocess=lambda x: x.count('|') + 1),
- Attribute(key='original air date',
- path="./h5[starts-with(text(), " \
- "'Original Air Date')]/../div/text()"),
- Attribute(key='tv series link',
- path="./h5[starts-with(text(), " \
- "'TV Series')]/..//a/@href"),
- Attribute(key='tv series title',
- path="./h5[starts-with(text(), " \
- "'TV Series')]/..//a/text()")
- ]),
- Extractor(label='language codes',
- path="//h5[starts-with(text(), 'Language')]/..//a[starts-with(@href, '/language/')]",
- attrs=Attribute(key='language codes', multi=True,
- path="./@href",
- postprocess=lambda x: x.split('/')[2].strip()
- )),
- Extractor(label='country codes',
- path="//h5[starts-with(text(), 'Country')]/..//a[starts-with(@href, '/country/')]",
- attrs=Attribute(key='country codes', multi=True,
- path="./@href",
- postprocess=lambda x: x.split('/')[2].strip()
- )),
- Extractor(label='creator',
- path="//h5[starts-with(text(), 'Creator')]/..//a",
- attrs=Attribute(key='creator', multi=True,
- path={'name': "./text()",
- 'link': "./@href"},
- postprocess=lambda x: \
- build_person(x.get('name') or u'',
- personID=analyze_imdbid(x.get('link')))
- )),
- Extractor(label='thin writer',
- path="//h5[starts-with(text(), 'Writer')]/..//a",
- attrs=Attribute(key='thin writer', multi=True,
- path={'name': "./text()",
- 'link': "./@href"},
- postprocess=lambda x: \
- build_person(x.get('name') or u'',
- personID=analyze_imdbid(x.get('link')))
- )),
- Extractor(label='thin director',
- path="//h5[starts-with(text(), 'Director')]/..//a",
- attrs=Attribute(key='thin director', multi=True,
- path={'name': "./text()",
- 'link': "@href"},
- postprocess=lambda x: \
- build_person(x.get('name') or u'',
- personID=analyze_imdbid(x.get('link')))
- )),
- Extractor(label='top 250/bottom 100',
- path="//div[@class='starbar-special']/" \
- "a[starts-with(@href, '/chart/')]",
- attrs=Attribute(key='top/bottom rank',
- path="./text()")),
- Extractor(label='series years',
- path="//div[@id='tn15title']//span" \
- "[starts-with(text(), 'TV series')]",
- attrs=Attribute(key='series years',
- path="./text()",
- postprocess=lambda x: \
- x.replace('TV series','').strip())),
- Extractor(label='number of episodes',
- path="//a[@title='Full Episode List']",
- attrs=Attribute(key='number of episodes',
- path="./text()",
- postprocess=lambda x: \
- _toInt(x, [(' Episodes', '')]))),
- Extractor(label='akas',
- path="//i[@class='transl']",
- attrs=Attribute(key='akas', multi=True, path='text()',
- postprocess=lambda x:
- x.replace(' ', ' ').rstrip('-').replace('" - ',
- '"::', 1).strip('"').replace(' ', ' '))),
- Extractor(label='production notes/status',
- path="//h5[starts-with(text(), 'Status:')]/..//div[@class='info-content']",
- attrs=Attribute(key='production status',
- path=".//text()",
- postprocess=lambda x: x.strip().split('|')[0].strip().lower())),
- Extractor(label='production notes/status updated',
- path="//h5[starts-with(text(), 'Status Updated:')]/..//div[@class='info-content']",
- attrs=Attribute(key='production status updated',
- path=".//text()",
- postprocess=lambda x: x.strip())),
- Extractor(label='production notes/comments',
- path="//h5[starts-with(text(), 'Comments:')]/..//div[@class='info-content']",
- attrs=Attribute(key='production comments',
- path=".//text()",
- postprocess=lambda x: x.strip())),
- Extractor(label='production notes/note',
- path="//h5[starts-with(text(), 'Note:')]/..//div[@class='info-content']",
- attrs=Attribute(key='production note',
- path=".//text()",
- postprocess=lambda x: x.strip())),
- Extractor(label='blackcatheader',
- group="//b[@class='blackcatheader']",
- group_key="./text()",
- group_key_normalize=lambda x: x.lower(),
- path="../ul/li",
- attrs=Attribute(key=None,
- multi=True,
- path={'name': "./a//text()",
- 'comp-link': "./a/@href",
- 'notes': "./text()"},
- postprocess=lambda x: \
- Company(name=x.get('name') or u'',
- companyID=analyze_imdbid(x.get('comp-link')),
- notes=(x.get('notes') or u'').strip())
- )),
- Extractor(label='rating',
- path="//div[@class='starbar-meta']/b",
- attrs=Attribute(key='rating',
- path=".//text()")),
- Extractor(label='votes',
- path="//div[@class='starbar-meta']/a[@href]",
- attrs=Attribute(key='votes',
- path=".//text()")),
- Extractor(label='cover url',
- path="//a[@name='poster']",
- attrs=Attribute(key='cover url',
- path="./img/@src"))
- ]
- preprocessors = [
- (re.compile(r'(<b class="blackcatheader">.+?</b>)', re.I),
- r'</div><div>\1'),
- ('<small>Full cast and crew for<br>', ''),
- ('<td> </td>', '<td>...</td>'),
- ('<span class="tv-extra">TV mini-series</span>',
- '<span class="tv-extra">(mini)</span>'),
- (_reRolesMovie, _manageRoles),
- (_reAkas, _replaceBR)]
- def preprocess_dom(self, dom):
- # Handle series information.
- xpath = self.xpath(dom, "//b[text()='Series Crew']")
- if xpath:
- b = xpath[-1] # In doubt, take the last one.
- for a in self.xpath(b, "./following::h5/a[@class='glossary']"):
- name = a.get('name')
- if name:
- a.set('name', 'series %s' % name)
- # Remove links to IMDbPro.
- for proLink in self.xpath(dom, "//span[@class='pro-link']"):
- proLink.drop_tree()
- # Remove some 'more' links (keep others, like the one around
- # the number of votes).
- for tn15more in self.xpath(dom,
- "//a[@class='tn15more'][starts-with(@href, '/title/')]"):
- tn15more.drop_tree()
- return dom
- re_space = re.compile(r'\s+')
- re_airdate = re.compile(r'(.*)\s*\(season (\d+), episode (\d+)\)', re.I)
- def postprocess_data(self, data):
- # Convert section names.
- for sect in data.keys():
- if sect in _SECT_CONV:
- data[_SECT_CONV[sect]] = data[sect]
- del data[sect]
- sect = _SECT_CONV[sect]
- # Filter out fake values.
- for key in data:
- value = data[key]
- if isinstance(value, list) and value:
- if isinstance(value[0], Person):
- data[key] = filter(lambda x: x.personID is not None, value)
- if isinstance(value[0], _Container):
- for obj in data[key]:
- obj.accessSystem = self._as
- obj.modFunct = self._modFunct
- if 'akas' in data or 'other akas' in data:
- akas = data.get('akas') or []
- other_akas = data.get('other akas') or []
- akas += other_akas
- nakas = []
- for aka in akas:
- aka = aka.strip()
- if aka.endswith('" -'):
- aka = aka[:-3].rstrip()
- nakas.append(aka)
- if 'akas' in data:
- del data['akas']
- if 'other akas' in data:
- del data['other akas']
- if nakas:
- data['akas'] = nakas
- if 'color info' in data:
- data['color info'] = [x.replace('Color:', '', 1) for x in data['color info']]
- if 'runtimes' in data:
- data['runtimes'] = [x.replace(' min', u'')
- for x in data['runtimes']]
- if 'original air date' in data:
- oid = self.re_space.sub(' ', data['original air date']).strip()
- data['original air date'] = oid
- aid = self.re_airdate.findall(oid)
- if aid and len(aid[0]) == 3:
- date, season, episode = aid[0]
- date = date.strip()
- try: season = int(season)
- except: pass
- try: episode = int(episode)
- except: pass
- if date and date != '????':
- data['original air date'] = date
- else:
- del data['original air date']
- # Handle also "episode 0".
- if season or type(season) is type(0):
- data['season'] = season
- if episode or type(season) is type(0):
- data['episode'] = episode
- for k in ('writer', 'director'):
- t_k = 'thin %s' % k
- if t_k not in data:
- continue
- if k not in data:
- data[k] = data[t_k]
- del data[t_k]
- if 'top/bottom rank' in data:
- tbVal = data['top/bottom rank'].lower()
- if tbVal.startswith('top'):
- tbKey = 'top 250 rank'
- tbVal = _toInt(tbVal, [('top 250: #', '')])
- else:
- tbKey = 'bottom 100 rank'
- tbVal = _toInt(tbVal, [('bottom 100: #', '')])
- if tbVal:
- data[tbKey] = tbVal
- del data['top/bottom rank']
- if 'year' in data and data['year'] == '????':
- del data['year']
- if 'tv series link' in data:
- if 'tv series title' in data:
- data['episode of'] = Movie(title=data['tv series title'],
- movieID=analyze_imdbid(
- data['tv series link']),
- accessSystem=self._as,
- modFunct=self._modFunct)
- del data['tv series title']
- del data['tv series link']
- if 'rating' in data:
- try:
- data['rating'] = float(data['rating'].replace('/10', ''))
- except (TypeError, ValueError):
- pass
- if 'votes' in data:
- try:
- votes = data['votes'].replace(',', '').replace('votes', '')
- data['votes'] = int(votes)
- except (TypeError, ValueError):
- pass
- return data
- def _process_plotsummary(x):
- """Process a plot (contributed by Rdian06)."""
- xauthor = x.get('author')
- xplot = x.get('plot', u'').strip()
- if xauthor:
- xplot += u'::%s' % xauthor
- return xplot
- class DOMHTMLPlotParser(DOMParserBase):
- """Parser for the "plot summary" page of a given movie.
- The page should be provided as a string, as taken from
- the akas.imdb.com server. The final result will be a
- dictionary, with a 'plot' key, containing a list
- of string with the structure: 'summary::summary_author <author@email>'.
- Example:
- pparser = HTMLPlotParser()
- result = pparser.parse(plot_summary_html_string)
- """
- _defGetRefs = True
- # Notice that recently IMDb started to put the email of the
- # author only in the link, that we're not collecting, here.
- extractors = [Extractor(label='plot',
- path="//ul[@class='zebraList']//p",
- attrs=Attribute(key='plot',
- multi=True,
- path={'plot': './text()[1]',
- 'author': './span/em/a/text()'},
- postprocess=_process_plotsummary))]
- def _process_award(x):
- award = {}
- _award = x.get('award')
- if _award is not None:
- _award = _award.strip()
- award['award'] = _award
- if not award['award']:
- return {}
- award['year'] = x.get('year').strip()
- if award['year'] and award['year'].isdigit():
- award['year'] = int(award['year'])
- award['result'] = x.get('result').strip()
- category = x.get('category').strip()
- if category:
- award['category'] = category
- received_with = x.get('with')
- if received_with is not None:
- award['with'] = received_with.strip()
- notes = x.get('notes')
- if notes is not None:
- notes = notes.strip()
- if notes:
- award['notes'] = notes
- award['anchor'] = x.get('anchor')
- return award
- class DOMHTMLAwardsParser(DOMParserBase):
- """Parser for the "awards" page of a given person or movie.
- The page should be provided as a string, as taken from
- the akas.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example:
- awparser = HTMLAwardsParser()
- result = awparser.parse(awards_html_string)
- """
- subject = 'title'
- _containsObjects = True
- extractors = [
- Extractor(label='awards',
- group="//table//big",
- group_key="./a",
- path="./ancestor::tr[1]/following-sibling::tr/" \
- "td[last()][not(@colspan)]",
- attrs=Attribute(key=None,
- multi=True,
- path={
- 'year': "../td[1]/a/text()",
- 'result': "../td[2]/b/text()",
- 'award': "../td[3]/text()",
- 'category': "./text()[1]",
- # FIXME: takes only the first co-recipient
- 'with': "./small[starts-with(text()," \
- " 'Shared with:')]/following-sibling::a[1]/text()",
- 'notes': "./small[last()]//text()",
- 'anchor': ".//text()"
- },
- postprocess=_process_award
- )),
- Extractor(label='recipients',
- group="//table//big",
- group_key="./a",
- path="./ancestor::tr[1]/following-sibling::tr/" \
- "td[last()]/small[1]/preceding-sibling::a",
- attrs=Attribute(key=None,
- multi=True,
- path={
- 'name': "./text()",
- 'link': "./@href",
- 'anchor': "..//text()"
- }
- ))
- ]
- preprocessors = [
- (re.compile('(<tr><td[^>]*>.*?</td></tr>\n\n</table>)', re.I),
- r'\1</table>'),
- (re.compile('(<tr><td[^>]*>\n\n<big>.*?</big></td></tr>)', re.I),
- r'</table><table class="_imdbpy">\1'),
- (re.compile('(<table[^>]*>\n\n)</table>(<table)', re.I), r'\1\2'),
- (re.compile('(<small>.*?)<br>(.*?</small)', re.I), r'\1 \2'),
- (re.compile('(</tr>\n\n)(<td)', re.I), r'\1<tr>\2')
- ]
- def preprocess_dom(self, dom):
- """Repeat td elements according to their rowspan attributes
- in subsequent tr elements.
- """
- cols = self.xpath(dom, "//td[@rowspan]")
- for col in cols:
- span = int(col.get('rowspan'))
- del col.attrib['rowspan']
- position = len(self.xpath(col, "./preceding-sibling::td"))
- row = col.getparent()
- for tr in self.xpath(row, "./following-sibling::tr")[:span-1]:
- # if not cloned, child will be moved to new parent
- clone = self.clone(col)
- # XXX: beware that here we don't use an "adapted" function,
- # because both BeautifulSoup and lxml uses the same
- # "insert" method.
- tr.insert(position, clone)
- return dom
- def postprocess_data(self, data):
- if len(data) == 0:
- return {}
- nd = []
- for key in data.keys():
- dom = self.get_dom(key)
- assigner = self.xpath(dom, "//a/text()")[0]
- for entry in data[key]:
- if not entry.has_key('name'):
- if not entry:
- continue
- # this is an award, not a recipient
- entry['assigner'] = assigner.strip()
- # find the recipients
- matches = [p for p in data[key]
- if p.has_key('name') and (entry['anchor'] ==
- p['anchor'])]
- if self.subject == 'title':
- recipients = [Person(name=recipient['name'],
- personID=analyze_imdbid(recipient['link']))
- for recipient in matches]
- entry['to'] = recipients
- elif self.subject == 'name':
- recipients = [Movie(title=recipient['name'],
- movieID=analyze_imdbid(recipient['link']))
- for recipient in matches]
- entry['for'] = recipients
- nd.append(entry)
- del entry['anchor']
- return {'awards': nd}
- class DOMHTMLTaglinesParser(DOMParserBase):
- """Parser for the "taglines" page of a given movie.
- The page should be provided as a string, as taken from
- the akas.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example:
- tparser = DOMHTMLTaglinesParser()
- result = tparser.parse(taglines_html_string)
- """
- extractors = [Extractor(label='taglines',
- path='//*[contains(concat(" ", normalize-space(@class), " "), " soda ")]',
- attrs=Attribute(key='taglines',
- multi=True,
- path="./text()"))]
- def postprocess_data(self, data):
- if 'taglines' in data:
- data['taglines'] = [tagline.strip() for tagline in data['taglines']]
- return data
- class DOMHTMLKeywordsParser(DOMParserBase):
- """Parser for the "keywords" page of a given movie.
- The page should be provided as a string, as taken from
- the akas.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example:
- kwparser = DOMHTMLKeywordsParser()
- result = kwparser.parse(keywords_html_string)
- """
- extractors = [Extractor(label='keywords',
- path="//a[starts-with(@href, '/keyword/')]",
- attrs=Attribute(key='keywords',
- path="./text()", multi=True,
- postprocess=lambda x: \
- x.lower().replace(' ', '-')))]
- class DOMHTMLAlternateVersionsParser(DOMParserBase):
- """Parser for the "alternate versions" page of a given movie.
- The page should be provided as a string, as taken from
- the akas.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example:
- avparser = HTMLAlternateVersionsParser()
- result = avparser.parse(alternateversions_html_string)
- """
- _defGetRefs = True
- extractors = [Extractor(label='alternate versions',
- path="//ul[@class='trivia']/li",
- attrs=Attribute(key='alternate versions',
- multi=True,
- path=".//text()",
- postprocess=lambda x: x.strip()))]
- class DOMHTMLTriviaParser(DOMParserBase):
- """Parser for the "trivia" page of a given movie.
- The page should be provided as a string, as taken from
- the akas.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example:
- avparser = HTMLAlternateVersionsParser()
- result = avparser.parse(alternateversions_html_string)
- """
- _defGetRefs = True
- extractors = [Extractor(label='alternate versions',
- path="//div[@class='sodatext']",
- attrs=Attribute(key='trivia',
- multi=True,
- path=".//text()",
- postprocess=lambda x: x.strip()))]
- def preprocess_dom(self, dom):
- # Remove "link this quote" links.
- for qLink in self.xpath(dom, "//span[@class='linksoda']"):
- qLink.drop_tree()
- return dom
- class DOMHTMLSoundtrackParser(DOMHTMLAlternateVersionsParser):
- kind = 'soundtrack'
- preprocessors = [
- ('<br>', '\n')
- ]
- def postprocess_data(self, data):
- if 'alternate versions' in data:
- nd = []
- for x in data['alternate versions']:
- ds = x.split('\n')
- title = ds[0]
- if title[0] == '"' and title[-1] == '"':
- title = title[1:-1]
- nds = []
- newData = {}
- for l in ds[1:]:
- if ' with ' in l or ' by ' in l or ' from ' in l \
- or ' of ' in l or l.startswith('From '):
- nds.append(l)
- else:
- if nds:
- nds[-1] += l
- else:
- nds.append(l)
- newData[title] = {}
- for l in nds:
- skip = False
- for sep in ('From ',):
- if l.startswith(sep):
- fdix = len(sep)
- kind = l[:fdix].rstrip().lower()
- info = l[fdix:].lstrip()
- newData[title][kind] = info
- skip = True
- if not skip:
- for sep in ' with ', ' by ', ' from ', ' of ':
- fdix = l.find(sep)
- if fdix != -1:
- fdix = fdix+len(sep)
- kind = l[:fdix].rstrip().lower()
- info = l[fdix:].lstrip()
- newData[title][kind] = info
- break
- nd.append(newData)
- data['soundtrack'] = nd
- return data
- class DOMHTMLCrazyCreditsParser(DOMParserBase):
- """Parser for the "crazy credits" page of a given movie.
- The page should be provided as a string, as taken from
- the akas.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example:
- ccparser = DOMHTMLCrazyCreditsParser()
- result = ccparser.parse(crazycredits_html_string)
- """
- _defGetRefs = True
- extractors = [Extractor(label='crazy credits', path="//ul/li/tt",
- attrs=Attribute(key='crazy credits', multi=True,
- path=".//text()",
- postprocess=lambda x: \
- x.replace('\n', ' ').replace(' ', ' ')))]
- def _process_goof(x):
- if x['spoiler_category']:
- return x['spoiler_category'].strip() + ': SPOILER: ' + x['text'].strip()
- else:
- return x['category'].strip() + ': ' + x['text'].strip()
- class DOMHTMLGoofsParser(DOMParserBase):
- """Parser for the "goofs" page of a given movie.
- The page should be provided as a string, as taken from
- the akas.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example:
- gparser = DOMHTMLGoofsParser()
- result = gparser.parse(goofs_html_string)
- """
- _defGetRefs = True
- extractors = [Extractor(label='goofs', path="//div[@class='soda odd']",
- attrs=Attribute(key='goofs', multi=True,
- path={
- 'text':"./text()",
- 'category':'./preceding-sibling::h4[1]/text()',
- 'spoiler_category': './h4/text()'
- },
- postprocess=_process_goof))]
- class DOMHTMLQuotesParser(DOMParserBase):
- """Parser for the "memorable quotes" page of a given movie.
- The page should be provided as a string, as taken from
- the akas.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example:
- qparser = DOMHTMLQuotesParser()
- result = qparser.parse(quotes_html_string)
- """
- _defGetRefs = True
- extractors = [
- Extractor(label='quotes_odd',
- path="//div[@class='quote soda odd']",
- attrs=Attribute(key='quotes_odd',
- multi=True,
- path=".//text()",
- postprocess=lambda x: x.strip().replace(' \n',
- '::').replace('::\n', '::').replace('\n', ' '))),
- Extractor(label='quotes_even',
- path="//div[@class='quote soda even']",
- attrs=Attribute(key='quotes_even',
- multi=True,
- path=".//text()",
- postprocess=lambda x: x.strip().replace(' \n',
- '::').replace('::\n', '::').replace('\n', ' ')))
- ]
- preprocessors = [
- (re.compile('<a href="#" class="hidesoda hidden">Hide options</a><br>', re.I), '')
- ]
- def preprocess_dom(self, dom):
- # Remove "link this quote" links.
- for qLink in self.xpath(dom, "//span[@class='linksoda']"):
- qLink.drop_tree()
- for qLink in self.xpath(dom, "//div[@class='sharesoda_pre']"):
- qLink.drop_tree()
- return dom
- def postprocess_data(self, data):
- quotes = data.get('quotes_odd', []) + data.get('quotes_even', [])
- if not quotes:
- return {}
- quotes = [q.split('::') for q in quotes]
- return {'quotes': quotes}
- class DOMHTMLReleaseinfoParser(DOMParserBase):
- """Parser for the "release dates" page of a given movie.
- The page should be provided as a string, as taken from
- the akas.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example:
- rdparser = DOMHTMLReleaseinfoParser()
- result = rdparser.parse(releaseinfo_html_string)
- """
- extractors = [Extractor(label='release dates',
- path="//table[@id='release_dates']//tr",
- attrs=Attribute(key='release dates', multi=True,
- path={'country': ".//td[1]//text()",
- 'date': ".//td[2]//text()",
- 'notes': ".//td[3]//text()"})),
- Extractor(label='akas',
- path="//table[@id='akas']//tr",
- attrs=Attribute(key='akas', multi=True,
- path={'title': "./td[1]/text()",
- 'countries': "./td[2]/text()"}))]
- preprocessors = [
- (re.compile('(<h5><a name="?akas"?.*</table>)', re.I | re.M | re.S),
- r'<div class="_imdbpy_akas">\1</div>')]
- def postprocess_data(self, data):
- if not ('release dates' in data or 'akas' in data): return data
- releases = data.get('release dates') or []
- rl = []
- for i in releases:
- country = i.get('country')
- date = i.get('date')
- if not (country and date): continue
- country = country.strip()
- date = date.strip()
- if not (country and date): continue
- notes = i['notes']
- info = u'%s::%s' % (country, date)
- if notes:
- info += notes
- rl.append(info)
- if releases:
- del data['release dates']
- if rl:
- data['release dates'] = rl
- akas = data.get('akas') or []
- nakas = []
- for aka in akas:
- title = (aka.get('title') or '').strip()
- if not title:
- continue
- countries = (aka.get('countries') or '').split(',')
- if not countries:
- nakas.append(title)
- else:
- for country in countries:
- nakas.append('%s::%s' % (title, country.strip()))
- if akas:
- del data['akas']
- if nakas:
- data['akas from release info'] = nakas
- return data
- class DOMHTMLRatingsParser(DOMParserBase):
- """Parser for the "user ratings" page of a given movie.
- The page should be provided as a string, as taken from
- the akas.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example:
- rparser = DOMHTMLRatingsParser()
- result = rparser.parse(userratings_html_string)
- """
- re_means = re.compile('mean\s*=\s*([0-9]\.[0-9])\.\s*median\s*=\s*([0-9])',
- re.I)
- extractors = [
- Extractor(label='number of votes',
- path="//td[b='Percentage']/../../tr",
- attrs=[Attribute(key='votes',
- multi=True,
- path={
- 'votes': "td[1]//text()",
- 'ordinal': "td[3]//text()"
- })]),
- Extractor(label='mean and median',
- path="//p[starts-with(text(), 'Arithmetic mean')]",
- attrs=Attribute(key='mean and median',
- path="text()")),
- Extractor(label='rating',
- path="//a[starts-with(@href, '/search/title?user_rating=')]",
- attrs=Attribute(key='rating',
- path="text()")),
- Extractor(label='demographic voters',
- path="//td[b='Average']/../../tr",
- attrs=Attribute(key='demographic voters',
- multi=True,
- path={
- 'voters': "td[1]//text()",
- 'votes': "td[2]//text()",
- 'average': "td[3]//text()"
- })),
- Extractor(label='top 250',
- path="//a[text()='top 250']",
- attrs=Attribute(key='top 250',
- path="./preceding-sibling::text()[1]"))
- ]
- def postprocess_data(self, data):
- nd = {}
- votes = data.get('votes', [])
- if votes:
- nd['number of votes'] = {}
- for i in xrange(1, 11):
- _ordinal = int(votes[i]['ordinal'])
- _strvts = votes[i]['votes'] or '0'
- nd['number of votes'][_ordinal] = \
- int(_strvts.replace(',', ''))
- mean = data.get('mean and median', '')
- if mean:
- means = self.re_means.findall(mean)
- if means and len(means[0]) == 2:
- am, med = means[0]
- try: am = float(am)
- except (ValueError, OverflowError): pass
- if type(am) is type(1.0):
- nd['arithmetic mean'] = am
- try: med = int(med)
- except (ValueError, OverflowError): pass
- if type(med) is type(0):
- nd['median'] = med
- if 'rating' in data:
- nd['rating'] = float(data['rating'])
- dem_voters = data.get('demographic voters')
- if dem_voters:
- nd['demographic'] = {}
- for i in xrange(1, len(dem_voters)):
- if (dem_voters[i]['votes'] is not None) \
- and (dem_voters[i]['votes'].strip()):
- nd['demographic'][dem_voters[i]['voters'].strip().lower()] \
- = (int(dem_voters[i]['votes'].replace(',', '')),
- float(dem_voters[i]['average']))
- if 'imdb users' in nd.get('demographic', {}):
- nd['votes'] = nd['demographic']['imdb users'][0]
- nd['demographic']['all votes'] = nd['demographic']['imdb users']
- del nd['demographic']['imdb users']
- top250 = data.get('top 250')
- if top250:
- sd = top250[9:]
- i = sd.find(' ')
- if i != -1:
- sd = sd[:i]
- try: sd = int(sd)
- except (ValueError, OverflowError): pass
- if type(sd) is type(0):
- nd['top 250 rank'] = sd
- return nd
- class DOMHTMLEpisodesRatings(DOMParserBase):
- """Parser for the "episode ratings ... by date" page of a given movie.
- The page should be provided as a string, as taken from
- the akas.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example:
- erparser = DOMHTMLEpisodesRatings()
- result = erparser.parse(eprating_html_string)
- """
- _containsObjects = True
- extractors = [Extractor(label='title', path="//title",
- attrs=Attribute(key='title', path="./text()")),
- Extractor(label='ep ratings',
- path="//th/../..//tr",
- attrs=Attribute(key='episodes', multi=True,
- path={'nr': ".//td[1]/text()",
- 'ep title': ".//td[2]//text()",
- 'movieID': ".//td[2]/a/@href",
- 'rating': ".//td[3]/text()",
- 'votes': ".//td[4]/text()"}))]
- def postprocess_data(self, data):
- if 'title' not in data or 'episodes' not in data: return {}
- nd = []
- title = data['title']
- for i in data['episodes']:
- ept = i['ep title']
- movieID = analyze_imdbid(i['movieID'])
- votes = i['votes']
- rating = i['rating']
- if not (ept and movieID and votes and rating): continue
- try:
- votes = int(votes.replace(',', '').replace('.', ''))
- except:
- pass
- try:
- rating = float(rating)
- except:
- pass
- ept = ept.strip()
- ept = u'%s {%s' % (title, ept)
- nr = i['nr']
- if nr:
- ept += u' (#%s)' % nr.strip()
- ept += '}'
- if movieID is not None:
- movieID = str(movieID)
- m = Movie(title=ept, movieID=movieID, accessSystem=self._as,
- modFunct=self._modFunct)
- epofdict = m.get('episode of')
- if epofdict is not None:
- m['episode of'] = Movie(data=epofdict, accessSystem=self._as,
- modFunct=self._modFunct)
- nd.append({'episode': m, 'votes': votes, 'rating': rating})
- return {'episodes rating': nd}
- def _normalize_href(href):
- if (href is not None) and (not href.lower().startswith('http://')):
- if href.startswith('/'): href = href[1:]
- # TODO: imdbURL_base may be set by the user!
- href = '%s%s' % (imdbURL_base, href)
- return href
- class DOMHTMLCriticReviewsParser(DOMParserBase):
- """Parser for the "critic reviews" pages of a given movie.
- The page should be provided as a string, as taken from
- the akas.imdb.com server. The final result will be a
- dictionary, with a key for every releva…
Large files files are truncated, but you can click here to view the full file