/imdb/parser/http/movieParser.py
Python | 2710 lines | 2632 code | 29 blank | 49 comment | 43 complexity | 77fef151a68ebaa7f58ab74d4998fc06 MD5 | raw file
Possible License(s): GPL-2.0
- # -*- coding: utf-8 -*-
- # Copyright 2004-2021 Davide Alberani <da@erlug.linux.it>
- # 2008-2018 H. Turgut Uyar <uyar@tekir.org>
- #
- # This program is free software; you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, write to the Free Software
- # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- """
- This module provides the classes (and the instances) that are used to parse
- the IMDb pages on the www.imdb.com server about a movie.
- For example, for Brian De Palma's "The Untouchables", the referred pages
- would be:
- combined details
- http://www.imdb.com/title/tt0094226/reference
- plot summary
- http://www.imdb.com/title/tt0094226/plotsummary
- ...and so on.
- """
- from __future__ import absolute_import, division, print_function, unicode_literals
- import functools
- import re
- from imdb import PY2
- from imdb import imdbURL_base
- from imdb.Company import Company
- from imdb.Movie import Movie
- from imdb.Person import Person
- from imdb.utils import _Container, KIND_MAP
- from .piculet import Path, Rule, Rules, preprocessors, transformers, ElementTree
- from .utils import DOMParserBase, analyze_imdbid, build_person, build_movie
- if PY2:
- from urllib import unquote
- else:
- from urllib.parse import unquote
- # Dictionary used to convert some section's names.
- _SECT_CONV = {
- 'directed': 'director',
- 'directed by': 'director',
- 'directors': 'director',
- 'editors': 'editor',
- 'writing credits': 'writer',
- 'writers': 'writer',
- 'produced': 'producer',
- 'cinematography': 'cinematographer',
- 'film editing': 'editor',
- 'casting': 'casting director',
- 'costume design': 'costume designer',
- 'makeup department': 'make up',
- 'production management': 'production manager',
- 'second unit director or assistant director': 'assistant director',
- 'costume and wardrobe department': 'costume department',
- 'costume departmen': 'costume department',
- 'sound department': 'sound crew',
- 'stunts': 'stunt performer',
- 'other crew': 'miscellaneous crew',
- 'also known as': 'akas',
- 'country': 'countries',
- 'runtime': 'runtimes',
- 'language': 'languages',
- 'certification': 'certificates',
- 'genre': 'genres',
- 'created': 'creator',
- 'creators': 'creator',
- 'color': 'color info',
- 'plot': 'plot outline',
- 'art director': 'art direction',
- 'art directors': 'art direction',
- 'composers': 'composer',
- 'assistant directors': 'assistant director',
- 'set decorator': 'set decoration',
- 'set decorators': 'set decoration',
- 'visual effects department': 'visual effects',
- 'miscellaneous': 'miscellaneous crew',
- 'make up department': 'make up',
- 'plot summary': 'plot outline',
- 'cinematographers': 'cinematographer',
- 'camera department': 'camera and electrical department',
- 'costume designers': 'costume designer',
- 'production designer': 'production design',
- 'production designers': 'production design',
- 'production managers': 'production manager',
- 'music original': 'original music',
- 'casting directors': 'casting director',
- 'other companies': 'miscellaneous companies',
- 'producers': 'producer',
- 'special effects by': 'special effects department',
- }
- re_space = re.compile(r'\s+')
- def clean_section_name(section):
- """Clean and replace some section names."""
- section = re_space.sub(' ', section.replace('_', ' ').strip().lower())
- if section.endswith(' by'):
- section = section[:-3]
- return _SECT_CONV.get(section, section)
- def _manageRoles(mo):
- """Perform some transformation on the html, so that roleIDs can
- be easily retrieved."""
- firstHalf = mo.group(1)
- secondHalf = mo.group(2)
- newRoles = []
- roles = secondHalf.split(' / ')
- for role in roles:
- role = role.strip()
- if not role:
- continue
- roleID = analyze_imdbid(role)
- if roleID is None:
- roleID = '/'
- else:
- roleID += '/'
- newRoles.append('<div class="_imdbpyrole" roleid="%s">%s</div>' % (
- roleID, role.strip()
- ))
- return firstHalf + ' / '.join(newRoles) + mo.group(3)
- _reRolesMovie = re.compile(r'(<td class="character">)(.*?)(</td>)', re.I | re.M | re.S)
- def makeSplitter(lstrip=None, sep='|', comments=True,
- origNotesSep=' (', newNotesSep='::(', strip=None):
- """Return a splitter function suitable for a given set of data."""
- def splitter(x):
- if not x:
- return x
- x = x.strip()
- if not x:
- return x
- if lstrip is not None:
- x = x.lstrip(lstrip).lstrip()
- lx = x.split(sep)
- lx[:] = [_f for _f in [j.strip() for j in lx] if _f]
- if comments:
- lx[:] = [j.replace(origNotesSep, newNotesSep, 1) for j in lx]
- if strip:
- lx[:] = [j.strip(strip) for j in lx]
- return lx
- return splitter
- def _toInt(val, replace=()):
- """Return the value, converted to integer, or None; if present, 'replace'
- must be a list of tuples of values to replace."""
- for before, after in replace:
- val = val.replace(before, after)
- try:
- return int(val)
- except (TypeError, ValueError):
- return None
- _re_og_title = re.compile(
- r'(.*) \((?:(?:(.+)(?= ))? ?(\d{4})(?:(–)(\d{4}| ))?|(.+))\)',
- re.UNICODE
- )
- def analyze_og_title(og_title):
- data = {}
- match = _re_og_title.match(og_title)
- if og_title and not match:
- # assume it's a title in production, missing release date information
- return {'title': og_title}
- data['title'] = match.group(1)
- if match.group(3):
- data['year'] = int(match.group(3))
- kind = match.group(2) or match.group(6)
- if kind is None:
- kind = 'movie'
- else:
- kind = kind.lower()
- kind = KIND_MAP.get(kind, kind)
- data['kind'] = kind
- year_separator = match.group(4)
- # There is a year separator so assume an ongoing or ended series
- if year_separator is not None:
- end_year = match.group(5)
- if end_year is not None:
- data['series years'] = '%(year)d-%(end_year)s' % {
- 'year': data['year'],
- 'end_year': end_year.strip(),
- }
- elif kind.endswith('series'):
- data['series years'] = '%(year)d-' % {'year': data['year']}
- # No year separator and series, so assume that it ended the same year
- elif kind.endswith('series') and 'year' in data:
- data['series years'] = '%(year)d-%(year)d' % {'year': data['year']}
- if data['kind'] == 'episode' and data['title'][0] == '"':
- quote_end = data['title'].find('"', 1)
- data['tv series title'] = data['title'][1:quote_end]
- data['title'] = data['title'][quote_end + 1:].strip()
- return data
- def analyze_certificates(certificates):
- def reducer(acc, el):
- cert_re = re.compile(r'^(.+):(.+)$', re.UNICODE)
- if cert_re.match(el):
- acc.append(el)
- elif acc:
- acc[-1] = u'{}::{}'.format(
- acc[-1],
- el,
- )
- return acc
- certificates = [el.strip() for el in certificates.split('\n') if el.strip()]
- return functools.reduce(reducer, certificates, [])
- def clean_akas(aka):
- aka = re_space.sub(' ', aka).strip()
- if aka.lower().startswith('see more'):
- aka = ''
- return aka
- class DOMHTMLMovieParser(DOMParserBase):
- """Parser for the "reference" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- mparser = DOMHTMLMovieParser()
- result = mparser.parse(reference_html_string)
- """
- _containsObjects = True
- rules = [
- Rule(
- key='title',
- extractor=Path('//meta[@property="og:title"]/@content',
- transform=analyze_og_title)
- ),
- Rule(
- key='original title',
- extractor=Path('//div[@class="titlereference-header"]//span[@class="titlereference-original-title-label"]/preceding-sibling::text()',
- transform=lambda x: re_space.sub(' ', x).strip())
- ),
- Rule(
- key='original title title-year',
- extractor=Path('//div[@class="titlereference-header"]//span[@class="titlereference-title-year"]/preceding-sibling::text()',
- transform=lambda x: re_space.sub(' ', x).strip())
- ),
- Rule(
- key='localized title',
- extractor=Path('//meta[@name="title"]/@content',
- transform=lambda x: analyze_og_title(x).get('title'))
- ),
- # parser for misc sections like 'casting department', 'stunts', ...
- Rule(
- key='misc sections',
- extractor=Rules(
- foreach='//h4[contains(@class, "ipl-header__content")]',
- rules=[
- Rule(
- key=Path('./@name', transform=clean_section_name),
- extractor=Rules(
- foreach='../../following-sibling::table[1]//tr',
- rules=[
- Rule(
- key='person',
- extractor=Path('.//text()')
- ),
- Rule(
- key='link',
- extractor=Path('./td[1]/a[@href]/@href')
- )
- ],
- transform=lambda x: build_person(
- x.get('person') or '',
- personID=analyze_imdbid(x.get('link'))
- )
- )
- )
- ]
- )
- ),
- Rule(
- key='cast',
- extractor=Rules(
- foreach='//table[@class="cast_list"]//tr',
- rules=[
- Rule(
- key='person',
- extractor=Path('.//text()')
- ),
- Rule(
- key='link',
- extractor=Path('./td[2]/a/@href')
- ),
- Rule(
- key='roleID',
- extractor=Path('./td[4]//div[@class="_imdbpyrole"]/@roleid')
- )
- ],
- transform=lambda x: build_person(
- x.get('person') or '',
- personID=analyze_imdbid(x.get('link')),
- roleID=(x.get('roleID') or '').split('/')
- )
- )
- ),
- Rule(
- key='recommendations',
- extractor=Rules(
- foreach='//div[contains(@class, "rec_item")]',
- rules=[
- Rule(
- key='movieID',
- extractor=Path(
- './@data-tconst',
- transform=lambda x: (x or '').replace('tt', '')
- )
- ),
- Rule(
- key='title',
- extractor=Path(
- './/a//img/@title',
- transform=lambda x: re_space.sub(' ', x or '').strip()
- )
- ),
- ],
- transform=lambda x: build_movie(x.get('title', ''), movieID=x.get('movieID'))
- )
- ),
- Rule(
- key='myrating',
- extractor=Path('//span[@id="voteuser"]//text()')
- ),
- Rule(
- key='plot summary',
- extractor=Path('//td[starts-with(text(), "Plot")]/..//p/text()',
- transform=lambda x: x.strip().rstrip('|').rstrip())
- ),
- Rule(
- key='genres',
- extractor=Path(
- foreach='//td[starts-with(text(), "Genre")]/..//li/a',
- path='./text()'
- )
- ),
- Rule(
- key='runtimes',
- extractor=Path(
- foreach='//td[starts-with(text(), "Runtime")]/..//li',
- path='./text()',
- transform=lambda x: x.strip().replace(' min', '')
- )
- ),
- Rule(
- key='countries',
- extractor=Path(
- foreach='//td[starts-with(text(), "Countr")]/..//li/a',
- path='./text()'
- )
- ),
- Rule(
- key='country codes',
- extractor=Path(
- foreach='//td[starts-with(text(), "Countr")]/..//li/a',
- path='./@href',
- transform=lambda x: x.split('/')[2].strip().lower()
- )
- ),
- Rule(
- key='language',
- extractor=Path(
- foreach='//td[starts-with(text(), "Language")]/..//li/a',
- path='./text()'
- )
- ),
- Rule(
- key='language codes',
- extractor=Path(
- foreach='//td[starts-with(text(), "Language")]/..//li/a',
- path='./@href',
- transform=lambda x: x.split('/')[2].strip()
- )
- ),
- Rule(
- key='color info',
- extractor=Path(
- foreach='//td[starts-with(text(), "Color")]/..//li/a',
- path='./text()',
- transform=lambda x: x.replace(' (', '::(')
- )
- ),
- Rule(
- key='aspect ratio',
- extractor=Path(
- '//td[starts-with(text(), "Aspect")]/..//li/text()',
- transform=transformers.strip
- )
- ),
- Rule(
- key='sound mix',
- extractor=Path(
- foreach='//td[starts-with(text(), "Sound Mix")]/..//li/a',
- path='./text()',
- transform=lambda x: x.replace(' (', '::(')
- )
- ),
- Rule(
- key='box office',
- extractor=Rules(
- foreach='//section[contains(@class, "titlereference-section-box-office")]'
- '//table[contains(@class, "titlereference-list")]//tr',
- rules=[
- Rule(
- key='box_office_title',
- extractor=Path('./td[1]/text()')
- ),
- Rule(
- key='box_office_detail',
- extractor=Path('./td[2]/text()')
- )
- ],
- transform=lambda x: (x['box_office_title'].strip(),
- x['box_office_detail'].strip())
- ),
- ),
- Rule(
- key='certificates',
- extractor=Path(
- '//td[starts-with(text(), "Certificat")]/..//text()',
- transform=analyze_certificates
- )
- ),
- # Collects akas not encosed in <i> tags.
- Rule(
- key='other akas',
- extractor=Path(
- foreach='//section[contains(@class, "listo")]//td[starts-with(text(), "Also Known As")]/..//ul/li',
- path='.//text()',
- transform=clean_akas
- )
- ),
- Rule(
- key='creator',
- extractor=Rules(
- foreach='//div[starts-with(normalize-space(text()), "Creator")]/ul/li[1]/a',
- rules=[
- Rule(
- key='name',
- extractor=Path('./text()')
- ),
- Rule(
- key='link',
- extractor=Path('./@href')
- )
- ],
- transform=lambda x: build_person(
- x.get('name') or '',
- personID=analyze_imdbid(x.get('link'))
- )
- )
- ),
- Rule(
- key='thin writer',
- extractor=Rules(
- foreach='//div[starts-with(normalize-space(text()), "Writer")]/ul/li[1]/a',
- rules=[
- Rule(
- key='name',
- extractor=Path('./text()')
- ),
- Rule(
- key='link',
- extractor=Path('./@href')
- )
- ],
- transform=lambda x: build_person(
- x.get('name') or '',
- personID=analyze_imdbid(x.get('link'))
- )
- )
- ),
- Rule(
- key='thin director',
- extractor=Rules(
- foreach='//div[starts-with(normalize-space(text()), "Director")]/ul/li[1]/a',
- rules=[
- Rule(
- key='name',
- extractor=Path('./text()')
- ),
- Rule(
- key='link',
- extractor=Path('./@href')
- )
- ],
- transform=lambda x: build_person(
- x.get('name') or '',
- personID=analyze_imdbid(x.get('link'))
- )
- )
- ),
- Rule(
- key='top/bottom rank',
- extractor=Path(
- '//li[@class="ipl-inline-list__item"]//a[starts-with(@href, "/chart/")]/text()'
- )
- ),
- Rule(
- key='original air date',
- extractor=Path('//span[@imdbpy="airdate"]/text()')
- ),
- Rule(
- key='series years',
- extractor=Path(
- '//div[@id="tn15title"]//span[starts-with(text(), "TV series")]/text()',
- transform=lambda x: x.replace('TV series', '').strip()
- )
- ),
- Rule(
- key='season/episode',
- extractor=Path(
- '//div[@class="titlereference-overview-season-episode-section"]/ul//text()',
- transform=transformers.strip
- )
- ),
- Rule(
- key='number of episodes',
- extractor=Path(
- '//a[starts-with(text(), "All Episodes")]/text()',
- transform=lambda x: int(x.replace('All Episodes', '').strip()[1:-1])
- )
- ),
- Rule(
- key='episode number',
- extractor=Path(
- '//div[@id="tn15epnav"]/text()',
- transform=lambda x: int(re.sub(r'[^a-z0-9 ]', '',
- x.lower()).strip().split()[0]))
- ),
- Rule(
- key='previous episode',
- extractor=Path(
- '//span[@class="titlereference-overview-episodes-links"]'
- '//a[contains(text(), "Previous")]/@href',
- transform=analyze_imdbid
- )
- ),
- Rule(
- key='next episode',
- extractor=Path(
- '//span[@class="titlereference-overview-episodes-links"]'
- '//a[contains(text(), "Next")]/@href',
- transform=analyze_imdbid
- )
- ),
- Rule(
- key='number of seasons',
- extractor=Path(
- '//span[@class="titlereference-overview-years-links"]/../a[1]/text()',
- transform=int
- )
- ),
- Rule(
- key='tv series link',
- extractor=Path('//a[starts-with(text(), "All Episodes")]/@href')
- ),
- Rule(
- key='akas',
- extractor=Path(
- foreach='//i[@class="transl"]',
- path='./text()',
- transform=lambda x: x
- .replace(' ', ' ')
- .rstrip('-')
- .replace('" - ', '"::', 1)
- .strip('"')
- .replace(' ', ' ')
- )
- ),
- Rule(
- key='production status',
- extractor=Path(
- '//td[starts-with(text(), "Status:")]/..//div[@class="info-content"]//text()',
- transform=lambda x: x.strip().split('|')[0].strip().lower()
- )
- ),
- Rule(
- key='production status updated',
- extractor=Path(
- '//td[starts-with(text(), "Status Updated:")]/'
- '..//div[@class="info-content"]//text()',
- transform=transformers.strip
- )
- ),
- Rule(
- key='production comments',
- extractor=Path(
- '//td[starts-with(text(), "Comments:")]/'
- '..//div[@class="info-content"]//text()',
- transform=transformers.strip
- )
- ),
- Rule(
- key='production note',
- extractor=Path(
- '//td[starts-with(text(), "Note:")]/'
- '..//div[@class="info-content"]//text()',
- transform=transformers.strip
- )
- ),
- Rule(
- key='companies',
- extractor=Rules(
- foreach="//ul[@class='simpleList']",
- rules=[
- Rule(
- key=Path('preceding-sibling::header[1]/div/h4/text()', transform=transformers.lower),
- extractor=Rules(
- foreach='./li',
- rules=[
- Rule(
- key='name',
- extractor=Path('./a//text()')
- ),
- Rule(
- key='comp-link',
- extractor=Path('./a/@href')
- ),
- Rule(
- key='notes',
- extractor=Path('./text()')
- )
- ],
- transform=lambda x: Company(
- name=x.get('name') or '',
- accessSystem='http',
- companyID=analyze_imdbid(x.get('comp-link')),
- notes=(x.get('notes') or '').strip()
- )
- )
- )
- ]
- )
- ),
- Rule(
- key='rating',
- extractor=Path('(//span[@class="ipl-rating-star__rating"])[1]/text()')
- ),
- Rule(
- key='votes',
- extractor=Path('//span[@class="ipl-rating-star__total-votes"][1]/text()')
- ),
- Rule(
- key='cover url',
- extractor=Path('//img[@alt="Poster"]/@src')
- ),
- Rule(
- key='imdbID',
- extractor=Path('//meta[@property="pageId"]/@content',
- transform=lambda x: (x or '').replace('tt', ''))
- )
- ]
- preprocessors = [
- ('/releaseinfo">', '"><span imdbpy="airdate">'),
- (re.compile(r'(<b class="blackcatheader">.+?</b>)', re.I), r'</div><div>\1'),
- ('<small>Full cast and crew for<br>', ''),
- ('<td> </td>', '<td>...</td>'),
- (re.compile(r'<span class="tv-extra">TV mini-series(\s+.*?)</span>', re.I),
- r'<span class="tv-extra">TV series\1</span> (mini)'),
- (_reRolesMovie, _manageRoles)
- ]
- def preprocess_dom(self, dom):
- # Handle series information.
- xpath = self.xpath(dom, "//b[text()='Series Crew']")
- if xpath:
- b = xpath[-1] # In doubt, take the last one.
- for a in self.xpath(b, "./following::h5/a[@class='glossary']"):
- name = a.get('name')
- if name:
- a.set('name', 'series %s' % name)
- # Remove links to IMDbPro.
- preprocessors.remove(dom, '//span[@class="pro-link"]')
- # Remove some 'more' links (keep others, like the one around
- # the number of votes).
- preprocessors.remove(dom, '//a[@class="tn15more"][starts-with(@href, "/title/")]')
- # Remove the "rest of list" in cast.
- preprocessors.remove(dom, '//td[@colspan="4"]/..')
- return dom
- re_space = re.compile(r'\s+')
- re_airdate = re.compile(r'(.*)\s*\(season (\d+), episode (\d+)\)', re.I)
- def postprocess_data(self, data):
- # Convert section names.
- for sect in list(data.keys()):
- if sect in _SECT_CONV:
- data[_SECT_CONV[sect]] = data[sect]
- del data[sect]
- # Filter out fake values.
- for key in data:
- value = data[key]
- if isinstance(value, list) and value:
- if isinstance(value[0], Person):
- data[key] = [x for x in value if x.personID is not None]
- if isinstance(value[0], _Container):
- for obj in data[key]:
- obj.accessSystem = self._as
- obj.modFunct = self._modFunct
- for key in ['title']:
- if (key in data) and isinstance(data[key], dict):
- subdata = data[key]
- del data[key]
- data.update(subdata)
- if not data.get('original title'):
- if 'original title title-year' in data:
- data['original title'] = data['original title title-year']
- del data['original title title-year']
- elif 'original title title-year' in data:
- del data['original title title-year']
- misc_sections = data.get('misc sections')
- if misc_sections is not None:
- for section in misc_sections:
- # skip sections with their own parsers
- if 'cast' in section.keys():
- continue
- data.update(section)
- del data['misc sections']
- if 'akas' in data or 'other akas' in data:
- akas = data.get('akas') or []
- other_akas = data.get('other akas') or []
- akas += other_akas
- nakas = []
- for aka in akas:
- aka = aka.strip()
- if not aka:
- continue
- if aka.endswith('" -'):
- aka = aka[:-3].rstrip()
- nakas.append(aka)
- if 'akas' in data:
- del data['akas']
- if 'other akas' in data:
- del data['other akas']
- if nakas:
- data['akas'] = nakas
- if 'runtimes' in data:
- data['runtimes'] = [x.replace(' min', '')
- for x in data['runtimes']]
- if 'number of seasons' in data:
- data['seasons'] = [str(i) for i in range(1, data['number of seasons'] + 1)]
- if 'season/episode' in data:
- tokens = data['season/episode'].split('Episode')
- try:
- data['season'] = int(tokens[0].split('Season')[1])
- except:
- data['season'] = 'unknown'
- try:
- data['episode'] = int(tokens[1])
- except:
- data['episode'] = 'unknown'
- del data['season/episode']
- for k in ('writer', 'director'):
- t_k = 'thin %s' % k
- if t_k not in data:
- continue
- if k not in data:
- data[k] = data[t_k]
- del data[t_k]
- if 'top/bottom rank' in data:
- tbVal = data['top/bottom rank'].lower()
- if tbVal.startswith('top'):
- tbKey = 'top 250 rank'
- tbVal = _toInt(tbVal, [('top rated movies: #', '')])
- else:
- tbKey = 'bottom 100 rank'
- tbVal = _toInt(tbVal, [('bottom rated movies: #', '')])
- if tbVal:
- data[tbKey] = tbVal
- del data['top/bottom rank']
- if 'year' in data and data['year'] == '????':
- del data['year']
- if 'tv series link' in data:
- if 'tv series title' in data:
- data['episode of'] = Movie(title=data['tv series title'],
- movieID=analyze_imdbid(data['tv series link']),
- accessSystem=self._as,
- modFunct=self._modFunct)
- data['episode of']['kind'] = 'tv series'
- del data['tv series title']
- del data['tv series link']
- if 'rating' in data:
- try:
- data['rating'] = float(data['rating'].replace('/10', ''))
- except (TypeError, ValueError):
- pass
- if data['rating'] == 0:
- del data['rating']
- if 'votes' in data:
- try:
- votes = data['votes'].replace('(', '').replace(')', '').replace(',', '').replace('votes', '')
- data['votes'] = int(votes)
- except (TypeError, ValueError):
- pass
- companies = data.get('companies')
- if companies:
- for section in companies:
- for key, value in section.items():
- if key in data:
- key = '%s companies' % key
- data.update({key: value})
- del data['companies']
- if 'box office' in data:
- data['box office'] = dict(data['box office'])
- return data
- def _process_plotsummary(x):
- """Process a plot (contributed by Rdian06)."""
- xauthor = x.get('author')
- xplot = x.get('plot', '').strip()
- if xauthor:
- xplot += '::%s' % xauthor
- return xplot
- class DOMHTMLPlotParser(DOMParserBase):
- """Parser for the "plot summary" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a 'plot' key, containing a list
- of string with the structure: 'summary::summary_author <author@email>'.
- Example::
- pparser = HTMLPlotParser()
- result = pparser.parse(plot_summary_html_string)
- """
- _defGetRefs = True
- def synopsis_reducer(nodes):
- ret=[]
- for n in nodes:
- if type(n) is ElementTree._ElementUnicodeResult:
- ret.append(n)
- return '\n\n'.join(ret)
- # Notice that recently IMDb started to put the email of the
- # author only in the link, that we're not collecting, here.
- rules = [
- Rule(
- key='plot',
- extractor=Rules(
- foreach='//ul[@id="plot-summaries-content"]/li',
- rules=[
- Rule(
- key='plot',
- extractor=Path('./p//text()')
- ),
- Rule(
- key='author',
- extractor=Path('.//div[@class="author-container"]//a/text()')
- )
- ],
- transform=_process_plotsummary
- )
- ),
- Rule(
- key='synopsis',
- extractor=Path(
- foreach='//ul[@id="plot-synopsis-content"]',
- path='.//li//node()',
- reduce=synopsis_reducer
- )
- )
- ]
- def preprocess_dom(self, dom):
- preprocessors.remove(dom, '//li[@id="no-summary-content"]')
- return dom
- def postprocess_data(self, data):
- if 'synopsis' in data and data['synopsis'][0] and 'a Synopsis for this title' in data['synopsis'][0]:
- del data['synopsis']
- return data
- def _process_award(x):
- award = {}
- _award = x.get('award')
- if _award is not None:
- _award = _award.strip()
- award['award'] = _award
- if not award['award']:
- return {}
- award['year'] = x.get('year').strip()
- if award['year'] and award['year'].isdigit():
- award['year'] = int(award['year'])
- award['result'] = x.get('result').strip()
- category = x.get('category').strip()
- if category:
- award['category'] = category
- received_with = x.get('with')
- if received_with is not None:
- award['with'] = received_with.strip()
- notes = x.get('notes')
- if notes is not None:
- notes = notes.strip().split('\n', 2)[0]
- notes = re_space.sub(' ', notes)
- if notes:
- award['notes'] = notes
- award['anchor'] = x.get('anchor')
- return award
- class DOMHTMLAwardsParser(DOMParserBase):
- """Parser for the "awards" page of a given person or movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- awparser = HTMLAwardsParser()
- result = awparser.parse(awards_html_string)
- """
- subject = 'title'
- _containsObjects = True
- rules = [
- Rule(
- key='awards',
- extractor=Rules(
- foreach='//*[@id="main"]/div[1]/div/table//tr',
- rules=[
- Rule(
- key='year',
- extractor=Path('normalize-space(./ancestor::table/preceding-sibling::*[1]/a/text())')
- ),
- Rule(
- key='result',
- extractor=Path('./td[1]/b/text()')
- ),
- Rule(
- key='award',
- extractor=Path('./td[1]/span/text()')
- ),
- Rule(
- key='category',
- extractor=Path('normalize-space(./ancestor::table/preceding-sibling::*[1]/text())')
- ),
- Rule(
- key='notes',
- extractor=Path('./td[2]/text()')
- ),
- Rule(
- key='anchor',
- extractor=Path('.//text()')
- )
- ],
- transform=_process_award
- )
- ),
- Rule(
- key='recipients',
- extractor=Rules(
- foreach='//*[@id="main"]/div[1]/div/table//tr/td[2]/a',
- rules=[
- Rule(
- key='name',
- extractor=Path('./text()')
- ),
- Rule(
- key='link',
- extractor=Path('./@href')
- ),
- Rule(
- key='anchor',
- extractor=Path('./ancestor::tr//text()')
- )
- ]
- )
- )
- ]
- preprocessors = [
- (re.compile('(<tr><td[^>]*>.*?</td></tr>\n\n</table>)', re.I),
- r'\1</table>'),
- (re.compile('(<tr><td[^>]*>\n\n<big>.*?</big></td></tr>)', re.I),
- r'</table><table class="_imdbpy">\1'),
- (re.compile('(<table[^>]*>\n\n)</table>(<table)', re.I), r'\1\2'),
- (re.compile('(<small>.*?)<br>(.*?</small)', re.I), r'\1 \2'),
- (re.compile('(</tr>\n\n)(<td)', re.I), r'\1<tr>\2')
- ]
- def preprocess_dom(self, dom):
- """Repeat td elements according to their rowspan attributes
- in subsequent tr elements.
- """
- cols = self.xpath(dom, "//td[@rowspan]")
- for col in cols:
- span = int(col.get('rowspan'))
- del col.attrib['rowspan']
- position = len(self.xpath(col, "./preceding-sibling::td"))
- row = col.getparent()
- for tr in self.xpath(row, "./following-sibling::tr")[:span - 1]:
- # if not cloned, child will be moved to new parent
- clone = self.clone(col)
- tr.insert(position, clone)
- return dom
- def postprocess_data(self, data):
- if len(data) == 0:
- return {}
- nd = []
- for award in data['awards']:
- matches = [p for p in data.get('recipients', [])
- if 'nm' in p.get('link') and award.get('anchor') == p.get('anchor')]
- if self.subject == 'title':
- recipients = [
- Person(name=recipient['name'],
- personID=analyze_imdbid(recipient['link']))
- for recipient in matches
- ]
- award['to'] = recipients
- elif self.subject == 'name':
- recipients = [
- Movie(title=recipient['name'],
- movieID=analyze_imdbid(recipient['link']))
- for recipient in matches
- ]
- award['for'] = recipients
- nd.append(award)
- if 'anchor' in award:
- del award['anchor']
- return {'awards': nd}
- class DOMHTMLTaglinesParser(DOMParserBase):
- """Parser for the "taglines" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- tparser = DOMHTMLTaglinesParser()
- result = tparser.parse(taglines_html_string)
- """
- rules = [
- Rule(
- key='taglines',
- extractor=Path(
- foreach='//div[@id="taglines_content"]/div',
- path='.//text()'
- )
- )
- ]
- def preprocess_dom(self, dom):
- preprocessors.remove(dom, '//div[@id="taglines_content"]/div[@class="header"]')
- preprocessors.remove(dom, '//div[@id="taglines_content"]/div[@id="no_content"]')
- return dom
- def postprocess_data(self, data):
- if 'taglines' in data:
- data['taglines'] = [tagline.strip() for tagline in data['taglines']]
- return data
- class DOMHTMLKeywordsParser(DOMParserBase):
- """Parser for the "keywords" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- kwparser = DOMHTMLKeywordsParser()
- result = kwparser.parse(keywords_html_string)
- """
- rules = [
- Rule(
- key='keywords',
- extractor=Path(
- foreach='//td[@data-item-keyword]',
- path='./@data-item-keyword',
- transform=lambda x: x.lower().replace(' ', '-')
- )
- ),
- Rule(
- key='relevant keywords',
- extractor=Rules(
- foreach='//td[@data-item-keyword]',
- rules=[
- Rule(
- key='keyword',
- extractor=Path('./@data-item-keyword')
- ),
- Rule(
- key='ordering',
- extractor=Path('./@data-item-votes')
- ),
- Rule(
- key='vote_str',
- extractor=Path('./div[2]/div//text()')
- )
- ],
- transform=lambda x: {
- 'keyword': x.get('keyword').lower(),
- 'keyword_dash': x.get('keyword').lower().replace(' ', '-'),
- 'ordering': x.get('ordering'),
- 'votes_str': x.get('vote_str').strip().lower()
- }
- )
- )
- ]
- def postprocess_data(self, data):
- if 'relevant keywords' in data:
- rk = []
- for x in data['relevant keywords']:
- if 'votes_str' in x:
- if 'is this relevant?' in x['votes_str']:
- x['votes_for'] = 0
- x['total_votes'] = 0
- else:
- x['votes_for'] = x['votes_str'].split('of')[0].strip()
- x['total_votes'] = re.sub(r"\D", "", x['votes_str'].split('of')[1]).strip()
- rk.append(x)
- data['relevant keywords'] = rk
- return data
- class DOMHTMLAlternateVersionsParser(DOMParserBase):
- """Parser for the "alternate versions" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- avparser = DOMHTMLAlternateVersionsParser()
- result = avparser.parse(alternateversions_html_string)
- """
- _defGetRefs = True
- rules = [
- Rule(
- key='alternate versions',
- extractor=Path(
- foreach='//ul[@class="trivia"]/li',
- path='.//text()',
- transform=transformers.strip
- )
- )
- ]
- class DOMHTMLTriviaParser(DOMParserBase):
- """Parser for the "trivia" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- tparser = DOMHTMLTriviaParser()
- result = tparser.parse(trivia_html_string)
- """
- _defGetRefs = True
- rules = [
- Rule(
- key='trivia',
- extractor=Path(
- foreach='//div[@class="sodatext"]',
- path='.//text()',
- transform=transformers.strip
- )
- )
- ]
- def preprocess_dom(self, dom):
- # Remove "link this quote" links.
- preprocessors.remove(dom, '//span[@class="linksoda"]')
- return dom
- class DOMHTMLSoundtrackParser(DOMParserBase):
- """Parser for the "soundtrack" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- stparser = DOMHTMLSoundtrackParser()
- result = stparser.parse(soundtrack_html_string)
- """
- _defGetRefs = True
- preprocessors = [('<br />', '\n'), ('<br>', '\n')]
- rules = [
- Rule(
- key='soundtrack',
- extractor=Path(
- foreach='//div[@class="list"]//div',
- path='.//text()',
- transform=transformers.strip
- )
- )
- ]
- def postprocess_data(self, data):
- if 'soundtrack' in data:
- nd = []
- for x in data['soundtrack']:
- ds = x.split('\n')
- title = ds[0]
- if title[0] == '"' and title[-1] == '"':
- title = title[1:-1]
- nds = []
- newData = {}
- for l in ds[1:]:
- if ' with ' in l or ' by ' in l or ' from ' in l \
- or ' of ' in l or l.startswith('From '):
- nds.append(l)
- else:
- if nds:
- nds[-1] += l
- else:
- nds.append(l)
- newData[title] = {}
- for l in nds:
- skip = False
- for sep in ('From ',):
- if l.startswith(sep):
- fdix = len(sep)
- kind = l[:fdix].rstrip().lower()
- info = l[fdix:].lstrip()
- newData[title][kind] = info
- skip = True
- if not skip:
- for sep in ' with ', ' by ', ' from ', ' of ':
- fdix = l.find(sep)
- if fdix != -1:
- fdix = fdix + len(sep)
- kind = l[:fdix].rstrip().lower()
- info = l[fdix:].lstrip()
- newData[title][kind] = info
- break
- nd.append(newData)
- data['soundtrack'] = nd
- return data
- class DOMHTMLCrazyCreditsParser(DOMParserBase):
- """Parser for the "crazy credits" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- ccparser = DOMHTMLCrazyCreditsParser()
- result = ccparser.parse(crazycredits_html_string)
- """
- _defGetRefs = True
- rules = [
- Rule(
- key='crazy credits',
- extractor=Path(
- foreach='//ul/li/tt',
- path='.//text()',
- transform=lambda x: x.replace('\n', ' ').replace(' ', ' ')
- )
- )
- ]
- def _process_goof(x):
- text = (x.get('text') or '').strip()
- category = (x.get('category') or 'Goof').strip()
- return {"category": category, "text": text}
- class DOMHTMLGoofsParser(DOMParserBase):
- """Parser for the "goofs" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- gparser = DOMHTMLGoofsParser()
- result = gparser.parse(goofs_html_string)
- """
- _defGetRefs = True
- rules = [
- Rule(
- key='goofs',
- extractor=Rules(
- foreach='//div[contains(@class, "soda sodavote")]',
- rules=[
- Rule(
- key='text',
- extractor=Path('./div[@class="sodatext"]/text()')
- ),
- Rule(
- key='category',
- extractor=Path('./preceding-sibling::h4[1]/text()')
- )
- ],
- transform=_process_goof
- )
- )
- ]
- class DOMHTMLQuotesParser(DOMParserBase):
- """Parser for the "memorable quotes" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- qparser = DOMHTMLQuotesParser()
- result = qparser.parse(quotes_html_string)
- """
- _defGetRefs = True
- rules = [
- Rule(
- key='quotes',
- extractor=Path(
- foreach='//div[@class="sodatext"]',
- path='.//text()',
- transform=lambda x: x
- .strip()
- .replace(' \n', '::')
- .replace('::\n', '::')
- .replace('\n', ' ')
- )
- )
- ]
- def preprocess_dom(self, dom):
- preprocessors.remove(dom, '//div[@class="did-you-know-actions"]')
- return dom
- def postprocess_data(self, data):
- quotes = data.get('quotes', [])
- if not quotes:
- return {}
- quotes = [q.split('::') for q in quotes]
- return {'quotes': quotes}
- class DOMHTMLReleaseinfoParser(DOMParserBase):
- """Parser for the "release dates" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- rdparser = DOMHTMLReleaseinfoParser()
- result = rdparser.parse(releaseinfo_html_string)
- """
- rules = [
- Rule(
- key='release dates',
- extractor=Rules(
- foreach='//table[contains(@class, "release-dates-table-test-only")]//tr',
- rules=[
- Rule(
- key='country',
- extractor=Path('.//td[1]//text()')
- ),
- Rule(
- key='country_code',
- extractor=Path('.//td[1]/a/@href')
- ),
- Rule(
- key='date',
- extractor=Path('.//td[2]//text()')
- ),
- Rule(
- key='notes',
- extractor=Path('.//td[3]//text()')
- )
- ]
- )
- ),
- Rule(
- key='akas',
- extractor=Rules(
- foreach='//table[contains(@class, "akas-table-test-only")]//tr',
- rules=[
- Rule(
- key='countries',
- extractor=Path('./td[1]/text()')
- ),
- Rule(
- key='title',
- extractor=Path('./td[2]/text()')
- )
- ]
- )
- )
- ]
- preprocessors = [
- (re.compile('(<h5><a name="?akas"?.*</table>)', re.I | re.M | re.S),
- r'<div class="_imdbpy_akas">\1</div>')
- ]
- def postprocess_data(self, data):
- if not ('release dates' in data or 'akas' in data):
- return data
- releases = data.get('release dates') or []
- rl = []
- for i in releases:
- country = i.get('country')
- date = i.get('date')
- if not (country and date):
- continue
- country = country.strip()
- date = date.strip()
- if not (country and date):
- continue
- notes = i.get('notes')
- info = '%s::%s' % (country, date)
- if notes:
- notes = notes.replace('\n', '')
- i['notes'] = notes
- info += notes
- rl.append(info)
- if releases:
- for rd in data['release dates']:
- rd['country_code'] = rd['country_code'].split('region=')[1].split('&')[0].strip().upper()
- data['raw release dates'] = data['release dates']
- del data['release dates']
- if rl:
- data['release dates'] = rl
- akas = data.get('akas') or []
- nakas = []
- for aka in akas:
- title = (aka.get('title') or '').strip()
- if not title:
- continue
- countries = (aka.get('countries') or '').split(',')
- if not countries:
- nakas.append(title)
- else:
- for country in countries:
- nakas.append('%s %s' % (title, country.strip()))
- if akas:
- if releases:
- for rd in data['raw release dates']:
- for a in data['akas']:
- if 'countries' in a:
- if rd['country'].strip() in a['countries'].strip():
- a['country_code'] = rd['country_code']
- data['raw akas'] = data['akas']
- del data['akas']
- if nakas:
- data['akas'] = data['akas from release info'] = nakas
- return data
- class DOMHTMLRatingsParser(DOMParserBase):
- """Parser for the "user ratings" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- rparser = DOMHTMLRatingsParser()
- result = rparser.parse(userratings_html_string)
- """
- re_means = re.compile(r'mean\s*=\s*([0-9]\.[0-9])\s*median\s*=\s*([0-9])', re.I)
- rules = [
- Rule(
- key='votes',
- extractor=Rules(
- foreach='//th[@class="firstTableCoulmn"]/../../tr',
- rules=[
- Rule(
- key='ordinal',
- extractor=Path('./td[1]/div//text()')
- ),
- Rule(
- key='votes',
- extractor=Path('./td[3]/div/div//text()')
- )
- ]
- )
- ),
- Rule(
- key='mean and median',
- extractor=Path(
- '//div[starts-with(normalize-space(text()), "Arithmetic mean")]/text()'
- )
- ),
- Rule(
- key='demographics',
- extractor=Rules(
- foreach='//div[@class="smallcell"]',
- rules=[
- Rule(
- key='link',
- extractor=Path('./a/@href')
- ),
- Rule(
- key='rating',
- extractor=Path('..//div[@class="bigcell"]//text()')
- ),
- Rule(
- key='votes',
- extractor=Path('./a/text()')
- )
- ]
- )
- )
- ]
- def postprocess_data(self, data):
- nd = {}
- demographics = data.get('demographics')
- if demographics:
- dem = {}
- for dem_data in demographics:
- link = (dem_data.get('link') or '').strip()
- votes = (dem_data.get('votes') or '').strip()
- rating = (dem_data.get('rating') or '').strip()
- if not (link and votes and rating):
- continue
- eq_idx = link.rfind('=')
- if eq_idx == -1:
- continue
- info = link[eq_idx + 1:].replace('_', ' ')
- try:
- votes = int(votes.replace(',', ''))
- except Exception:
- continue
- try:
- rating = float(rating)
- except Exception:
- continue
- dem[info] = {'votes': votes, 'rating': rating}
- nd['demographics'] = dem
- votes = data.get('votes', [])
- if votes:
- nd['number of votes'] = {}
- for v_info in votes:
- ordinal = v_info.get('ordinal')
- nr_votes = v_info.get('votes')
- if not (ordinal and nr_votes):
- continue
- try:
- ordinal = int(ordinal)
- except Exception:
- continue
- try:
- nr_votes = int(nr_votes.replace(',', ''))
- except Exception:
- continue
- nd['number of votes'][ordinal] = nr_votes
- mean = data.get('mean and median', '')
- if mean:
- means = self.re_means.findall(mean)
- if means and len(means[0]) == 2:
- am, med = means[0]
- try:
- am = float(am)
- except (ValueError, OverflowError):
- pass
- if isinstance(am, float):
- nd['arithmetic mean'] = am
- try:
- med = int(med)
- except (ValueError, OverflowError):
- pass
- if isinstance(med, int):
- nd['median'] = med
- return nd
- def _normalize_href(href):
- if (href is not None) and (not href.lower().startswith('http://')):
- if href.startswith('/'):
- href = href[1:]
- # TODO: imdbURL_base may be set by the user!
- href = '%s%s' % (imdbURL_base, href)
- return href
- class DOMHTMLCriticReviewsParser(DOMParserBase):
- """Parser for the "critic reviews" pages of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- crparser = DOMHTMLCriticReviewsParser()
- result = crparser.parse(criticreviews_html_string)
- """
- kind = 'critic reviews'
- rules = [
- Rule(
- key='metascore',
- extractor=Path('//div[@class="metascore_wrap"]/div/span//text()')
- ),
- Rule(
- key='metacritic url',
- extractor=Path('//div[@class="article"]/div[@class="see-more"]/a/@href')
- )
- ]
- class DOMHTMLReviewsParser(DOMParserBase):
- """Parser for the "reviews" pages of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- rparser = DOMHTMLReviewsParser()
- result = rparser.parse(reviews_html_string)
- """
- rules = [
- Rule(
- key='reviews',
- extractor=Rules(
- foreach='//div[@class="review-container"]',
- rules=[
- Rule(
- key='text',
- extractor=Path('.//div[@class="text show-more__control"]//text()')
- ),
- Rule(
- key='helpful',
- extractor=Path('.//div[@class="actions text-muted"]//text()[1]')
- ),
- Rule(
- key='title',
- extractor=Path('.//a[@class="title"]//text()')
- ),
- Rule(
- key='author',
- extractor=Path('.//span[@class="display-name-link"]/a/@href')
- ),
- Rule(
- key='date',
- extractor=Path('.//span[@class="review-date"]//text()')
- ),
- Rule(
- key='rating',
- extractor=Path('.//span[@class="point-scale"]/preceding-sibling::span[1]/text()')
- )
- ],
- transform=lambda x: ({
- 'content': x.get('text', '').replace('\n', ' ').replace(' ', ' ').strip(),
- 'helpful': [int(s) for s in x.get('helpful', '').split() if s.isdigit()],
- 'title': x.get('title', '').strip(),
- 'author': analyze_imdbid(x.get('author')),
- 'date': x.get('date', '').strip(),
- 'rating': x.get('rating', '').strip()
- })
- )
- )
- ]
- preprocessors = [('<br>', '<br>\n')]
- def postprocess_data(self, data):
- for review in data.get('reviews', []):
- if review.get('rating'):
- if isinstance(review['rating'], str):
- review['rating'] = int(review['rating'])
- elif len(review['rating']) == 2: # May be legacy code.
- review['rating'] = int(review['rating'][0])
- else:
- review['rating'] = None
- else:
- review['rating'] = None
- if review.get('helpful') and len(review['helpful']) == 2:
- review['not_helpful'] = review['helpful'][1] - review['helpful'][0]
- review['helpful'] = review['helpful'][0]
- else:
- review['helpful'] = 0
- review['not_helpful'] = 0
- review['author'] = "ur%s" % review['author']
- return data
- class DOMHTMLFullCreditsParser(DOMParserBase):
- """Parser for the "full credits" (series cast section) page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- fcparser = DOMHTMLFullCreditsParser()
- result = fcparser.parse(fullcredits_html_string)
- """
- kind = 'full credits'
- rules = [
- Rule(
- key='cast',
- extractor=Rules(
- foreach='//table[@class="cast_list"]//tr[@class="odd" or @class="even"]',
- rules=[
- Rule(
- key='person',
- extractor=Path('.//text()')
- ),
- Rule(
- key='link',
- extractor=Path('./td[2]/a/@href')
- ),
- Rule(
- key='roleID',
- extractor=Path('./td[4]//div[@class="_imdbpyrole"]/@roleid')
- ),
- Rule(
- key='headshot',
- extractor=Path('./td[@class="primary_photo"]/a/img/@loadlate')
- )
- ],
- transform=lambda x: build_person(
- x.get('person', ''),
- personID=analyze_imdbid(x.get('link')),
- roleID=(x.get('roleID', '')).split('/'),
- headshot=(x.get('headshot', ''))
- )
- )
- ),
- # parser for misc sections like 'casting department', 'stunts', ...
- Rule(
- key='misc sections',
- extractor=Rules(
- foreach='//h4[contains(@class, "dataHeaderWithBorder")]',
- rules=[
- Rule(
- key=Path('./@name', transform=clean_section_name),
- extractor=Rules(
- foreach='./following-sibling::table[1]//tr',
- rules=[
- Rule(
- key='person',
- extractor=Path('.//text()')
- ),
- Rule(
- key='link',
- extractor=Path('./td[1]/a[@href]/@href')
- )
- ],
- transform=lambda x: build_person(
- x.get('person') or '',
- personID=analyze_imdbid(x.get('link'))
- )
- )
- )
- ]
- )
- ),
- ]
- preprocessors = [
- (_reRolesMovie, _manageRoles)
- ]
- def postprocess_data(self, data):
- # Convert section names.
- clean_cast = []
- for person in data.get('cast', []):
- if person.personID and person.get('name'):
- clean_cast.append(person)
- if clean_cast:
- data['cast'] = clean_cast
- misc_sections = data.get('misc sections')
- if misc_sections is not None:
- for section in misc_sections:
- for sectName, sectData in section.items():
- # skip sections with their own parsers
- if sectName in ('cast',):
- continue
- newName = _SECT_CONV.get(sectName, sectName)
- if sectData:
- data[newName] = sectData
- del data['misc sections']
- return data
- class DOMHTMLOfficialsitesParser(DOMParserBase):
- """Parser for the "official sites", "external reviews"
- "miscellaneous links", "sound clips", "video clips" and
- "photographs" pages of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- osparser = DOMHTMLOfficialsitesParser()
- result = osparser.parse(officialsites_html_string)
- """
- rules = [
- Rule(
- foreach='//h4[@class="li_group"]',
- key=Path(
- './text()',
- transform=lambda x: x.strip().lower()
- ),
- extractor=Rules(
- foreach='./following::ul[1]/li/a',
- rules=[
- Rule(
- key='link',
- extractor=Path('./@href')
- ),
- Rule(
- key='info',
- extractor=Path('./text()')
- )
- ],
- transform=lambda x: (
- x.get('info').strip(),
- unquote(_normalize_href(x.get('link')))
- )
- )
- )
- ]
- class DOMHTMLConnectionsParser(DOMParserBase):
- """Parser for the "connections" pages of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- osparser = DOMHTMLOfficialsitesParser()
- result = osparser.parse(officialsites_html_string)
- """
- preprocessors = [
- (re.compile('(<h4 class="li_group">)', re.I), r'</div><div class="_imdbpy">\1'),
- (re.compile('(^<br />.*$)', re.I | re.M), r''),
- ]
- rules = [
- Rule(
- foreach='//div[@class="_imdbpy"]',
- key=Path(
- './h4/text()',
- transform=lambda x: x.strip().lower()
- ),
- extractor=Rules(
- foreach='./div[contains(@class, "soda")]',
- rules=[
- Rule(
- key='link',
- extractor=Path('./a/@href')
- ),
- Rule(
- key='info',
- extractor=Path('.//text()')
- )
- ],
- transform=lambda x: (
- x.get('info').strip(),
- unquote(_normalize_href(x.get('link')))
- )
- )
- )
- ]
- def postprocess_data(self, data):
- connections = {}
- for k, v in data.items():
- k = k.strip()
- if not (k and v):
- continue
- movies = []
- for title, link in v:
- title = title.strip().replace('\n', '')
- movieID = analyze_imdbid(link)
- if not (title and movieID):
- continue
- movie = Movie(title=title, movieID=movieID,
- accessSystem=self._as, modFunct=self._modFunct)
- movies.append(movie)
- if movies:
- connections[k] = movies
- return {'connections': connections}
- class DOMHTMLLocationsParser(DOMParserBase):
- """Parser for the "locations" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- lparser = DOMHTMLLocationsParser()
- result = lparser.parse(locations_html_string)
- """
- rules = [
- Rule(
- key='locations',
- extractor=Rules(
- foreach='//dt',
- rules=[
- Rule(
- key='place',
- extractor=Path('.//text()')
- ),
- Rule(
- key='note',
- extractor=Path('./following-sibling::dd[1]//text()')
- )
- ],
- transform=lambda x: ('%s::%s' % (x['place'].strip(),
- (x['note'] or '').strip())).strip(':')
- )
- )
- ]
- class DOMHTMLTechParser(DOMParserBase):
- """Parser for the "technical", "publicity" (for people) and "contacts" (for people)
- pages of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- tparser = DOMHTMLTechParser()
- result = tparser.parse(technical_html_string)
- """
- kind = 'tech'
- re_space = re.compile(r'\s+')
- rules = [
- Rule(
- key='tech',
- extractor=Rules(
- foreach='//table//tr/td[@class="label"]',
- rules=[
- Rule(
- key=Path(
- './text()',
- transform=lambda x: x.lower().strip()),
- extractor=Path(
- '..//td[2]//text()',
- transform=lambda x: [t.strip()
- for t in x.split(':::') if t.strip()]
- )
- )
- ]
- )
- )
- ]
- preprocessors = [
- (re.compile('(<h5>.*?</h5>)', re.I), r'</div>\1<div class="_imdbpy">'),
- (re.compile('((<br/>|</p>|</table>))\n?<br/>(?!<a)', re.I), r'\1</div>'),
- # the ones below are for the publicity parser
- (re.compile('<p>(.*?)</p>', re.I), r'\1<br/>'),
- (re.compile('(</td><td valign="top">)', re.I), r'\1::'),
- (re.compile('(</tr><tr>)', re.I), r'\n\1'),
- (re.compile(r'<span class="ghost">\|</span>', re.I), r':::'),
- (re.compile('<br/?>', re.I), r':::')
- # this is for splitting individual entries
- ]
- def postprocess_data(self, data):
- info = {}
- for section in data.get('tech', []):
- info.update(section)
- for key, value in info.items():
- if isinstance(value, list):
- info[key] = [self.re_space.sub(' ', x).strip() for x in value]
- else:
- info[key] = self.re_space.sub(' ', value).strip()
- return {self.kind: info}
- class DOMHTMLNewsParser(DOMParserBase):
- """Parser for the "news" page of a given movie or person.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- nwparser = DOMHTMLNewsParser()
- result = nwparser.parse(news_html_string)
- """
- _defGetRefs = True
- rules = [
- Rule(
- key='news',
- extractor=Rules(
- foreach='//h2',
- rules=[
- Rule(
- key='title',
- extractor=Path('./text()')
- ),
- Rule(
- key='fromdate',
- extractor=Path('./following-sibling::p[1]/small//text()')
- ),
- Rule(
- key='body',
- extractor=Path('../following-sibling::p[2]//text()')
- ),
- Rule(
- key='link',
- extractor=Path('../..//a[text()="Permalink"]/@href')
- ),
- Rule(
- key='fulllink',
- extractor=Path('../..//a[starts-with(text(), "See full article at")]/@href')
- )
- ],
- transform=lambda x: {
- 'title': x.get('title').strip(),
- 'date': x.get('fromdate').split('|')[0].strip(),
- 'from': x.get('fromdate').split('|')[1].replace('From ', '').strip(),
- 'body': (x.get('body') or '').strip(),
- 'link': _normalize_href(x.get('link')),
- 'full article link': _normalize_href(x.get('fulllink'))
- }
- )
- )
- ]
- preprocessors = [
- (re.compile('(<a name=[^>]+><h2>)', re.I), r'<div class="_imdbpy">\1'),
- (re.compile('(<hr/>)', re.I), r'</div>\1'),
- (re.compile('<p></p>', re.I), r'')
- ]
- def postprocess_data(self, data):
- if 'news' not in data:
- return {}
- for news in data['news']:
- if 'full article link' in news:
- if news['full article link'] is None:
- del news['full article link']
- return data
- def _parse_review(x):
- result = {}
- title = x.get('title').strip()
- if title[-1] == ':':
- title = title[:-1]
- result['title'] = title
- result['link'] = _normalize_href(x.get('link'))
- kind = x.get('kind').strip()
- if kind[-1] == ':':
- kind = kind[:-1]
- result['review kind'] = kind
- text = x.get('review').replace('\n\n', '||').replace('\n', ' ').split('||')
- review = '\n'.join(text)
- if x.get('author') is not None:
- author = x.get('author').strip()
- review = review.split(author)[0].strip()
- result['review author'] = author[2:]
- if x.get('item') is not None:
- item = x.get('item').strip()
- review = review[len(item):].strip()
- review = "%s: %s" % (item, review)
- result['review'] = review
- return result
- class DOMHTMLSeasonEpisodesParser(DOMParserBase):
- """Parser for the "episode list" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- sparser = DOMHTMLSeasonEpisodesParser()
- result = sparser.parse(episodes_html_string)
- """
- rules = [
- Rule(
- key='series link',
- extractor=Path('//div[@class="parent"]//a/@href')
- ),
- Rule(
- key='series title',
- extractor=Path('//head/meta[@property="og:title"]/@content')
- ),
- Rule(
- key='_seasons',
- extractor=Path(
- foreach='//select[@id="bySeason"]//option',
- path='./@value'
- )
- ),
- Rule(
- key='_current_season',
- extractor=Path('//select[@id="bySeason"]//option[@selected]/@value')
- ),
- Rule(
- key='episodes',
- extractor=Rules(
- foreach='//div[@class="info"]',
- rules=[
- Rule(
- key=Path('.//meta/@content',
- transform=lambda x: 'episode %s' % x),
- extractor=Rules(
- rules=[
- Rule(
- key='link',
- extractor=Path('.//strong//a[@href][1]/@href')
- ),
- Rule(
- key='original air date',
- extractor=Path('.//div[@class="airdate"]/text()')
- ),
- Rule(
- key='title',
- extractor=Path('.//strong//text()')
- ),
- Rule(
- key='rating',
- extractor=Path(
- './/div[contains(@class, "ipl-rating-star")][1]'
- '/span[@class="ipl-rating-star__rating"][1]/text()'
- )
- ),
- Rule(
- key='votes',
- extractor=Path(
- './/div[contains(@class, "ipl-rating-star")][1]'
- '/span[@class="ipl-rating-star__total-votes"][1]/text()'
- )
- ),
- Rule(
- key='plot',
- extractor=Path('.//div[@class="item_description"]//text()')
- )
- ]
- )
- )
- ]
- )
- )
- ]
- def postprocess_data(self, data):
- series_id = analyze_imdbid(data.get('series link'))
- series_title = data.get('series title', '').strip()
- selected_season = data.get('_current_season', 'unknown season').strip()
- if not (series_id and series_title):
- return {}
- series = Movie(title=series_title, movieID=str(series_id),
- accessSystem=self._as, modFunct=self._modFunct)
- if series.get('kind') == 'movie':
- series['kind'] = 'tv series'
- try:
- selected_season = int(selected_season)
- except ValueError:
- pass
- nd = {selected_season: {}}
- if 'episode -1' in data:
- counter = 1
- for episode in data['episode -1']:
- while 'episode %d' % counter in data:
- counter += 1
- k = 'episode %d' % counter
- data[k] = [episode]
- del data['episode -1']
- episodes = data.get('episodes', [])
- for ep in episodes:
- if not ep:
- continue
- episode_nr, episode = list(ep.items())[0]
- if not episode_nr.startswith('episode '):
- continue
- episode_nr = episode_nr[8:].rstrip()
- try:
- episode_nr = int(episode_nr)
- except ValueError:
- pass
- episode_id = analyze_imdbid(episode.get('link' ''))
- episode_air_date = episode.get('original air date', '').strip()
- episode_title = episode.get('title', '').strip()
- episode_plot = episode.get('plot', '')
- episode_rating = episode.get('rating', '')
- episode_votes = episode.get('votes', '')
- if not (episode_nr is not None and episode_id and episode_title):
- continue
- ep_obj = Movie(movieID=episode_id, title=episode_title,
- accessSystem=self._as, modFunct=self._modFunct)
- ep_obj['kind'] = 'episode'
- ep_obj['episode of'] = series
- ep_obj['season'] = selected_season
- ep_obj['episode'] = episode_nr
- if episode_rating:
- try:
- ep_obj['rating'] = float(episode_rating)
- except:
- pass
- if episode_votes:
- try:
- ep_obj['votes'] = int(episode_votes.replace(',', '')
- .replace('.', '').replace('(', '').replace(')', ''))
- except:
- pass
- if episode_air_date:
- ep_obj['original air date'] = episode_air_date
- if episode_air_date[-4:].isdigit():
- ep_obj['year'] = episode_air_date[-4:]
- if episode_plot:
- ep_obj['plot'] = episode_plot
- nd[selected_season][episode_nr] = ep_obj
- _seasons = data.get('_seasons') or []
- for idx, season in enumerate(_seasons):
- try:
- _seasons[idx] = int(season)
- except ValueError:
- pass
- return {'episodes': nd, '_seasons': _seasons, '_current_season': selected_season}
- def _build_episode(x):
- """Create a Movie object for a given series' episode."""
- episode_id = analyze_imdbid(x.get('link'))
- episode_title = x.get('title')
- e = Movie(movieID=episode_id, title=episode_title)
- e['kind'] = 'episode'
- oad = x.get('oad')
- if oad:
- e['original air date'] = oad.strip()
- year = x.get('year')
- if year is not None:
- year = year[5:]
- if year == 'unknown':
- year = '????'
- if year and year.isdigit():
- year = int(year)
- e['year'] = year
- else:
- if oad and oad[-4:].isdigit():
- e['year'] = int(oad[-4:])
- epinfo = x.get('episode')
- if epinfo is not None:
- season, episode = epinfo.split(':')[0].split(',')
- e['season'] = int(season[7:])
- e['episode'] = int(episode[8:])
- else:
- e['season'] = 'unknown'
- e['episode'] = 'unknown'
- plot = x.get('plot')
- if plot:
- e['plot'] = plot.strip()
- return e
- class DOMHTMLEpisodesParser(DOMParserBase):
- """Parser for the "episode list" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- eparser = DOMHTMLEpisodesParser()
- result = eparser.parse(episodes_html_string)
- """
- kind = 'episodes list'
- _episodes_path = "..//h4"
- _oad_path = "./following-sibling::span/strong[1]/text()"
- def _init(self):
- self.rules = [
- Rule(
- key='series title',
- extractor=Path('//title/text()')
- ),
- Rule(
- key='series movieID',
- extractor=Path(
- './/h1/a[@class="main"]/@href',
- transform=analyze_imdbid
- )
- ),
- Rule(
- key='episodes',
- extractor=Rules(
- foreach='//div[@class="_imdbpy"]/h3',
- rules=[
- Rule(
- key='./a/@name',
- extractor=Rules(
- foreach=self._episodes_path,
- rules=[
- Rule(
- key='link',
- extractor=Path('./a/@href')
- ),
- Rule(
- key='title',
- extractor=Path('./a/text()')
- ),
- Rule(
- key='year',
- extractor=Path('./preceding-sibling::a[1]/@name')
- ),
- Rule(
- key='episode',
- extractor=Path('./text()[1]')
- ),
- Rule(
- key='oad',
- extractor=Path(self._oad_path)
- ),
- Rule(
- key='plot',
- extractor=Path('./following-sibling::text()[1]')
- )
- ],
- transform=_build_episode
- )
- )
- ]
- )
- )
- ]
- preprocessors = [
- (re.compile('(<hr/>\n)(<h3>)', re.I), r'</div>\1<div class="_imdbpy">\2'),
- (re.compile('(</p>\n\n)</div>', re.I), r'\1'),
- (re.compile('<h3>(.*?)</h3>', re.I), r'<h4>\1</h4>'),
- (_reRolesMovie, _manageRoles),
- (re.compile('(<br/> <br/>\n)(<hr/>)', re.I), r'\1</div>\2')
- ]
- def postprocess_data(self, data):
- # A bit extreme?
- if 'series title' not in data:
- return {}
- if 'series movieID' not in data:
- return {}
- stitle = data['series title'].replace('- Episode list', '')
- stitle = stitle.replace('- Episodes list', '')
- stitle = stitle.replace('- Episode cast', '')
- stitle = stitle.replace('- Episodes cast', '')
- stitle = stitle.strip()
- if not stitle:
- return {}
- seriesID = data['series movieID']
- if seriesID is None:
- return {}
- series = Movie(title=stitle, movieID=str(seriesID),
- accessSystem=self._as, modFunct=self._modFunct)
- nd = {}
- for key in list(data.keys()):
- if key.startswith('filter-season-') or key.startswith('season-'):
- season_key = key.replace('filter-season-', '').replace('season-', '')
- try:
- season_key = int(season_key)
- except ValueError:
- pass
- nd[season_key] = {}
- ep_counter = 1
- for episode in data[key]:
- if not episode:
- continue
- episode_key = episode.get('episode')
- if episode_key is None:
- continue
- if not isinstance(episode_key, int):
- episode_key = ep_counter
- ep_counter += 1
- cast_key = 'Season %s, Episode %s:' % (season_key, episode_key)
- if cast_key in data:
- cast = data[cast_key]
- for i in range(len(cast)):
- cast[i].billingPos = i + 1
- episode['cast'] = cast
- episode['episode of'] = series
- nd[season_key][episode_key] = episode
- if len(nd) == 0:
- return {}
- return {'episodes': nd}
- class DOMHTMLFaqsParser(DOMParserBase):
- """Parser for the "FAQ" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- fparser = DOMHTMLFaqsParser()
- result = fparser.parse(faqs_html_string)
- """
- _defGetRefs = True
- rules = [
- Rule(
- key='faqs',
- extractor=Rules(
- foreach='//div[@class="section"]',
- rules=[
- Rule(
- key='question',
- extractor=Path('./h3/a/span/text()')
- ),
- Rule(
- key='answer',
- extractor=Path('../following-sibling::div[1]//text()')
- )
- ],
- transform=lambda x: '%s::%s' % (
- x.get('question').strip(),
- '\n\n'.join(x.get('answer').replace('\n\n', '\n').strip().split('||'))
- )
- )
- )
- ]
- preprocessors = [
- (re.compile('<br/><br/>', re.I), r'||'),
- (re.compile('<h4>(.*?)</h4>\n', re.I), r'||\1--'),
- (re.compile('<span class="spoiler"><span>(.*?)</span></span>', re.I),
- r'[spoiler]\1[/spoiler]')
- ]
- class DOMHTMLAiringParser(DOMParserBase):
- """Parser for the "airing" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- aparser = DOMHTMLAiringParser()
- result = aparser.parse(airing_html_string)
- """
- _containsObjects = True
- rules = [
- Rule(
- key='series title',
- extractor=Path(
- '//title/text()',
- transform=lambda x: x.replace(' - TV schedule', '')
- )
- ),
- Rule(
- key='series id',
- extractor=Path('//h1/a[@href]/@href')
- ),
- Rule(
- key='tv airings',
- extractor=Rules(
- foreach='//tr[@class]',
- rules=[
- Rule(
- key='date',
- extractor=Path('./td[1]//text()')
- ),
- Rule(
- key='time',
- extractor=Path('./td[2]//text()')
- ),
- Rule(
- key='channel',
- extractor=Path('./td[3]//text()')
- ),
- Rule(
- key='link',
- extractor=Path('./td[4]/a[1]/@href')
- ),
- Rule(
- key='title',
- extractor=Path('./td[4]//text()')
- ),
- Rule(
- key='season',
- extractor=Path('./td[5]//text()')
- )
- ],
- transform=lambda x: {
- 'date': x.get('date'),
- 'time': x.get('time'),
- 'channel': x.get('channel').strip(),
- 'link': x.get('link'),
- 'title': x.get('title'),
- 'season': (x.get('season') or '').strip()
- }
- )
- )
- ]
- def postprocess_data(self, data):
- if len(data) == 0:
- return {}
- seriesTitle = data.get('series title') or ''
- seriesID = analyze_imdbid(data.get('series id'))
- if seriesID and 'airing' in data:
- for airing in data['airing']:
- title = airing.get('title', '').strip()
- if not title:
- epsTitle = seriesTitle
- if seriesID is None:
- continue
- epsID = seriesID
- else:
- epsTitle = '%s {%s}' % (data['series title'],
- airing['title'])
- epsID = analyze_imdbid(airing['link'])
- e = Movie(title=epsTitle, movieID=epsID)
- airing['episode'] = e
- del airing['link']
- del airing['title']
- if not airing['season']:
- del airing['season']
- if 'series title' in data:
- del data['series title']
- if 'series id' in data:
- del data['series id']
- if 'airing' in data:
- data['airing'] = [_f for _f in data['airing'] if _f]
- if 'airing' not in data or not data['airing']:
- return {}
- return data
- class DOMHTMLParentsGuideParser(DOMParserBase):
- """Parser for the "parents guide" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- pgparser = HTMLParentsGuideParser()
- result = pgparser.parse(parentsguide_html_string)
- """
- rules = [
- Rule(
- key='mpaa',
- extractor=Path(
- '//tr[@id="mpaa-rating"]/td[2]//text()'
- )
- ),
- Rule(
- key='certificates',
- extractor=Rules(
- foreach='//tr[@id="certifications-list"]//li',
- rules=[
- Rule(
- key='full',
- extractor=Path('./a//text()')
- ),
- Rule(
- key='country_code',
- extractor=Path('./a/@href')
- ),
- Rule(
- key='note',
- extractor=Path('./text()')
- ),
- ],
- transform=lambda x: {
- 'country_code': x.get('country_code').split('certificates=')[1].split(':')[0].strip(),
- 'country': x.get('full').split(':')[0].strip(),
- 'certificate': x.get('full').split(':')[1].strip(),
- 'note': x.get('note').strip(),
- 'full': x.get('full').strip(),
- }
- )
- ),
- Rule(
- key='advisories',
- extractor=Rules(
- foreach='//section[starts-with(@id, "advisory-")]',
- rules=[
- Rule(key='section',
- extractor=Path('./@id')
- ),
- Rule(key='items',
- extractor=Rules(
- foreach='.//li',
- rules=[
- Rule(
- key='item',
- extractor=Path('./text()')
- )
- ],
- transform=lambda x: x.get('item').strip()
- )
- )
- ]
- )
- )
- ]
- def postprocess_data(self, data):
- if 'advisories' in data:
- for advisory in data['advisories']:
- sect = advisory.get('section', '').replace('-', ' ')
- items = [x for x in advisory.get('items', []) if x]
- if sect and items:
- data[sect] = items
- del data['advisories']
- return data
- _OBJECTS = {
- 'movie_parser': ((DOMHTMLMovieParser,), None),
- 'full_credits_parser': ((DOMHTMLFullCreditsParser,), None),
- 'plot_parser': ((DOMHTMLPlotParser,), None),
- 'movie_awards_parser': ((DOMHTMLAwardsParser,), None),
- 'taglines_parser': ((DOMHTMLTaglinesParser,), None),
- 'keywords_parser': ((DOMHTMLKeywordsParser,), None),
- 'crazycredits_parser': ((DOMHTMLCrazyCreditsParser,), None),
- 'goofs_parser': ((DOMHTMLGoofsParser,), None),
- 'alternateversions_parser': ((DOMHTMLAlternateVersionsParser,), None),
- 'trivia_parser': ((DOMHTMLTriviaParser,), None),
- 'soundtrack_parser': ((DOMHTMLSoundtrackParser,), None),
- 'quotes_parser': ((DOMHTMLQuotesParser,), None),
- 'releasedates_parser': ((DOMHTMLReleaseinfoParser,), None),
- 'ratings_parser': ((DOMHTMLRatingsParser,), None),
- 'criticrev_parser': ((DOMHTMLCriticReviewsParser,), {'kind': 'critic reviews'}),
- 'reviews_parser': ((DOMHTMLReviewsParser,), {'kind': 'reviews'}),
- 'externalsites_parser': ((DOMHTMLOfficialsitesParser,), None),
- 'officialsites_parser': ((DOMHTMLOfficialsitesParser,), None),
- 'externalrev_parser': ((DOMHTMLOfficialsitesParser,), None),
- 'misclinks_parser': ((DOMHTMLOfficialsitesParser,), None),
- 'soundclips_parser': ((DOMHTMLOfficialsitesParser,), None),
- 'videoclips_parser': ((DOMHTMLOfficialsitesParser,), None),
- 'photosites_parser': ((DOMHTMLOfficialsitesParser,), None),
- 'connections_parser': ((DOMHTMLConnectionsParser,), None),
- 'tech_parser': ((DOMHTMLTechParser,), None),
- 'locations_parser': ((DOMHTMLLocationsParser,), None),
- 'news_parser': ((DOMHTMLNewsParser,), None),
- 'episodes_parser': ((DOMHTMLEpisodesParser,), None),
- 'season_episodes_parser': ((DOMHTMLSeasonEpisodesParser,), None),
- 'movie_faqs_parser': ((DOMHTMLFaqsParser,), None),
- 'airing_parser': ((DOMHTMLAiringParser,), None),
- 'parentsguide_parser': ((DOMHTMLParentsGuideParser,), None)
- }