/imdb/parser/http/movieParser.py
Python | 2710 lines | 2632 code | 29 blank | 49 comment | 43 complexity | 77fef151a68ebaa7f58ab74d4998fc06 MD5 | raw file
Possible License(s): GPL-2.0
Large files files are truncated, but you can click here to view the full file
- # -*- coding: utf-8 -*-
- # Copyright 2004-2021 Davide Alberani <da@erlug.linux.it>
- # 2008-2018 H. Turgut Uyar <uyar@tekir.org>
- #
- # This program is free software; you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, write to the Free Software
- # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- """
- This module provides the classes (and the instances) that are used to parse
- the IMDb pages on the www.imdb.com server about a movie.
- For example, for Brian De Palma's "The Untouchables", the referred pages
- would be:
- combined details
- http://www.imdb.com/title/tt0094226/reference
- plot summary
- http://www.imdb.com/title/tt0094226/plotsummary
- ...and so on.
- """
- from __future__ import absolute_import, division, print_function, unicode_literals
- import functools
- import re
- from imdb import PY2
- from imdb import imdbURL_base
- from imdb.Company import Company
- from imdb.Movie import Movie
- from imdb.Person import Person
- from imdb.utils import _Container, KIND_MAP
- from .piculet import Path, Rule, Rules, preprocessors, transformers, ElementTree
- from .utils import DOMParserBase, analyze_imdbid, build_person, build_movie
- if PY2:
- from urllib import unquote
- else:
- from urllib.parse import unquote
- # Dictionary used to convert some section's names.
- _SECT_CONV = {
- 'directed': 'director',
- 'directed by': 'director',
- 'directors': 'director',
- 'editors': 'editor',
- 'writing credits': 'writer',
- 'writers': 'writer',
- 'produced': 'producer',
- 'cinematography': 'cinematographer',
- 'film editing': 'editor',
- 'casting': 'casting director',
- 'costume design': 'costume designer',
- 'makeup department': 'make up',
- 'production management': 'production manager',
- 'second unit director or assistant director': 'assistant director',
- 'costume and wardrobe department': 'costume department',
- 'costume departmen': 'costume department',
- 'sound department': 'sound crew',
- 'stunts': 'stunt performer',
- 'other crew': 'miscellaneous crew',
- 'also known as': 'akas',
- 'country': 'countries',
- 'runtime': 'runtimes',
- 'language': 'languages',
- 'certification': 'certificates',
- 'genre': 'genres',
- 'created': 'creator',
- 'creators': 'creator',
- 'color': 'color info',
- 'plot': 'plot outline',
- 'art director': 'art direction',
- 'art directors': 'art direction',
- 'composers': 'composer',
- 'assistant directors': 'assistant director',
- 'set decorator': 'set decoration',
- 'set decorators': 'set decoration',
- 'visual effects department': 'visual effects',
- 'miscellaneous': 'miscellaneous crew',
- 'make up department': 'make up',
- 'plot summary': 'plot outline',
- 'cinematographers': 'cinematographer',
- 'camera department': 'camera and electrical department',
- 'costume designers': 'costume designer',
- 'production designer': 'production design',
- 'production designers': 'production design',
- 'production managers': 'production manager',
- 'music original': 'original music',
- 'casting directors': 'casting director',
- 'other companies': 'miscellaneous companies',
- 'producers': 'producer',
- 'special effects by': 'special effects department',
- }
- re_space = re.compile(r'\s+')
- def clean_section_name(section):
- """Clean and replace some section names."""
- section = re_space.sub(' ', section.replace('_', ' ').strip().lower())
- if section.endswith(' by'):
- section = section[:-3]
- return _SECT_CONV.get(section, section)
- def _manageRoles(mo):
- """Perform some transformation on the html, so that roleIDs can
- be easily retrieved."""
- firstHalf = mo.group(1)
- secondHalf = mo.group(2)
- newRoles = []
- roles = secondHalf.split(' / ')
- for role in roles:
- role = role.strip()
- if not role:
- continue
- roleID = analyze_imdbid(role)
- if roleID is None:
- roleID = '/'
- else:
- roleID += '/'
- newRoles.append('<div class="_imdbpyrole" roleid="%s">%s</div>' % (
- roleID, role.strip()
- ))
- return firstHalf + ' / '.join(newRoles) + mo.group(3)
- _reRolesMovie = re.compile(r'(<td class="character">)(.*?)(</td>)', re.I | re.M | re.S)
- def makeSplitter(lstrip=None, sep='|', comments=True,
- origNotesSep=' (', newNotesSep='::(', strip=None):
- """Return a splitter function suitable for a given set of data."""
- def splitter(x):
- if not x:
- return x
- x = x.strip()
- if not x:
- return x
- if lstrip is not None:
- x = x.lstrip(lstrip).lstrip()
- lx = x.split(sep)
- lx[:] = [_f for _f in [j.strip() for j in lx] if _f]
- if comments:
- lx[:] = [j.replace(origNotesSep, newNotesSep, 1) for j in lx]
- if strip:
- lx[:] = [j.strip(strip) for j in lx]
- return lx
- return splitter
- def _toInt(val, replace=()):
- """Return the value, converted to integer, or None; if present, 'replace'
- must be a list of tuples of values to replace."""
- for before, after in replace:
- val = val.replace(before, after)
- try:
- return int(val)
- except (TypeError, ValueError):
- return None
- _re_og_title = re.compile(
- r'(.*) \((?:(?:(.+)(?= ))? ?(\d{4})(?:(–)(\d{4}| ))?|(.+))\)',
- re.UNICODE
- )
- def analyze_og_title(og_title):
- data = {}
- match = _re_og_title.match(og_title)
- if og_title and not match:
- # assume it's a title in production, missing release date information
- return {'title': og_title}
- data['title'] = match.group(1)
- if match.group(3):
- data['year'] = int(match.group(3))
- kind = match.group(2) or match.group(6)
- if kind is None:
- kind = 'movie'
- else:
- kind = kind.lower()
- kind = KIND_MAP.get(kind, kind)
- data['kind'] = kind
- year_separator = match.group(4)
- # There is a year separator so assume an ongoing or ended series
- if year_separator is not None:
- end_year = match.group(5)
- if end_year is not None:
- data['series years'] = '%(year)d-%(end_year)s' % {
- 'year': data['year'],
- 'end_year': end_year.strip(),
- }
- elif kind.endswith('series'):
- data['series years'] = '%(year)d-' % {'year': data['year']}
- # No year separator and series, so assume that it ended the same year
- elif kind.endswith('series') and 'year' in data:
- data['series years'] = '%(year)d-%(year)d' % {'year': data['year']}
- if data['kind'] == 'episode' and data['title'][0] == '"':
- quote_end = data['title'].find('"', 1)
- data['tv series title'] = data['title'][1:quote_end]
- data['title'] = data['title'][quote_end + 1:].strip()
- return data
- def analyze_certificates(certificates):
- def reducer(acc, el):
- cert_re = re.compile(r'^(.+):(.+)$', re.UNICODE)
- if cert_re.match(el):
- acc.append(el)
- elif acc:
- acc[-1] = u'{}::{}'.format(
- acc[-1],
- el,
- )
- return acc
- certificates = [el.strip() for el in certificates.split('\n') if el.strip()]
- return functools.reduce(reducer, certificates, [])
- def clean_akas(aka):
- aka = re_space.sub(' ', aka).strip()
- if aka.lower().startswith('see more'):
- aka = ''
- return aka
- class DOMHTMLMovieParser(DOMParserBase):
- """Parser for the "reference" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- mparser = DOMHTMLMovieParser()
- result = mparser.parse(reference_html_string)
- """
- _containsObjects = True
- rules = [
- Rule(
- key='title',
- extractor=Path('//meta[@property="og:title"]/@content',
- transform=analyze_og_title)
- ),
- Rule(
- key='original title',
- extractor=Path('//div[@class="titlereference-header"]//span[@class="titlereference-original-title-label"]/preceding-sibling::text()',
- transform=lambda x: re_space.sub(' ', x).strip())
- ),
- Rule(
- key='original title title-year',
- extractor=Path('//div[@class="titlereference-header"]//span[@class="titlereference-title-year"]/preceding-sibling::text()',
- transform=lambda x: re_space.sub(' ', x).strip())
- ),
- Rule(
- key='localized title',
- extractor=Path('//meta[@name="title"]/@content',
- transform=lambda x: analyze_og_title(x).get('title'))
- ),
- # parser for misc sections like 'casting department', 'stunts', ...
- Rule(
- key='misc sections',
- extractor=Rules(
- foreach='//h4[contains(@class, "ipl-header__content")]',
- rules=[
- Rule(
- key=Path('./@name', transform=clean_section_name),
- extractor=Rules(
- foreach='../../following-sibling::table[1]//tr',
- rules=[
- Rule(
- key='person',
- extractor=Path('.//text()')
- ),
- Rule(
- key='link',
- extractor=Path('./td[1]/a[@href]/@href')
- )
- ],
- transform=lambda x: build_person(
- x.get('person') or '',
- personID=analyze_imdbid(x.get('link'))
- )
- )
- )
- ]
- )
- ),
- Rule(
- key='cast',
- extractor=Rules(
- foreach='//table[@class="cast_list"]//tr',
- rules=[
- Rule(
- key='person',
- extractor=Path('.//text()')
- ),
- Rule(
- key='link',
- extractor=Path('./td[2]/a/@href')
- ),
- Rule(
- key='roleID',
- extractor=Path('./td[4]//div[@class="_imdbpyrole"]/@roleid')
- )
- ],
- transform=lambda x: build_person(
- x.get('person') or '',
- personID=analyze_imdbid(x.get('link')),
- roleID=(x.get('roleID') or '').split('/')
- )
- )
- ),
- Rule(
- key='recommendations',
- extractor=Rules(
- foreach='//div[contains(@class, "rec_item")]',
- rules=[
- Rule(
- key='movieID',
- extractor=Path(
- './@data-tconst',
- transform=lambda x: (x or '').replace('tt', '')
- )
- ),
- Rule(
- key='title',
- extractor=Path(
- './/a//img/@title',
- transform=lambda x: re_space.sub(' ', x or '').strip()
- )
- ),
- ],
- transform=lambda x: build_movie(x.get('title', ''), movieID=x.get('movieID'))
- )
- ),
- Rule(
- key='myrating',
- extractor=Path('//span[@id="voteuser"]//text()')
- ),
- Rule(
- key='plot summary',
- extractor=Path('//td[starts-with(text(), "Plot")]/..//p/text()',
- transform=lambda x: x.strip().rstrip('|').rstrip())
- ),
- Rule(
- key='genres',
- extractor=Path(
- foreach='//td[starts-with(text(), "Genre")]/..//li/a',
- path='./text()'
- )
- ),
- Rule(
- key='runtimes',
- extractor=Path(
- foreach='//td[starts-with(text(), "Runtime")]/..//li',
- path='./text()',
- transform=lambda x: x.strip().replace(' min', '')
- )
- ),
- Rule(
- key='countries',
- extractor=Path(
- foreach='//td[starts-with(text(), "Countr")]/..//li/a',
- path='./text()'
- )
- ),
- Rule(
- key='country codes',
- extractor=Path(
- foreach='//td[starts-with(text(), "Countr")]/..//li/a',
- path='./@href',
- transform=lambda x: x.split('/')[2].strip().lower()
- )
- ),
- Rule(
- key='language',
- extractor=Path(
- foreach='//td[starts-with(text(), "Language")]/..//li/a',
- path='./text()'
- )
- ),
- Rule(
- key='language codes',
- extractor=Path(
- foreach='//td[starts-with(text(), "Language")]/..//li/a',
- path='./@href',
- transform=lambda x: x.split('/')[2].strip()
- )
- ),
- Rule(
- key='color info',
- extractor=Path(
- foreach='//td[starts-with(text(), "Color")]/..//li/a',
- path='./text()',
- transform=lambda x: x.replace(' (', '::(')
- )
- ),
- Rule(
- key='aspect ratio',
- extractor=Path(
- '//td[starts-with(text(), "Aspect")]/..//li/text()',
- transform=transformers.strip
- )
- ),
- Rule(
- key='sound mix',
- extractor=Path(
- foreach='//td[starts-with(text(), "Sound Mix")]/..//li/a',
- path='./text()',
- transform=lambda x: x.replace(' (', '::(')
- )
- ),
- Rule(
- key='box office',
- extractor=Rules(
- foreach='//section[contains(@class, "titlereference-section-box-office")]'
- '//table[contains(@class, "titlereference-list")]//tr',
- rules=[
- Rule(
- key='box_office_title',
- extractor=Path('./td[1]/text()')
- ),
- Rule(
- key='box_office_detail',
- extractor=Path('./td[2]/text()')
- )
- ],
- transform=lambda x: (x['box_office_title'].strip(),
- x['box_office_detail'].strip())
- ),
- ),
- Rule(
- key='certificates',
- extractor=Path(
- '//td[starts-with(text(), "Certificat")]/..//text()',
- transform=analyze_certificates
- )
- ),
- # Collects akas not encosed in <i> tags.
- Rule(
- key='other akas',
- extractor=Path(
- foreach='//section[contains(@class, "listo")]//td[starts-with(text(), "Also Known As")]/..//ul/li',
- path='.//text()',
- transform=clean_akas
- )
- ),
- Rule(
- key='creator',
- extractor=Rules(
- foreach='//div[starts-with(normalize-space(text()), "Creator")]/ul/li[1]/a',
- rules=[
- Rule(
- key='name',
- extractor=Path('./text()')
- ),
- Rule(
- key='link',
- extractor=Path('./@href')
- )
- ],
- transform=lambda x: build_person(
- x.get('name') or '',
- personID=analyze_imdbid(x.get('link'))
- )
- )
- ),
- Rule(
- key='thin writer',
- extractor=Rules(
- foreach='//div[starts-with(normalize-space(text()), "Writer")]/ul/li[1]/a',
- rules=[
- Rule(
- key='name',
- extractor=Path('./text()')
- ),
- Rule(
- key='link',
- extractor=Path('./@href')
- )
- ],
- transform=lambda x: build_person(
- x.get('name') or '',
- personID=analyze_imdbid(x.get('link'))
- )
- )
- ),
- Rule(
- key='thin director',
- extractor=Rules(
- foreach='//div[starts-with(normalize-space(text()), "Director")]/ul/li[1]/a',
- rules=[
- Rule(
- key='name',
- extractor=Path('./text()')
- ),
- Rule(
- key='link',
- extractor=Path('./@href')
- )
- ],
- transform=lambda x: build_person(
- x.get('name') or '',
- personID=analyze_imdbid(x.get('link'))
- )
- )
- ),
- Rule(
- key='top/bottom rank',
- extractor=Path(
- '//li[@class="ipl-inline-list__item"]//a[starts-with(@href, "/chart/")]/text()'
- )
- ),
- Rule(
- key='original air date',
- extractor=Path('//span[@imdbpy="airdate"]/text()')
- ),
- Rule(
- key='series years',
- extractor=Path(
- '//div[@id="tn15title"]//span[starts-with(text(), "TV series")]/text()',
- transform=lambda x: x.replace('TV series', '').strip()
- )
- ),
- Rule(
- key='season/episode',
- extractor=Path(
- '//div[@class="titlereference-overview-season-episode-section"]/ul//text()',
- transform=transformers.strip
- )
- ),
- Rule(
- key='number of episodes',
- extractor=Path(
- '//a[starts-with(text(), "All Episodes")]/text()',
- transform=lambda x: int(x.replace('All Episodes', '').strip()[1:-1])
- )
- ),
- Rule(
- key='episode number',
- extractor=Path(
- '//div[@id="tn15epnav"]/text()',
- transform=lambda x: int(re.sub(r'[^a-z0-9 ]', '',
- x.lower()).strip().split()[0]))
- ),
- Rule(
- key='previous episode',
- extractor=Path(
- '//span[@class="titlereference-overview-episodes-links"]'
- '//a[contains(text(), "Previous")]/@href',
- transform=analyze_imdbid
- )
- ),
- Rule(
- key='next episode',
- extractor=Path(
- '//span[@class="titlereference-overview-episodes-links"]'
- '//a[contains(text(), "Next")]/@href',
- transform=analyze_imdbid
- )
- ),
- Rule(
- key='number of seasons',
- extractor=Path(
- '//span[@class="titlereference-overview-years-links"]/../a[1]/text()',
- transform=int
- )
- ),
- Rule(
- key='tv series link',
- extractor=Path('//a[starts-with(text(), "All Episodes")]/@href')
- ),
- Rule(
- key='akas',
- extractor=Path(
- foreach='//i[@class="transl"]',
- path='./text()',
- transform=lambda x: x
- .replace(' ', ' ')
- .rstrip('-')
- .replace('" - ', '"::', 1)
- .strip('"')
- .replace(' ', ' ')
- )
- ),
- Rule(
- key='production status',
- extractor=Path(
- '//td[starts-with(text(), "Status:")]/..//div[@class="info-content"]//text()',
- transform=lambda x: x.strip().split('|')[0].strip().lower()
- )
- ),
- Rule(
- key='production status updated',
- extractor=Path(
- '//td[starts-with(text(), "Status Updated:")]/'
- '..//div[@class="info-content"]//text()',
- transform=transformers.strip
- )
- ),
- Rule(
- key='production comments',
- extractor=Path(
- '//td[starts-with(text(), "Comments:")]/'
- '..//div[@class="info-content"]//text()',
- transform=transformers.strip
- )
- ),
- Rule(
- key='production note',
- extractor=Path(
- '//td[starts-with(text(), "Note:")]/'
- '..//div[@class="info-content"]//text()',
- transform=transformers.strip
- )
- ),
- Rule(
- key='companies',
- extractor=Rules(
- foreach="//ul[@class='simpleList']",
- rules=[
- Rule(
- key=Path('preceding-sibling::header[1]/div/h4/text()', transform=transformers.lower),
- extractor=Rules(
- foreach='./li',
- rules=[
- Rule(
- key='name',
- extractor=Path('./a//text()')
- ),
- Rule(
- key='comp-link',
- extractor=Path('./a/@href')
- ),
- Rule(
- key='notes',
- extractor=Path('./text()')
- )
- ],
- transform=lambda x: Company(
- name=x.get('name') or '',
- accessSystem='http',
- companyID=analyze_imdbid(x.get('comp-link')),
- notes=(x.get('notes') or '').strip()
- )
- )
- )
- ]
- )
- ),
- Rule(
- key='rating',
- extractor=Path('(//span[@class="ipl-rating-star__rating"])[1]/text()')
- ),
- Rule(
- key='votes',
- extractor=Path('//span[@class="ipl-rating-star__total-votes"][1]/text()')
- ),
- Rule(
- key='cover url',
- extractor=Path('//img[@alt="Poster"]/@src')
- ),
- Rule(
- key='imdbID',
- extractor=Path('//meta[@property="pageId"]/@content',
- transform=lambda x: (x or '').replace('tt', ''))
- )
- ]
- preprocessors = [
- ('/releaseinfo">', '"><span imdbpy="airdate">'),
- (re.compile(r'(<b class="blackcatheader">.+?</b>)', re.I), r'</div><div>\1'),
- ('<small>Full cast and crew for<br>', ''),
- ('<td> </td>', '<td>...</td>'),
- (re.compile(r'<span class="tv-extra">TV mini-series(\s+.*?)</span>', re.I),
- r'<span class="tv-extra">TV series\1</span> (mini)'),
- (_reRolesMovie, _manageRoles)
- ]
- def preprocess_dom(self, dom):
- # Handle series information.
- xpath = self.xpath(dom, "//b[text()='Series Crew']")
- if xpath:
- b = xpath[-1] # In doubt, take the last one.
- for a in self.xpath(b, "./following::h5/a[@class='glossary']"):
- name = a.get('name')
- if name:
- a.set('name', 'series %s' % name)
- # Remove links to IMDbPro.
- preprocessors.remove(dom, '//span[@class="pro-link"]')
- # Remove some 'more' links (keep others, like the one around
- # the number of votes).
- preprocessors.remove(dom, '//a[@class="tn15more"][starts-with(@href, "/title/")]')
- # Remove the "rest of list" in cast.
- preprocessors.remove(dom, '//td[@colspan="4"]/..')
- return dom
- re_space = re.compile(r'\s+')
- re_airdate = re.compile(r'(.*)\s*\(season (\d+), episode (\d+)\)', re.I)
- def postprocess_data(self, data):
- # Convert section names.
- for sect in list(data.keys()):
- if sect in _SECT_CONV:
- data[_SECT_CONV[sect]] = data[sect]
- del data[sect]
- # Filter out fake values.
- for key in data:
- value = data[key]
- if isinstance(value, list) and value:
- if isinstance(value[0], Person):
- data[key] = [x for x in value if x.personID is not None]
- if isinstance(value[0], _Container):
- for obj in data[key]:
- obj.accessSystem = self._as
- obj.modFunct = self._modFunct
- for key in ['title']:
- if (key in data) and isinstance(data[key], dict):
- subdata = data[key]
- del data[key]
- data.update(subdata)
- if not data.get('original title'):
- if 'original title title-year' in data:
- data['original title'] = data['original title title-year']
- del data['original title title-year']
- elif 'original title title-year' in data:
- del data['original title title-year']
- misc_sections = data.get('misc sections')
- if misc_sections is not None:
- for section in misc_sections:
- # skip sections with their own parsers
- if 'cast' in section.keys():
- continue
- data.update(section)
- del data['misc sections']
- if 'akas' in data or 'other akas' in data:
- akas = data.get('akas') or []
- other_akas = data.get('other akas') or []
- akas += other_akas
- nakas = []
- for aka in akas:
- aka = aka.strip()
- if not aka:
- continue
- if aka.endswith('" -'):
- aka = aka[:-3].rstrip()
- nakas.append(aka)
- if 'akas' in data:
- del data['akas']
- if 'other akas' in data:
- del data['other akas']
- if nakas:
- data['akas'] = nakas
- if 'runtimes' in data:
- data['runtimes'] = [x.replace(' min', '')
- for x in data['runtimes']]
- if 'number of seasons' in data:
- data['seasons'] = [str(i) for i in range(1, data['number of seasons'] + 1)]
- if 'season/episode' in data:
- tokens = data['season/episode'].split('Episode')
- try:
- data['season'] = int(tokens[0].split('Season')[1])
- except:
- data['season'] = 'unknown'
- try:
- data['episode'] = int(tokens[1])
- except:
- data['episode'] = 'unknown'
- del data['season/episode']
- for k in ('writer', 'director'):
- t_k = 'thin %s' % k
- if t_k not in data:
- continue
- if k not in data:
- data[k] = data[t_k]
- del data[t_k]
- if 'top/bottom rank' in data:
- tbVal = data['top/bottom rank'].lower()
- if tbVal.startswith('top'):
- tbKey = 'top 250 rank'
- tbVal = _toInt(tbVal, [('top rated movies: #', '')])
- else:
- tbKey = 'bottom 100 rank'
- tbVal = _toInt(tbVal, [('bottom rated movies: #', '')])
- if tbVal:
- data[tbKey] = tbVal
- del data['top/bottom rank']
- if 'year' in data and data['year'] == '????':
- del data['year']
- if 'tv series link' in data:
- if 'tv series title' in data:
- data['episode of'] = Movie(title=data['tv series title'],
- movieID=analyze_imdbid(data['tv series link']),
- accessSystem=self._as,
- modFunct=self._modFunct)
- data['episode of']['kind'] = 'tv series'
- del data['tv series title']
- del data['tv series link']
- if 'rating' in data:
- try:
- data['rating'] = float(data['rating'].replace('/10', ''))
- except (TypeError, ValueError):
- pass
- if data['rating'] == 0:
- del data['rating']
- if 'votes' in data:
- try:
- votes = data['votes'].replace('(', '').replace(')', '').replace(',', '').replace('votes', '')
- data['votes'] = int(votes)
- except (TypeError, ValueError):
- pass
- companies = data.get('companies')
- if companies:
- for section in companies:
- for key, value in section.items():
- if key in data:
- key = '%s companies' % key
- data.update({key: value})
- del data['companies']
- if 'box office' in data:
- data['box office'] = dict(data['box office'])
- return data
- def _process_plotsummary(x):
- """Process a plot (contributed by Rdian06)."""
- xauthor = x.get('author')
- xplot = x.get('plot', '').strip()
- if xauthor:
- xplot += '::%s' % xauthor
- return xplot
- class DOMHTMLPlotParser(DOMParserBase):
- """Parser for the "plot summary" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a 'plot' key, containing a list
- of string with the structure: 'summary::summary_author <author@email>'.
- Example::
- pparser = HTMLPlotParser()
- result = pparser.parse(plot_summary_html_string)
- """
- _defGetRefs = True
- def synopsis_reducer(nodes):
- ret=[]
- for n in nodes:
- if type(n) is ElementTree._ElementUnicodeResult:
- ret.append(n)
- return '\n\n'.join(ret)
- # Notice that recently IMDb started to put the email of the
- # author only in the link, that we're not collecting, here.
- rules = [
- Rule(
- key='plot',
- extractor=Rules(
- foreach='//ul[@id="plot-summaries-content"]/li',
- rules=[
- Rule(
- key='plot',
- extractor=Path('./p//text()')
- ),
- Rule(
- key='author',
- extractor=Path('.//div[@class="author-container"]//a/text()')
- )
- ],
- transform=_process_plotsummary
- )
- ),
- Rule(
- key='synopsis',
- extractor=Path(
- foreach='//ul[@id="plot-synopsis-content"]',
- path='.//li//node()',
- reduce=synopsis_reducer
- )
- )
- ]
- def preprocess_dom(self, dom):
- preprocessors.remove(dom, '//li[@id="no-summary-content"]')
- return dom
- def postprocess_data(self, data):
- if 'synopsis' in data and data['synopsis'][0] and 'a Synopsis for this title' in data['synopsis'][0]:
- del data['synopsis']
- return data
- def _process_award(x):
- award = {}
- _award = x.get('award')
- if _award is not None:
- _award = _award.strip()
- award['award'] = _award
- if not award['award']:
- return {}
- award['year'] = x.get('year').strip()
- if award['year'] and award['year'].isdigit():
- award['year'] = int(award['year'])
- award['result'] = x.get('result').strip()
- category = x.get('category').strip()
- if category:
- award['category'] = category
- received_with = x.get('with')
- if received_with is not None:
- award['with'] = received_with.strip()
- notes = x.get('notes')
- if notes is not None:
- notes = notes.strip().split('\n', 2)[0]
- notes = re_space.sub(' ', notes)
- if notes:
- award['notes'] = notes
- award['anchor'] = x.get('anchor')
- return award
- class DOMHTMLAwardsParser(DOMParserBase):
- """Parser for the "awards" page of a given person or movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- awparser = HTMLAwardsParser()
- result = awparser.parse(awards_html_string)
- """
- subject = 'title'
- _containsObjects = True
- rules = [
- Rule(
- key='awards',
- extractor=Rules(
- foreach='//*[@id="main"]/div[1]/div/table//tr',
- rules=[
- Rule(
- key='year',
- extractor=Path('normalize-space(./ancestor::table/preceding-sibling::*[1]/a/text())')
- ),
- Rule(
- key='result',
- extractor=Path('./td[1]/b/text()')
- ),
- Rule(
- key='award',
- extractor=Path('./td[1]/span/text()')
- ),
- Rule(
- key='category',
- extractor=Path('normalize-space(./ancestor::table/preceding-sibling::*[1]/text())')
- ),
- Rule(
- key='notes',
- extractor=Path('./td[2]/text()')
- ),
- Rule(
- key='anchor',
- extractor=Path('.//text()')
- )
- ],
- transform=_process_award
- )
- ),
- Rule(
- key='recipients',
- extractor=Rules(
- foreach='//*[@id="main"]/div[1]/div/table//tr/td[2]/a',
- rules=[
- Rule(
- key='name',
- extractor=Path('./text()')
- ),
- Rule(
- key='link',
- extractor=Path('./@href')
- ),
- Rule(
- key='anchor',
- extractor=Path('./ancestor::tr//text()')
- )
- ]
- )
- )
- ]
- preprocessors = [
- (re.compile('(<tr><td[^>]*>.*?</td></tr>\n\n</table>)', re.I),
- r'\1</table>'),
- (re.compile('(<tr><td[^>]*>\n\n<big>.*?</big></td></tr>)', re.I),
- r'</table><table class="_imdbpy">\1'),
- (re.compile('(<table[^>]*>\n\n)</table>(<table)', re.I), r'\1\2'),
- (re.compile('(<small>.*?)<br>(.*?</small)', re.I), r'\1 \2'),
- (re.compile('(</tr>\n\n)(<td)', re.I), r'\1<tr>\2')
- ]
- def preprocess_dom(self, dom):
- """Repeat td elements according to their rowspan attributes
- in subsequent tr elements.
- """
- cols = self.xpath(dom, "//td[@rowspan]")
- for col in cols:
- span = int(col.get('rowspan'))
- del col.attrib['rowspan']
- position = len(self.xpath(col, "./preceding-sibling::td"))
- row = col.getparent()
- for tr in self.xpath(row, "./following-sibling::tr")[:span - 1]:
- # if not cloned, child will be moved to new parent
- clone = self.clone(col)
- tr.insert(position, clone)
- return dom
- def postprocess_data(self, data):
- if len(data) == 0:
- return {}
- nd = []
- for award in data['awards']:
- matches = [p for p in data.get('recipients', [])
- if 'nm' in p.get('link') and award.get('anchor') == p.get('anchor')]
- if self.subject == 'title':
- recipients = [
- Person(name=recipient['name'],
- personID=analyze_imdbid(recipient['link']))
- for recipient in matches
- ]
- award['to'] = recipients
- elif self.subject == 'name':
- recipients = [
- Movie(title=recipient['name'],
- movieID=analyze_imdbid(recipient['link']))
- for recipient in matches
- ]
- award['for'] = recipients
- nd.append(award)
- if 'anchor' in award:
- del award['anchor']
- return {'awards': nd}
- class DOMHTMLTaglinesParser(DOMParserBase):
- """Parser for the "taglines" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- tparser = DOMHTMLTaglinesParser()
- result = tparser.parse(taglines_html_string)
- """
- rules = [
- Rule(
- key='taglines',
- extractor=Path(
- foreach='//div[@id="taglines_content"]/div',
- path='.//text()'
- )
- )
- ]
- def preprocess_dom(self, dom):
- preprocessors.remove(dom, '//div[@id="taglines_content"]/div[@class="header"]')
- preprocessors.remove(dom, '//div[@id="taglines_content"]/div[@id="no_content"]')
- return dom
- def postprocess_data(self, data):
- if 'taglines' in data:
- data['taglines'] = [tagline.strip() for tagline in data['taglines']]
- return data
- class DOMHTMLKeywordsParser(DOMParserBase):
- """Parser for the "keywords" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- kwparser = DOMHTMLKeywordsParser()
- result = kwparser.parse(keywords_html_string)
- """
- rules = [
- Rule(
- key='keywords',
- extractor=Path(
- foreach='//td[@data-item-keyword]',
- path='./@data-item-keyword',
- transform=lambda x: x.lower().replace(' ', '-')
- )
- ),
- Rule(
- key='relevant keywords',
- extractor=Rules(
- foreach='//td[@data-item-keyword]',
- rules=[
- Rule(
- key='keyword',
- extractor=Path('./@data-item-keyword')
- ),
- Rule(
- key='ordering',
- extractor=Path('./@data-item-votes')
- ),
- Rule(
- key='vote_str',
- extractor=Path('./div[2]/div//text()')
- )
- ],
- transform=lambda x: {
- 'keyword': x.get('keyword').lower(),
- 'keyword_dash': x.get('keyword').lower().replace(' ', '-'),
- 'ordering': x.get('ordering'),
- 'votes_str': x.get('vote_str').strip().lower()
- }
- )
- )
- ]
- def postprocess_data(self, data):
- if 'relevant keywords' in data:
- rk = []
- for x in data['relevant keywords']:
- if 'votes_str' in x:
- if 'is this relevant?' in x['votes_str']:
- x['votes_for'] = 0
- x['total_votes'] = 0
- else:
- x['votes_for'] = x['votes_str'].split('of')[0].strip()
- x['total_votes'] = re.sub(r"\D", "", x['votes_str'].split('of')[1]).strip()
- rk.append(x)
- data['relevant keywords'] = rk
- return data
- class DOMHTMLAlternateVersionsParser(DOMParserBase):
- """Parser for the "alternate versions" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- avparser = DOMHTMLAlternateVersionsParser()
- result = avparser.parse(alternateversions_html_string)
- """
- _defGetRefs = True
- rules = [
- Rule(
- key='alternate versions',
- extractor=Path(
- foreach='//ul[@class="trivia"]/li',
- path='.//text()',
- transform=transformers.strip
- )
- )
- ]
- class DOMHTMLTriviaParser(DOMParserBase):
- """Parser for the "trivia" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- tparser = DOMHTMLTriviaParser()
- result = tparser.parse(trivia_html_string)
- """
- _defGetRefs = True
- rules = [
- Rule(
- key='trivia',
- extractor=Path(
- foreach='//div[@class="sodatext"]',
- path='.//text()',
- transform=transformers.strip
- )
- )
- ]
- def preprocess_dom(self, dom):
- # Remove "link this quote" links.
- preprocessors.remove(dom, '//span[@class="linksoda"]')
- return dom
- class DOMHTMLSoundtrackParser(DOMParserBase):
- """Parser for the "soundtrack" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- stparser = DOMHTMLSoundtrackParser()
- result = stparser.parse(soundtrack_html_string)
- """
- _defGetRefs = True
- preprocessors = [('<br />', '\n'), ('<br>', '\n')]
- rules = [
- Rule(
- key='soundtrack',
- extractor=Path(
- foreach='//div[@class="list"]//div',
- path='.//text()',
- transform=transformers.strip
- )
- )
- ]
- def postprocess_data(self, data):
- if 'soundtrack' in data:
- nd = []
- for x in data['soundtrack']:
- ds = x.split('\n')
- title = ds[0]
- if title[0] == '"' and title[-1] == '"':
- title = title[1:-1]
- nds = []
- newData = {}
- for l in ds[1:]:
- if ' with ' in l or ' by ' in l or ' from ' in l \
- or ' of ' in l or l.startswith('From '):
- nds.append(l)
- else:
- if nds:
- nds[-1] += l
- else:
- nds.append(l)
- newData[title] = {}
- for l in nds:
- skip = False
- for sep in ('From ',):
- if l.startswith(sep):
- fdix = len(sep)
- kind = l[:fdix].rstrip().lower()
- info = l[fdix:].lstrip()
- newData[title][kind] = info
- skip = True
- if not skip:
- for sep in ' with ', ' by ', ' from ', ' of ':
- fdix = l.find(sep)
- if fdix != -1:
- fdix = fdix + len(sep)
- kind = l[:fdix].rstrip().lower()
- info = l[fdix:].lstrip()
- newData[title][kind] = info
- break
- nd.append(newData)
- data['soundtrack'] = nd
- return data
- class DOMHTMLCrazyCreditsParser(DOMParserBase):
- """Parser for the "crazy credits" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- ccparser = DOMHTMLCrazyCreditsParser()
- result = ccparser.parse(crazycredits_html_string)
- """
- _defGetRefs = True
- rules = [
- Rule(
- key='crazy credits',
- extractor=Path(
- foreach='//ul/li/tt',
- path='.//text()',
- transform=lambda x: x.replace('\n', ' ').replace(' ', ' ')
- )
- )
- ]
- def _process_goof(x):
- text = (x.get('text') or '').strip()
- category = (x.get('category') or 'Goof').strip()
- return {"category": category, "text": text}
- class DOMHTMLGoofsParser(DOMParserBase):
- """Parser for the "goofs" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- gparser = DOMHTMLGoofsParser()
- result = gparser.parse(goofs_html_string)
- """
- _defGetRefs = True
- rules = [
- Rule(
- key='goofs',
- extractor=Rules(
- foreach='//div[contains(@class, "soda sodavote")]',
- rules=[
- Rule(
- key='text',
- extractor=Path('./div[@class="sodatext"]/text()')
- ),
- Rule(
- key='category',
- extractor=Path('./preceding-sibling::h4[1]/text()')
- )
- ],
- transform=_process_goof
- )
- )
- ]
- class DOMHTMLQuotesParser(DOMParserBase):
- """Parser for the "memorable quotes" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- qparser = DOMHTMLQuotesParser()
- result = qparser.parse(quotes_html_string)
- """
- _defGetRefs = True
- rules = [
- Rule(
- key='quotes',
- extractor=Path(
- foreach='//div[@class="sodatext"]',
- path='.//text()',
- transform=lambda x: x
- .strip()
- .replace(' \n', '::')
- .replace('::\n', '::')
- .replace('\n', ' ')
- )
- )
- ]
- def preprocess_dom(self, dom):
- preprocessors.remove(dom, '//div[@class="did-you-know-actions"]')
- return dom
- def postprocess_data(self, data):
- quotes = data.get('quotes', [])
- if not quotes:
- return {}
- quotes = [q.split('::') for q in quotes]
- return {'quotes': quotes}
- class DOMHTMLReleaseinfoParser(DOMParserBase):
- """Parser for the "release dates" page of a given movie.
- The page should be provided as a string, as taken from
- the www.imdb.com server. The final result will be a
- dictionary, with a key for every relevant section.
- Example::
- rdparser = DOMHTMLReleaseinfoParser()
- result = rdparser.parse(releaseinfo_html_string)
- """
- rules = [
- Rule(
- key='release dates',
- extractor=Rules(
- foreach='//table[contains(@class, "release-dates-table-test-only")]//tr',
- rules=[
- Rule(
- key='country',
- extractor=Path('.//td[1]//text()')
- ),
- Rule(
- key='country_code',
- extractor=Path('.//td[1]/a/@href')
- ),
- Rule(
- key='date',
- extractor=Path('.//td[2]//text()')
- ),
- Rule(
- key='notes',
- extractor=Path('.//td[3]//text()')
- )
- ]
- )
- ),
- Rule(
- key='akas',
- extractor=Rules(
- foreach='//table[contains(@class, "akas-table-test-only")]//tr',
- rules=[
- Rule(
- key='countries',
- extractor=Path('./td[1]/text()')
- ),
- Rule(
- …
Large files files are truncated, but you can click here to view the full file