movieParser.py - Copyright 2004-2021 Davide Alberani <da@er…

/imdb/parser/http/movieParser.py

http://github.com/alberanid/imdbpy · Python · 2710 lines · 2504 code · 132 blank · 74 comment · 295 complexity · 77fef151a68ebaa7f58ab74d4998fc06 MD5 · raw file
Large files are truncated click here to view the full file

# -*- coding: utf-8 -*-

# Copyright 2004-2021 Davide Alberani <da@erlug.linux.it>
#           2008-2018 H. Turgut Uyar <uyar@tekir.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

"""
This module provides the classes (and the instances) that are used to parse
the IMDb pages on the www.imdb.com server about a movie.

For example, for Brian De Palma's "The Untouchables", the referred pages
would be:

combined details
    http://www.imdb.com/title/tt0094226/reference

plot summary
    http://www.imdb.com/title/tt0094226/plotsummary

...and so on.
"""

from __future__ import absolute_import, division, print_function, unicode_literals

import functools
import re

from imdb import PY2
from imdb import imdbURL_base
from imdb.Company import Company
from imdb.Movie import Movie
from imdb.Person import Person
from imdb.utils import _Container, KIND_MAP

from .piculet import Path, Rule, Rules, preprocessors, transformers, ElementTree
from .utils import DOMParserBase, analyze_imdbid, build_person, build_movie

if PY2:
    from urllib import unquote
else:
    from urllib.parse import unquote

# Dictionary used to convert some section's names.
_SECT_CONV = {
    'directed': 'director',
    'directed by': 'director',
    'directors': 'director',
    'editors': 'editor',
    'writing credits': 'writer',
    'writers': 'writer',
    'produced': 'producer',
    'cinematography': 'cinematographer',
    'film editing': 'editor',
    'casting': 'casting director',
    'costume design': 'costume designer',
    'makeup department': 'make up',
    'production management': 'production manager',
    'second unit director or assistant director': 'assistant director',
    'costume and wardrobe department': 'costume department',
    'costume departmen': 'costume department',
    'sound department': 'sound crew',
    'stunts': 'stunt performer',
    'other crew': 'miscellaneous crew',
    'also known as': 'akas',
    'country': 'countries',
    'runtime': 'runtimes',
    'language': 'languages',
    'certification': 'certificates',
    'genre': 'genres',
    'created': 'creator',
    'creators': 'creator',
    'color': 'color info',
    'plot': 'plot outline',
    'art director': 'art direction',
    'art directors': 'art direction',
    'composers': 'composer',
    'assistant directors': 'assistant director',
    'set decorator': 'set decoration',
    'set decorators': 'set decoration',
    'visual effects department': 'visual effects',
    'miscellaneous': 'miscellaneous crew',
    'make up department': 'make up',
    'plot summary': 'plot outline',
    'cinematographers': 'cinematographer',
    'camera department': 'camera and electrical department',
    'costume designers': 'costume designer',
    'production designer': 'production design',
    'production designers': 'production design',
    'production managers': 'production manager',
    'music original': 'original music',
    'casting directors': 'casting director',
    'other companies': 'miscellaneous companies',
    'producers': 'producer',
    'special effects by': 'special effects department',
}

re_space = re.compile(r'\s+')

def clean_section_name(section):
    """Clean and replace some section names."""
    section = re_space.sub(' ', section.replace('_', ' ').strip().lower())
    if section.endswith(' by'):
        section = section[:-3]
    return _SECT_CONV.get(section, section)

def _manageRoles(mo):
    """Perform some transformation on the html, so that roleIDs can
    be easily retrieved."""
    firstHalf = mo.group(1)
    secondHalf = mo.group(2)
    newRoles = []
    roles = secondHalf.split(' / ')
    for role in roles:
        role = role.strip()
        if not role:
            continue
        roleID = analyze_imdbid(role)
        if roleID is None:
            roleID = '/'
        else:
            roleID += '/'
        newRoles.append('<div class="_imdbpyrole" roleid="%s">%s</div>' % (
            roleID, role.strip()
        ))
    return firstHalf + ' / '.join(newRoles) + mo.group(3)


_reRolesMovie = re.compile(r'(<td class="character">)(.*?)(</td>)', re.I | re.M | re.S)


def makeSplitter(lstrip=None, sep='|', comments=True,
                 origNotesSep=' (', newNotesSep='::(', strip=None):
    """Return a splitter function suitable for a given set of data."""

    def splitter(x):
        if not x:
            return x
        x = x.strip()
        if not x:
            return x
        if lstrip is not None:
            x = x.lstrip(lstrip).lstrip()
        lx = x.split(sep)
        lx[:] = [_f for _f in [j.strip() for j in lx] if _f]
        if comments:
            lx[:] = [j.replace(origNotesSep, newNotesSep, 1) for j in lx]
        if strip:
            lx[:] = [j.strip(strip) for j in lx]
        return lx

    return splitter


def _toInt(val, replace=()):
    """Return the value, converted to integer, or None; if present, 'replace'
    must be a list of tuples of values to replace."""
    for before, after in replace:
        val = val.replace(before, after)
    try:
        return int(val)
    except (TypeError, ValueError):
        return None


_re_og_title = re.compile(
    r'(.*) \((?:(?:(.+)(?= ))? ?(\d{4})(?:(–)(\d{4}| ))?|(.+))\)',
    re.UNICODE
)


def analyze_og_title(og_title):
    data = {}
    match = _re_og_title.match(og_title)
    if og_title and not match:
        # assume it's a title in production, missing release date information
        return {'title': og_title}
    data['title'] = match.group(1)
    if match.group(3):
        data['year'] = int(match.group(3))
    kind = match.group(2) or match.group(6)
    if kind is None:
        kind = 'movie'
    else:
        kind = kind.lower()
        kind = KIND_MAP.get(kind, kind)
    data['kind'] = kind
    year_separator = match.group(4)
    # There is a year separator so assume an ongoing or ended series
    if year_separator is not None:
        end_year = match.group(5)
        if end_year is not None:
            data['series years'] = '%(year)d-%(end_year)s' % {
                'year': data['year'],
                'end_year': end_year.strip(),
            }
        elif kind.endswith('series'):
            data['series years'] = '%(year)d-' % {'year': data['year']}
    # No year separator and series, so assume that it ended the same year
    elif kind.endswith('series') and 'year' in data:
        data['series years'] = '%(year)d-%(year)d' % {'year': data['year']}

    if data['kind'] == 'episode' and data['title'][0] == '"':
        quote_end = data['title'].find('"', 1)
        data['tv series title'] = data['title'][1:quote_end]
        data['title'] = data['title'][quote_end + 1:].strip()
    return data


def analyze_certificates(certificates):
    def reducer(acc, el):
        cert_re = re.compile(r'^(.+):(.+)$', re.UNICODE)

        if cert_re.match(el):
            acc.append(el)
        elif acc:
            acc[-1] = u'{}::{}'.format(
                acc[-1],
                el,
            )
        return acc

    certificates = [el.strip() for el in certificates.split('\n') if el.strip()]
    return functools.reduce(reducer, certificates, [])


def clean_akas(aka):
    aka = re_space.sub(' ', aka).strip()
    if aka.lower().startswith('see more'):
        aka = ''
    return aka


class DOMHTMLMovieParser(DOMParserBase):
    """Parser for the "reference" page of a given movie.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example::

        mparser = DOMHTMLMovieParser()
        result = mparser.parse(reference_html_string)
    """
    _containsObjects = True

    rules = [
        Rule(
            key='title',
            extractor=Path('//meta[@property="og:title"]/@content',
                           transform=analyze_og_title)
        ),
        Rule(
            key='original title',
            extractor=Path('//div[@class="titlereference-header"]//span[@class="titlereference-original-title-label"]/preceding-sibling::text()',
                           transform=lambda x: re_space.sub(' ', x).strip())

        ),
        Rule(
            key='original title title-year',
            extractor=Path('//div[@class="titlereference-header"]//span[@class="titlereference-title-year"]/preceding-sibling::text()',
                           transform=lambda x: re_space.sub(' ', x).strip())
        ),
        Rule(
            key='localized title',
            extractor=Path('//meta[@name="title"]/@content',
                           transform=lambda x: analyze_og_title(x).get('title'))
        ),

        # parser for misc sections like 'casting department', 'stunts', ...
        Rule(
            key='misc sections',
            extractor=Rules(
                foreach='//h4[contains(@class, "ipl-header__content")]',
                rules=[
                    Rule(
                        key=Path('./@name', transform=clean_section_name),
                        extractor=Rules(
                            foreach='../../following-sibling::table[1]//tr',
                            rules=[
                                Rule(
                                    key='person',
                                    extractor=Path('.//text()')
                                ),
                                Rule(
                                    key='link',
                                    extractor=Path('./td[1]/a[@href]/@href')
                                )
                            ],
                            transform=lambda x: build_person(
                                x.get('person') or '',
                                personID=analyze_imdbid(x.get('link'))
                            )
                        )
                    )
                ]
            )
        ),
        Rule(
            key='cast',
            extractor=Rules(
                foreach='//table[@class="cast_list"]//tr',
                rules=[
                    Rule(
                        key='person',
                        extractor=Path('.//text()')
                    ),
                    Rule(
                        key='link',
                        extractor=Path('./td[2]/a/@href')
                    ),
                    Rule(
                        key='roleID',
                        extractor=Path('./td[4]//div[@class="_imdbpyrole"]/@roleid')
                    )
                ],
                transform=lambda x: build_person(
                    x.get('person') or '',
                    personID=analyze_imdbid(x.get('link')),
                    roleID=(x.get('roleID') or '').split('/')
                )
            )
        ),
        Rule(
            key='recommendations',
            extractor=Rules(
                foreach='//div[contains(@class, "rec_item")]',
                rules=[
                    Rule(
                        key='movieID',
                        extractor=Path(
                            './@data-tconst',
                            transform=lambda x: (x or '').replace('tt', '')
                        )
                    ),
                    Rule(
                        key='title',
                        extractor=Path(
                            './/a//img/@title',
                            transform=lambda x: re_space.sub(' ', x or '').strip()
                        )
                    ),
                ],
                transform=lambda x: build_movie(x.get('title', ''), movieID=x.get('movieID'))
            )
        ),
        Rule(
            key='myrating',
            extractor=Path('//span[@id="voteuser"]//text()')
        ),
        Rule(
            key='plot summary',
            extractor=Path('//td[starts-with(text(), "Plot")]/..//p/text()',
                           transform=lambda x: x.strip().rstrip('|').rstrip())
        ),
        Rule(
            key='genres',
            extractor=Path(
                foreach='//td[starts-with(text(), "Genre")]/..//li/a',
                path='./text()'
            )
        ),
        Rule(
            key='runtimes',
            extractor=Path(
                foreach='//td[starts-with(text(), "Runtime")]/..//li',
                path='./text()',
                transform=lambda x: x.strip().replace(' min', '')
            )
        ),
        Rule(
            key='countries',
            extractor=Path(
                foreach='//td[starts-with(text(), "Countr")]/..//li/a',
                path='./text()'
            )
        ),
        Rule(
            key='country codes',
            extractor=Path(
                foreach='//td[starts-with(text(), "Countr")]/..//li/a',
                path='./@href',
                transform=lambda x: x.split('/')[2].strip().lower()
            )
        ),
        Rule(
            key='language',
            extractor=Path(
                foreach='//td[starts-with(text(), "Language")]/..//li/a',
                path='./text()'
            )
        ),
        Rule(
            key='language codes',
            extractor=Path(
                foreach='//td[starts-with(text(), "Language")]/..//li/a',
                path='./@href',
                transform=lambda x: x.split('/')[2].strip()
            )
        ),
        Rule(
            key='color info',
            extractor=Path(
                foreach='//td[starts-with(text(), "Color")]/..//li/a',
                path='./text()',
                transform=lambda x: x.replace(' (', '::(')
            )
        ),
        Rule(
            key='aspect ratio',
            extractor=Path(
                '//td[starts-with(text(), "Aspect")]/..//li/text()',
                transform=transformers.strip
            )
        ),
        Rule(
            key='sound mix',
            extractor=Path(
                foreach='//td[starts-with(text(), "Sound Mix")]/..//li/a',
                path='./text()',
                transform=lambda x: x.replace(' (', '::(')
            )
        ),
        Rule(
            key='box office',
            extractor=Rules(
                foreach='//section[contains(@class, "titlereference-section-box-office")]'
                        '//table[contains(@class, "titlereference-list")]//tr',
                rules=[
                    Rule(
                        key='box_office_title',
                        extractor=Path('./td[1]/text()')
                    ),
                    Rule(
                        key='box_office_detail',
                        extractor=Path('./td[2]/text()')
                    )
                ],
                transform=lambda x: (x['box_office_title'].strip(),
                                     x['box_office_detail'].strip())
            ),
        ),
        Rule(
            key='certificates',
            extractor=Path(
                '//td[starts-with(text(), "Certificat")]/..//text()',
                transform=analyze_certificates
            )
        ),
        # Collects akas not encosed in <i> tags.
        Rule(
            key='other akas',
            extractor=Path(
                foreach='//section[contains(@class, "listo")]//td[starts-with(text(), "Also Known As")]/..//ul/li',
                path='.//text()',
                transform=clean_akas
            )
        ),
        Rule(
            key='creator',
            extractor=Rules(
                foreach='//div[starts-with(normalize-space(text()), "Creator")]/ul/li[1]/a',
                rules=[
                    Rule(
                        key='name',
                        extractor=Path('./text()')
                    ),
                    Rule(
                        key='link',
                        extractor=Path('./@href')
                    )
                ],
                transform=lambda x: build_person(
                    x.get('name') or '',
                    personID=analyze_imdbid(x.get('link'))
                )
            )
        ),
        Rule(
            key='thin writer',
            extractor=Rules(
                foreach='//div[starts-with(normalize-space(text()), "Writer")]/ul/li[1]/a',
                rules=[
                    Rule(
                        key='name',
                        extractor=Path('./text()')
                    ),
                    Rule(
                        key='link',
                        extractor=Path('./@href')
                    )
                ],
                transform=lambda x: build_person(
                    x.get('name') or '',
                    personID=analyze_imdbid(x.get('link'))
                )
            )
        ),
        Rule(
            key='thin director',
            extractor=Rules(
                foreach='//div[starts-with(normalize-space(text()), "Director")]/ul/li[1]/a',
                rules=[
                    Rule(
                        key='name',
                        extractor=Path('./text()')
                    ),
                    Rule(
                        key='link',
                        extractor=Path('./@href')
                    )
                ],
                transform=lambda x: build_person(
                    x.get('name') or '',
                    personID=analyze_imdbid(x.get('link'))
                )
            )
        ),
        Rule(
            key='top/bottom rank',
            extractor=Path(
                '//li[@class="ipl-inline-list__item"]//a[starts-with(@href, "/chart/")]/text()'
            )
        ),
        Rule(
            key='original air date',
            extractor=Path('//span[@imdbpy="airdate"]/text()')
        ),
        Rule(
            key='series years',
            extractor=Path(
                '//div[@id="tn15title"]//span[starts-with(text(), "TV series")]/text()',
                transform=lambda x: x.replace('TV series', '').strip()
            )
        ),
        Rule(
            key='season/episode',
            extractor=Path(
                '//div[@class="titlereference-overview-season-episode-section"]/ul//text()',
                transform=transformers.strip
            )
        ),
        Rule(
            key='number of episodes',
            extractor=Path(
                '//a[starts-with(text(), "All Episodes")]/text()',
                transform=lambda x: int(x.replace('All Episodes', '').strip()[1:-1])
            )
        ),
        Rule(
            key='episode number',
            extractor=Path(
                '//div[@id="tn15epnav"]/text()',
                transform=lambda x: int(re.sub(r'[^a-z0-9 ]', '',
                                               x.lower()).strip().split()[0]))
        ),
        Rule(
            key='previous episode',
            extractor=Path(
                '//span[@class="titlereference-overview-episodes-links"]'
                '//a[contains(text(), "Previous")]/@href',
                transform=analyze_imdbid
            )
        ),
        Rule(
            key='next episode',
            extractor=Path(
                '//span[@class="titlereference-overview-episodes-links"]'
                '//a[contains(text(), "Next")]/@href',
                transform=analyze_imdbid
            )
        ),
        Rule(
            key='number of seasons',
            extractor=Path(
                '//span[@class="titlereference-overview-years-links"]/../a[1]/text()',
                transform=int
            )
        ),
        Rule(
            key='tv series link',
            extractor=Path('//a[starts-with(text(), "All Episodes")]/@href')
        ),
        Rule(
            key='akas',
            extractor=Path(
                foreach='//i[@class="transl"]',
                path='./text()',
                transform=lambda x: x
                    .replace('  ', ' ')
                    .rstrip('-')
                    .replace('" - ', '"::', 1)
                    .strip('"')
                    .replace('  ', ' ')
            )
        ),
        Rule(
            key='production status',
            extractor=Path(
                '//td[starts-with(text(), "Status:")]/..//div[@class="info-content"]//text()',
                transform=lambda x: x.strip().split('|')[0].strip().lower()
            )
        ),
        Rule(
            key='production status updated',
            extractor=Path(
                '//td[starts-with(text(), "Status Updated:")]/'
                '..//div[@class="info-content"]//text()',
                transform=transformers.strip
            )
        ),
        Rule(
            key='production comments',
            extractor=Path(
                '//td[starts-with(text(), "Comments:")]/'
                '..//div[@class="info-content"]//text()',
                transform=transformers.strip
            )
        ),
        Rule(
            key='production note',
            extractor=Path(
                '//td[starts-with(text(), "Note:")]/'
                '..//div[@class="info-content"]//text()',
                transform=transformers.strip
            )
        ),
        Rule(
            key='companies',
            extractor=Rules(
                foreach="//ul[@class='simpleList']",
                rules=[
                    Rule(
                        key=Path('preceding-sibling::header[1]/div/h4/text()', transform=transformers.lower),
                        extractor=Rules(
                            foreach='./li',
                            rules=[
                                Rule(
                                    key='name',
                                    extractor=Path('./a//text()')
                                ),
                                Rule(
                                    key='comp-link',
                                    extractor=Path('./a/@href')
                                ),
                                Rule(
                                    key='notes',
                                    extractor=Path('./text()')
                                )
                            ],
                            transform=lambda x: Company(
                                name=x.get('name') or '',
                                accessSystem='http',
                                companyID=analyze_imdbid(x.get('comp-link')),
                                notes=(x.get('notes') or '').strip()
                            )
                        )
                    )
                ]
            )
        ),
        Rule(
            key='rating',
            extractor=Path('(//span[@class="ipl-rating-star__rating"])[1]/text()')
        ),
        Rule(
            key='votes',
            extractor=Path('//span[@class="ipl-rating-star__total-votes"][1]/text()')
        ),
        Rule(
            key='cover url',
            extractor=Path('//img[@alt="Poster"]/@src')
        ),
        Rule(
            key='imdbID',
            extractor=Path('//meta[@property="pageId"]/@content',
                           transform=lambda x: (x or '').replace('tt', ''))
        )
    ]

    preprocessors = [
        ('/releaseinfo">', '"><span imdbpy="airdate">'),
        (re.compile(r'(<b class="blackcatheader">.+?</b>)', re.I), r'</div><div>\1'),
        ('<small>Full cast and crew for<br>', ''),
        ('<td> </td>', '<td>...</td>'),
        (re.compile(r'<span class="tv-extra">TV mini-series(\s+.*?)</span>', re.I),
         r'<span class="tv-extra">TV series\1</span> (mini)'),
        (_reRolesMovie, _manageRoles)
    ]

    def preprocess_dom(self, dom):
        # Handle series information.
        xpath = self.xpath(dom, "//b[text()='Series Crew']")
        if xpath:
            b = xpath[-1]  # In doubt, take the last one.
            for a in self.xpath(b, "./following::h5/a[@class='glossary']"):
                name = a.get('name')
                if name:
                    a.set('name', 'series %s' % name)
        # Remove links to IMDbPro.
        preprocessors.remove(dom, '//span[@class="pro-link"]')
        # Remove some 'more' links (keep others, like the one around
        # the number of votes).
        preprocessors.remove(dom, '//a[@class="tn15more"][starts-with(@href, "/title/")]')
        # Remove the "rest of list" in cast.
        preprocessors.remove(dom, '//td[@colspan="4"]/..')
        return dom

    re_space = re.compile(r'\s+')
    re_airdate = re.compile(r'(.*)\s*\(season (\d+), episode (\d+)\)', re.I)

    def postprocess_data(self, data):
        # Convert section names.
        for sect in list(data.keys()):
            if sect in _SECT_CONV:
                data[_SECT_CONV[sect]] = data[sect]
                del data[sect]
        # Filter out fake values.
        for key in data:
            value = data[key]
            if isinstance(value, list) and value:
                if isinstance(value[0], Person):
                    data[key] = [x for x in value if x.personID is not None]
                if isinstance(value[0], _Container):
                    for obj in data[key]:
                        obj.accessSystem = self._as
                        obj.modFunct = self._modFunct
        for key in ['title']:
            if (key in data) and isinstance(data[key], dict):
                subdata = data[key]
                del data[key]
                data.update(subdata)
        if not data.get('original title'):
            if 'original title title-year' in data:
                data['original title'] = data['original title title-year']
                del data['original title title-year']
        elif 'original title title-year' in data:
            del data['original title title-year']
        misc_sections = data.get('misc sections')
        if misc_sections is not None:
            for section in misc_sections:
                # skip sections with their own parsers
                if 'cast' in section.keys():
                    continue
                data.update(section)
            del data['misc sections']
        if 'akas' in data or 'other akas' in data:
            akas = data.get('akas') or []
            other_akas = data.get('other akas') or []
            akas += other_akas
            nakas = []
            for aka in akas:
                aka = aka.strip()
                if not aka:
                    continue
                if aka.endswith('" -'):
                    aka = aka[:-3].rstrip()
                nakas.append(aka)
            if 'akas' in data:
                del data['akas']
            if 'other akas' in data:
                del data['other akas']
            if nakas:
                data['akas'] = nakas
        if 'runtimes' in data:
            data['runtimes'] = [x.replace(' min', '')
                                for x in data['runtimes']]
        if 'number of seasons' in data:
            data['seasons'] = [str(i) for i in range(1, data['number of seasons'] + 1)]
        if 'season/episode' in data:
            tokens = data['season/episode'].split('Episode')
            try:
                data['season'] = int(tokens[0].split('Season')[1])
            except:
                data['season'] = 'unknown'
            try:
                data['episode'] = int(tokens[1])
            except:
                data['episode'] = 'unknown'
            del data['season/episode']
        for k in ('writer', 'director'):
            t_k = 'thin %s' % k
            if t_k not in data:
                continue
            if k not in data:
                data[k] = data[t_k]
            del data[t_k]
        if 'top/bottom rank' in data:
            tbVal = data['top/bottom rank'].lower()
            if tbVal.startswith('top'):
                tbKey = 'top 250 rank'
                tbVal = _toInt(tbVal, [('top rated movies: #', '')])
            else:
                tbKey = 'bottom 100 rank'
                tbVal = _toInt(tbVal, [('bottom rated movies: #', '')])
            if tbVal:
                data[tbKey] = tbVal
            del data['top/bottom rank']
        if 'year' in data and data['year'] == '????':
            del data['year']
        if 'tv series link' in data:
            if 'tv series title' in data:
                data['episode of'] = Movie(title=data['tv series title'],
                                           movieID=analyze_imdbid(data['tv series link']),
                                           accessSystem=self._as,
                                           modFunct=self._modFunct)
                data['episode of']['kind'] = 'tv series'
                del data['tv series title']
            del data['tv series link']
        if 'rating' in data:
            try:
                data['rating'] = float(data['rating'].replace('/10', ''))
            except (TypeError, ValueError):
                pass
            if data['rating'] == 0:
                del data['rating']
        if 'votes' in data:
            try:
                votes = data['votes'].replace('(', '').replace(')', '').replace(',', '').replace('votes', '')
                data['votes'] = int(votes)
            except (TypeError, ValueError):
                pass
        companies = data.get('companies')
        if companies:
            for section in companies:
                for key, value in section.items():
                    if key in data:
                        key = '%s companies' % key
                    data.update({key: value})
            del data['companies']
        if 'box office' in data:
            data['box office'] = dict(data['box office'])
        return data


def _process_plotsummary(x):
    """Process a plot (contributed by Rdian06)."""
    xauthor = x.get('author')
    xplot = x.get('plot', '').strip()
    if xauthor:
        xplot += '::%s' % xauthor
    return xplot


class DOMHTMLPlotParser(DOMParserBase):
    """Parser for the "plot summary" page of a given movie.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a 'plot' key, containing a list
    of string with the structure: 'summary::summary_author <author@email>'.

    Example::

        pparser = HTMLPlotParser()
        result = pparser.parse(plot_summary_html_string)
    """
    _defGetRefs = True
    def synopsis_reducer(nodes):
      ret=[]
      for n in nodes:
        if type(n) is ElementTree._ElementUnicodeResult:
          ret.append(n)
      return '\n\n'.join(ret)

    # Notice that recently IMDb started to put the email of the
    # author only in the link, that we're not collecting, here.
    rules = [
        Rule(
            key='plot',
            extractor=Rules(
                foreach='//ul[@id="plot-summaries-content"]/li',
                rules=[
                    Rule(
                        key='plot',
                        extractor=Path('./p//text()')
                    ),
                    Rule(
                        key='author',
                        extractor=Path('.//div[@class="author-container"]//a/text()')
                    )
                ],
                transform=_process_plotsummary
            )
        ),
        Rule(
            key='synopsis',
            extractor=Path(
                foreach='//ul[@id="plot-synopsis-content"]',
                path='.//li//node()',
                reduce=synopsis_reducer
            )
        )
    ]

    def preprocess_dom(self, dom):
        preprocessors.remove(dom, '//li[@id="no-summary-content"]')
        return dom

    def postprocess_data(self, data):
        if 'synopsis' in data and data['synopsis'][0] and 'a Synopsis for this title' in data['synopsis'][0]:
            del data['synopsis']
        return data


def _process_award(x):
    award = {}
    _award = x.get('award')
    if _award is not None:
        _award = _award.strip()
    award['award'] = _award
    if not award['award']:
        return {}
    award['year'] = x.get('year').strip()
    if award['year'] and award['year'].isdigit():
        award['year'] = int(award['year'])
    award['result'] = x.get('result').strip()
    category = x.get('category').strip()
    if category:
        award['category'] = category
    received_with = x.get('with')
    if received_with is not None:
        award['with'] = received_with.strip()
    notes = x.get('notes')
    if notes is not None:
        notes = notes.strip().split('\n', 2)[0]
        notes = re_space.sub(' ', notes)
        if notes:
            award['notes'] = notes
    award['anchor'] = x.get('anchor')
    return award


class DOMHTMLAwardsParser(DOMParserBase):
    """Parser for the "awards" page of a given person or movie.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example::

        awparser = HTMLAwardsParser()
        result = awparser.parse(awards_html_string)
    """
    subject = 'title'
    _containsObjects = True

    rules = [
        Rule(
            key='awards',
            extractor=Rules(
                foreach='//*[@id="main"]/div[1]/div/table//tr',
                rules=[
                    Rule(
                        key='year',
                        extractor=Path('normalize-space(./ancestor::table/preceding-sibling::*[1]/a/text())')
                    ),
                    Rule(
                        key='result',
                        extractor=Path('./td[1]/b/text()')
                    ),
                    Rule(
                        key='award',
                        extractor=Path('./td[1]/span/text()')
                    ),
                    Rule(
                        key='category',
                        extractor=Path('normalize-space(./ancestor::table/preceding-sibling::*[1]/text())')
                    ),
                    Rule(
                        key='notes',
                        extractor=Path('./td[2]/text()')
                    ),
                    Rule(
                        key='anchor',
                        extractor=Path('.//text()')
                    )
                ],
                transform=_process_award
            )
        ),
        Rule(
            key='recipients',
            extractor=Rules(
                foreach='//*[@id="main"]/div[1]/div/table//tr/td[2]/a',
                            rules=[
                                Rule(
                                    key='name',
                                    extractor=Path('./text()')
                                ),
                                Rule(
                                    key='link',
                                    extractor=Path('./@href')
                                ),
                                Rule(
                                    key='anchor',
                                    extractor=Path('./ancestor::tr//text()')
                                )
                            ]
            )
        )
    ]

    preprocessors = [
        (re.compile('(<tr><td[^>]*>.*?</td></tr>\n\n</table>)', re.I),
         r'\1</table>'),
        (re.compile('(<tr><td[^>]*>\n\n<big>.*?</big></td></tr>)', re.I),
         r'</table><table class="_imdbpy">\1'),
        (re.compile('(<table[^>]*>\n\n)</table>(<table)', re.I), r'\1\2'),
        (re.compile('(<small>.*?)<br>(.*?</small)', re.I), r'\1 \2'),
        (re.compile('(</tr>\n\n)(<td)', re.I), r'\1<tr>\2')
    ]

    def preprocess_dom(self, dom):
        """Repeat td elements according to their rowspan attributes
        in subsequent tr elements.
        """
        cols = self.xpath(dom, "//td[@rowspan]")
        for col in cols:
            span = int(col.get('rowspan'))
            del col.attrib['rowspan']
            position = len(self.xpath(col, "./preceding-sibling::td"))
            row = col.getparent()
            for tr in self.xpath(row, "./following-sibling::tr")[:span - 1]:
                # if not cloned, child will be moved to new parent
                clone = self.clone(col)
                tr.insert(position, clone)
        return dom

    def postprocess_data(self, data):
        if len(data) == 0:
            return {}
        nd = []
        for award in data['awards']:
            matches = [p for p in data.get('recipients', [])
                       if 'nm' in p.get('link') and award.get('anchor') == p.get('anchor')]
            if self.subject == 'title':
                recipients = [
                    Person(name=recipient['name'],
                           personID=analyze_imdbid(recipient['link']))
                    for recipient in matches
                ]
                award['to'] = recipients
            elif self.subject == 'name':
                recipients = [
                    Movie(title=recipient['name'],
                          movieID=analyze_imdbid(recipient['link']))
                    for recipient in matches
                ]
                award['for'] = recipients
            nd.append(award)
            if 'anchor' in award:
                del award['anchor']
        return {'awards': nd}



class DOMHTMLTaglinesParser(DOMParserBase):
    """Parser for the "taglines" page of a given movie.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example::

        tparser = DOMHTMLTaglinesParser()
        result = tparser.parse(taglines_html_string)
    """
    rules = [
        Rule(
            key='taglines',
            extractor=Path(
                foreach='//div[@id="taglines_content"]/div',
                path='.//text()'
            )
        )
    ]

    def preprocess_dom(self, dom):
        preprocessors.remove(dom, '//div[@id="taglines_content"]/div[@class="header"]')
        preprocessors.remove(dom, '//div[@id="taglines_content"]/div[@id="no_content"]')
        return dom

    def postprocess_data(self, data):
        if 'taglines' in data:
            data['taglines'] = [tagline.strip() for tagline in data['taglines']]
        return data


class DOMHTMLKeywordsParser(DOMParserBase):
    """Parser for the "keywords" page of a given movie.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example::

        kwparser = DOMHTMLKeywordsParser()
        result = kwparser.parse(keywords_html_string)
    """
    rules = [
        Rule(
            key='keywords',
            extractor=Path(
                foreach='//td[@data-item-keyword]',
                path='./@data-item-keyword',
                transform=lambda x: x.lower().replace(' ', '-')
            )
        ),
        Rule(
            key='relevant keywords',
            extractor=Rules(
                foreach='//td[@data-item-keyword]',
                            rules=[
                                Rule(
                                    key='keyword',
                                    extractor=Path('./@data-item-keyword')
                                ),
                                Rule(
                                    key='ordering',
                                    extractor=Path('./@data-item-votes')
                                ),
                                Rule(
                                    key='vote_str',
                                    extractor=Path('./div[2]/div//text()')
                                )
                            ],
                            transform=lambda x: {
                                'keyword': x.get('keyword').lower(),
                                'keyword_dash': x.get('keyword').lower().replace(' ', '-'),
                                'ordering': x.get('ordering'),
                                'votes_str': x.get('vote_str').strip().lower()
                            }
            )
        )

    ]

    def postprocess_data(self, data):
        if 'relevant keywords' in data:
            rk = []
            for x in data['relevant keywords']:
                if 'votes_str' in x:
                    if 'is this relevant?' in x['votes_str']:
                        x['votes_for'] = 0
                        x['total_votes'] = 0
                    else:
                        x['votes_for'] = x['votes_str'].split('of')[0].strip()
                        x['total_votes'] = re.sub(r"\D", "", x['votes_str'].split('of')[1]).strip()
                    rk.append(x)
            data['relevant keywords'] = rk
        return data


class DOMHTMLAlternateVersionsParser(DOMParserBase):
    """Parser for the "alternate versions" page of a given movie.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example::

        avparser = DOMHTMLAlternateVersionsParser()
        result = avparser.parse(alternateversions_html_string)
    """
    _defGetRefs = True

    rules = [
        Rule(
            key='alternate versions',
            extractor=Path(
                foreach='//ul[@class="trivia"]/li',
                path='.//text()',
                transform=transformers.strip
            )
        )
    ]


class DOMHTMLTriviaParser(DOMParserBase):
    """Parser for the "trivia" page of a given movie.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example::

        tparser = DOMHTMLTriviaParser()
        result = tparser.parse(trivia_html_string)
    """
    _defGetRefs = True

    rules = [
        Rule(
            key='trivia',
            extractor=Path(
                foreach='//div[@class="sodatext"]',
                path='.//text()',
                transform=transformers.strip
            )
        )
    ]

    def preprocess_dom(self, dom):
        # Remove "link this quote" links.
        preprocessors.remove(dom, '//span[@class="linksoda"]')
        return dom


class DOMHTMLSoundtrackParser(DOMParserBase):
    """Parser for the "soundtrack" page of a given movie.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example::

        stparser = DOMHTMLSoundtrackParser()
        result = stparser.parse(soundtrack_html_string)
    """
    _defGetRefs = True

    preprocessors = [('<br />', '\n'), ('<br>', '\n')]

    rules = [
        Rule(
            key='soundtrack',
            extractor=Path(
                foreach='//div[@class="list"]//div',
                path='.//text()',
                transform=transformers.strip
            )
        )
    ]

    def postprocess_data(self, data):
        if 'soundtrack' in data:
            nd = []
            for x in data['soundtrack']:
                ds = x.split('\n')
                title = ds[0]
                if title[0] == '"' and title[-1] == '"':
                    title = title[1:-1]
                nds = []
                newData = {}
                for l in ds[1:]:
                    if ' with ' in l or ' by ' in l or ' from ' in l \
                            or ' of ' in l or l.startswith('From '):
                        nds.append(l)
                    else:
                        if nds:
                            nds[-1] += l
                        else:
                            nds.append(l)
                newData[title] = {}
                for l in nds:
                    skip = False
                    for sep in ('From ',):
                        if l.startswith(sep):
                            fdix = len(sep)
                            kind = l[:fdix].rstrip().lower()
                            info = l[fdix:].lstrip()
                            newData[title][kind] = info
                            skip = True
                    if not skip:
                        for sep in ' with ', ' by ', ' from ', ' of ':
                            fdix = l.find(sep)
                            if fdix != -1:
                                fdix = fdix + len(sep)
                                kind = l[:fdix].rstrip().lower()
                                info = l[fdix:].lstrip()
                                newData[title][kind] = info
                                break
                nd.append(newData)
            data['soundtrack'] = nd
        return data


class DOMHTMLCrazyCreditsParser(DOMParserBase):
    """Parser for the "crazy credits" page of a given movie.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example::

        ccparser = DOMHTMLCrazyCreditsParser()
        result = ccparser.parse(crazycredits_html_string)
    """
    _defGetRefs = True

    rules = [
        Rule(
            key='crazy credits',
            extractor=Path(
                foreach='//ul/li/tt',
                path='.//text()',
                transform=lambda x: x.replace('\n', ' ').replace('  ', ' ')
            )
        )
    ]


def _process_goof(x):
    text = (x.get('text') or '').strip()
    category = (x.get('category') or 'Goof').strip()
    return {"category": category, "text": text}


class DOMHTMLGoofsParser(DOMParserBase):
    """Parser for the "goofs" page of a given movie.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example::

        gparser = DOMHTMLGoofsParser()
        result = gparser.parse(goofs_html_string)
    """
    _defGetRefs = True

    rules = [
        Rule(
            key='goofs',
            extractor=Rules(
                foreach='//div[contains(@class, "soda sodavote")]',
                rules=[
                    Rule(
                        key='text',
                        extractor=Path('./div[@class="sodatext"]/text()')
                    ),
                    Rule(
                        key='category',
                        extractor=Path('./preceding-sibling::h4[1]/text()')
                    )
                ],
                transform=_process_goof
            )
        )
    ]


class DOMHTMLQuotesParser(DOMParserBase):
    """Parser for the "memorable quotes" page of a given movie.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example::

        qparser = DOMHTMLQuotesParser()
        result = qparser.parse(quotes_html_string)
    """
    _defGetRefs = True

    rules = [
        Rule(
            key='quotes',
            extractor=Path(
                foreach='//div[@class="sodatext"]',
                path='.//text()',
                transform=lambda x: x
                    .strip()
                    .replace(' \n', '::')
                    .replace('::\n', '::')
                    .replace('\n', ' ')
            )
        )
    ]

    def preprocess_dom(self, dom):
        preprocessors.remove(dom, '//div[@class="did-you-know-actions"]')
        return dom

    def postprocess_data(self, data):
        quotes = data.get('quotes', [])
        if not quotes:
            return {}
        quotes = [q.split('::') for q in quotes]
        return {'quotes': quotes}


class DOMHTMLReleaseinfoParser(DOMParserBase):
    """Parser for the "release dates" page of a given movie.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example::

        rdparser = DOMHTMLReleaseinfoParser()
        result = rdparser.parse(releaseinfo_html_string)
    """
    rules = [
        Rule(
            key='release dates',
            extractor=Rules(
                foreach='//table[contains(@class, "release-dates-table-test-only")]//tr',
                rules=[
                    Rule(
                        key='country',
                        extractor=Path('.//td[1]//text()')
                    ),
                    Rule(
                        key='country_code',
                        extractor=Path('.//td[1]/a/@href')
                    ),
                    Rule(
                        key='date',
                        extractor=Path('.//td[2]//text()')
                    ),
                    Rule(
                        key='notes',
                        extractor=Path('.//td[3]//text()')
                    )
                ]
            )
        ),
        Rule(
            key='akas',
            extractor=Rules(
                foreach='//table[contains(@class, "akas-table-test-only")]//tr',
                rules=[
                    Rule(
                        key='countries',
                        extractor=Path('./td[1]/text()')
                    ),
                    Rule(…
Tech Fingerprint

Alerts (36)

'def' Ensure functions have docstrings for documentation
144 148 184 222 223 239 703 870 907 911 1383 1387
Complexity hotspot; lines 157 to 161 (total complexity: 7)
157 158 159 160 161
Complexity hotspot; lines 733 to 737 (total complexity: 7)
733 734 735 736 737
'del' Avoid unless necessary; Python's garbage collector typically handles object deletion
772 774 792 799 810 812 820 821 828 842 913
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
786 790
'type(' Use isinstance() for type checking instead of type()
873