PageRenderTime 66ms CodeModel.GetById 30ms RepoModel.GetById 1ms app.codeStats 0ms

/imdb/parser/http/movieParser.py

http://github.com/alberanid/imdbpy
Python | 2710 lines | 2632 code | 29 blank | 49 comment | 43 complexity | 77fef151a68ebaa7f58ab74d4998fc06 MD5 | raw file
Possible License(s): GPL-2.0

Large files files are truncated, but you can click here to view the full file

  1. # -*- coding: utf-8 -*-
  2. # Copyright 2004-2021 Davide Alberani <da@erlug.linux.it>
  3. # 2008-2018 H. Turgut Uyar <uyar@tekir.org>
  4. #
  5. # This program is free software; you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation; either version 2 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program; if not, write to the Free Software
  17. # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. """
  19. This module provides the classes (and the instances) that are used to parse
  20. the IMDb pages on the www.imdb.com server about a movie.
  21. For example, for Brian De Palma's "The Untouchables", the referred pages
  22. would be:
  23. combined details
  24. http://www.imdb.com/title/tt0094226/reference
  25. plot summary
  26. http://www.imdb.com/title/tt0094226/plotsummary
  27. ...and so on.
  28. """
  29. from __future__ import absolute_import, division, print_function, unicode_literals
  30. import functools
  31. import re
  32. from imdb import PY2
  33. from imdb import imdbURL_base
  34. from imdb.Company import Company
  35. from imdb.Movie import Movie
  36. from imdb.Person import Person
  37. from imdb.utils import _Container, KIND_MAP
  38. from .piculet import Path, Rule, Rules, preprocessors, transformers, ElementTree
  39. from .utils import DOMParserBase, analyze_imdbid, build_person, build_movie
  40. if PY2:
  41. from urllib import unquote
  42. else:
  43. from urllib.parse import unquote
  44. # Dictionary used to convert some section's names.
  45. _SECT_CONV = {
  46. 'directed': 'director',
  47. 'directed by': 'director',
  48. 'directors': 'director',
  49. 'editors': 'editor',
  50. 'writing credits': 'writer',
  51. 'writers': 'writer',
  52. 'produced': 'producer',
  53. 'cinematography': 'cinematographer',
  54. 'film editing': 'editor',
  55. 'casting': 'casting director',
  56. 'costume design': 'costume designer',
  57. 'makeup department': 'make up',
  58. 'production management': 'production manager',
  59. 'second unit director or assistant director': 'assistant director',
  60. 'costume and wardrobe department': 'costume department',
  61. 'costume departmen': 'costume department',
  62. 'sound department': 'sound crew',
  63. 'stunts': 'stunt performer',
  64. 'other crew': 'miscellaneous crew',
  65. 'also known as': 'akas',
  66. 'country': 'countries',
  67. 'runtime': 'runtimes',
  68. 'language': 'languages',
  69. 'certification': 'certificates',
  70. 'genre': 'genres',
  71. 'created': 'creator',
  72. 'creators': 'creator',
  73. 'color': 'color info',
  74. 'plot': 'plot outline',
  75. 'art director': 'art direction',
  76. 'art directors': 'art direction',
  77. 'composers': 'composer',
  78. 'assistant directors': 'assistant director',
  79. 'set decorator': 'set decoration',
  80. 'set decorators': 'set decoration',
  81. 'visual effects department': 'visual effects',
  82. 'miscellaneous': 'miscellaneous crew',
  83. 'make up department': 'make up',
  84. 'plot summary': 'plot outline',
  85. 'cinematographers': 'cinematographer',
  86. 'camera department': 'camera and electrical department',
  87. 'costume designers': 'costume designer',
  88. 'production designer': 'production design',
  89. 'production designers': 'production design',
  90. 'production managers': 'production manager',
  91. 'music original': 'original music',
  92. 'casting directors': 'casting director',
  93. 'other companies': 'miscellaneous companies',
  94. 'producers': 'producer',
  95. 'special effects by': 'special effects department',
  96. }
  97. re_space = re.compile(r'\s+')
  98. def clean_section_name(section):
  99. """Clean and replace some section names."""
  100. section = re_space.sub(' ', section.replace('_', ' ').strip().lower())
  101. if section.endswith(' by'):
  102. section = section[:-3]
  103. return _SECT_CONV.get(section, section)
  104. def _manageRoles(mo):
  105. """Perform some transformation on the html, so that roleIDs can
  106. be easily retrieved."""
  107. firstHalf = mo.group(1)
  108. secondHalf = mo.group(2)
  109. newRoles = []
  110. roles = secondHalf.split(' / ')
  111. for role in roles:
  112. role = role.strip()
  113. if not role:
  114. continue
  115. roleID = analyze_imdbid(role)
  116. if roleID is None:
  117. roleID = '/'
  118. else:
  119. roleID += '/'
  120. newRoles.append('<div class="_imdbpyrole" roleid="%s">%s</div>' % (
  121. roleID, role.strip()
  122. ))
  123. return firstHalf + ' / '.join(newRoles) + mo.group(3)
  124. _reRolesMovie = re.compile(r'(<td class="character">)(.*?)(</td>)', re.I | re.M | re.S)
  125. def makeSplitter(lstrip=None, sep='|', comments=True,
  126. origNotesSep=' (', newNotesSep='::(', strip=None):
  127. """Return a splitter function suitable for a given set of data."""
  128. def splitter(x):
  129. if not x:
  130. return x
  131. x = x.strip()
  132. if not x:
  133. return x
  134. if lstrip is not None:
  135. x = x.lstrip(lstrip).lstrip()
  136. lx = x.split(sep)
  137. lx[:] = [_f for _f in [j.strip() for j in lx] if _f]
  138. if comments:
  139. lx[:] = [j.replace(origNotesSep, newNotesSep, 1) for j in lx]
  140. if strip:
  141. lx[:] = [j.strip(strip) for j in lx]
  142. return lx
  143. return splitter
  144. def _toInt(val, replace=()):
  145. """Return the value, converted to integer, or None; if present, 'replace'
  146. must be a list of tuples of values to replace."""
  147. for before, after in replace:
  148. val = val.replace(before, after)
  149. try:
  150. return int(val)
  151. except (TypeError, ValueError):
  152. return None
  153. _re_og_title = re.compile(
  154. r'(.*) \((?:(?:(.+)(?= ))? ?(\d{4})(?:(–)(\d{4}| ))?|(.+))\)',
  155. re.UNICODE
  156. )
  157. def analyze_og_title(og_title):
  158. data = {}
  159. match = _re_og_title.match(og_title)
  160. if og_title and not match:
  161. # assume it's a title in production, missing release date information
  162. return {'title': og_title}
  163. data['title'] = match.group(1)
  164. if match.group(3):
  165. data['year'] = int(match.group(3))
  166. kind = match.group(2) or match.group(6)
  167. if kind is None:
  168. kind = 'movie'
  169. else:
  170. kind = kind.lower()
  171. kind = KIND_MAP.get(kind, kind)
  172. data['kind'] = kind
  173. year_separator = match.group(4)
  174. # There is a year separator so assume an ongoing or ended series
  175. if year_separator is not None:
  176. end_year = match.group(5)
  177. if end_year is not None:
  178. data['series years'] = '%(year)d-%(end_year)s' % {
  179. 'year': data['year'],
  180. 'end_year': end_year.strip(),
  181. }
  182. elif kind.endswith('series'):
  183. data['series years'] = '%(year)d-' % {'year': data['year']}
  184. # No year separator and series, so assume that it ended the same year
  185. elif kind.endswith('series') and 'year' in data:
  186. data['series years'] = '%(year)d-%(year)d' % {'year': data['year']}
  187. if data['kind'] == 'episode' and data['title'][0] == '"':
  188. quote_end = data['title'].find('"', 1)
  189. data['tv series title'] = data['title'][1:quote_end]
  190. data['title'] = data['title'][quote_end + 1:].strip()
  191. return data
  192. def analyze_certificates(certificates):
  193. def reducer(acc, el):
  194. cert_re = re.compile(r'^(.+):(.+)$', re.UNICODE)
  195. if cert_re.match(el):
  196. acc.append(el)
  197. elif acc:
  198. acc[-1] = u'{}::{}'.format(
  199. acc[-1],
  200. el,
  201. )
  202. return acc
  203. certificates = [el.strip() for el in certificates.split('\n') if el.strip()]
  204. return functools.reduce(reducer, certificates, [])
  205. def clean_akas(aka):
  206. aka = re_space.sub(' ', aka).strip()
  207. if aka.lower().startswith('see more'):
  208. aka = ''
  209. return aka
  210. class DOMHTMLMovieParser(DOMParserBase):
  211. """Parser for the "reference" page of a given movie.
  212. The page should be provided as a string, as taken from
  213. the www.imdb.com server. The final result will be a
  214. dictionary, with a key for every relevant section.
  215. Example::
  216. mparser = DOMHTMLMovieParser()
  217. result = mparser.parse(reference_html_string)
  218. """
  219. _containsObjects = True
  220. rules = [
  221. Rule(
  222. key='title',
  223. extractor=Path('//meta[@property="og:title"]/@content',
  224. transform=analyze_og_title)
  225. ),
  226. Rule(
  227. key='original title',
  228. extractor=Path('//div[@class="titlereference-header"]//span[@class="titlereference-original-title-label"]/preceding-sibling::text()',
  229. transform=lambda x: re_space.sub(' ', x).strip())
  230. ),
  231. Rule(
  232. key='original title title-year',
  233. extractor=Path('//div[@class="titlereference-header"]//span[@class="titlereference-title-year"]/preceding-sibling::text()',
  234. transform=lambda x: re_space.sub(' ', x).strip())
  235. ),
  236. Rule(
  237. key='localized title',
  238. extractor=Path('//meta[@name="title"]/@content',
  239. transform=lambda x: analyze_og_title(x).get('title'))
  240. ),
  241. # parser for misc sections like 'casting department', 'stunts', ...
  242. Rule(
  243. key='misc sections',
  244. extractor=Rules(
  245. foreach='//h4[contains(@class, "ipl-header__content")]',
  246. rules=[
  247. Rule(
  248. key=Path('./@name', transform=clean_section_name),
  249. extractor=Rules(
  250. foreach='../../following-sibling::table[1]//tr',
  251. rules=[
  252. Rule(
  253. key='person',
  254. extractor=Path('.//text()')
  255. ),
  256. Rule(
  257. key='link',
  258. extractor=Path('./td[1]/a[@href]/@href')
  259. )
  260. ],
  261. transform=lambda x: build_person(
  262. x.get('person') or '',
  263. personID=analyze_imdbid(x.get('link'))
  264. )
  265. )
  266. )
  267. ]
  268. )
  269. ),
  270. Rule(
  271. key='cast',
  272. extractor=Rules(
  273. foreach='//table[@class="cast_list"]//tr',
  274. rules=[
  275. Rule(
  276. key='person',
  277. extractor=Path('.//text()')
  278. ),
  279. Rule(
  280. key='link',
  281. extractor=Path('./td[2]/a/@href')
  282. ),
  283. Rule(
  284. key='roleID',
  285. extractor=Path('./td[4]//div[@class="_imdbpyrole"]/@roleid')
  286. )
  287. ],
  288. transform=lambda x: build_person(
  289. x.get('person') or '',
  290. personID=analyze_imdbid(x.get('link')),
  291. roleID=(x.get('roleID') or '').split('/')
  292. )
  293. )
  294. ),
  295. Rule(
  296. key='recommendations',
  297. extractor=Rules(
  298. foreach='//div[contains(@class, "rec_item")]',
  299. rules=[
  300. Rule(
  301. key='movieID',
  302. extractor=Path(
  303. './@data-tconst',
  304. transform=lambda x: (x or '').replace('tt', '')
  305. )
  306. ),
  307. Rule(
  308. key='title',
  309. extractor=Path(
  310. './/a//img/@title',
  311. transform=lambda x: re_space.sub(' ', x or '').strip()
  312. )
  313. ),
  314. ],
  315. transform=lambda x: build_movie(x.get('title', ''), movieID=x.get('movieID'))
  316. )
  317. ),
  318. Rule(
  319. key='myrating',
  320. extractor=Path('//span[@id="voteuser"]//text()')
  321. ),
  322. Rule(
  323. key='plot summary',
  324. extractor=Path('//td[starts-with(text(), "Plot")]/..//p/text()',
  325. transform=lambda x: x.strip().rstrip('|').rstrip())
  326. ),
  327. Rule(
  328. key='genres',
  329. extractor=Path(
  330. foreach='//td[starts-with(text(), "Genre")]/..//li/a',
  331. path='./text()'
  332. )
  333. ),
  334. Rule(
  335. key='runtimes',
  336. extractor=Path(
  337. foreach='//td[starts-with(text(), "Runtime")]/..//li',
  338. path='./text()',
  339. transform=lambda x: x.strip().replace(' min', '')
  340. )
  341. ),
  342. Rule(
  343. key='countries',
  344. extractor=Path(
  345. foreach='//td[starts-with(text(), "Countr")]/..//li/a',
  346. path='./text()'
  347. )
  348. ),
  349. Rule(
  350. key='country codes',
  351. extractor=Path(
  352. foreach='//td[starts-with(text(), "Countr")]/..//li/a',
  353. path='./@href',
  354. transform=lambda x: x.split('/')[2].strip().lower()
  355. )
  356. ),
  357. Rule(
  358. key='language',
  359. extractor=Path(
  360. foreach='//td[starts-with(text(), "Language")]/..//li/a',
  361. path='./text()'
  362. )
  363. ),
  364. Rule(
  365. key='language codes',
  366. extractor=Path(
  367. foreach='//td[starts-with(text(), "Language")]/..//li/a',
  368. path='./@href',
  369. transform=lambda x: x.split('/')[2].strip()
  370. )
  371. ),
  372. Rule(
  373. key='color info',
  374. extractor=Path(
  375. foreach='//td[starts-with(text(), "Color")]/..//li/a',
  376. path='./text()',
  377. transform=lambda x: x.replace(' (', '::(')
  378. )
  379. ),
  380. Rule(
  381. key='aspect ratio',
  382. extractor=Path(
  383. '//td[starts-with(text(), "Aspect")]/..//li/text()',
  384. transform=transformers.strip
  385. )
  386. ),
  387. Rule(
  388. key='sound mix',
  389. extractor=Path(
  390. foreach='//td[starts-with(text(), "Sound Mix")]/..//li/a',
  391. path='./text()',
  392. transform=lambda x: x.replace(' (', '::(')
  393. )
  394. ),
  395. Rule(
  396. key='box office',
  397. extractor=Rules(
  398. foreach='//section[contains(@class, "titlereference-section-box-office")]'
  399. '//table[contains(@class, "titlereference-list")]//tr',
  400. rules=[
  401. Rule(
  402. key='box_office_title',
  403. extractor=Path('./td[1]/text()')
  404. ),
  405. Rule(
  406. key='box_office_detail',
  407. extractor=Path('./td[2]/text()')
  408. )
  409. ],
  410. transform=lambda x: (x['box_office_title'].strip(),
  411. x['box_office_detail'].strip())
  412. ),
  413. ),
  414. Rule(
  415. key='certificates',
  416. extractor=Path(
  417. '//td[starts-with(text(), "Certificat")]/..//text()',
  418. transform=analyze_certificates
  419. )
  420. ),
  421. # Collects akas not encosed in <i> tags.
  422. Rule(
  423. key='other akas',
  424. extractor=Path(
  425. foreach='//section[contains(@class, "listo")]//td[starts-with(text(), "Also Known As")]/..//ul/li',
  426. path='.//text()',
  427. transform=clean_akas
  428. )
  429. ),
  430. Rule(
  431. key='creator',
  432. extractor=Rules(
  433. foreach='//div[starts-with(normalize-space(text()), "Creator")]/ul/li[1]/a',
  434. rules=[
  435. Rule(
  436. key='name',
  437. extractor=Path('./text()')
  438. ),
  439. Rule(
  440. key='link',
  441. extractor=Path('./@href')
  442. )
  443. ],
  444. transform=lambda x: build_person(
  445. x.get('name') or '',
  446. personID=analyze_imdbid(x.get('link'))
  447. )
  448. )
  449. ),
  450. Rule(
  451. key='thin writer',
  452. extractor=Rules(
  453. foreach='//div[starts-with(normalize-space(text()), "Writer")]/ul/li[1]/a',
  454. rules=[
  455. Rule(
  456. key='name',
  457. extractor=Path('./text()')
  458. ),
  459. Rule(
  460. key='link',
  461. extractor=Path('./@href')
  462. )
  463. ],
  464. transform=lambda x: build_person(
  465. x.get('name') or '',
  466. personID=analyze_imdbid(x.get('link'))
  467. )
  468. )
  469. ),
  470. Rule(
  471. key='thin director',
  472. extractor=Rules(
  473. foreach='//div[starts-with(normalize-space(text()), "Director")]/ul/li[1]/a',
  474. rules=[
  475. Rule(
  476. key='name',
  477. extractor=Path('./text()')
  478. ),
  479. Rule(
  480. key='link',
  481. extractor=Path('./@href')
  482. )
  483. ],
  484. transform=lambda x: build_person(
  485. x.get('name') or '',
  486. personID=analyze_imdbid(x.get('link'))
  487. )
  488. )
  489. ),
  490. Rule(
  491. key='top/bottom rank',
  492. extractor=Path(
  493. '//li[@class="ipl-inline-list__item"]//a[starts-with(@href, "/chart/")]/text()'
  494. )
  495. ),
  496. Rule(
  497. key='original air date',
  498. extractor=Path('//span[@imdbpy="airdate"]/text()')
  499. ),
  500. Rule(
  501. key='series years',
  502. extractor=Path(
  503. '//div[@id="tn15title"]//span[starts-with(text(), "TV series")]/text()',
  504. transform=lambda x: x.replace('TV series', '').strip()
  505. )
  506. ),
  507. Rule(
  508. key='season/episode',
  509. extractor=Path(
  510. '//div[@class="titlereference-overview-season-episode-section"]/ul//text()',
  511. transform=transformers.strip
  512. )
  513. ),
  514. Rule(
  515. key='number of episodes',
  516. extractor=Path(
  517. '//a[starts-with(text(), "All Episodes")]/text()',
  518. transform=lambda x: int(x.replace('All Episodes', '').strip()[1:-1])
  519. )
  520. ),
  521. Rule(
  522. key='episode number',
  523. extractor=Path(
  524. '//div[@id="tn15epnav"]/text()',
  525. transform=lambda x: int(re.sub(r'[^a-z0-9 ]', '',
  526. x.lower()).strip().split()[0]))
  527. ),
  528. Rule(
  529. key='previous episode',
  530. extractor=Path(
  531. '//span[@class="titlereference-overview-episodes-links"]'
  532. '//a[contains(text(), "Previous")]/@href',
  533. transform=analyze_imdbid
  534. )
  535. ),
  536. Rule(
  537. key='next episode',
  538. extractor=Path(
  539. '//span[@class="titlereference-overview-episodes-links"]'
  540. '//a[contains(text(), "Next")]/@href',
  541. transform=analyze_imdbid
  542. )
  543. ),
  544. Rule(
  545. key='number of seasons',
  546. extractor=Path(
  547. '//span[@class="titlereference-overview-years-links"]/../a[1]/text()',
  548. transform=int
  549. )
  550. ),
  551. Rule(
  552. key='tv series link',
  553. extractor=Path('//a[starts-with(text(), "All Episodes")]/@href')
  554. ),
  555. Rule(
  556. key='akas',
  557. extractor=Path(
  558. foreach='//i[@class="transl"]',
  559. path='./text()',
  560. transform=lambda x: x
  561. .replace(' ', ' ')
  562. .rstrip('-')
  563. .replace('" - ', '"::', 1)
  564. .strip('"')
  565. .replace(' ', ' ')
  566. )
  567. ),
  568. Rule(
  569. key='production status',
  570. extractor=Path(
  571. '//td[starts-with(text(), "Status:")]/..//div[@class="info-content"]//text()',
  572. transform=lambda x: x.strip().split('|')[0].strip().lower()
  573. )
  574. ),
  575. Rule(
  576. key='production status updated',
  577. extractor=Path(
  578. '//td[starts-with(text(), "Status Updated:")]/'
  579. '..//div[@class="info-content"]//text()',
  580. transform=transformers.strip
  581. )
  582. ),
  583. Rule(
  584. key='production comments',
  585. extractor=Path(
  586. '//td[starts-with(text(), "Comments:")]/'
  587. '..//div[@class="info-content"]//text()',
  588. transform=transformers.strip
  589. )
  590. ),
  591. Rule(
  592. key='production note',
  593. extractor=Path(
  594. '//td[starts-with(text(), "Note:")]/'
  595. '..//div[@class="info-content"]//text()',
  596. transform=transformers.strip
  597. )
  598. ),
  599. Rule(
  600. key='companies',
  601. extractor=Rules(
  602. foreach="//ul[@class='simpleList']",
  603. rules=[
  604. Rule(
  605. key=Path('preceding-sibling::header[1]/div/h4/text()', transform=transformers.lower),
  606. extractor=Rules(
  607. foreach='./li',
  608. rules=[
  609. Rule(
  610. key='name',
  611. extractor=Path('./a//text()')
  612. ),
  613. Rule(
  614. key='comp-link',
  615. extractor=Path('./a/@href')
  616. ),
  617. Rule(
  618. key='notes',
  619. extractor=Path('./text()')
  620. )
  621. ],
  622. transform=lambda x: Company(
  623. name=x.get('name') or '',
  624. accessSystem='http',
  625. companyID=analyze_imdbid(x.get('comp-link')),
  626. notes=(x.get('notes') or '').strip()
  627. )
  628. )
  629. )
  630. ]
  631. )
  632. ),
  633. Rule(
  634. key='rating',
  635. extractor=Path('(//span[@class="ipl-rating-star__rating"])[1]/text()')
  636. ),
  637. Rule(
  638. key='votes',
  639. extractor=Path('//span[@class="ipl-rating-star__total-votes"][1]/text()')
  640. ),
  641. Rule(
  642. key='cover url',
  643. extractor=Path('//img[@alt="Poster"]/@src')
  644. ),
  645. Rule(
  646. key='imdbID',
  647. extractor=Path('//meta[@property="pageId"]/@content',
  648. transform=lambda x: (x or '').replace('tt', ''))
  649. )
  650. ]
  651. preprocessors = [
  652. ('/releaseinfo">', '"><span imdbpy="airdate">'),
  653. (re.compile(r'(<b class="blackcatheader">.+?</b>)', re.I), r'</div><div>\1'),
  654. ('<small>Full cast and crew for<br>', ''),
  655. ('<td> </td>', '<td>...</td>'),
  656. (re.compile(r'<span class="tv-extra">TV mini-series(\s+.*?)</span>', re.I),
  657. r'<span class="tv-extra">TV series\1</span> (mini)'),
  658. (_reRolesMovie, _manageRoles)
  659. ]
  660. def preprocess_dom(self, dom):
  661. # Handle series information.
  662. xpath = self.xpath(dom, "//b[text()='Series Crew']")
  663. if xpath:
  664. b = xpath[-1] # In doubt, take the last one.
  665. for a in self.xpath(b, "./following::h5/a[@class='glossary']"):
  666. name = a.get('name')
  667. if name:
  668. a.set('name', 'series %s' % name)
  669. # Remove links to IMDbPro.
  670. preprocessors.remove(dom, '//span[@class="pro-link"]')
  671. # Remove some 'more' links (keep others, like the one around
  672. # the number of votes).
  673. preprocessors.remove(dom, '//a[@class="tn15more"][starts-with(@href, "/title/")]')
  674. # Remove the "rest of list" in cast.
  675. preprocessors.remove(dom, '//td[@colspan="4"]/..')
  676. return dom
  677. re_space = re.compile(r'\s+')
  678. re_airdate = re.compile(r'(.*)\s*\(season (\d+), episode (\d+)\)', re.I)
  679. def postprocess_data(self, data):
  680. # Convert section names.
  681. for sect in list(data.keys()):
  682. if sect in _SECT_CONV:
  683. data[_SECT_CONV[sect]] = data[sect]
  684. del data[sect]
  685. # Filter out fake values.
  686. for key in data:
  687. value = data[key]
  688. if isinstance(value, list) and value:
  689. if isinstance(value[0], Person):
  690. data[key] = [x for x in value if x.personID is not None]
  691. if isinstance(value[0], _Container):
  692. for obj in data[key]:
  693. obj.accessSystem = self._as
  694. obj.modFunct = self._modFunct
  695. for key in ['title']:
  696. if (key in data) and isinstance(data[key], dict):
  697. subdata = data[key]
  698. del data[key]
  699. data.update(subdata)
  700. if not data.get('original title'):
  701. if 'original title title-year' in data:
  702. data['original title'] = data['original title title-year']
  703. del data['original title title-year']
  704. elif 'original title title-year' in data:
  705. del data['original title title-year']
  706. misc_sections = data.get('misc sections')
  707. if misc_sections is not None:
  708. for section in misc_sections:
  709. # skip sections with their own parsers
  710. if 'cast' in section.keys():
  711. continue
  712. data.update(section)
  713. del data['misc sections']
  714. if 'akas' in data or 'other akas' in data:
  715. akas = data.get('akas') or []
  716. other_akas = data.get('other akas') or []
  717. akas += other_akas
  718. nakas = []
  719. for aka in akas:
  720. aka = aka.strip()
  721. if not aka:
  722. continue
  723. if aka.endswith('" -'):
  724. aka = aka[:-3].rstrip()
  725. nakas.append(aka)
  726. if 'akas' in data:
  727. del data['akas']
  728. if 'other akas' in data:
  729. del data['other akas']
  730. if nakas:
  731. data['akas'] = nakas
  732. if 'runtimes' in data:
  733. data['runtimes'] = [x.replace(' min', '')
  734. for x in data['runtimes']]
  735. if 'number of seasons' in data:
  736. data['seasons'] = [str(i) for i in range(1, data['number of seasons'] + 1)]
  737. if 'season/episode' in data:
  738. tokens = data['season/episode'].split('Episode')
  739. try:
  740. data['season'] = int(tokens[0].split('Season')[1])
  741. except:
  742. data['season'] = 'unknown'
  743. try:
  744. data['episode'] = int(tokens[1])
  745. except:
  746. data['episode'] = 'unknown'
  747. del data['season/episode']
  748. for k in ('writer', 'director'):
  749. t_k = 'thin %s' % k
  750. if t_k not in data:
  751. continue
  752. if k not in data:
  753. data[k] = data[t_k]
  754. del data[t_k]
  755. if 'top/bottom rank' in data:
  756. tbVal = data['top/bottom rank'].lower()
  757. if tbVal.startswith('top'):
  758. tbKey = 'top 250 rank'
  759. tbVal = _toInt(tbVal, [('top rated movies: #', '')])
  760. else:
  761. tbKey = 'bottom 100 rank'
  762. tbVal = _toInt(tbVal, [('bottom rated movies: #', '')])
  763. if tbVal:
  764. data[tbKey] = tbVal
  765. del data['top/bottom rank']
  766. if 'year' in data and data['year'] == '????':
  767. del data['year']
  768. if 'tv series link' in data:
  769. if 'tv series title' in data:
  770. data['episode of'] = Movie(title=data['tv series title'],
  771. movieID=analyze_imdbid(data['tv series link']),
  772. accessSystem=self._as,
  773. modFunct=self._modFunct)
  774. data['episode of']['kind'] = 'tv series'
  775. del data['tv series title']
  776. del data['tv series link']
  777. if 'rating' in data:
  778. try:
  779. data['rating'] = float(data['rating'].replace('/10', ''))
  780. except (TypeError, ValueError):
  781. pass
  782. if data['rating'] == 0:
  783. del data['rating']
  784. if 'votes' in data:
  785. try:
  786. votes = data['votes'].replace('(', '').replace(')', '').replace(',', '').replace('votes', '')
  787. data['votes'] = int(votes)
  788. except (TypeError, ValueError):
  789. pass
  790. companies = data.get('companies')
  791. if companies:
  792. for section in companies:
  793. for key, value in section.items():
  794. if key in data:
  795. key = '%s companies' % key
  796. data.update({key: value})
  797. del data['companies']
  798. if 'box office' in data:
  799. data['box office'] = dict(data['box office'])
  800. return data
  801. def _process_plotsummary(x):
  802. """Process a plot (contributed by Rdian06)."""
  803. xauthor = x.get('author')
  804. xplot = x.get('plot', '').strip()
  805. if xauthor:
  806. xplot += '::%s' % xauthor
  807. return xplot
  808. class DOMHTMLPlotParser(DOMParserBase):
  809. """Parser for the "plot summary" page of a given movie.
  810. The page should be provided as a string, as taken from
  811. the www.imdb.com server. The final result will be a
  812. dictionary, with a 'plot' key, containing a list
  813. of string with the structure: 'summary::summary_author <author@email>'.
  814. Example::
  815. pparser = HTMLPlotParser()
  816. result = pparser.parse(plot_summary_html_string)
  817. """
  818. _defGetRefs = True
  819. def synopsis_reducer(nodes):
  820. ret=[]
  821. for n in nodes:
  822. if type(n) is ElementTree._ElementUnicodeResult:
  823. ret.append(n)
  824. return '\n\n'.join(ret)
  825. # Notice that recently IMDb started to put the email of the
  826. # author only in the link, that we're not collecting, here.
  827. rules = [
  828. Rule(
  829. key='plot',
  830. extractor=Rules(
  831. foreach='//ul[@id="plot-summaries-content"]/li',
  832. rules=[
  833. Rule(
  834. key='plot',
  835. extractor=Path('./p//text()')
  836. ),
  837. Rule(
  838. key='author',
  839. extractor=Path('.//div[@class="author-container"]//a/text()')
  840. )
  841. ],
  842. transform=_process_plotsummary
  843. )
  844. ),
  845. Rule(
  846. key='synopsis',
  847. extractor=Path(
  848. foreach='//ul[@id="plot-synopsis-content"]',
  849. path='.//li//node()',
  850. reduce=synopsis_reducer
  851. )
  852. )
  853. ]
  854. def preprocess_dom(self, dom):
  855. preprocessors.remove(dom, '//li[@id="no-summary-content"]')
  856. return dom
  857. def postprocess_data(self, data):
  858. if 'synopsis' in data and data['synopsis'][0] and 'a Synopsis for this title' in data['synopsis'][0]:
  859. del data['synopsis']
  860. return data
  861. def _process_award(x):
  862. award = {}
  863. _award = x.get('award')
  864. if _award is not None:
  865. _award = _award.strip()
  866. award['award'] = _award
  867. if not award['award']:
  868. return {}
  869. award['year'] = x.get('year').strip()
  870. if award['year'] and award['year'].isdigit():
  871. award['year'] = int(award['year'])
  872. award['result'] = x.get('result').strip()
  873. category = x.get('category').strip()
  874. if category:
  875. award['category'] = category
  876. received_with = x.get('with')
  877. if received_with is not None:
  878. award['with'] = received_with.strip()
  879. notes = x.get('notes')
  880. if notes is not None:
  881. notes = notes.strip().split('\n', 2)[0]
  882. notes = re_space.sub(' ', notes)
  883. if notes:
  884. award['notes'] = notes
  885. award['anchor'] = x.get('anchor')
  886. return award
  887. class DOMHTMLAwardsParser(DOMParserBase):
  888. """Parser for the "awards" page of a given person or movie.
  889. The page should be provided as a string, as taken from
  890. the www.imdb.com server. The final result will be a
  891. dictionary, with a key for every relevant section.
  892. Example::
  893. awparser = HTMLAwardsParser()
  894. result = awparser.parse(awards_html_string)
  895. """
  896. subject = 'title'
  897. _containsObjects = True
  898. rules = [
  899. Rule(
  900. key='awards',
  901. extractor=Rules(
  902. foreach='//*[@id="main"]/div[1]/div/table//tr',
  903. rules=[
  904. Rule(
  905. key='year',
  906. extractor=Path('normalize-space(./ancestor::table/preceding-sibling::*[1]/a/text())')
  907. ),
  908. Rule(
  909. key='result',
  910. extractor=Path('./td[1]/b/text()')
  911. ),
  912. Rule(
  913. key='award',
  914. extractor=Path('./td[1]/span/text()')
  915. ),
  916. Rule(
  917. key='category',
  918. extractor=Path('normalize-space(./ancestor::table/preceding-sibling::*[1]/text())')
  919. ),
  920. Rule(
  921. key='notes',
  922. extractor=Path('./td[2]/text()')
  923. ),
  924. Rule(
  925. key='anchor',
  926. extractor=Path('.//text()')
  927. )
  928. ],
  929. transform=_process_award
  930. )
  931. ),
  932. Rule(
  933. key='recipients',
  934. extractor=Rules(
  935. foreach='//*[@id="main"]/div[1]/div/table//tr/td[2]/a',
  936. rules=[
  937. Rule(
  938. key='name',
  939. extractor=Path('./text()')
  940. ),
  941. Rule(
  942. key='link',
  943. extractor=Path('./@href')
  944. ),
  945. Rule(
  946. key='anchor',
  947. extractor=Path('./ancestor::tr//text()')
  948. )
  949. ]
  950. )
  951. )
  952. ]
  953. preprocessors = [
  954. (re.compile('(<tr><td[^>]*>.*?</td></tr>\n\n</table>)', re.I),
  955. r'\1</table>'),
  956. (re.compile('(<tr><td[^>]*>\n\n<big>.*?</big></td></tr>)', re.I),
  957. r'</table><table class="_imdbpy">\1'),
  958. (re.compile('(<table[^>]*>\n\n)</table>(<table)', re.I), r'\1\2'),
  959. (re.compile('(<small>.*?)<br>(.*?</small)', re.I), r'\1 \2'),
  960. (re.compile('(</tr>\n\n)(<td)', re.I), r'\1<tr>\2')
  961. ]
  962. def preprocess_dom(self, dom):
  963. """Repeat td elements according to their rowspan attributes
  964. in subsequent tr elements.
  965. """
  966. cols = self.xpath(dom, "//td[@rowspan]")
  967. for col in cols:
  968. span = int(col.get('rowspan'))
  969. del col.attrib['rowspan']
  970. position = len(self.xpath(col, "./preceding-sibling::td"))
  971. row = col.getparent()
  972. for tr in self.xpath(row, "./following-sibling::tr")[:span - 1]:
  973. # if not cloned, child will be moved to new parent
  974. clone = self.clone(col)
  975. tr.insert(position, clone)
  976. return dom
  977. def postprocess_data(self, data):
  978. if len(data) == 0:
  979. return {}
  980. nd = []
  981. for award in data['awards']:
  982. matches = [p for p in data.get('recipients', [])
  983. if 'nm' in p.get('link') and award.get('anchor') == p.get('anchor')]
  984. if self.subject == 'title':
  985. recipients = [
  986. Person(name=recipient['name'],
  987. personID=analyze_imdbid(recipient['link']))
  988. for recipient in matches
  989. ]
  990. award['to'] = recipients
  991. elif self.subject == 'name':
  992. recipients = [
  993. Movie(title=recipient['name'],
  994. movieID=analyze_imdbid(recipient['link']))
  995. for recipient in matches
  996. ]
  997. award['for'] = recipients
  998. nd.append(award)
  999. if 'anchor' in award:
  1000. del award['anchor']
  1001. return {'awards': nd}
  1002. class DOMHTMLTaglinesParser(DOMParserBase):
  1003. """Parser for the "taglines" page of a given movie.
  1004. The page should be provided as a string, as taken from
  1005. the www.imdb.com server. The final result will be a
  1006. dictionary, with a key for every relevant section.
  1007. Example::
  1008. tparser = DOMHTMLTaglinesParser()
  1009. result = tparser.parse(taglines_html_string)
  1010. """
  1011. rules = [
  1012. Rule(
  1013. key='taglines',
  1014. extractor=Path(
  1015. foreach='//div[@id="taglines_content"]/div',
  1016. path='.//text()'
  1017. )
  1018. )
  1019. ]
  1020. def preprocess_dom(self, dom):
  1021. preprocessors.remove(dom, '//div[@id="taglines_content"]/div[@class="header"]')
  1022. preprocessors.remove(dom, '//div[@id="taglines_content"]/div[@id="no_content"]')
  1023. return dom
  1024. def postprocess_data(self, data):
  1025. if 'taglines' in data:
  1026. data['taglines'] = [tagline.strip() for tagline in data['taglines']]
  1027. return data
  1028. class DOMHTMLKeywordsParser(DOMParserBase):
  1029. """Parser for the "keywords" page of a given movie.
  1030. The page should be provided as a string, as taken from
  1031. the www.imdb.com server. The final result will be a
  1032. dictionary, with a key for every relevant section.
  1033. Example::
  1034. kwparser = DOMHTMLKeywordsParser()
  1035. result = kwparser.parse(keywords_html_string)
  1036. """
  1037. rules = [
  1038. Rule(
  1039. key='keywords',
  1040. extractor=Path(
  1041. foreach='//td[@data-item-keyword]',
  1042. path='./@data-item-keyword',
  1043. transform=lambda x: x.lower().replace(' ', '-')
  1044. )
  1045. ),
  1046. Rule(
  1047. key='relevant keywords',
  1048. extractor=Rules(
  1049. foreach='//td[@data-item-keyword]',
  1050. rules=[
  1051. Rule(
  1052. key='keyword',
  1053. extractor=Path('./@data-item-keyword')
  1054. ),
  1055. Rule(
  1056. key='ordering',
  1057. extractor=Path('./@data-item-votes')
  1058. ),
  1059. Rule(
  1060. key='vote_str',
  1061. extractor=Path('./div[2]/div//text()')
  1062. )
  1063. ],
  1064. transform=lambda x: {
  1065. 'keyword': x.get('keyword').lower(),
  1066. 'keyword_dash': x.get('keyword').lower().replace(' ', '-'),
  1067. 'ordering': x.get('ordering'),
  1068. 'votes_str': x.get('vote_str').strip().lower()
  1069. }
  1070. )
  1071. )
  1072. ]
  1073. def postprocess_data(self, data):
  1074. if 'relevant keywords' in data:
  1075. rk = []
  1076. for x in data['relevant keywords']:
  1077. if 'votes_str' in x:
  1078. if 'is this relevant?' in x['votes_str']:
  1079. x['votes_for'] = 0
  1080. x['total_votes'] = 0
  1081. else:
  1082. x['votes_for'] = x['votes_str'].split('of')[0].strip()
  1083. x['total_votes'] = re.sub(r"\D", "", x['votes_str'].split('of')[1]).strip()
  1084. rk.append(x)
  1085. data['relevant keywords'] = rk
  1086. return data
  1087. class DOMHTMLAlternateVersionsParser(DOMParserBase):
  1088. """Parser for the "alternate versions" page of a given movie.
  1089. The page should be provided as a string, as taken from
  1090. the www.imdb.com server. The final result will be a
  1091. dictionary, with a key for every relevant section.
  1092. Example::
  1093. avparser = DOMHTMLAlternateVersionsParser()
  1094. result = avparser.parse(alternateversions_html_string)
  1095. """
  1096. _defGetRefs = True
  1097. rules = [
  1098. Rule(
  1099. key='alternate versions',
  1100. extractor=Path(
  1101. foreach='//ul[@class="trivia"]/li',
  1102. path='.//text()',
  1103. transform=transformers.strip
  1104. )
  1105. )
  1106. ]
  1107. class DOMHTMLTriviaParser(DOMParserBase):
  1108. """Parser for the "trivia" page of a given movie.
  1109. The page should be provided as a string, as taken from
  1110. the www.imdb.com server. The final result will be a
  1111. dictionary, with a key for every relevant section.
  1112. Example::
  1113. tparser = DOMHTMLTriviaParser()
  1114. result = tparser.parse(trivia_html_string)
  1115. """
  1116. _defGetRefs = True
  1117. rules = [
  1118. Rule(
  1119. key='trivia',
  1120. extractor=Path(
  1121. foreach='//div[@class="sodatext"]',
  1122. path='.//text()',
  1123. transform=transformers.strip
  1124. )
  1125. )
  1126. ]
  1127. def preprocess_dom(self, dom):
  1128. # Remove "link this quote" links.
  1129. preprocessors.remove(dom, '//span[@class="linksoda"]')
  1130. return dom
  1131. class DOMHTMLSoundtrackParser(DOMParserBase):
  1132. """Parser for the "soundtrack" page of a given movie.
  1133. The page should be provided as a string, as taken from
  1134. the www.imdb.com server. The final result will be a
  1135. dictionary, with a key for every relevant section.
  1136. Example::
  1137. stparser = DOMHTMLSoundtrackParser()
  1138. result = stparser.parse(soundtrack_html_string)
  1139. """
  1140. _defGetRefs = True
  1141. preprocessors = [('<br />', '\n'), ('<br>', '\n')]
  1142. rules = [
  1143. Rule(
  1144. key='soundtrack',
  1145. extractor=Path(
  1146. foreach='//div[@class="list"]//div',
  1147. path='.//text()',
  1148. transform=transformers.strip
  1149. )
  1150. )
  1151. ]
  1152. def postprocess_data(self, data):
  1153. if 'soundtrack' in data:
  1154. nd = []
  1155. for x in data['soundtrack']:
  1156. ds = x.split('\n')
  1157. title = ds[0]
  1158. if title[0] == '"' and title[-1] == '"':
  1159. title = title[1:-1]
  1160. nds = []
  1161. newData = {}
  1162. for l in ds[1:]:
  1163. if ' with ' in l or ' by ' in l or ' from ' in l \
  1164. or ' of ' in l or l.startswith('From '):
  1165. nds.append(l)
  1166. else:
  1167. if nds:
  1168. nds[-1] += l
  1169. else:
  1170. nds.append(l)
  1171. newData[title] = {}
  1172. for l in nds:
  1173. skip = False
  1174. for sep in ('From ',):
  1175. if l.startswith(sep):
  1176. fdix = len(sep)
  1177. kind = l[:fdix].rstrip().lower()
  1178. info = l[fdix:].lstrip()
  1179. newData[title][kind] = info
  1180. skip = True
  1181. if not skip:
  1182. for sep in ' with ', ' by ', ' from ', ' of ':
  1183. fdix = l.find(sep)
  1184. if fdix != -1:
  1185. fdix = fdix + len(sep)
  1186. kind = l[:fdix].rstrip().lower()
  1187. info = l[fdix:].lstrip()
  1188. newData[title][kind] = info
  1189. break
  1190. nd.append(newData)
  1191. data['soundtrack'] = nd
  1192. return data
  1193. class DOMHTMLCrazyCreditsParser(DOMParserBase):
  1194. """Parser for the "crazy credits" page of a given movie.
  1195. The page should be provided as a string, as taken from
  1196. the www.imdb.com server. The final result will be a
  1197. dictionary, with a key for every relevant section.
  1198. Example::
  1199. ccparser = DOMHTMLCrazyCreditsParser()
  1200. result = ccparser.parse(crazycredits_html_string)
  1201. """
  1202. _defGetRefs = True
  1203. rules = [
  1204. Rule(
  1205. key='crazy credits',
  1206. extractor=Path(
  1207. foreach='//ul/li/tt',
  1208. path='.//text()',
  1209. transform=lambda x: x.replace('\n', ' ').replace(' ', ' ')
  1210. )
  1211. )
  1212. ]
  1213. def _process_goof(x):
  1214. text = (x.get('text') or '').strip()
  1215. category = (x.get('category') or 'Goof').strip()
  1216. return {"category": category, "text": text}
  1217. class DOMHTMLGoofsParser(DOMParserBase):
  1218. """Parser for the "goofs" page of a given movie.
  1219. The page should be provided as a string, as taken from
  1220. the www.imdb.com server. The final result will be a
  1221. dictionary, with a key for every relevant section.
  1222. Example::
  1223. gparser = DOMHTMLGoofsParser()
  1224. result = gparser.parse(goofs_html_string)
  1225. """
  1226. _defGetRefs = True
  1227. rules = [
  1228. Rule(
  1229. key='goofs',
  1230. extractor=Rules(
  1231. foreach='//div[contains(@class, "soda sodavote")]',
  1232. rules=[
  1233. Rule(
  1234. key='text',
  1235. extractor=Path('./div[@class="sodatext"]/text()')
  1236. ),
  1237. Rule(
  1238. key='category',
  1239. extractor=Path('./preceding-sibling::h4[1]/text()')
  1240. )
  1241. ],
  1242. transform=_process_goof
  1243. )
  1244. )
  1245. ]
  1246. class DOMHTMLQuotesParser(DOMParserBase):
  1247. """Parser for the "memorable quotes" page of a given movie.
  1248. The page should be provided as a string, as taken from
  1249. the www.imdb.com server. The final result will be a
  1250. dictionary, with a key for every relevant section.
  1251. Example::
  1252. qparser = DOMHTMLQuotesParser()
  1253. result = qparser.parse(quotes_html_string)
  1254. """
  1255. _defGetRefs = True
  1256. rules = [
  1257. Rule(
  1258. key='quotes',
  1259. extractor=Path(
  1260. foreach='//div[@class="sodatext"]',
  1261. path='.//text()',
  1262. transform=lambda x: x
  1263. .strip()
  1264. .replace(' \n', '::')
  1265. .replace('::\n', '::')
  1266. .replace('\n', ' ')
  1267. )
  1268. )
  1269. ]
  1270. def preprocess_dom(self, dom):
  1271. preprocessors.remove(dom, '//div[@class="did-you-know-actions"]')
  1272. return dom
  1273. def postprocess_data(self, data):
  1274. quotes = data.get('quotes', [])
  1275. if not quotes:
  1276. return {}
  1277. quotes = [q.split('::') for q in quotes]
  1278. return {'quotes': quotes}
  1279. class DOMHTMLReleaseinfoParser(DOMParserBase):
  1280. """Parser for the "release dates" page of a given movie.
  1281. The page should be provided as a string, as taken from
  1282. the www.imdb.com server. The final result will be a
  1283. dictionary, with a key for every relevant section.
  1284. Example::
  1285. rdparser = DOMHTMLReleaseinfoParser()
  1286. result = rdparser.parse(releaseinfo_html_string)
  1287. """
  1288. rules = [
  1289. Rule(
  1290. key='release dates',
  1291. extractor=Rules(
  1292. foreach='//table[contains(@class, "release-dates-table-test-only")]//tr',
  1293. rules=[
  1294. Rule(
  1295. key='country',
  1296. extractor=Path('.//td[1]//text()')
  1297. ),
  1298. Rule(
  1299. key='country_code',
  1300. extractor=Path('.//td[1]/a/@href')
  1301. ),
  1302. Rule(
  1303. key='date',
  1304. extractor=Path('.//td[2]//text()')
  1305. ),
  1306. Rule(
  1307. key='notes',
  1308. extractor=Path('.//td[3]//text()')
  1309. )
  1310. ]
  1311. )
  1312. ),
  1313. Rule(
  1314. key='akas',
  1315. extractor=Rules(
  1316. foreach='//table[contains(@class, "akas-table-test-only")]//tr',
  1317. rules=[
  1318. Rule(
  1319. key='countries',
  1320. extractor=Path('./td[1]/text()')
  1321. ),
  1322. Rule(

Large files files are truncated, but you can click here to view the full file