PageRenderTime 60ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/imdb/parser/http/movieParser.py

http://github.com/alberanid/imdbpy
Python | 2710 lines | 2632 code | 29 blank | 49 comment | 43 complexity | 77fef151a68ebaa7f58ab74d4998fc06 MD5 | raw file
Possible License(s): GPL-2.0
  1. # -*- coding: utf-8 -*-
  2. # Copyright 2004-2021 Davide Alberani <da@erlug.linux.it>
  3. # 2008-2018 H. Turgut Uyar <uyar@tekir.org>
  4. #
  5. # This program is free software; you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation; either version 2 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program; if not, write to the Free Software
  17. # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. """
  19. This module provides the classes (and the instances) that are used to parse
  20. the IMDb pages on the www.imdb.com server about a movie.
  21. For example, for Brian De Palma's "The Untouchables", the referred pages
  22. would be:
  23. combined details
  24. http://www.imdb.com/title/tt0094226/reference
  25. plot summary
  26. http://www.imdb.com/title/tt0094226/plotsummary
  27. ...and so on.
  28. """
  29. from __future__ import absolute_import, division, print_function, unicode_literals
  30. import functools
  31. import re
  32. from imdb import PY2
  33. from imdb import imdbURL_base
  34. from imdb.Company import Company
  35. from imdb.Movie import Movie
  36. from imdb.Person import Person
  37. from imdb.utils import _Container, KIND_MAP
  38. from .piculet import Path, Rule, Rules, preprocessors, transformers, ElementTree
  39. from .utils import DOMParserBase, analyze_imdbid, build_person, build_movie
  40. if PY2:
  41. from urllib import unquote
  42. else:
  43. from urllib.parse import unquote
  44. # Dictionary used to convert some section's names.
  45. _SECT_CONV = {
  46. 'directed': 'director',
  47. 'directed by': 'director',
  48. 'directors': 'director',
  49. 'editors': 'editor',
  50. 'writing credits': 'writer',
  51. 'writers': 'writer',
  52. 'produced': 'producer',
  53. 'cinematography': 'cinematographer',
  54. 'film editing': 'editor',
  55. 'casting': 'casting director',
  56. 'costume design': 'costume designer',
  57. 'makeup department': 'make up',
  58. 'production management': 'production manager',
  59. 'second unit director or assistant director': 'assistant director',
  60. 'costume and wardrobe department': 'costume department',
  61. 'costume departmen': 'costume department',
  62. 'sound department': 'sound crew',
  63. 'stunts': 'stunt performer',
  64. 'other crew': 'miscellaneous crew',
  65. 'also known as': 'akas',
  66. 'country': 'countries',
  67. 'runtime': 'runtimes',
  68. 'language': 'languages',
  69. 'certification': 'certificates',
  70. 'genre': 'genres',
  71. 'created': 'creator',
  72. 'creators': 'creator',
  73. 'color': 'color info',
  74. 'plot': 'plot outline',
  75. 'art director': 'art direction',
  76. 'art directors': 'art direction',
  77. 'composers': 'composer',
  78. 'assistant directors': 'assistant director',
  79. 'set decorator': 'set decoration',
  80. 'set decorators': 'set decoration',
  81. 'visual effects department': 'visual effects',
  82. 'miscellaneous': 'miscellaneous crew',
  83. 'make up department': 'make up',
  84. 'plot summary': 'plot outline',
  85. 'cinematographers': 'cinematographer',
  86. 'camera department': 'camera and electrical department',
  87. 'costume designers': 'costume designer',
  88. 'production designer': 'production design',
  89. 'production designers': 'production design',
  90. 'production managers': 'production manager',
  91. 'music original': 'original music',
  92. 'casting directors': 'casting director',
  93. 'other companies': 'miscellaneous companies',
  94. 'producers': 'producer',
  95. 'special effects by': 'special effects department',
  96. }
  97. re_space = re.compile(r'\s+')
  98. def clean_section_name(section):
  99. """Clean and replace some section names."""
  100. section = re_space.sub(' ', section.replace('_', ' ').strip().lower())
  101. if section.endswith(' by'):
  102. section = section[:-3]
  103. return _SECT_CONV.get(section, section)
  104. def _manageRoles(mo):
  105. """Perform some transformation on the html, so that roleIDs can
  106. be easily retrieved."""
  107. firstHalf = mo.group(1)
  108. secondHalf = mo.group(2)
  109. newRoles = []
  110. roles = secondHalf.split(' / ')
  111. for role in roles:
  112. role = role.strip()
  113. if not role:
  114. continue
  115. roleID = analyze_imdbid(role)
  116. if roleID is None:
  117. roleID = '/'
  118. else:
  119. roleID += '/'
  120. newRoles.append('<div class="_imdbpyrole" roleid="%s">%s</div>' % (
  121. roleID, role.strip()
  122. ))
  123. return firstHalf + ' / '.join(newRoles) + mo.group(3)
  124. _reRolesMovie = re.compile(r'(<td class="character">)(.*?)(</td>)', re.I | re.M | re.S)
  125. def makeSplitter(lstrip=None, sep='|', comments=True,
  126. origNotesSep=' (', newNotesSep='::(', strip=None):
  127. """Return a splitter function suitable for a given set of data."""
  128. def splitter(x):
  129. if not x:
  130. return x
  131. x = x.strip()
  132. if not x:
  133. return x
  134. if lstrip is not None:
  135. x = x.lstrip(lstrip).lstrip()
  136. lx = x.split(sep)
  137. lx[:] = [_f for _f in [j.strip() for j in lx] if _f]
  138. if comments:
  139. lx[:] = [j.replace(origNotesSep, newNotesSep, 1) for j in lx]
  140. if strip:
  141. lx[:] = [j.strip(strip) for j in lx]
  142. return lx
  143. return splitter
  144. def _toInt(val, replace=()):
  145. """Return the value, converted to integer, or None; if present, 'replace'
  146. must be a list of tuples of values to replace."""
  147. for before, after in replace:
  148. val = val.replace(before, after)
  149. try:
  150. return int(val)
  151. except (TypeError, ValueError):
  152. return None
  153. _re_og_title = re.compile(
  154. r'(.*) \((?:(?:(.+)(?= ))? ?(\d{4})(?:(–)(\d{4}| ))?|(.+))\)',
  155. re.UNICODE
  156. )
  157. def analyze_og_title(og_title):
  158. data = {}
  159. match = _re_og_title.match(og_title)
  160. if og_title and not match:
  161. # assume it's a title in production, missing release date information
  162. return {'title': og_title}
  163. data['title'] = match.group(1)
  164. if match.group(3):
  165. data['year'] = int(match.group(3))
  166. kind = match.group(2) or match.group(6)
  167. if kind is None:
  168. kind = 'movie'
  169. else:
  170. kind = kind.lower()
  171. kind = KIND_MAP.get(kind, kind)
  172. data['kind'] = kind
  173. year_separator = match.group(4)
  174. # There is a year separator so assume an ongoing or ended series
  175. if year_separator is not None:
  176. end_year = match.group(5)
  177. if end_year is not None:
  178. data['series years'] = '%(year)d-%(end_year)s' % {
  179. 'year': data['year'],
  180. 'end_year': end_year.strip(),
  181. }
  182. elif kind.endswith('series'):
  183. data['series years'] = '%(year)d-' % {'year': data['year']}
  184. # No year separator and series, so assume that it ended the same year
  185. elif kind.endswith('series') and 'year' in data:
  186. data['series years'] = '%(year)d-%(year)d' % {'year': data['year']}
  187. if data['kind'] == 'episode' and data['title'][0] == '"':
  188. quote_end = data['title'].find('"', 1)
  189. data['tv series title'] = data['title'][1:quote_end]
  190. data['title'] = data['title'][quote_end + 1:].strip()
  191. return data
  192. def analyze_certificates(certificates):
  193. def reducer(acc, el):
  194. cert_re = re.compile(r'^(.+):(.+)$', re.UNICODE)
  195. if cert_re.match(el):
  196. acc.append(el)
  197. elif acc:
  198. acc[-1] = u'{}::{}'.format(
  199. acc[-1],
  200. el,
  201. )
  202. return acc
  203. certificates = [el.strip() for el in certificates.split('\n') if el.strip()]
  204. return functools.reduce(reducer, certificates, [])
  205. def clean_akas(aka):
  206. aka = re_space.sub(' ', aka).strip()
  207. if aka.lower().startswith('see more'):
  208. aka = ''
  209. return aka
  210. class DOMHTMLMovieParser(DOMParserBase):
  211. """Parser for the "reference" page of a given movie.
  212. The page should be provided as a string, as taken from
  213. the www.imdb.com server. The final result will be a
  214. dictionary, with a key for every relevant section.
  215. Example::
  216. mparser = DOMHTMLMovieParser()
  217. result = mparser.parse(reference_html_string)
  218. """
  219. _containsObjects = True
  220. rules = [
  221. Rule(
  222. key='title',
  223. extractor=Path('//meta[@property="og:title"]/@content',
  224. transform=analyze_og_title)
  225. ),
  226. Rule(
  227. key='original title',
  228. extractor=Path('//div[@class="titlereference-header"]//span[@class="titlereference-original-title-label"]/preceding-sibling::text()',
  229. transform=lambda x: re_space.sub(' ', x).strip())
  230. ),
  231. Rule(
  232. key='original title title-year',
  233. extractor=Path('//div[@class="titlereference-header"]//span[@class="titlereference-title-year"]/preceding-sibling::text()',
  234. transform=lambda x: re_space.sub(' ', x).strip())
  235. ),
  236. Rule(
  237. key='localized title',
  238. extractor=Path('//meta[@name="title"]/@content',
  239. transform=lambda x: analyze_og_title(x).get('title'))
  240. ),
  241. # parser for misc sections like 'casting department', 'stunts', ...
  242. Rule(
  243. key='misc sections',
  244. extractor=Rules(
  245. foreach='//h4[contains(@class, "ipl-header__content")]',
  246. rules=[
  247. Rule(
  248. key=Path('./@name', transform=clean_section_name),
  249. extractor=Rules(
  250. foreach='../../following-sibling::table[1]//tr',
  251. rules=[
  252. Rule(
  253. key='person',
  254. extractor=Path('.//text()')
  255. ),
  256. Rule(
  257. key='link',
  258. extractor=Path('./td[1]/a[@href]/@href')
  259. )
  260. ],
  261. transform=lambda x: build_person(
  262. x.get('person') or '',
  263. personID=analyze_imdbid(x.get('link'))
  264. )
  265. )
  266. )
  267. ]
  268. )
  269. ),
  270. Rule(
  271. key='cast',
  272. extractor=Rules(
  273. foreach='//table[@class="cast_list"]//tr',
  274. rules=[
  275. Rule(
  276. key='person',
  277. extractor=Path('.//text()')
  278. ),
  279. Rule(
  280. key='link',
  281. extractor=Path('./td[2]/a/@href')
  282. ),
  283. Rule(
  284. key='roleID',
  285. extractor=Path('./td[4]//div[@class="_imdbpyrole"]/@roleid')
  286. )
  287. ],
  288. transform=lambda x: build_person(
  289. x.get('person') or '',
  290. personID=analyze_imdbid(x.get('link')),
  291. roleID=(x.get('roleID') or '').split('/')
  292. )
  293. )
  294. ),
  295. Rule(
  296. key='recommendations',
  297. extractor=Rules(
  298. foreach='//div[contains(@class, "rec_item")]',
  299. rules=[
  300. Rule(
  301. key='movieID',
  302. extractor=Path(
  303. './@data-tconst',
  304. transform=lambda x: (x or '').replace('tt', '')
  305. )
  306. ),
  307. Rule(
  308. key='title',
  309. extractor=Path(
  310. './/a//img/@title',
  311. transform=lambda x: re_space.sub(' ', x or '').strip()
  312. )
  313. ),
  314. ],
  315. transform=lambda x: build_movie(x.get('title', ''), movieID=x.get('movieID'))
  316. )
  317. ),
  318. Rule(
  319. key='myrating',
  320. extractor=Path('//span[@id="voteuser"]//text()')
  321. ),
  322. Rule(
  323. key='plot summary',
  324. extractor=Path('//td[starts-with(text(), "Plot")]/..//p/text()',
  325. transform=lambda x: x.strip().rstrip('|').rstrip())
  326. ),
  327. Rule(
  328. key='genres',
  329. extractor=Path(
  330. foreach='//td[starts-with(text(), "Genre")]/..//li/a',
  331. path='./text()'
  332. )
  333. ),
  334. Rule(
  335. key='runtimes',
  336. extractor=Path(
  337. foreach='//td[starts-with(text(), "Runtime")]/..//li',
  338. path='./text()',
  339. transform=lambda x: x.strip().replace(' min', '')
  340. )
  341. ),
  342. Rule(
  343. key='countries',
  344. extractor=Path(
  345. foreach='//td[starts-with(text(), "Countr")]/..//li/a',
  346. path='./text()'
  347. )
  348. ),
  349. Rule(
  350. key='country codes',
  351. extractor=Path(
  352. foreach='//td[starts-with(text(), "Countr")]/..//li/a',
  353. path='./@href',
  354. transform=lambda x: x.split('/')[2].strip().lower()
  355. )
  356. ),
  357. Rule(
  358. key='language',
  359. extractor=Path(
  360. foreach='//td[starts-with(text(), "Language")]/..//li/a',
  361. path='./text()'
  362. )
  363. ),
  364. Rule(
  365. key='language codes',
  366. extractor=Path(
  367. foreach='//td[starts-with(text(), "Language")]/..//li/a',
  368. path='./@href',
  369. transform=lambda x: x.split('/')[2].strip()
  370. )
  371. ),
  372. Rule(
  373. key='color info',
  374. extractor=Path(
  375. foreach='//td[starts-with(text(), "Color")]/..//li/a',
  376. path='./text()',
  377. transform=lambda x: x.replace(' (', '::(')
  378. )
  379. ),
  380. Rule(
  381. key='aspect ratio',
  382. extractor=Path(
  383. '//td[starts-with(text(), "Aspect")]/..//li/text()',
  384. transform=transformers.strip
  385. )
  386. ),
  387. Rule(
  388. key='sound mix',
  389. extractor=Path(
  390. foreach='//td[starts-with(text(), "Sound Mix")]/..//li/a',
  391. path='./text()',
  392. transform=lambda x: x.replace(' (', '::(')
  393. )
  394. ),
  395. Rule(
  396. key='box office',
  397. extractor=Rules(
  398. foreach='//section[contains(@class, "titlereference-section-box-office")]'
  399. '//table[contains(@class, "titlereference-list")]//tr',
  400. rules=[
  401. Rule(
  402. key='box_office_title',
  403. extractor=Path('./td[1]/text()')
  404. ),
  405. Rule(
  406. key='box_office_detail',
  407. extractor=Path('./td[2]/text()')
  408. )
  409. ],
  410. transform=lambda x: (x['box_office_title'].strip(),
  411. x['box_office_detail'].strip())
  412. ),
  413. ),
  414. Rule(
  415. key='certificates',
  416. extractor=Path(
  417. '//td[starts-with(text(), "Certificat")]/..//text()',
  418. transform=analyze_certificates
  419. )
  420. ),
  421. # Collects akas not encosed in <i> tags.
  422. Rule(
  423. key='other akas',
  424. extractor=Path(
  425. foreach='//section[contains(@class, "listo")]//td[starts-with(text(), "Also Known As")]/..//ul/li',
  426. path='.//text()',
  427. transform=clean_akas
  428. )
  429. ),
  430. Rule(
  431. key='creator',
  432. extractor=Rules(
  433. foreach='//div[starts-with(normalize-space(text()), "Creator")]/ul/li[1]/a',
  434. rules=[
  435. Rule(
  436. key='name',
  437. extractor=Path('./text()')
  438. ),
  439. Rule(
  440. key='link',
  441. extractor=Path('./@href')
  442. )
  443. ],
  444. transform=lambda x: build_person(
  445. x.get('name') or '',
  446. personID=analyze_imdbid(x.get('link'))
  447. )
  448. )
  449. ),
  450. Rule(
  451. key='thin writer',
  452. extractor=Rules(
  453. foreach='//div[starts-with(normalize-space(text()), "Writer")]/ul/li[1]/a',
  454. rules=[
  455. Rule(
  456. key='name',
  457. extractor=Path('./text()')
  458. ),
  459. Rule(
  460. key='link',
  461. extractor=Path('./@href')
  462. )
  463. ],
  464. transform=lambda x: build_person(
  465. x.get('name') or '',
  466. personID=analyze_imdbid(x.get('link'))
  467. )
  468. )
  469. ),
  470. Rule(
  471. key='thin director',
  472. extractor=Rules(
  473. foreach='//div[starts-with(normalize-space(text()), "Director")]/ul/li[1]/a',
  474. rules=[
  475. Rule(
  476. key='name',
  477. extractor=Path('./text()')
  478. ),
  479. Rule(
  480. key='link',
  481. extractor=Path('./@href')
  482. )
  483. ],
  484. transform=lambda x: build_person(
  485. x.get('name') or '',
  486. personID=analyze_imdbid(x.get('link'))
  487. )
  488. )
  489. ),
  490. Rule(
  491. key='top/bottom rank',
  492. extractor=Path(
  493. '//li[@class="ipl-inline-list__item"]//a[starts-with(@href, "/chart/")]/text()'
  494. )
  495. ),
  496. Rule(
  497. key='original air date',
  498. extractor=Path('//span[@imdbpy="airdate"]/text()')
  499. ),
  500. Rule(
  501. key='series years',
  502. extractor=Path(
  503. '//div[@id="tn15title"]//span[starts-with(text(), "TV series")]/text()',
  504. transform=lambda x: x.replace('TV series', '').strip()
  505. )
  506. ),
  507. Rule(
  508. key='season/episode',
  509. extractor=Path(
  510. '//div[@class="titlereference-overview-season-episode-section"]/ul//text()',
  511. transform=transformers.strip
  512. )
  513. ),
  514. Rule(
  515. key='number of episodes',
  516. extractor=Path(
  517. '//a[starts-with(text(), "All Episodes")]/text()',
  518. transform=lambda x: int(x.replace('All Episodes', '').strip()[1:-1])
  519. )
  520. ),
  521. Rule(
  522. key='episode number',
  523. extractor=Path(
  524. '//div[@id="tn15epnav"]/text()',
  525. transform=lambda x: int(re.sub(r'[^a-z0-9 ]', '',
  526. x.lower()).strip().split()[0]))
  527. ),
  528. Rule(
  529. key='previous episode',
  530. extractor=Path(
  531. '//span[@class="titlereference-overview-episodes-links"]'
  532. '//a[contains(text(), "Previous")]/@href',
  533. transform=analyze_imdbid
  534. )
  535. ),
  536. Rule(
  537. key='next episode',
  538. extractor=Path(
  539. '//span[@class="titlereference-overview-episodes-links"]'
  540. '//a[contains(text(), "Next")]/@href',
  541. transform=analyze_imdbid
  542. )
  543. ),
  544. Rule(
  545. key='number of seasons',
  546. extractor=Path(
  547. '//span[@class="titlereference-overview-years-links"]/../a[1]/text()',
  548. transform=int
  549. )
  550. ),
  551. Rule(
  552. key='tv series link',
  553. extractor=Path('//a[starts-with(text(), "All Episodes")]/@href')
  554. ),
  555. Rule(
  556. key='akas',
  557. extractor=Path(
  558. foreach='//i[@class="transl"]',
  559. path='./text()',
  560. transform=lambda x: x
  561. .replace(' ', ' ')
  562. .rstrip('-')
  563. .replace('" - ', '"::', 1)
  564. .strip('"')
  565. .replace(' ', ' ')
  566. )
  567. ),
  568. Rule(
  569. key='production status',
  570. extractor=Path(
  571. '//td[starts-with(text(), "Status:")]/..//div[@class="info-content"]//text()',
  572. transform=lambda x: x.strip().split('|')[0].strip().lower()
  573. )
  574. ),
  575. Rule(
  576. key='production status updated',
  577. extractor=Path(
  578. '//td[starts-with(text(), "Status Updated:")]/'
  579. '..//div[@class="info-content"]//text()',
  580. transform=transformers.strip
  581. )
  582. ),
  583. Rule(
  584. key='production comments',
  585. extractor=Path(
  586. '//td[starts-with(text(), "Comments:")]/'
  587. '..//div[@class="info-content"]//text()',
  588. transform=transformers.strip
  589. )
  590. ),
  591. Rule(
  592. key='production note',
  593. extractor=Path(
  594. '//td[starts-with(text(), "Note:")]/'
  595. '..//div[@class="info-content"]//text()',
  596. transform=transformers.strip
  597. )
  598. ),
  599. Rule(
  600. key='companies',
  601. extractor=Rules(
  602. foreach="//ul[@class='simpleList']",
  603. rules=[
  604. Rule(
  605. key=Path('preceding-sibling::header[1]/div/h4/text()', transform=transformers.lower),
  606. extractor=Rules(
  607. foreach='./li',
  608. rules=[
  609. Rule(
  610. key='name',
  611. extractor=Path('./a//text()')
  612. ),
  613. Rule(
  614. key='comp-link',
  615. extractor=Path('./a/@href')
  616. ),
  617. Rule(
  618. key='notes',
  619. extractor=Path('./text()')
  620. )
  621. ],
  622. transform=lambda x: Company(
  623. name=x.get('name') or '',
  624. accessSystem='http',
  625. companyID=analyze_imdbid(x.get('comp-link')),
  626. notes=(x.get('notes') or '').strip()
  627. )
  628. )
  629. )
  630. ]
  631. )
  632. ),
  633. Rule(
  634. key='rating',
  635. extractor=Path('(//span[@class="ipl-rating-star__rating"])[1]/text()')
  636. ),
  637. Rule(
  638. key='votes',
  639. extractor=Path('//span[@class="ipl-rating-star__total-votes"][1]/text()')
  640. ),
  641. Rule(
  642. key='cover url',
  643. extractor=Path('//img[@alt="Poster"]/@src')
  644. ),
  645. Rule(
  646. key='imdbID',
  647. extractor=Path('//meta[@property="pageId"]/@content',
  648. transform=lambda x: (x or '').replace('tt', ''))
  649. )
  650. ]
  651. preprocessors = [
  652. ('/releaseinfo">', '"><span imdbpy="airdate">'),
  653. (re.compile(r'(<b class="blackcatheader">.+?</b>)', re.I), r'</div><div>\1'),
  654. ('<small>Full cast and crew for<br>', ''),
  655. ('<td> </td>', '<td>...</td>'),
  656. (re.compile(r'<span class="tv-extra">TV mini-series(\s+.*?)</span>', re.I),
  657. r'<span class="tv-extra">TV series\1</span> (mini)'),
  658. (_reRolesMovie, _manageRoles)
  659. ]
  660. def preprocess_dom(self, dom):
  661. # Handle series information.
  662. xpath = self.xpath(dom, "//b[text()='Series Crew']")
  663. if xpath:
  664. b = xpath[-1] # In doubt, take the last one.
  665. for a in self.xpath(b, "./following::h5/a[@class='glossary']"):
  666. name = a.get('name')
  667. if name:
  668. a.set('name', 'series %s' % name)
  669. # Remove links to IMDbPro.
  670. preprocessors.remove(dom, '//span[@class="pro-link"]')
  671. # Remove some 'more' links (keep others, like the one around
  672. # the number of votes).
  673. preprocessors.remove(dom, '//a[@class="tn15more"][starts-with(@href, "/title/")]')
  674. # Remove the "rest of list" in cast.
  675. preprocessors.remove(dom, '//td[@colspan="4"]/..')
  676. return dom
  677. re_space = re.compile(r'\s+')
  678. re_airdate = re.compile(r'(.*)\s*\(season (\d+), episode (\d+)\)', re.I)
  679. def postprocess_data(self, data):
  680. # Convert section names.
  681. for sect in list(data.keys()):
  682. if sect in _SECT_CONV:
  683. data[_SECT_CONV[sect]] = data[sect]
  684. del data[sect]
  685. # Filter out fake values.
  686. for key in data:
  687. value = data[key]
  688. if isinstance(value, list) and value:
  689. if isinstance(value[0], Person):
  690. data[key] = [x for x in value if x.personID is not None]
  691. if isinstance(value[0], _Container):
  692. for obj in data[key]:
  693. obj.accessSystem = self._as
  694. obj.modFunct = self._modFunct
  695. for key in ['title']:
  696. if (key in data) and isinstance(data[key], dict):
  697. subdata = data[key]
  698. del data[key]
  699. data.update(subdata)
  700. if not data.get('original title'):
  701. if 'original title title-year' in data:
  702. data['original title'] = data['original title title-year']
  703. del data['original title title-year']
  704. elif 'original title title-year' in data:
  705. del data['original title title-year']
  706. misc_sections = data.get('misc sections')
  707. if misc_sections is not None:
  708. for section in misc_sections:
  709. # skip sections with their own parsers
  710. if 'cast' in section.keys():
  711. continue
  712. data.update(section)
  713. del data['misc sections']
  714. if 'akas' in data or 'other akas' in data:
  715. akas = data.get('akas') or []
  716. other_akas = data.get('other akas') or []
  717. akas += other_akas
  718. nakas = []
  719. for aka in akas:
  720. aka = aka.strip()
  721. if not aka:
  722. continue
  723. if aka.endswith('" -'):
  724. aka = aka[:-3].rstrip()
  725. nakas.append(aka)
  726. if 'akas' in data:
  727. del data['akas']
  728. if 'other akas' in data:
  729. del data['other akas']
  730. if nakas:
  731. data['akas'] = nakas
  732. if 'runtimes' in data:
  733. data['runtimes'] = [x.replace(' min', '')
  734. for x in data['runtimes']]
  735. if 'number of seasons' in data:
  736. data['seasons'] = [str(i) for i in range(1, data['number of seasons'] + 1)]
  737. if 'season/episode' in data:
  738. tokens = data['season/episode'].split('Episode')
  739. try:
  740. data['season'] = int(tokens[0].split('Season')[1])
  741. except:
  742. data['season'] = 'unknown'
  743. try:
  744. data['episode'] = int(tokens[1])
  745. except:
  746. data['episode'] = 'unknown'
  747. del data['season/episode']
  748. for k in ('writer', 'director'):
  749. t_k = 'thin %s' % k
  750. if t_k not in data:
  751. continue
  752. if k not in data:
  753. data[k] = data[t_k]
  754. del data[t_k]
  755. if 'top/bottom rank' in data:
  756. tbVal = data['top/bottom rank'].lower()
  757. if tbVal.startswith('top'):
  758. tbKey = 'top 250 rank'
  759. tbVal = _toInt(tbVal, [('top rated movies: #', '')])
  760. else:
  761. tbKey = 'bottom 100 rank'
  762. tbVal = _toInt(tbVal, [('bottom rated movies: #', '')])
  763. if tbVal:
  764. data[tbKey] = tbVal
  765. del data['top/bottom rank']
  766. if 'year' in data and data['year'] == '????':
  767. del data['year']
  768. if 'tv series link' in data:
  769. if 'tv series title' in data:
  770. data['episode of'] = Movie(title=data['tv series title'],
  771. movieID=analyze_imdbid(data['tv series link']),
  772. accessSystem=self._as,
  773. modFunct=self._modFunct)
  774. data['episode of']['kind'] = 'tv series'
  775. del data['tv series title']
  776. del data['tv series link']
  777. if 'rating' in data:
  778. try:
  779. data['rating'] = float(data['rating'].replace('/10', ''))
  780. except (TypeError, ValueError):
  781. pass
  782. if data['rating'] == 0:
  783. del data['rating']
  784. if 'votes' in data:
  785. try:
  786. votes = data['votes'].replace('(', '').replace(')', '').replace(',', '').replace('votes', '')
  787. data['votes'] = int(votes)
  788. except (TypeError, ValueError):
  789. pass
  790. companies = data.get('companies')
  791. if companies:
  792. for section in companies:
  793. for key, value in section.items():
  794. if key in data:
  795. key = '%s companies' % key
  796. data.update({key: value})
  797. del data['companies']
  798. if 'box office' in data:
  799. data['box office'] = dict(data['box office'])
  800. return data
  801. def _process_plotsummary(x):
  802. """Process a plot (contributed by Rdian06)."""
  803. xauthor = x.get('author')
  804. xplot = x.get('plot', '').strip()
  805. if xauthor:
  806. xplot += '::%s' % xauthor
  807. return xplot
  808. class DOMHTMLPlotParser(DOMParserBase):
  809. """Parser for the "plot summary" page of a given movie.
  810. The page should be provided as a string, as taken from
  811. the www.imdb.com server. The final result will be a
  812. dictionary, with a 'plot' key, containing a list
  813. of string with the structure: 'summary::summary_author <author@email>'.
  814. Example::
  815. pparser = HTMLPlotParser()
  816. result = pparser.parse(plot_summary_html_string)
  817. """
  818. _defGetRefs = True
  819. def synopsis_reducer(nodes):
  820. ret=[]
  821. for n in nodes:
  822. if type(n) is ElementTree._ElementUnicodeResult:
  823. ret.append(n)
  824. return '\n\n'.join(ret)
  825. # Notice that recently IMDb started to put the email of the
  826. # author only in the link, that we're not collecting, here.
  827. rules = [
  828. Rule(
  829. key='plot',
  830. extractor=Rules(
  831. foreach='//ul[@id="plot-summaries-content"]/li',
  832. rules=[
  833. Rule(
  834. key='plot',
  835. extractor=Path('./p//text()')
  836. ),
  837. Rule(
  838. key='author',
  839. extractor=Path('.//div[@class="author-container"]//a/text()')
  840. )
  841. ],
  842. transform=_process_plotsummary
  843. )
  844. ),
  845. Rule(
  846. key='synopsis',
  847. extractor=Path(
  848. foreach='//ul[@id="plot-synopsis-content"]',
  849. path='.//li//node()',
  850. reduce=synopsis_reducer
  851. )
  852. )
  853. ]
  854. def preprocess_dom(self, dom):
  855. preprocessors.remove(dom, '//li[@id="no-summary-content"]')
  856. return dom
  857. def postprocess_data(self, data):
  858. if 'synopsis' in data and data['synopsis'][0] and 'a Synopsis for this title' in data['synopsis'][0]:
  859. del data['synopsis']
  860. return data
  861. def _process_award(x):
  862. award = {}
  863. _award = x.get('award')
  864. if _award is not None:
  865. _award = _award.strip()
  866. award['award'] = _award
  867. if not award['award']:
  868. return {}
  869. award['year'] = x.get('year').strip()
  870. if award['year'] and award['year'].isdigit():
  871. award['year'] = int(award['year'])
  872. award['result'] = x.get('result').strip()
  873. category = x.get('category').strip()
  874. if category:
  875. award['category'] = category
  876. received_with = x.get('with')
  877. if received_with is not None:
  878. award['with'] = received_with.strip()
  879. notes = x.get('notes')
  880. if notes is not None:
  881. notes = notes.strip().split('\n', 2)[0]
  882. notes = re_space.sub(' ', notes)
  883. if notes:
  884. award['notes'] = notes
  885. award['anchor'] = x.get('anchor')
  886. return award
  887. class DOMHTMLAwardsParser(DOMParserBase):
  888. """Parser for the "awards" page of a given person or movie.
  889. The page should be provided as a string, as taken from
  890. the www.imdb.com server. The final result will be a
  891. dictionary, with a key for every relevant section.
  892. Example::
  893. awparser = HTMLAwardsParser()
  894. result = awparser.parse(awards_html_string)
  895. """
  896. subject = 'title'
  897. _containsObjects = True
  898. rules = [
  899. Rule(
  900. key='awards',
  901. extractor=Rules(
  902. foreach='//*[@id="main"]/div[1]/div/table//tr',
  903. rules=[
  904. Rule(
  905. key='year',
  906. extractor=Path('normalize-space(./ancestor::table/preceding-sibling::*[1]/a/text())')
  907. ),
  908. Rule(
  909. key='result',
  910. extractor=Path('./td[1]/b/text()')
  911. ),
  912. Rule(
  913. key='award',
  914. extractor=Path('./td[1]/span/text()')
  915. ),
  916. Rule(
  917. key='category',
  918. extractor=Path('normalize-space(./ancestor::table/preceding-sibling::*[1]/text())')
  919. ),
  920. Rule(
  921. key='notes',
  922. extractor=Path('./td[2]/text()')
  923. ),
  924. Rule(
  925. key='anchor',
  926. extractor=Path('.//text()')
  927. )
  928. ],
  929. transform=_process_award
  930. )
  931. ),
  932. Rule(
  933. key='recipients',
  934. extractor=Rules(
  935. foreach='//*[@id="main"]/div[1]/div/table//tr/td[2]/a',
  936. rules=[
  937. Rule(
  938. key='name',
  939. extractor=Path('./text()')
  940. ),
  941. Rule(
  942. key='link',
  943. extractor=Path('./@href')
  944. ),
  945. Rule(
  946. key='anchor',
  947. extractor=Path('./ancestor::tr//text()')
  948. )
  949. ]
  950. )
  951. )
  952. ]
  953. preprocessors = [
  954. (re.compile('(<tr><td[^>]*>.*?</td></tr>\n\n</table>)', re.I),
  955. r'\1</table>'),
  956. (re.compile('(<tr><td[^>]*>\n\n<big>.*?</big></td></tr>)', re.I),
  957. r'</table><table class="_imdbpy">\1'),
  958. (re.compile('(<table[^>]*>\n\n)</table>(<table)', re.I), r'\1\2'),
  959. (re.compile('(<small>.*?)<br>(.*?</small)', re.I), r'\1 \2'),
  960. (re.compile('(</tr>\n\n)(<td)', re.I), r'\1<tr>\2')
  961. ]
  962. def preprocess_dom(self, dom):
  963. """Repeat td elements according to their rowspan attributes
  964. in subsequent tr elements.
  965. """
  966. cols = self.xpath(dom, "//td[@rowspan]")
  967. for col in cols:
  968. span = int(col.get('rowspan'))
  969. del col.attrib['rowspan']
  970. position = len(self.xpath(col, "./preceding-sibling::td"))
  971. row = col.getparent()
  972. for tr in self.xpath(row, "./following-sibling::tr")[:span - 1]:
  973. # if not cloned, child will be moved to new parent
  974. clone = self.clone(col)
  975. tr.insert(position, clone)
  976. return dom
  977. def postprocess_data(self, data):
  978. if len(data) == 0:
  979. return {}
  980. nd = []
  981. for award in data['awards']:
  982. matches = [p for p in data.get('recipients', [])
  983. if 'nm' in p.get('link') and award.get('anchor') == p.get('anchor')]
  984. if self.subject == 'title':
  985. recipients = [
  986. Person(name=recipient['name'],
  987. personID=analyze_imdbid(recipient['link']))
  988. for recipient in matches
  989. ]
  990. award['to'] = recipients
  991. elif self.subject == 'name':
  992. recipients = [
  993. Movie(title=recipient['name'],
  994. movieID=analyze_imdbid(recipient['link']))
  995. for recipient in matches
  996. ]
  997. award['for'] = recipients
  998. nd.append(award)
  999. if 'anchor' in award:
  1000. del award['anchor']
  1001. return {'awards': nd}
  1002. class DOMHTMLTaglinesParser(DOMParserBase):
  1003. """Parser for the "taglines" page of a given movie.
  1004. The page should be provided as a string, as taken from
  1005. the www.imdb.com server. The final result will be a
  1006. dictionary, with a key for every relevant section.
  1007. Example::
  1008. tparser = DOMHTMLTaglinesParser()
  1009. result = tparser.parse(taglines_html_string)
  1010. """
  1011. rules = [
  1012. Rule(
  1013. key='taglines',
  1014. extractor=Path(
  1015. foreach='//div[@id="taglines_content"]/div',
  1016. path='.//text()'
  1017. )
  1018. )
  1019. ]
  1020. def preprocess_dom(self, dom):
  1021. preprocessors.remove(dom, '//div[@id="taglines_content"]/div[@class="header"]')
  1022. preprocessors.remove(dom, '//div[@id="taglines_content"]/div[@id="no_content"]')
  1023. return dom
  1024. def postprocess_data(self, data):
  1025. if 'taglines' in data:
  1026. data['taglines'] = [tagline.strip() for tagline in data['taglines']]
  1027. return data
  1028. class DOMHTMLKeywordsParser(DOMParserBase):
  1029. """Parser for the "keywords" page of a given movie.
  1030. The page should be provided as a string, as taken from
  1031. the www.imdb.com server. The final result will be a
  1032. dictionary, with a key for every relevant section.
  1033. Example::
  1034. kwparser = DOMHTMLKeywordsParser()
  1035. result = kwparser.parse(keywords_html_string)
  1036. """
  1037. rules = [
  1038. Rule(
  1039. key='keywords',
  1040. extractor=Path(
  1041. foreach='//td[@data-item-keyword]',
  1042. path='./@data-item-keyword',
  1043. transform=lambda x: x.lower().replace(' ', '-')
  1044. )
  1045. ),
  1046. Rule(
  1047. key='relevant keywords',
  1048. extractor=Rules(
  1049. foreach='//td[@data-item-keyword]',
  1050. rules=[
  1051. Rule(
  1052. key='keyword',
  1053. extractor=Path('./@data-item-keyword')
  1054. ),
  1055. Rule(
  1056. key='ordering',
  1057. extractor=Path('./@data-item-votes')
  1058. ),
  1059. Rule(
  1060. key='vote_str',
  1061. extractor=Path('./div[2]/div//text()')
  1062. )
  1063. ],
  1064. transform=lambda x: {
  1065. 'keyword': x.get('keyword').lower(),
  1066. 'keyword_dash': x.get('keyword').lower().replace(' ', '-'),
  1067. 'ordering': x.get('ordering'),
  1068. 'votes_str': x.get('vote_str').strip().lower()
  1069. }
  1070. )
  1071. )
  1072. ]
  1073. def postprocess_data(self, data):
  1074. if 'relevant keywords' in data:
  1075. rk = []
  1076. for x in data['relevant keywords']:
  1077. if 'votes_str' in x:
  1078. if 'is this relevant?' in x['votes_str']:
  1079. x['votes_for'] = 0
  1080. x['total_votes'] = 0
  1081. else:
  1082. x['votes_for'] = x['votes_str'].split('of')[0].strip()
  1083. x['total_votes'] = re.sub(r"\D", "", x['votes_str'].split('of')[1]).strip()
  1084. rk.append(x)
  1085. data['relevant keywords'] = rk
  1086. return data
  1087. class DOMHTMLAlternateVersionsParser(DOMParserBase):
  1088. """Parser for the "alternate versions" page of a given movie.
  1089. The page should be provided as a string, as taken from
  1090. the www.imdb.com server. The final result will be a
  1091. dictionary, with a key for every relevant section.
  1092. Example::
  1093. avparser = DOMHTMLAlternateVersionsParser()
  1094. result = avparser.parse(alternateversions_html_string)
  1095. """
  1096. _defGetRefs = True
  1097. rules = [
  1098. Rule(
  1099. key='alternate versions',
  1100. extractor=Path(
  1101. foreach='//ul[@class="trivia"]/li',
  1102. path='.//text()',
  1103. transform=transformers.strip
  1104. )
  1105. )
  1106. ]
  1107. class DOMHTMLTriviaParser(DOMParserBase):
  1108. """Parser for the "trivia" page of a given movie.
  1109. The page should be provided as a string, as taken from
  1110. the www.imdb.com server. The final result will be a
  1111. dictionary, with a key for every relevant section.
  1112. Example::
  1113. tparser = DOMHTMLTriviaParser()
  1114. result = tparser.parse(trivia_html_string)
  1115. """
  1116. _defGetRefs = True
  1117. rules = [
  1118. Rule(
  1119. key='trivia',
  1120. extractor=Path(
  1121. foreach='//div[@class="sodatext"]',
  1122. path='.//text()',
  1123. transform=transformers.strip
  1124. )
  1125. )
  1126. ]
  1127. def preprocess_dom(self, dom):
  1128. # Remove "link this quote" links.
  1129. preprocessors.remove(dom, '//span[@class="linksoda"]')
  1130. return dom
  1131. class DOMHTMLSoundtrackParser(DOMParserBase):
  1132. """Parser for the "soundtrack" page of a given movie.
  1133. The page should be provided as a string, as taken from
  1134. the www.imdb.com server. The final result will be a
  1135. dictionary, with a key for every relevant section.
  1136. Example::
  1137. stparser = DOMHTMLSoundtrackParser()
  1138. result = stparser.parse(soundtrack_html_string)
  1139. """
  1140. _defGetRefs = True
  1141. preprocessors = [('<br />', '\n'), ('<br>', '\n')]
  1142. rules = [
  1143. Rule(
  1144. key='soundtrack',
  1145. extractor=Path(
  1146. foreach='//div[@class="list"]//div',
  1147. path='.//text()',
  1148. transform=transformers.strip
  1149. )
  1150. )
  1151. ]
  1152. def postprocess_data(self, data):
  1153. if 'soundtrack' in data:
  1154. nd = []
  1155. for x in data['soundtrack']:
  1156. ds = x.split('\n')
  1157. title = ds[0]
  1158. if title[0] == '"' and title[-1] == '"':
  1159. title = title[1:-1]
  1160. nds = []
  1161. newData = {}
  1162. for l in ds[1:]:
  1163. if ' with ' in l or ' by ' in l or ' from ' in l \
  1164. or ' of ' in l or l.startswith('From '):
  1165. nds.append(l)
  1166. else:
  1167. if nds:
  1168. nds[-1] += l
  1169. else:
  1170. nds.append(l)
  1171. newData[title] = {}
  1172. for l in nds:
  1173. skip = False
  1174. for sep in ('From ',):
  1175. if l.startswith(sep):
  1176. fdix = len(sep)
  1177. kind = l[:fdix].rstrip().lower()
  1178. info = l[fdix:].lstrip()
  1179. newData[title][kind] = info
  1180. skip = True
  1181. if not skip:
  1182. for sep in ' with ', ' by ', ' from ', ' of ':
  1183. fdix = l.find(sep)
  1184. if fdix != -1:
  1185. fdix = fdix + len(sep)
  1186. kind = l[:fdix].rstrip().lower()
  1187. info = l[fdix:].lstrip()
  1188. newData[title][kind] = info
  1189. break
  1190. nd.append(newData)
  1191. data['soundtrack'] = nd
  1192. return data
  1193. class DOMHTMLCrazyCreditsParser(DOMParserBase):
  1194. """Parser for the "crazy credits" page of a given movie.
  1195. The page should be provided as a string, as taken from
  1196. the www.imdb.com server. The final result will be a
  1197. dictionary, with a key for every relevant section.
  1198. Example::
  1199. ccparser = DOMHTMLCrazyCreditsParser()
  1200. result = ccparser.parse(crazycredits_html_string)
  1201. """
  1202. _defGetRefs = True
  1203. rules = [
  1204. Rule(
  1205. key='crazy credits',
  1206. extractor=Path(
  1207. foreach='//ul/li/tt',
  1208. path='.//text()',
  1209. transform=lambda x: x.replace('\n', ' ').replace(' ', ' ')
  1210. )
  1211. )
  1212. ]
  1213. def _process_goof(x):
  1214. text = (x.get('text') or '').strip()
  1215. category = (x.get('category') or 'Goof').strip()
  1216. return {"category": category, "text": text}
  1217. class DOMHTMLGoofsParser(DOMParserBase):
  1218. """Parser for the "goofs" page of a given movie.
  1219. The page should be provided as a string, as taken from
  1220. the www.imdb.com server. The final result will be a
  1221. dictionary, with a key for every relevant section.
  1222. Example::
  1223. gparser = DOMHTMLGoofsParser()
  1224. result = gparser.parse(goofs_html_string)
  1225. """
  1226. _defGetRefs = True
  1227. rules = [
  1228. Rule(
  1229. key='goofs',
  1230. extractor=Rules(
  1231. foreach='//div[contains(@class, "soda sodavote")]',
  1232. rules=[
  1233. Rule(
  1234. key='text',
  1235. extractor=Path('./div[@class="sodatext"]/text()')
  1236. ),
  1237. Rule(
  1238. key='category',
  1239. extractor=Path('./preceding-sibling::h4[1]/text()')
  1240. )
  1241. ],
  1242. transform=_process_goof
  1243. )
  1244. )
  1245. ]
  1246. class DOMHTMLQuotesParser(DOMParserBase):
  1247. """Parser for the "memorable quotes" page of a given movie.
  1248. The page should be provided as a string, as taken from
  1249. the www.imdb.com server. The final result will be a
  1250. dictionary, with a key for every relevant section.
  1251. Example::
  1252. qparser = DOMHTMLQuotesParser()
  1253. result = qparser.parse(quotes_html_string)
  1254. """
  1255. _defGetRefs = True
  1256. rules = [
  1257. Rule(
  1258. key='quotes',
  1259. extractor=Path(
  1260. foreach='//div[@class="sodatext"]',
  1261. path='.//text()',
  1262. transform=lambda x: x
  1263. .strip()
  1264. .replace(' \n', '::')
  1265. .replace('::\n', '::')
  1266. .replace('\n', ' ')
  1267. )
  1268. )
  1269. ]
  1270. def preprocess_dom(self, dom):
  1271. preprocessors.remove(dom, '//div[@class="did-you-know-actions"]')
  1272. return dom
  1273. def postprocess_data(self, data):
  1274. quotes = data.get('quotes', [])
  1275. if not quotes:
  1276. return {}
  1277. quotes = [q.split('::') for q in quotes]
  1278. return {'quotes': quotes}
  1279. class DOMHTMLReleaseinfoParser(DOMParserBase):
  1280. """Parser for the "release dates" page of a given movie.
  1281. The page should be provided as a string, as taken from
  1282. the www.imdb.com server. The final result will be a
  1283. dictionary, with a key for every relevant section.
  1284. Example::
  1285. rdparser = DOMHTMLReleaseinfoParser()
  1286. result = rdparser.parse(releaseinfo_html_string)
  1287. """
  1288. rules = [
  1289. Rule(
  1290. key='release dates',
  1291. extractor=Rules(
  1292. foreach='//table[contains(@class, "release-dates-table-test-only")]//tr',
  1293. rules=[
  1294. Rule(
  1295. key='country',
  1296. extractor=Path('.//td[1]//text()')
  1297. ),
  1298. Rule(
  1299. key='country_code',
  1300. extractor=Path('.//td[1]/a/@href')
  1301. ),
  1302. Rule(
  1303. key='date',
  1304. extractor=Path('.//td[2]//text()')
  1305. ),
  1306. Rule(
  1307. key='notes',
  1308. extractor=Path('.//td[3]//text()')
  1309. )
  1310. ]
  1311. )
  1312. ),
  1313. Rule(
  1314. key='akas',
  1315. extractor=Rules(
  1316. foreach='//table[contains(@class, "akas-table-test-only")]//tr',
  1317. rules=[
  1318. Rule(
  1319. key='countries',
  1320. extractor=Path('./td[1]/text()')
  1321. ),
  1322. Rule(
  1323. key='title',
  1324. extractor=Path('./td[2]/text()')
  1325. )
  1326. ]
  1327. )
  1328. )
  1329. ]
  1330. preprocessors = [
  1331. (re.compile('(<h5><a name="?akas"?.*</table>)', re.I | re.M | re.S),
  1332. r'<div class="_imdbpy_akas">\1</div>')
  1333. ]
  1334. def postprocess_data(self, data):
  1335. if not ('release dates' in data or 'akas' in data):
  1336. return data
  1337. releases = data.get('release dates') or []
  1338. rl = []
  1339. for i in releases:
  1340. country = i.get('country')
  1341. date = i.get('date')
  1342. if not (country and date):
  1343. continue
  1344. country = country.strip()
  1345. date = date.strip()
  1346. if not (country and date):
  1347. continue
  1348. notes = i.get('notes')
  1349. info = '%s::%s' % (country, date)
  1350. if notes:
  1351. notes = notes.replace('\n', '')
  1352. i['notes'] = notes
  1353. info += notes
  1354. rl.append(info)
  1355. if releases:
  1356. for rd in data['release dates']:
  1357. rd['country_code'] = rd['country_code'].split('region=')[1].split('&')[0].strip().upper()
  1358. data['raw release dates'] = data['release dates']
  1359. del data['release dates']
  1360. if rl:
  1361. data['release dates'] = rl
  1362. akas = data.get('akas') or []
  1363. nakas = []
  1364. for aka in akas:
  1365. title = (aka.get('title') or '').strip()
  1366. if not title:
  1367. continue
  1368. countries = (aka.get('countries') or '').split(',')
  1369. if not countries:
  1370. nakas.append(title)
  1371. else:
  1372. for country in countries:
  1373. nakas.append('%s %s' % (title, country.strip()))
  1374. if akas:
  1375. if releases:
  1376. for rd in data['raw release dates']:
  1377. for a in data['akas']:
  1378. if 'countries' in a:
  1379. if rd['country'].strip() in a['countries'].strip():
  1380. a['country_code'] = rd['country_code']
  1381. data['raw akas'] = data['akas']
  1382. del data['akas']
  1383. if nakas:
  1384. data['akas'] = data['akas from release info'] = nakas
  1385. return data
  1386. class DOMHTMLRatingsParser(DOMParserBase):
  1387. """Parser for the "user ratings" page of a given movie.
  1388. The page should be provided as a string, as taken from
  1389. the www.imdb.com server. The final result will be a
  1390. dictionary, with a key for every relevant section.
  1391. Example::
  1392. rparser = DOMHTMLRatingsParser()
  1393. result = rparser.parse(userratings_html_string)
  1394. """
  1395. re_means = re.compile(r'mean\s*=\s*([0-9]\.[0-9])\s*median\s*=\s*([0-9])', re.I)
  1396. rules = [
  1397. Rule(
  1398. key='votes',
  1399. extractor=Rules(
  1400. foreach='//th[@class="firstTableCoulmn"]/../../tr',
  1401. rules=[
  1402. Rule(
  1403. key='ordinal',
  1404. extractor=Path('./td[1]/div//text()')
  1405. ),
  1406. Rule(
  1407. key='votes',
  1408. extractor=Path('./td[3]/div/div//text()')
  1409. )
  1410. ]
  1411. )
  1412. ),
  1413. Rule(
  1414. key='mean and median',
  1415. extractor=Path(
  1416. '//div[starts-with(normalize-space(text()), "Arithmetic mean")]/text()'
  1417. )
  1418. ),
  1419. Rule(
  1420. key='demographics',
  1421. extractor=Rules(
  1422. foreach='//div[@class="smallcell"]',
  1423. rules=[
  1424. Rule(
  1425. key='link',
  1426. extractor=Path('./a/@href')
  1427. ),
  1428. Rule(
  1429. key='rating',
  1430. extractor=Path('..//div[@class="bigcell"]//text()')
  1431. ),
  1432. Rule(
  1433. key='votes',
  1434. extractor=Path('./a/text()')
  1435. )
  1436. ]
  1437. )
  1438. )
  1439. ]
  1440. def postprocess_data(self, data):
  1441. nd = {}
  1442. demographics = data.get('demographics')
  1443. if demographics:
  1444. dem = {}
  1445. for dem_data in demographics:
  1446. link = (dem_data.get('link') or '').strip()
  1447. votes = (dem_data.get('votes') or '').strip()
  1448. rating = (dem_data.get('rating') or '').strip()
  1449. if not (link and votes and rating):
  1450. continue
  1451. eq_idx = link.rfind('=')
  1452. if eq_idx == -1:
  1453. continue
  1454. info = link[eq_idx + 1:].replace('_', ' ')
  1455. try:
  1456. votes = int(votes.replace(',', ''))
  1457. except Exception:
  1458. continue
  1459. try:
  1460. rating = float(rating)
  1461. except Exception:
  1462. continue
  1463. dem[info] = {'votes': votes, 'rating': rating}
  1464. nd['demographics'] = dem
  1465. votes = data.get('votes', [])
  1466. if votes:
  1467. nd['number of votes'] = {}
  1468. for v_info in votes:
  1469. ordinal = v_info.get('ordinal')
  1470. nr_votes = v_info.get('votes')
  1471. if not (ordinal and nr_votes):
  1472. continue
  1473. try:
  1474. ordinal = int(ordinal)
  1475. except Exception:
  1476. continue
  1477. try:
  1478. nr_votes = int(nr_votes.replace(',', ''))
  1479. except Exception:
  1480. continue
  1481. nd['number of votes'][ordinal] = nr_votes
  1482. mean = data.get('mean and median', '')
  1483. if mean:
  1484. means = self.re_means.findall(mean)
  1485. if means and len(means[0]) == 2:
  1486. am, med = means[0]
  1487. try:
  1488. am = float(am)
  1489. except (ValueError, OverflowError):
  1490. pass
  1491. if isinstance(am, float):
  1492. nd['arithmetic mean'] = am
  1493. try:
  1494. med = int(med)
  1495. except (ValueError, OverflowError):
  1496. pass
  1497. if isinstance(med, int):
  1498. nd['median'] = med
  1499. return nd
  1500. def _normalize_href(href):
  1501. if (href is not None) and (not href.lower().startswith('http://')):
  1502. if href.startswith('/'):
  1503. href = href[1:]
  1504. # TODO: imdbURL_base may be set by the user!
  1505. href = '%s%s' % (imdbURL_base, href)
  1506. return href
  1507. class DOMHTMLCriticReviewsParser(DOMParserBase):
  1508. """Parser for the "critic reviews" pages of a given movie.
  1509. The page should be provided as a string, as taken from
  1510. the www.imdb.com server. The final result will be a
  1511. dictionary, with a key for every relevant section.
  1512. Example::
  1513. crparser = DOMHTMLCriticReviewsParser()
  1514. result = crparser.parse(criticreviews_html_string)
  1515. """
  1516. kind = 'critic reviews'
  1517. rules = [
  1518. Rule(
  1519. key='metascore',
  1520. extractor=Path('//div[@class="metascore_wrap"]/div/span//text()')
  1521. ),
  1522. Rule(
  1523. key='metacritic url',
  1524. extractor=Path('//div[@class="article"]/div[@class="see-more"]/a/@href')
  1525. )
  1526. ]
  1527. class DOMHTMLReviewsParser(DOMParserBase):
  1528. """Parser for the "reviews" pages of a given movie.
  1529. The page should be provided as a string, as taken from
  1530. the www.imdb.com server. The final result will be a
  1531. dictionary, with a key for every relevant section.
  1532. Example::
  1533. rparser = DOMHTMLReviewsParser()
  1534. result = rparser.parse(reviews_html_string)
  1535. """
  1536. rules = [
  1537. Rule(
  1538. key='reviews',
  1539. extractor=Rules(
  1540. foreach='//div[@class="review-container"]',
  1541. rules=[
  1542. Rule(
  1543. key='text',
  1544. extractor=Path('.//div[@class="text show-more__control"]//text()')
  1545. ),
  1546. Rule(
  1547. key='helpful',
  1548. extractor=Path('.//div[@class="actions text-muted"]//text()[1]')
  1549. ),
  1550. Rule(
  1551. key='title',
  1552. extractor=Path('.//a[@class="title"]//text()')
  1553. ),
  1554. Rule(
  1555. key='author',
  1556. extractor=Path('.//span[@class="display-name-link"]/a/@href')
  1557. ),
  1558. Rule(
  1559. key='date',
  1560. extractor=Path('.//span[@class="review-date"]//text()')
  1561. ),
  1562. Rule(
  1563. key='rating',
  1564. extractor=Path('.//span[@class="point-scale"]/preceding-sibling::span[1]/text()')
  1565. )
  1566. ],
  1567. transform=lambda x: ({
  1568. 'content': x.get('text', '').replace('\n', ' ').replace(' ', ' ').strip(),
  1569. 'helpful': [int(s) for s in x.get('helpful', '').split() if s.isdigit()],
  1570. 'title': x.get('title', '').strip(),
  1571. 'author': analyze_imdbid(x.get('author')),
  1572. 'date': x.get('date', '').strip(),
  1573. 'rating': x.get('rating', '').strip()
  1574. })
  1575. )
  1576. )
  1577. ]
  1578. preprocessors = [('<br>', '<br>\n')]
  1579. def postprocess_data(self, data):
  1580. for review in data.get('reviews', []):
  1581. if review.get('rating'):
  1582. if isinstance(review['rating'], str):
  1583. review['rating'] = int(review['rating'])
  1584. elif len(review['rating']) == 2: # May be legacy code.
  1585. review['rating'] = int(review['rating'][0])
  1586. else:
  1587. review['rating'] = None
  1588. else:
  1589. review['rating'] = None
  1590. if review.get('helpful') and len(review['helpful']) == 2:
  1591. review['not_helpful'] = review['helpful'][1] - review['helpful'][0]
  1592. review['helpful'] = review['helpful'][0]
  1593. else:
  1594. review['helpful'] = 0
  1595. review['not_helpful'] = 0
  1596. review['author'] = "ur%s" % review['author']
  1597. return data
  1598. class DOMHTMLFullCreditsParser(DOMParserBase):
  1599. """Parser for the "full credits" (series cast section) page of a given movie.
  1600. The page should be provided as a string, as taken from
  1601. the www.imdb.com server. The final result will be a
  1602. dictionary, with a key for every relevant section.
  1603. Example::
  1604. fcparser = DOMHTMLFullCreditsParser()
  1605. result = fcparser.parse(fullcredits_html_string)
  1606. """
  1607. kind = 'full credits'
  1608. rules = [
  1609. Rule(
  1610. key='cast',
  1611. extractor=Rules(
  1612. foreach='//table[@class="cast_list"]//tr[@class="odd" or @class="even"]',
  1613. rules=[
  1614. Rule(
  1615. key='person',
  1616. extractor=Path('.//text()')
  1617. ),
  1618. Rule(
  1619. key='link',
  1620. extractor=Path('./td[2]/a/@href')
  1621. ),
  1622. Rule(
  1623. key='roleID',
  1624. extractor=Path('./td[4]//div[@class="_imdbpyrole"]/@roleid')
  1625. ),
  1626. Rule(
  1627. key='headshot',
  1628. extractor=Path('./td[@class="primary_photo"]/a/img/@loadlate')
  1629. )
  1630. ],
  1631. transform=lambda x: build_person(
  1632. x.get('person', ''),
  1633. personID=analyze_imdbid(x.get('link')),
  1634. roleID=(x.get('roleID', '')).split('/'),
  1635. headshot=(x.get('headshot', ''))
  1636. )
  1637. )
  1638. ),
  1639. # parser for misc sections like 'casting department', 'stunts', ...
  1640. Rule(
  1641. key='misc sections',
  1642. extractor=Rules(
  1643. foreach='//h4[contains(@class, "dataHeaderWithBorder")]',
  1644. rules=[
  1645. Rule(
  1646. key=Path('./@name', transform=clean_section_name),
  1647. extractor=Rules(
  1648. foreach='./following-sibling::table[1]//tr',
  1649. rules=[
  1650. Rule(
  1651. key='person',
  1652. extractor=Path('.//text()')
  1653. ),
  1654. Rule(
  1655. key='link',
  1656. extractor=Path('./td[1]/a[@href]/@href')
  1657. )
  1658. ],
  1659. transform=lambda x: build_person(
  1660. x.get('person') or '',
  1661. personID=analyze_imdbid(x.get('link'))
  1662. )
  1663. )
  1664. )
  1665. ]
  1666. )
  1667. ),
  1668. ]
  1669. preprocessors = [
  1670. (_reRolesMovie, _manageRoles)
  1671. ]
  1672. def postprocess_data(self, data):
  1673. # Convert section names.
  1674. clean_cast = []
  1675. for person in data.get('cast', []):
  1676. if person.personID and person.get('name'):
  1677. clean_cast.append(person)
  1678. if clean_cast:
  1679. data['cast'] = clean_cast
  1680. misc_sections = data.get('misc sections')
  1681. if misc_sections is not None:
  1682. for section in misc_sections:
  1683. for sectName, sectData in section.items():
  1684. # skip sections with their own parsers
  1685. if sectName in ('cast',):
  1686. continue
  1687. newName = _SECT_CONV.get(sectName, sectName)
  1688. if sectData:
  1689. data[newName] = sectData
  1690. del data['misc sections']
  1691. return data
  1692. class DOMHTMLOfficialsitesParser(DOMParserBase):
  1693. """Parser for the "official sites", "external reviews"
  1694. "miscellaneous links", "sound clips", "video clips" and
  1695. "photographs" pages of a given movie.
  1696. The page should be provided as a string, as taken from
  1697. the www.imdb.com server. The final result will be a
  1698. dictionary, with a key for every relevant section.
  1699. Example::
  1700. osparser = DOMHTMLOfficialsitesParser()
  1701. result = osparser.parse(officialsites_html_string)
  1702. """
  1703. rules = [
  1704. Rule(
  1705. foreach='//h4[@class="li_group"]',
  1706. key=Path(
  1707. './text()',
  1708. transform=lambda x: x.strip().lower()
  1709. ),
  1710. extractor=Rules(
  1711. foreach='./following::ul[1]/li/a',
  1712. rules=[
  1713. Rule(
  1714. key='link',
  1715. extractor=Path('./@href')
  1716. ),
  1717. Rule(
  1718. key='info',
  1719. extractor=Path('./text()')
  1720. )
  1721. ],
  1722. transform=lambda x: (
  1723. x.get('info').strip(),
  1724. unquote(_normalize_href(x.get('link')))
  1725. )
  1726. )
  1727. )
  1728. ]
  1729. class DOMHTMLConnectionsParser(DOMParserBase):
  1730. """Parser for the "connections" pages of a given movie.
  1731. The page should be provided as a string, as taken from
  1732. the www.imdb.com server. The final result will be a
  1733. dictionary, with a key for every relevant section.
  1734. Example::
  1735. osparser = DOMHTMLOfficialsitesParser()
  1736. result = osparser.parse(officialsites_html_string)
  1737. """
  1738. preprocessors = [
  1739. (re.compile('(<h4 class="li_group">)', re.I), r'</div><div class="_imdbpy">\1'),
  1740. (re.compile('(^<br />.*$)', re.I | re.M), r''),
  1741. ]
  1742. rules = [
  1743. Rule(
  1744. foreach='//div[@class="_imdbpy"]',
  1745. key=Path(
  1746. './h4/text()',
  1747. transform=lambda x: x.strip().lower()
  1748. ),
  1749. extractor=Rules(
  1750. foreach='./div[contains(@class, "soda")]',
  1751. rules=[
  1752. Rule(
  1753. key='link',
  1754. extractor=Path('./a/@href')
  1755. ),
  1756. Rule(
  1757. key='info',
  1758. extractor=Path('.//text()')
  1759. )
  1760. ],
  1761. transform=lambda x: (
  1762. x.get('info').strip(),
  1763. unquote(_normalize_href(x.get('link')))
  1764. )
  1765. )
  1766. )
  1767. ]
  1768. def postprocess_data(self, data):
  1769. connections = {}
  1770. for k, v in data.items():
  1771. k = k.strip()
  1772. if not (k and v):
  1773. continue
  1774. movies = []
  1775. for title, link in v:
  1776. title = title.strip().replace('\n', '')
  1777. movieID = analyze_imdbid(link)
  1778. if not (title and movieID):
  1779. continue
  1780. movie = Movie(title=title, movieID=movieID,
  1781. accessSystem=self._as, modFunct=self._modFunct)
  1782. movies.append(movie)
  1783. if movies:
  1784. connections[k] = movies
  1785. return {'connections': connections}
  1786. class DOMHTMLLocationsParser(DOMParserBase):
  1787. """Parser for the "locations" page of a given movie.
  1788. The page should be provided as a string, as taken from
  1789. the www.imdb.com server. The final result will be a
  1790. dictionary, with a key for every relevant section.
  1791. Example::
  1792. lparser = DOMHTMLLocationsParser()
  1793. result = lparser.parse(locations_html_string)
  1794. """
  1795. rules = [
  1796. Rule(
  1797. key='locations',
  1798. extractor=Rules(
  1799. foreach='//dt',
  1800. rules=[
  1801. Rule(
  1802. key='place',
  1803. extractor=Path('.//text()')
  1804. ),
  1805. Rule(
  1806. key='note',
  1807. extractor=Path('./following-sibling::dd[1]//text()')
  1808. )
  1809. ],
  1810. transform=lambda x: ('%s::%s' % (x['place'].strip(),
  1811. (x['note'] or '').strip())).strip(':')
  1812. )
  1813. )
  1814. ]
  1815. class DOMHTMLTechParser(DOMParserBase):
  1816. """Parser for the "technical", "publicity" (for people) and "contacts" (for people)
  1817. pages of a given movie.
  1818. The page should be provided as a string, as taken from
  1819. the www.imdb.com server. The final result will be a
  1820. dictionary, with a key for every relevant section.
  1821. Example::
  1822. tparser = DOMHTMLTechParser()
  1823. result = tparser.parse(technical_html_string)
  1824. """
  1825. kind = 'tech'
  1826. re_space = re.compile(r'\s+')
  1827. rules = [
  1828. Rule(
  1829. key='tech',
  1830. extractor=Rules(
  1831. foreach='//table//tr/td[@class="label"]',
  1832. rules=[
  1833. Rule(
  1834. key=Path(
  1835. './text()',
  1836. transform=lambda x: x.lower().strip()),
  1837. extractor=Path(
  1838. '..//td[2]//text()',
  1839. transform=lambda x: [t.strip()
  1840. for t in x.split(':::') if t.strip()]
  1841. )
  1842. )
  1843. ]
  1844. )
  1845. )
  1846. ]
  1847. preprocessors = [
  1848. (re.compile('(<h5>.*?</h5>)', re.I), r'</div>\1<div class="_imdbpy">'),
  1849. (re.compile('((<br/>|</p>|</table>))\n?<br/>(?!<a)', re.I), r'\1</div>'),
  1850. # the ones below are for the publicity parser
  1851. (re.compile('<p>(.*?)</p>', re.I), r'\1<br/>'),
  1852. (re.compile('(</td><td valign="top">)', re.I), r'\1::'),
  1853. (re.compile('(</tr><tr>)', re.I), r'\n\1'),
  1854. (re.compile(r'<span class="ghost">\|</span>', re.I), r':::'),
  1855. (re.compile('<br/?>', re.I), r':::')
  1856. # this is for splitting individual entries
  1857. ]
  1858. def postprocess_data(self, data):
  1859. info = {}
  1860. for section in data.get('tech', []):
  1861. info.update(section)
  1862. for key, value in info.items():
  1863. if isinstance(value, list):
  1864. info[key] = [self.re_space.sub(' ', x).strip() for x in value]
  1865. else:
  1866. info[key] = self.re_space.sub(' ', value).strip()
  1867. return {self.kind: info}
  1868. class DOMHTMLNewsParser(DOMParserBase):
  1869. """Parser for the "news" page of a given movie or person.
  1870. The page should be provided as a string, as taken from
  1871. the www.imdb.com server. The final result will be a
  1872. dictionary, with a key for every relevant section.
  1873. Example::
  1874. nwparser = DOMHTMLNewsParser()
  1875. result = nwparser.parse(news_html_string)
  1876. """
  1877. _defGetRefs = True
  1878. rules = [
  1879. Rule(
  1880. key='news',
  1881. extractor=Rules(
  1882. foreach='//h2',
  1883. rules=[
  1884. Rule(
  1885. key='title',
  1886. extractor=Path('./text()')
  1887. ),
  1888. Rule(
  1889. key='fromdate',
  1890. extractor=Path('./following-sibling::p[1]/small//text()')
  1891. ),
  1892. Rule(
  1893. key='body',
  1894. extractor=Path('../following-sibling::p[2]//text()')
  1895. ),
  1896. Rule(
  1897. key='link',
  1898. extractor=Path('../..//a[text()="Permalink"]/@href')
  1899. ),
  1900. Rule(
  1901. key='fulllink',
  1902. extractor=Path('../..//a[starts-with(text(), "See full article at")]/@href')
  1903. )
  1904. ],
  1905. transform=lambda x: {
  1906. 'title': x.get('title').strip(),
  1907. 'date': x.get('fromdate').split('|')[0].strip(),
  1908. 'from': x.get('fromdate').split('|')[1].replace('From ', '').strip(),
  1909. 'body': (x.get('body') or '').strip(),
  1910. 'link': _normalize_href(x.get('link')),
  1911. 'full article link': _normalize_href(x.get('fulllink'))
  1912. }
  1913. )
  1914. )
  1915. ]
  1916. preprocessors = [
  1917. (re.compile('(<a name=[^>]+><h2>)', re.I), r'<div class="_imdbpy">\1'),
  1918. (re.compile('(<hr/>)', re.I), r'</div>\1'),
  1919. (re.compile('<p></p>', re.I), r'')
  1920. ]
  1921. def postprocess_data(self, data):
  1922. if 'news' not in data:
  1923. return {}
  1924. for news in data['news']:
  1925. if 'full article link' in news:
  1926. if news['full article link'] is None:
  1927. del news['full article link']
  1928. return data
  1929. def _parse_review(x):
  1930. result = {}
  1931. title = x.get('title').strip()
  1932. if title[-1] == ':':
  1933. title = title[:-1]
  1934. result['title'] = title
  1935. result['link'] = _normalize_href(x.get('link'))
  1936. kind = x.get('kind').strip()
  1937. if kind[-1] == ':':
  1938. kind = kind[:-1]
  1939. result['review kind'] = kind
  1940. text = x.get('review').replace('\n\n', '||').replace('\n', ' ').split('||')
  1941. review = '\n'.join(text)
  1942. if x.get('author') is not None:
  1943. author = x.get('author').strip()
  1944. review = review.split(author)[0].strip()
  1945. result['review author'] = author[2:]
  1946. if x.get('item') is not None:
  1947. item = x.get('item').strip()
  1948. review = review[len(item):].strip()
  1949. review = "%s: %s" % (item, review)
  1950. result['review'] = review
  1951. return result
  1952. class DOMHTMLSeasonEpisodesParser(DOMParserBase):
  1953. """Parser for the "episode list" page of a given movie.
  1954. The page should be provided as a string, as taken from
  1955. the www.imdb.com server. The final result will be a
  1956. dictionary, with a key for every relevant section.
  1957. Example::
  1958. sparser = DOMHTMLSeasonEpisodesParser()
  1959. result = sparser.parse(episodes_html_string)
  1960. """
  1961. rules = [
  1962. Rule(
  1963. key='series link',
  1964. extractor=Path('//div[@class="parent"]//a/@href')
  1965. ),
  1966. Rule(
  1967. key='series title',
  1968. extractor=Path('//head/meta[@property="og:title"]/@content')
  1969. ),
  1970. Rule(
  1971. key='_seasons',
  1972. extractor=Path(
  1973. foreach='//select[@id="bySeason"]//option',
  1974. path='./@value'
  1975. )
  1976. ),
  1977. Rule(
  1978. key='_current_season',
  1979. extractor=Path('//select[@id="bySeason"]//option[@selected]/@value')
  1980. ),
  1981. Rule(
  1982. key='episodes',
  1983. extractor=Rules(
  1984. foreach='//div[@class="info"]',
  1985. rules=[
  1986. Rule(
  1987. key=Path('.//meta/@content',
  1988. transform=lambda x: 'episode %s' % x),
  1989. extractor=Rules(
  1990. rules=[
  1991. Rule(
  1992. key='link',
  1993. extractor=Path('.//strong//a[@href][1]/@href')
  1994. ),
  1995. Rule(
  1996. key='original air date',
  1997. extractor=Path('.//div[@class="airdate"]/text()')
  1998. ),
  1999. Rule(
  2000. key='title',
  2001. extractor=Path('.//strong//text()')
  2002. ),
  2003. Rule(
  2004. key='rating',
  2005. extractor=Path(
  2006. './/div[contains(@class, "ipl-rating-star")][1]'
  2007. '/span[@class="ipl-rating-star__rating"][1]/text()'
  2008. )
  2009. ),
  2010. Rule(
  2011. key='votes',
  2012. extractor=Path(
  2013. './/div[contains(@class, "ipl-rating-star")][1]'
  2014. '/span[@class="ipl-rating-star__total-votes"][1]/text()'
  2015. )
  2016. ),
  2017. Rule(
  2018. key='plot',
  2019. extractor=Path('.//div[@class="item_description"]//text()')
  2020. )
  2021. ]
  2022. )
  2023. )
  2024. ]
  2025. )
  2026. )
  2027. ]
  2028. def postprocess_data(self, data):
  2029. series_id = analyze_imdbid(data.get('series link'))
  2030. series_title = data.get('series title', '').strip()
  2031. selected_season = data.get('_current_season', 'unknown season').strip()
  2032. if not (series_id and series_title):
  2033. return {}
  2034. series = Movie(title=series_title, movieID=str(series_id),
  2035. accessSystem=self._as, modFunct=self._modFunct)
  2036. if series.get('kind') == 'movie':
  2037. series['kind'] = 'tv series'
  2038. try:
  2039. selected_season = int(selected_season)
  2040. except ValueError:
  2041. pass
  2042. nd = {selected_season: {}}
  2043. if 'episode -1' in data:
  2044. counter = 1
  2045. for episode in data['episode -1']:
  2046. while 'episode %d' % counter in data:
  2047. counter += 1
  2048. k = 'episode %d' % counter
  2049. data[k] = [episode]
  2050. del data['episode -1']
  2051. episodes = data.get('episodes', [])
  2052. for ep in episodes:
  2053. if not ep:
  2054. continue
  2055. episode_nr, episode = list(ep.items())[0]
  2056. if not episode_nr.startswith('episode '):
  2057. continue
  2058. episode_nr = episode_nr[8:].rstrip()
  2059. try:
  2060. episode_nr = int(episode_nr)
  2061. except ValueError:
  2062. pass
  2063. episode_id = analyze_imdbid(episode.get('link' ''))
  2064. episode_air_date = episode.get('original air date', '').strip()
  2065. episode_title = episode.get('title', '').strip()
  2066. episode_plot = episode.get('plot', '')
  2067. episode_rating = episode.get('rating', '')
  2068. episode_votes = episode.get('votes', '')
  2069. if not (episode_nr is not None and episode_id and episode_title):
  2070. continue
  2071. ep_obj = Movie(movieID=episode_id, title=episode_title,
  2072. accessSystem=self._as, modFunct=self._modFunct)
  2073. ep_obj['kind'] = 'episode'
  2074. ep_obj['episode of'] = series
  2075. ep_obj['season'] = selected_season
  2076. ep_obj['episode'] = episode_nr
  2077. if episode_rating:
  2078. try:
  2079. ep_obj['rating'] = float(episode_rating)
  2080. except:
  2081. pass
  2082. if episode_votes:
  2083. try:
  2084. ep_obj['votes'] = int(episode_votes.replace(',', '')
  2085. .replace('.', '').replace('(', '').replace(')', ''))
  2086. except:
  2087. pass
  2088. if episode_air_date:
  2089. ep_obj['original air date'] = episode_air_date
  2090. if episode_air_date[-4:].isdigit():
  2091. ep_obj['year'] = episode_air_date[-4:]
  2092. if episode_plot:
  2093. ep_obj['plot'] = episode_plot
  2094. nd[selected_season][episode_nr] = ep_obj
  2095. _seasons = data.get('_seasons') or []
  2096. for idx, season in enumerate(_seasons):
  2097. try:
  2098. _seasons[idx] = int(season)
  2099. except ValueError:
  2100. pass
  2101. return {'episodes': nd, '_seasons': _seasons, '_current_season': selected_season}
  2102. def _build_episode(x):
  2103. """Create a Movie object for a given series' episode."""
  2104. episode_id = analyze_imdbid(x.get('link'))
  2105. episode_title = x.get('title')
  2106. e = Movie(movieID=episode_id, title=episode_title)
  2107. e['kind'] = 'episode'
  2108. oad = x.get('oad')
  2109. if oad:
  2110. e['original air date'] = oad.strip()
  2111. year = x.get('year')
  2112. if year is not None:
  2113. year = year[5:]
  2114. if year == 'unknown':
  2115. year = '????'
  2116. if year and year.isdigit():
  2117. year = int(year)
  2118. e['year'] = year
  2119. else:
  2120. if oad and oad[-4:].isdigit():
  2121. e['year'] = int(oad[-4:])
  2122. epinfo = x.get('episode')
  2123. if epinfo is not None:
  2124. season, episode = epinfo.split(':')[0].split(',')
  2125. e['season'] = int(season[7:])
  2126. e['episode'] = int(episode[8:])
  2127. else:
  2128. e['season'] = 'unknown'
  2129. e['episode'] = 'unknown'
  2130. plot = x.get('plot')
  2131. if plot:
  2132. e['plot'] = plot.strip()
  2133. return e
  2134. class DOMHTMLEpisodesParser(DOMParserBase):
  2135. """Parser for the "episode list" page of a given movie.
  2136. The page should be provided as a string, as taken from
  2137. the www.imdb.com server. The final result will be a
  2138. dictionary, with a key for every relevant section.
  2139. Example::
  2140. eparser = DOMHTMLEpisodesParser()
  2141. result = eparser.parse(episodes_html_string)
  2142. """
  2143. kind = 'episodes list'
  2144. _episodes_path = "..//h4"
  2145. _oad_path = "./following-sibling::span/strong[1]/text()"
  2146. def _init(self):
  2147. self.rules = [
  2148. Rule(
  2149. key='series title',
  2150. extractor=Path('//title/text()')
  2151. ),
  2152. Rule(
  2153. key='series movieID',
  2154. extractor=Path(
  2155. './/h1/a[@class="main"]/@href',
  2156. transform=analyze_imdbid
  2157. )
  2158. ),
  2159. Rule(
  2160. key='episodes',
  2161. extractor=Rules(
  2162. foreach='//div[@class="_imdbpy"]/h3',
  2163. rules=[
  2164. Rule(
  2165. key='./a/@name',
  2166. extractor=Rules(
  2167. foreach=self._episodes_path,
  2168. rules=[
  2169. Rule(
  2170. key='link',
  2171. extractor=Path('./a/@href')
  2172. ),
  2173. Rule(
  2174. key='title',
  2175. extractor=Path('./a/text()')
  2176. ),
  2177. Rule(
  2178. key='year',
  2179. extractor=Path('./preceding-sibling::a[1]/@name')
  2180. ),
  2181. Rule(
  2182. key='episode',
  2183. extractor=Path('./text()[1]')
  2184. ),
  2185. Rule(
  2186. key='oad',
  2187. extractor=Path(self._oad_path)
  2188. ),
  2189. Rule(
  2190. key='plot',
  2191. extractor=Path('./following-sibling::text()[1]')
  2192. )
  2193. ],
  2194. transform=_build_episode
  2195. )
  2196. )
  2197. ]
  2198. )
  2199. )
  2200. ]
  2201. preprocessors = [
  2202. (re.compile('(<hr/>\n)(<h3>)', re.I), r'</div>\1<div class="_imdbpy">\2'),
  2203. (re.compile('(</p>\n\n)</div>', re.I), r'\1'),
  2204. (re.compile('<h3>(.*?)</h3>', re.I), r'<h4>\1</h4>'),
  2205. (_reRolesMovie, _manageRoles),
  2206. (re.compile('(<br/> <br/>\n)(<hr/>)', re.I), r'\1</div>\2')
  2207. ]
  2208. def postprocess_data(self, data):
  2209. # A bit extreme?
  2210. if 'series title' not in data:
  2211. return {}
  2212. if 'series movieID' not in data:
  2213. return {}
  2214. stitle = data['series title'].replace('- Episode list', '')
  2215. stitle = stitle.replace('- Episodes list', '')
  2216. stitle = stitle.replace('- Episode cast', '')
  2217. stitle = stitle.replace('- Episodes cast', '')
  2218. stitle = stitle.strip()
  2219. if not stitle:
  2220. return {}
  2221. seriesID = data['series movieID']
  2222. if seriesID is None:
  2223. return {}
  2224. series = Movie(title=stitle, movieID=str(seriesID),
  2225. accessSystem=self._as, modFunct=self._modFunct)
  2226. nd = {}
  2227. for key in list(data.keys()):
  2228. if key.startswith('filter-season-') or key.startswith('season-'):
  2229. season_key = key.replace('filter-season-', '').replace('season-', '')
  2230. try:
  2231. season_key = int(season_key)
  2232. except ValueError:
  2233. pass
  2234. nd[season_key] = {}
  2235. ep_counter = 1
  2236. for episode in data[key]:
  2237. if not episode:
  2238. continue
  2239. episode_key = episode.get('episode')
  2240. if episode_key is None:
  2241. continue
  2242. if not isinstance(episode_key, int):
  2243. episode_key = ep_counter
  2244. ep_counter += 1
  2245. cast_key = 'Season %s, Episode %s:' % (season_key, episode_key)
  2246. if cast_key in data:
  2247. cast = data[cast_key]
  2248. for i in range(len(cast)):
  2249. cast[i].billingPos = i + 1
  2250. episode['cast'] = cast
  2251. episode['episode of'] = series
  2252. nd[season_key][episode_key] = episode
  2253. if len(nd) == 0:
  2254. return {}
  2255. return {'episodes': nd}
  2256. class DOMHTMLFaqsParser(DOMParserBase):
  2257. """Parser for the "FAQ" page of a given movie.
  2258. The page should be provided as a string, as taken from
  2259. the www.imdb.com server. The final result will be a
  2260. dictionary, with a key for every relevant section.
  2261. Example::
  2262. fparser = DOMHTMLFaqsParser()
  2263. result = fparser.parse(faqs_html_string)
  2264. """
  2265. _defGetRefs = True
  2266. rules = [
  2267. Rule(
  2268. key='faqs',
  2269. extractor=Rules(
  2270. foreach='//div[@class="section"]',
  2271. rules=[
  2272. Rule(
  2273. key='question',
  2274. extractor=Path('./h3/a/span/text()')
  2275. ),
  2276. Rule(
  2277. key='answer',
  2278. extractor=Path('../following-sibling::div[1]//text()')
  2279. )
  2280. ],
  2281. transform=lambda x: '%s::%s' % (
  2282. x.get('question').strip(),
  2283. '\n\n'.join(x.get('answer').replace('\n\n', '\n').strip().split('||'))
  2284. )
  2285. )
  2286. )
  2287. ]
  2288. preprocessors = [
  2289. (re.compile('<br/><br/>', re.I), r'||'),
  2290. (re.compile('<h4>(.*?)</h4>\n', re.I), r'||\1--'),
  2291. (re.compile('<span class="spoiler"><span>(.*?)</span></span>', re.I),
  2292. r'[spoiler]\1[/spoiler]')
  2293. ]
  2294. class DOMHTMLAiringParser(DOMParserBase):
  2295. """Parser for the "airing" page of a given movie.
  2296. The page should be provided as a string, as taken from
  2297. the www.imdb.com server. The final result will be a
  2298. dictionary, with a key for every relevant section.
  2299. Example::
  2300. aparser = DOMHTMLAiringParser()
  2301. result = aparser.parse(airing_html_string)
  2302. """
  2303. _containsObjects = True
  2304. rules = [
  2305. Rule(
  2306. key='series title',
  2307. extractor=Path(
  2308. '//title/text()',
  2309. transform=lambda x: x.replace(' - TV schedule', '')
  2310. )
  2311. ),
  2312. Rule(
  2313. key='series id',
  2314. extractor=Path('//h1/a[@href]/@href')
  2315. ),
  2316. Rule(
  2317. key='tv airings',
  2318. extractor=Rules(
  2319. foreach='//tr[@class]',
  2320. rules=[
  2321. Rule(
  2322. key='date',
  2323. extractor=Path('./td[1]//text()')
  2324. ),
  2325. Rule(
  2326. key='time',
  2327. extractor=Path('./td[2]//text()')
  2328. ),
  2329. Rule(
  2330. key='channel',
  2331. extractor=Path('./td[3]//text()')
  2332. ),
  2333. Rule(
  2334. key='link',
  2335. extractor=Path('./td[4]/a[1]/@href')
  2336. ),
  2337. Rule(
  2338. key='title',
  2339. extractor=Path('./td[4]//text()')
  2340. ),
  2341. Rule(
  2342. key='season',
  2343. extractor=Path('./td[5]//text()')
  2344. )
  2345. ],
  2346. transform=lambda x: {
  2347. 'date': x.get('date'),
  2348. 'time': x.get('time'),
  2349. 'channel': x.get('channel').strip(),
  2350. 'link': x.get('link'),
  2351. 'title': x.get('title'),
  2352. 'season': (x.get('season') or '').strip()
  2353. }
  2354. )
  2355. )
  2356. ]
  2357. def postprocess_data(self, data):
  2358. if len(data) == 0:
  2359. return {}
  2360. seriesTitle = data.get('series title') or ''
  2361. seriesID = analyze_imdbid(data.get('series id'))
  2362. if seriesID and 'airing' in data:
  2363. for airing in data['airing']:
  2364. title = airing.get('title', '').strip()
  2365. if not title:
  2366. epsTitle = seriesTitle
  2367. if seriesID is None:
  2368. continue
  2369. epsID = seriesID
  2370. else:
  2371. epsTitle = '%s {%s}' % (data['series title'],
  2372. airing['title'])
  2373. epsID = analyze_imdbid(airing['link'])
  2374. e = Movie(title=epsTitle, movieID=epsID)
  2375. airing['episode'] = e
  2376. del airing['link']
  2377. del airing['title']
  2378. if not airing['season']:
  2379. del airing['season']
  2380. if 'series title' in data:
  2381. del data['series title']
  2382. if 'series id' in data:
  2383. del data['series id']
  2384. if 'airing' in data:
  2385. data['airing'] = [_f for _f in data['airing'] if _f]
  2386. if 'airing' not in data or not data['airing']:
  2387. return {}
  2388. return data
  2389. class DOMHTMLParentsGuideParser(DOMParserBase):
  2390. """Parser for the "parents guide" page of a given movie.
  2391. The page should be provided as a string, as taken from
  2392. the www.imdb.com server. The final result will be a
  2393. dictionary, with a key for every relevant section.
  2394. Example::
  2395. pgparser = HTMLParentsGuideParser()
  2396. result = pgparser.parse(parentsguide_html_string)
  2397. """
  2398. rules = [
  2399. Rule(
  2400. key='mpaa',
  2401. extractor=Path(
  2402. '//tr[@id="mpaa-rating"]/td[2]//text()'
  2403. )
  2404. ),
  2405. Rule(
  2406. key='certificates',
  2407. extractor=Rules(
  2408. foreach='//tr[@id="certifications-list"]//li',
  2409. rules=[
  2410. Rule(
  2411. key='full',
  2412. extractor=Path('./a//text()')
  2413. ),
  2414. Rule(
  2415. key='country_code',
  2416. extractor=Path('./a/@href')
  2417. ),
  2418. Rule(
  2419. key='note',
  2420. extractor=Path('./text()')
  2421. ),
  2422. ],
  2423. transform=lambda x: {
  2424. 'country_code': x.get('country_code').split('certificates=')[1].split(':')[0].strip(),
  2425. 'country': x.get('full').split(':')[0].strip(),
  2426. 'certificate': x.get('full').split(':')[1].strip(),
  2427. 'note': x.get('note').strip(),
  2428. 'full': x.get('full').strip(),
  2429. }
  2430. )
  2431. ),
  2432. Rule(
  2433. key='advisories',
  2434. extractor=Rules(
  2435. foreach='//section[starts-with(@id, "advisory-")]',
  2436. rules=[
  2437. Rule(key='section',
  2438. extractor=Path('./@id')
  2439. ),
  2440. Rule(key='items',
  2441. extractor=Rules(
  2442. foreach='.//li',
  2443. rules=[
  2444. Rule(
  2445. key='item',
  2446. extractor=Path('./text()')
  2447. )
  2448. ],
  2449. transform=lambda x: x.get('item').strip()
  2450. )
  2451. )
  2452. ]
  2453. )
  2454. )
  2455. ]
  2456. def postprocess_data(self, data):
  2457. if 'advisories' in data:
  2458. for advisory in data['advisories']:
  2459. sect = advisory.get('section', '').replace('-', ' ')
  2460. items = [x for x in advisory.get('items', []) if x]
  2461. if sect and items:
  2462. data[sect] = items
  2463. del data['advisories']
  2464. return data
  2465. _OBJECTS = {
  2466. 'movie_parser': ((DOMHTMLMovieParser,), None),
  2467. 'full_credits_parser': ((DOMHTMLFullCreditsParser,), None),
  2468. 'plot_parser': ((DOMHTMLPlotParser,), None),
  2469. 'movie_awards_parser': ((DOMHTMLAwardsParser,), None),
  2470. 'taglines_parser': ((DOMHTMLTaglinesParser,), None),
  2471. 'keywords_parser': ((DOMHTMLKeywordsParser,), None),
  2472. 'crazycredits_parser': ((DOMHTMLCrazyCreditsParser,), None),
  2473. 'goofs_parser': ((DOMHTMLGoofsParser,), None),
  2474. 'alternateversions_parser': ((DOMHTMLAlternateVersionsParser,), None),
  2475. 'trivia_parser': ((DOMHTMLTriviaParser,), None),
  2476. 'soundtrack_parser': ((DOMHTMLSoundtrackParser,), None),
  2477. 'quotes_parser': ((DOMHTMLQuotesParser,), None),
  2478. 'releasedates_parser': ((DOMHTMLReleaseinfoParser,), None),
  2479. 'ratings_parser': ((DOMHTMLRatingsParser,), None),
  2480. 'criticrev_parser': ((DOMHTMLCriticReviewsParser,), {'kind': 'critic reviews'}),
  2481. 'reviews_parser': ((DOMHTMLReviewsParser,), {'kind': 'reviews'}),
  2482. 'externalsites_parser': ((DOMHTMLOfficialsitesParser,), None),
  2483. 'officialsites_parser': ((DOMHTMLOfficialsitesParser,), None),
  2484. 'externalrev_parser': ((DOMHTMLOfficialsitesParser,), None),
  2485. 'misclinks_parser': ((DOMHTMLOfficialsitesParser,), None),
  2486. 'soundclips_parser': ((DOMHTMLOfficialsitesParser,), None),
  2487. 'videoclips_parser': ((DOMHTMLOfficialsitesParser,), None),
  2488. 'photosites_parser': ((DOMHTMLOfficialsitesParser,), None),
  2489. 'connections_parser': ((DOMHTMLConnectionsParser,), None),
  2490. 'tech_parser': ((DOMHTMLTechParser,), None),
  2491. 'locations_parser': ((DOMHTMLLocationsParser,), None),
  2492. 'news_parser': ((DOMHTMLNewsParser,), None),
  2493. 'episodes_parser': ((DOMHTMLEpisodesParser,), None),
  2494. 'season_episodes_parser': ((DOMHTMLSeasonEpisodesParser,), None),
  2495. 'movie_faqs_parser': ((DOMHTMLFaqsParser,), None),
  2496. 'airing_parser': ((DOMHTMLAiringParser,), None),
  2497. 'parentsguide_parser': ((DOMHTMLParentsGuideParser,), None)
  2498. }