PageRenderTime 103ms CodeModel.GetById 38ms RepoModel.GetById 0ms app.codeStats 0ms

/imdb/parser/http/movieParser.py

https://bitbucket.org/alberanid/imdbpy_ptdf_only
Python | 1960 lines | 1928 code | 10 blank | 22 comment | 10 complexity | a30da506d130bb41fe7c2f84ae419dda MD5 | raw file
Possible License(s): GPL-2.0
  1. """
  2. parser.http.movieParser module (imdb package).
  3. This module provides the classes (and the instances), used to parse the
  4. IMDb pages on the akas.imdb.com server about a movie.
  5. E.g., for Brian De Palma's "The Untouchables", the referred
  6. pages would be:
  7. combined details: http://akas.imdb.com/title/tt0094226/combined
  8. plot summary: http://akas.imdb.com/title/tt0094226/plotsummary
  9. ...and so on...
  10. Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
  11. 2008 H. Turgut Uyar <uyar@tekir.org>
  12. This program is free software; you can redistribute it and/or modify
  13. it under the terms of the GNU General Public License as published by
  14. the Free Software Foundation; either version 2 of the License, or
  15. (at your option) any later version.
  16. This program is distributed in the hope that it will be useful,
  17. but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. GNU General Public License for more details.
  20. You should have received a copy of the GNU General Public License
  21. along with this program; if not, write to the Free Software
  22. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  23. """
  24. import re
  25. import urllib
  26. from imdb import imdbURL_base
  27. from imdb.Person import Person
  28. from imdb.Movie import Movie
  29. from imdb.Company import Company
  30. from imdb.utils import analyze_title, split_company_name_notes, _Container
  31. from utils import build_person, DOMParserBase, Attribute, Extractor, \
  32. analyze_imdbid
  33. # Dictionary used to convert some section's names.
  34. _SECT_CONV = {
  35. 'directed': 'director',
  36. 'directed by': 'director',
  37. 'directors': 'director',
  38. 'editors': 'editor',
  39. 'writing credits': 'writer',
  40. 'writers': 'writer',
  41. 'produced': 'producer',
  42. 'cinematography': 'cinematographer',
  43. 'film editing': 'editor',
  44. 'casting': 'casting director',
  45. 'costume design': 'costume designer',
  46. 'makeup department': 'make up',
  47. 'production management': 'production manager',
  48. 'second unit director or assistant director': 'assistant director',
  49. 'costume and wardrobe department': 'costume department',
  50. 'sound department': 'sound crew',
  51. 'stunts': 'stunt performer',
  52. 'other crew': 'miscellaneous crew',
  53. 'also known as': 'akas',
  54. 'country': 'countries',
  55. 'runtime': 'runtimes',
  56. 'language': 'languages',
  57. 'certification': 'certificates',
  58. 'genre': 'genres',
  59. 'created': 'creator',
  60. 'creators': 'creator',
  61. 'color': 'color info',
  62. 'plot': 'plot outline',
  63. 'seasons': 'number of seasons',
  64. 'art directors': 'art direction',
  65. 'assistant directors': 'assistant director',
  66. 'set decorators': 'set decoration',
  67. 'visual effects department': 'visual effects',
  68. 'production managers': 'production manager',
  69. 'miscellaneous': 'miscellaneous crew',
  70. 'make up department': 'make up',
  71. 'plot summary': 'plot outline',
  72. 'cinematographers': 'cinematographer',
  73. 'camera department': 'camera and electrical department',
  74. 'costume designers': 'costume designer',
  75. 'production designers': 'production design',
  76. 'production managers': 'production manager',
  77. 'music original': 'original music',
  78. 'casting directors': 'casting director',
  79. 'other companies': 'miscellaneous companies',
  80. 'producers': 'producer',
  81. 'special effects by': 'special effects department',
  82. 'special effects': 'special effects companies'
  83. }
  84. def _manageRoles(mo):
  85. """Perform some transformation on the html, so that roleIDs can
  86. be easily retrieved."""
  87. firstHalf = mo.group(1)
  88. secondHalf = mo.group(2)
  89. newRoles = []
  90. roles = secondHalf.split(' / ')
  91. for role in roles:
  92. role = role.strip()
  93. if not role:
  94. continue
  95. roleID = analyze_imdbid(role)
  96. if roleID is None:
  97. roleID = u'/'
  98. else:
  99. roleID += u'/'
  100. newRoles.append(u'<div class="_imdbpyrole" roleid="%s">%s</div>' % \
  101. (roleID, role.strip()))
  102. return firstHalf + u' / '.join(newRoles) + mo.group(3)
  103. _reRolesMovie = re.compile(r'(<td class="char">)(.*?)(</td>)',
  104. re.I | re.M | re.S)
  105. def _replaceBR(mo):
  106. """Replaces <br> tags with '::' (useful for some akas)"""
  107. txt = mo.group(0)
  108. return txt.replace('<br>', '::')
  109. _reAkas = re.compile(r'<h5>also known as:</h5>.*?</div>', re.I | re.M | re.S)
  110. def makeSplitter(lstrip=None, sep='|', comments=True,
  111. origNotesSep=' (', newNotesSep='::(', strip=None):
  112. """Return a splitter function suitable for a given set of data."""
  113. def splitter(x):
  114. if not x: return x
  115. x = x.strip()
  116. if not x: return x
  117. if lstrip is not None:
  118. x = x.lstrip(lstrip).lstrip()
  119. lx = x.split(sep)
  120. lx[:] = filter(None, [j.strip() for j in lx])
  121. if comments:
  122. lx[:] = [j.replace(origNotesSep, newNotesSep, 1) for j in lx]
  123. if strip:
  124. lx[:] = [j.strip(strip) for j in lx]
  125. return lx
  126. return splitter
  127. def _toInt(val, replace=()):
  128. """Return the value, converted to integer, or None; if present, 'replace'
  129. must be a list of tuples of values to replace."""
  130. for before, after in replace:
  131. val = val.replace(before, after)
  132. try:
  133. return int(val)
  134. except (TypeError, ValueError):
  135. return None
  136. class DOMHTMLMovieParser(DOMParserBase):
  137. """Parser for the "combined details" (and if instance.mdparse is
  138. True also for the "main details") page of a given movie.
  139. The page should be provided as a string, as taken from
  140. the akas.imdb.com server. The final result will be a
  141. dictionary, with a key for every relevant section.
  142. Example:
  143. mparser = DOMHTMLMovieParser()
  144. result = mparser.parse(combined_details_html_string)
  145. """
  146. _containsObjects = True
  147. extractors = [Extractor(label='title',
  148. path="//h1",
  149. attrs=Attribute(key='title',
  150. path=".//text()",
  151. postprocess=analyze_title)),
  152. Extractor(label='glossarysections',
  153. group="//a[@class='glossary']",
  154. group_key="./@name",
  155. group_key_normalize=lambda x: x.replace('_', ' '),
  156. path="../../../..//tr",
  157. attrs=Attribute(key=None,
  158. multi=True,
  159. path={'person': ".//text()",
  160. 'link': "./td[1]/a[@href]/@href"},
  161. postprocess=lambda x: \
  162. build_person(x.get('person') or u'',
  163. personID=analyze_imdbid(x.get('link')))
  164. )),
  165. Extractor(label='cast',
  166. path="//table[@class='cast']//tr",
  167. attrs=Attribute(key="cast",
  168. multi=True,
  169. path={'person': ".//text()",
  170. 'link': "td[2]/a/@href",
  171. 'roleID': \
  172. "td[4]/div[@class='_imdbpyrole']/@roleid"},
  173. postprocess=lambda x: \
  174. build_person(x.get('person') or u'',
  175. personID=analyze_imdbid(x.get('link')),
  176. roleID=(x.get('roleID') or u'').split('/'))
  177. )),
  178. Extractor(label='genres',
  179. path="//div[@class='info']//a[starts-with(@href," \
  180. " '/Sections/Genres')]",
  181. attrs=Attribute(key="genres",
  182. multi=True,
  183. path="./text()")),
  184. Extractor(label='h5sections',
  185. path="//div[@class='info']/h5/..",
  186. attrs=[
  187. Attribute(key="plot summary",
  188. path="./h5[starts-with(text(), " \
  189. "'Plot:')]/../div/text()",
  190. postprocess=lambda x: \
  191. x.strip().rstrip('|').rstrip()),
  192. Attribute(key="aspect ratio",
  193. path="./h5[starts-with(text()," \
  194. " 'Aspect')]/../div/text()",
  195. postprocess=lambda x: x.strip()),
  196. Attribute(key="mpaa",
  197. path="./h5/a[starts-with(text()," \
  198. " 'MPAA')]/../../div/text()",
  199. postprocess=lambda x: x.strip()),
  200. Attribute(key="countries",
  201. path="./h5[starts-with(text(), " \
  202. "'Countr')]/..//a/text()",
  203. postprocess=makeSplitter(sep='\n')),
  204. Attribute(key="language",
  205. path="./h5[starts-with(text(), " \
  206. "'Language')]/..//text()",
  207. postprocess=makeSplitter('Language:')),
  208. Attribute(key='color info',
  209. path="./h5[starts-with(text(), " \
  210. "'Color')]/..//text()",
  211. postprocess=makeSplitter('Color:')),
  212. Attribute(key='sound mix',
  213. path="./h5[starts-with(text(), " \
  214. "'Sound Mix')]/..//text()",
  215. postprocess=makeSplitter('Sound Mix:')),
  216. # Collects akas not encosed in <i> tags.
  217. Attribute(key='other akas',
  218. path="./h5[starts-with(text(), " \
  219. "'Also Known As')]/../div//text()",
  220. postprocess=makeSplitter(sep='::',
  221. origNotesSep='" - ',
  222. newNotesSep='::',
  223. strip='"')),
  224. Attribute(key='runtimes',
  225. path="./h5[starts-with(text(), " \
  226. "'Runtime')]/../div/text()",
  227. postprocess=makeSplitter()),
  228. Attribute(key='certificates',
  229. path="./h5[starts-with(text(), " \
  230. "'Certificat')]/..//text()",
  231. postprocess=makeSplitter('Certification:')),
  232. Attribute(key='number of seasons',
  233. path="./h5[starts-with(text(), " \
  234. "'Seasons')]/..//text()",
  235. postprocess=lambda x: x.count('|') + 1),
  236. Attribute(key='original air date',
  237. path="./h5[starts-with(text(), " \
  238. "'Original Air Date')]/../div/text()"),
  239. Attribute(key='tv series link',
  240. path="./h5[starts-with(text(), " \
  241. "'TV Series')]/..//a/@href"),
  242. Attribute(key='tv series title',
  243. path="./h5[starts-with(text(), " \
  244. "'TV Series')]/..//a/text()")
  245. ]),
  246. Extractor(label='creator',
  247. path="//h5[starts-with(text(), 'Creator')]/..//a",
  248. attrs=Attribute(key='creator', multi=True,
  249. path={'name': "./text()",
  250. 'link': "./@href"},
  251. postprocess=lambda x: \
  252. build_person(x.get('name') or u'',
  253. personID=analyze_imdbid(x.get('link')))
  254. )),
  255. Extractor(label='thin writer',
  256. path="//h5[starts-with(text(), 'Writer')]/..//a",
  257. attrs=Attribute(key='thin writer', multi=True,
  258. path={'name': "./text()",
  259. 'link': "./@href"},
  260. postprocess=lambda x: \
  261. build_person(x.get('name') or u'',
  262. personID=analyze_imdbid(x.get('link')))
  263. )),
  264. Extractor(label='thin director',
  265. path="//h5[starts-with(text(), 'Director')]/..//a",
  266. attrs=Attribute(key='thin director', multi=True,
  267. path={'name': "./text()",
  268. 'link': "@href"},
  269. postprocess=lambda x: \
  270. build_person(x.get('name') or u'',
  271. personID=analyze_imdbid(x.get('link')))
  272. )),
  273. Extractor(label='top 250/bottom 100',
  274. path="//div[@class='starbar-special']/" \
  275. "a[starts-with(@href, '/chart/')]",
  276. attrs=Attribute(key='top/bottom rank',
  277. path="./text()")),
  278. Extractor(label='series years',
  279. path="//div[@id='tn15title']//span" \
  280. "[starts-with(text(), 'TV series')]",
  281. attrs=Attribute(key='series years',
  282. path="./text()",
  283. postprocess=lambda x: \
  284. x.replace('TV series','').strip())),
  285. Extractor(label='number of episodes',
  286. path="//a[@title='Full Episode List']",
  287. attrs=Attribute(key='number of episodes',
  288. path="./text()",
  289. postprocess=lambda x: \
  290. _toInt(x, [(' Episodes', '')]))),
  291. Extractor(label='akas',
  292. path="//i[@class='transl']",
  293. attrs=Attribute(key='akas', multi=True, path='text()',
  294. postprocess=lambda x:
  295. x.replace(' ', ' ').rstrip('-').replace('" - ',
  296. '"::', 1).strip('"').replace(' ', ' '))),
  297. Extractor(label='production notes/status',
  298. path="//div[@class='info inprod']",
  299. attrs=Attribute(key='production notes',
  300. path=".//text()",
  301. postprocess=lambda x: x.strip())),
  302. Extractor(label='blackcatheader',
  303. group="//b[@class='blackcatheader']",
  304. group_key="./text()",
  305. group_key_normalize=lambda x: x.lower(),
  306. path="../ul/li",
  307. attrs=Attribute(key=None,
  308. multi=True,
  309. path={'name': "./a//text()",
  310. 'comp-link': "./a/@href",
  311. 'notes': "./text()"},
  312. postprocess=lambda x: \
  313. Company(name=x.get('name') or u'',
  314. companyID=analyze_imdbid(x.get('comp-link')),
  315. notes=(x.get('notes') or u'').strip())
  316. )),
  317. Extractor(label='rating',
  318. path="//div[@class='starbar-meta']/b",
  319. attrs=Attribute(key='rating',
  320. path=".//text()")),
  321. Extractor(label='votes',
  322. path="//div[@class='starbar-meta']/a[@href]",
  323. attrs=Attribute(key='votes',
  324. path=".//text()")),
  325. Extractor(label='cover url',
  326. path="//a[@name='poster']",
  327. attrs=Attribute(key='cover url',
  328. path="./img/@src"))
  329. ]
  330. preprocessors = [
  331. (re.compile(r'(<b class="blackcatheader">.+?</b>)', re.I),
  332. r'</div><div>\1'),
  333. ('<small>Full cast and crew for<br></small>', ''),
  334. ('<td> </td>', '<td>...</td>'),
  335. ('<span class="tv-extra">TV mini-series</span>',
  336. '<span class="tv-extra">(mini)</span>'),
  337. (_reRolesMovie, _manageRoles),
  338. (_reAkas, _replaceBR)]
  339. def preprocess_dom(self, dom):
  340. # Handle series information.
  341. xpath = self.xpath(dom, "//b[text()='Series Crew']")
  342. if xpath:
  343. b = xpath[-1] # In doubt, take the last one.
  344. for a in self.xpath(b, "./following::h5/a[@class='glossary']"):
  345. name = a.get('name')
  346. if name:
  347. a.set('name', 'series %s' % name)
  348. # Remove links to IMDbPro.
  349. for proLink in self.xpath(dom, "//span[@class='pro-link']"):
  350. proLink.drop_tree()
  351. # Remove some 'more' links (keep others, like the one around
  352. # the number of votes).
  353. for tn15more in self.xpath(dom,
  354. "//a[@class='tn15more'][starts-with(@href, '/title/')]"):
  355. tn15more.drop_tree()
  356. return dom
  357. re_space = re.compile(r'\s+')
  358. re_airdate = re.compile(r'(.*)\s*\(season (\d+), episode (\d+)\)', re.I)
  359. def postprocess_data(self, data):
  360. # Convert section names.
  361. for sect in data.keys():
  362. if sect in _SECT_CONV:
  363. data[_SECT_CONV[sect]] = data[sect]
  364. del data[sect]
  365. sect = _SECT_CONV[sect]
  366. # Filter out fake values.
  367. for key in data:
  368. value = data[key]
  369. if isinstance(value, list) and value:
  370. if isinstance(value[0], Person):
  371. data[key] = filter(lambda x: x.personID is not None, value)
  372. if isinstance(value[0], _Container):
  373. for obj in data[key]:
  374. obj.accessSystem = self._as
  375. obj.modFunct = self._modFunct
  376. if 'akas' in data or 'other akas' in data:
  377. akas = data.get('akas') or []
  378. other_akas = data.get('other akas') or []
  379. akas += other_akas
  380. if 'akas' in data:
  381. del data['akas']
  382. if 'other akas' in data:
  383. del data['other akas']
  384. if akas:
  385. data['akas'] = akas
  386. if 'runtimes' in data:
  387. data['runtimes'] = [x.replace(' min', u'')
  388. for x in data['runtimes']]
  389. if 'production notes' in data:
  390. pn = data['production notes'].replace('\n\nComments:',
  391. '\nComments:').replace('\n\nNote:',
  392. '\nNote:').replace('Note:\n\n',
  393. 'Note:\n').split('\n')
  394. for k, v in zip(pn[::2], pn[1::2]):
  395. v = v.strip()
  396. if not v:
  397. continue
  398. k = k.lower().strip(':')
  399. if k == 'note':
  400. k = 'status note'
  401. data[k] = v
  402. del data['production notes']
  403. if 'original air date' in data:
  404. oid = self.re_space.sub(' ', data['original air date']).strip()
  405. data['original air date'] = oid
  406. aid = self.re_airdate.findall(oid)
  407. if aid and len(aid[0]) == 3:
  408. date, season, episode = aid[0]
  409. date = date.strip()
  410. try: season = int(season)
  411. except: pass
  412. try: episode = int(episode)
  413. except: pass
  414. if date and date != '????':
  415. data['original air date'] = date
  416. else:
  417. del data['original air date']
  418. # Handle also "episode 0".
  419. if season or type(season) is type(0):
  420. data['season'] = season
  421. if episode or type(season) is type(0):
  422. data['episode'] = episode
  423. for k in ('writer', 'director'):
  424. t_k = 'thin %s' % k
  425. if t_k not in data:
  426. continue
  427. if k not in data:
  428. data[k] = data[t_k]
  429. del data[t_k]
  430. if 'top/bottom rank' in data:
  431. tbVal = data['top/bottom rank'].lower()
  432. if tbVal.startswith('top'):
  433. tbKey = 'top 250 rank'
  434. tbVal = _toInt(tbVal, [('top 250: #', '')])
  435. else:
  436. tbKey = 'bottom 100 rank'
  437. tbVal = _toInt(tbVal, [('bottom 100: #', '')])
  438. if tbVal:
  439. data[tbKey] = tbVal
  440. del data['top/bottom rank']
  441. if 'year' in data and data['year'] == '????':
  442. del data['year']
  443. if 'tv series link' in data:
  444. if 'tv series title' in data:
  445. data['episode of'] = Movie(title=data['tv series title'],
  446. movieID=analyze_imdbid(
  447. data['tv series link']),
  448. accessSystem=self._as,
  449. modFunct=self._modFunct)
  450. del data['tv series title']
  451. del data['tv series link']
  452. if 'rating' in data:
  453. try:
  454. data['rating'] = float(data['rating'].replace('/10', ''))
  455. except (TypeError, ValueError):
  456. pass
  457. if 'votes' in data:
  458. try:
  459. votes = data['votes'].replace(',', '').replace('votes', '')
  460. data['votes'] = int(votes)
  461. except (TypeError, ValueError):
  462. pass
  463. return data
  464. def _process_plotsummary(x):
  465. """Process a plot (contributed by Rdian06)."""
  466. xauthor = x.get('author')
  467. if xauthor:
  468. xauthor = xauthor.replace('{', '<').replace('}', '>').replace('(',
  469. '<').replace(')', '>').strip()
  470. xplot = x.get('plot', u'').strip()
  471. if xauthor:
  472. xplot += u'::%s' % xauthor
  473. return xplot
  474. class DOMHTMLPlotParser(DOMParserBase):
  475. """Parser for the "plot summary" page of a given movie.
  476. The page should be provided as a string, as taken from
  477. the akas.imdb.com server. The final result will be a
  478. dictionary, with a 'plot' key, containing a list
  479. of string with the structure: 'summary::summary_author <author@email>'.
  480. Example:
  481. pparser = HTMLPlotParser()
  482. result = pparser.parse(plot_summary_html_string)
  483. """
  484. _defGetRefs = True
  485. # Notice that recently IMDb started to put the email of the
  486. # author only in the link, that we're not collecting, here.
  487. extractors = [Extractor(label='plot',
  488. path="//p[@class='plotpar']",
  489. attrs=Attribute(key='plot',
  490. multi=True,
  491. path={'plot': './text()',
  492. 'author': './i/a/text()'},
  493. postprocess=_process_plotsummary))]
  494. def _process_award(x):
  495. award = {}
  496. award['year'] = x.get('year').strip()
  497. if award['year'] and award['year'].isdigit():
  498. award['year'] = int(award['year'])
  499. award['result'] = x.get('result').strip()
  500. award['award'] = x.get('award').strip()
  501. category = x.get('category').strip()
  502. if category:
  503. award['category'] = category
  504. received_with = x.get('with')
  505. if received_with is not None:
  506. award['with'] = received_with.strip()
  507. notes = x.get('notes')
  508. if notes is not None:
  509. notes = notes.strip()
  510. if notes:
  511. award['notes'] = notes
  512. award['anchor'] = x.get('anchor')
  513. return award
  514. class DOMHTMLAwardsParser(DOMParserBase):
  515. """Parser for the "awards" page of a given person or movie.
  516. The page should be provided as a string, as taken from
  517. the akas.imdb.com server. The final result will be a
  518. dictionary, with a key for every relevant section.
  519. Example:
  520. awparser = HTMLAwardsParser()
  521. result = awparser.parse(awards_html_string)
  522. """
  523. subject = 'title'
  524. _containsObjects = True
  525. _isInPtdf = False
  526. extractors = [
  527. Extractor(label='awards',
  528. group="//table//big",
  529. group_key="./a",
  530. path="./ancestor::tr[1]/following-sibling::tr/" \
  531. "td[last()][not(@colspan)]",
  532. attrs=Attribute(key=None,
  533. multi=True,
  534. path={
  535. 'year': "../td[1]/a/text()",
  536. 'result': "../td[2]/b/text()",
  537. 'award': "../td[3]/text()",
  538. 'category': "./text()[1]",
  539. # FIXME: takes only the first co-recipient
  540. 'with': "./small[starts-with(text()," \
  541. " 'Shared with:')]/following-sibling::a[1]/text()",
  542. 'notes': "./small[last()]//text()",
  543. 'anchor': ".//text()"
  544. },
  545. postprocess=_process_award
  546. )),
  547. Extractor(label='recipients',
  548. group="//table//big",
  549. group_key="./a",
  550. path="./ancestor::tr[1]/following-sibling::tr/" \
  551. "td[last()]/small[1]/preceding-sibling::a",
  552. attrs=Attribute(key=None,
  553. multi=True,
  554. path={
  555. 'name': "./text()",
  556. 'link': "./@href",
  557. 'anchor': "..//text()"
  558. }
  559. ))
  560. ]
  561. preprocessors = [
  562. (re.compile('(<tr><td[^>]*>.*?</td></tr>\n\n</table>)', re.I),
  563. r'\1</table>'),
  564. (re.compile('(<tr><td[^>]*>\n\n<big>.*?</big></td></tr>)', re.I),
  565. r'</table><table class="_imdbpy">\1'),
  566. (re.compile('(<table[^>]*>\n\n)</table>(<table)', re.I), r'\1\2'),
  567. (re.compile('(<small>.*?)<br>(.*?</small)', re.I), r'\1 \2'),
  568. (re.compile('(</tr>\n\n)(<td)', re.I), r'\1<tr>\2')
  569. ]
  570. def preprocess_dom(self, dom):
  571. """Repeat td elements according to their rowspan attributes
  572. in subsequent tr elements.
  573. """
  574. cols = self.xpath(dom, "//td[@rowspan]")
  575. for col in cols:
  576. span = int(col.get('rowspan'))
  577. del col.attrib['rowspan']
  578. position = len(self.xpath(col, "./preceding-sibling::td"))
  579. row = col.getparent()
  580. for tr in self.xpath(row, "./following-sibling::tr")[:span-1]:
  581. # if not cloned, child will be moved to new parent
  582. clone = self.clone(col)
  583. # XXX: beware that here we don't use an "adapted" function,
  584. # because both BeautifulSoup and lxml uses the same
  585. # "insert" method.
  586. tr.insert(position, clone)
  587. return dom
  588. def postprocess_data(self, data):
  589. if len(data) == 0:
  590. return {}
  591. nd = []
  592. for key in data.keys():
  593. dom = self.get_dom(key)
  594. assigner = self.xpath(dom, "//a/text()")[0]
  595. for entry in data[key]:
  596. if not entry.has_key('name'):
  597. # this is an award, not a recipient
  598. entry['assigner'] = assigner.strip()
  599. # find the recipients
  600. matches = [p for p in data[key]
  601. if p.has_key('name') and (entry['anchor'] ==
  602. p['anchor'])]
  603. if self.subject == 'title':
  604. recipients = [Person(name=recipient['name'],
  605. personID=analyze_imdbid(recipient['link']))
  606. for recipient in matches]
  607. entry['to'] = recipients
  608. elif self.subject == 'name':
  609. recipients = [Movie(title=recipient['name'],
  610. movieID=analyze_imdbid(recipient['link']))
  611. for recipient in matches]
  612. entry['for'] = recipients
  613. nd.append(entry)
  614. del entry['anchor']
  615. return {'awards': nd}
  616. class DOMHTMLTaglinesParser(DOMParserBase):
  617. """Parser for the "taglines" page of a given movie.
  618. The page should be provided as a string, as taken from
  619. the akas.imdb.com server. The final result will be a
  620. dictionary, with a key for every relevant section.
  621. Example:
  622. tparser = DOMHTMLTaglinesParser()
  623. result = tparser.parse(taglines_html_string)
  624. """
  625. _isInPtdf = False
  626. extractors = [Extractor(label='taglines',
  627. path="//div[@id='tn15content']/p",
  628. attrs=Attribute(key='taglines', multi=True,
  629. path="./text()"))]
  630. class DOMHTMLKeywordsParser(DOMParserBase):
  631. """Parser for the "keywords" page of a given movie.
  632. The page should be provided as a string, as taken from
  633. the akas.imdb.com server. The final result will be a
  634. dictionary, with a key for every relevant section.
  635. Example:
  636. kwparser = DOMHTMLKeywordsParser()
  637. result = kwparser.parse(keywords_html_string)
  638. """
  639. extractors = [Extractor(label='keywords',
  640. path="//a[starts-with(@href, '/keyword/')]",
  641. attrs=Attribute(key='keywords',
  642. path="./text()", multi=True,
  643. postprocess=lambda x: \
  644. x.lower().replace(' ', '-')))]
  645. class DOMHTMLAlternateVersionsParser(DOMParserBase):
  646. """Parser for the "alternate versions" page of a given movie.
  647. The page should be provided as a string, as taken from
  648. the akas.imdb.com server. The final result will be a
  649. dictionary, with a key for every relevant section.
  650. Example:
  651. avparser = HTMLAlternateVersionsParser()
  652. result = avparser.parse(alternateversions_html_string)
  653. """
  654. _defGetRefs = True
  655. extractors = [Extractor(label='alternate versions',
  656. path="//ul[@class='trivia']/li",
  657. attrs=Attribute(key='alternate versions',
  658. multi=True,
  659. path=".//text()",
  660. postprocess=lambda x: x.strip()))]
  661. class DOMHTMLTriviaParser(DOMParserBase):
  662. """Parser for the "trivia" page of a given movie.
  663. The page should be provided as a string, as taken from
  664. the akas.imdb.com server. The final result will be a
  665. dictionary, with a key for every relevant section.
  666. Example:
  667. avparser = HTMLAlternateVersionsParser()
  668. result = avparser.parse(alternateversions_html_string)
  669. """
  670. _defGetRefs = True
  671. extractors = [Extractor(label='alternate versions',
  672. path="//div[@class='sodatext']",
  673. attrs=Attribute(key='trivia',
  674. multi=True,
  675. path=".//text()",
  676. postprocess=lambda x: x.strip()))]
  677. class DOMHTMLSoundtrackParser(DOMHTMLAlternateVersionsParser):
  678. kind = 'soundtrack'
  679. preprocessors = [
  680. ('<br>', '\n')
  681. ]
  682. def postprocess_data(self, data):
  683. if 'soundtrack' in data:
  684. nd = []
  685. for x in data['soundtrack']:
  686. ds = x.split('\n')
  687. title = ds[0]
  688. if title[0] == '"' and title[-1] == '"':
  689. title = title[1:-1]
  690. nds = []
  691. newData = {}
  692. for l in ds[1:]:
  693. if ' with ' in l or ' by ' in l or ' from ' in l \
  694. or ' of ' in l or l.startswith('From '):
  695. nds.append(l)
  696. else:
  697. if nds:
  698. nds[-1] += l
  699. else:
  700. nds.append(l)
  701. newData[title] = {}
  702. for l in nds:
  703. skip = False
  704. for sep in ('From ',):
  705. if l.startswith(sep):
  706. fdix = len(sep)
  707. kind = l[:fdix].rstrip().lower()
  708. info = l[fdix:].lstrip()
  709. newData[title][kind] = info
  710. skip = True
  711. if not skip:
  712. for sep in ' with ', ' by ', ' from ', ' of ':
  713. fdix = l.find(sep)
  714. if fdix != -1:
  715. fdix = fdix+len(sep)
  716. kind = l[:fdix].rstrip().lower()
  717. info = l[fdix:].lstrip()
  718. newData[title][kind] = info
  719. break
  720. nd.append(newData)
  721. data['soundtrack'] = nd
  722. return data
  723. class DOMHTMLCrazyCreditsParser(DOMParserBase):
  724. """Parser for the "crazy credits" page of a given movie.
  725. The page should be provided as a string, as taken from
  726. the akas.imdb.com server. The final result will be a
  727. dictionary, with a key for every relevant section.
  728. Example:
  729. ccparser = DOMHTMLCrazyCreditsParser()
  730. result = ccparser.parse(crazycredits_html_string)
  731. """
  732. _defGetRefs = True
  733. extractors = [Extractor(label='crazy credits', path="//ul/li/tt",
  734. attrs=Attribute(key='crazy credits', multi=True,
  735. path=".//text()",
  736. postprocess=lambda x: \
  737. x.replace('\n', ' ').replace(' ', ' ')))]
  738. class DOMHTMLGoofsParser(DOMParserBase):
  739. """Parser for the "goofs" page of a given movie.
  740. The page should be provided as a string, as taken from
  741. the akas.imdb.com server. The final result will be a
  742. dictionary, with a key for every relevant section.
  743. Example:
  744. gparser = DOMHTMLGoofsParser()
  745. result = gparser.parse(goofs_html_string)
  746. """
  747. _defGetRefs = True
  748. extractors = [Extractor(label='goofs', path="//ul[@class='trivia']/li",
  749. attrs=Attribute(key='goofs', multi=True, path=".//text()",
  750. postprocess=lambda x: (x or u'').strip()))]
  751. class DOMHTMLQuotesParser(DOMParserBase):
  752. """Parser for the "memorable quotes" page of a given movie.
  753. The page should be provided as a string, as taken from
  754. the akas.imdb.com server. The final result will be a
  755. dictionary, with a key for every relevant section.
  756. Example:
  757. qparser = DOMHTMLQuotesParser()
  758. result = qparser.parse(quotes_html_string)
  759. """
  760. _defGetRefs = True
  761. extractors = [
  762. Extractor(label='quotes',
  763. path="//div[@class='_imdbpy']",
  764. attrs=Attribute(key='quotes',
  765. multi=True,
  766. path=".//text()",
  767. postprocess=lambda x: x.strip().replace(' \n',
  768. '::').replace('::\n', '::').replace('\n', ' ')))
  769. ]
  770. preprocessors = [
  771. (re.compile('(<a name="?qt[0-9]{7}"?></a>)', re.I),
  772. r'\1<div class="_imdbpy">'),
  773. (re.compile('<hr width="30%">', re.I), '</div>'),
  774. (re.compile('<hr/>', re.I), '</div>'),
  775. (re.compile('<script.*?</script>', re.I|re.S), ''),
  776. # For BeautifulSoup.
  777. (re.compile('<!-- sid: t-channel : MIDDLE_CENTER -->', re.I), '</div>')
  778. ]
  779. def preprocess_dom(self, dom):
  780. # Remove "link this quote" links.
  781. for qLink in self.xpath(dom, "//p[@class='linksoda']"):
  782. qLink.drop_tree()
  783. return dom
  784. def postprocess_data(self, data):
  785. if 'quotes' not in data:
  786. return {}
  787. for idx, quote in enumerate(data['quotes']):
  788. data['quotes'][idx] = quote.split('::')
  789. return data
  790. class DOMHTMLReleaseinfoParser(DOMParserBase):
  791. """Parser for the "release dates" page of a given movie.
  792. The page should be provided as a string, as taken from
  793. the akas.imdb.com server. The final result will be a
  794. dictionary, with a key for every relevant section.
  795. Example:
  796. rdparser = DOMHTMLReleaseinfoParser()
  797. result = rdparser.parse(releaseinfo_html_string)
  798. """
  799. extractors = [Extractor(label='release dates',
  800. path="//th[@class='xxxx']/../../tr",
  801. attrs=Attribute(key='release dates', multi=True,
  802. path={'country': ".//td[1]//text()",
  803. 'date': ".//td[2]//text()",
  804. 'notes': ".//td[3]//text()"})),
  805. Extractor(label='akas',
  806. path="//div[@class='_imdbpy_akas']/table/tr",
  807. attrs=Attribute(key='akas', multi=True,
  808. path={'title': "./td[1]/text()",
  809. 'countries': "./td[2]/text()"}))]
  810. preprocessors = [
  811. (re.compile('(<h5><a name="?akas"?.*</table>)', re.I | re.M | re.S),
  812. r'<div class="_imdbpy_akas">\1</div>')]
  813. def postprocess_data(self, data):
  814. if not ('release dates' in data or 'akas' in data): return data
  815. releases = data.get('release dates') or []
  816. rl = []
  817. for i in releases:
  818. country = i.get('country')
  819. date = i.get('date')
  820. if not (country and date): continue
  821. country = country.strip()
  822. date = date.strip()
  823. if not (country and date): continue
  824. notes = i['notes']
  825. info = u'%s::%s' % (country, date)
  826. if notes:
  827. info += notes
  828. rl.append(info)
  829. if releases:
  830. del data['release dates']
  831. if rl:
  832. data['release dates'] = rl
  833. akas = data.get('akas') or []
  834. nakas = []
  835. for aka in akas:
  836. title = aka.get('title', '').strip()
  837. if not title:
  838. continue
  839. countries = aka.get('countries', '').split('/')
  840. if not countries:
  841. nakas.append(title)
  842. else:
  843. for country in countries:
  844. nakas.append('%s::%s' % (title, country.strip()))
  845. if akas:
  846. del data['akas']
  847. if nakas:
  848. data['akas from release info'] = nakas
  849. return data
  850. class DOMHTMLRatingsParser(DOMParserBase):
  851. """Parser for the "user ratings" page of a given movie.
  852. The page should be provided as a string, as taken from
  853. the akas.imdb.com server. The final result will be a
  854. dictionary, with a key for every relevant section.
  855. Example:
  856. rparser = DOMHTMLRatingsParser()
  857. result = rparser.parse(userratings_html_string)
  858. """
  859. re_means = re.compile('mean\s*=\s*([0-9]\.[0-9])\.\s*median\s*=\s*([0-9])',
  860. re.I)
  861. extractors = [
  862. Extractor(label='number of votes',
  863. path="//td[b='Percentage']/../../tr",
  864. attrs=[Attribute(key='votes',
  865. multi=True,
  866. path={
  867. 'votes': "td[1]//text()",
  868. 'ordinal': "td[3]//text()"
  869. })]),
  870. Extractor(label='mean and median',
  871. path="//p[starts-with(text(), 'Arithmetic mean')]",
  872. attrs=Attribute(key='mean and median',
  873. path="text()")),
  874. Extractor(label='rating',
  875. path="//a[starts-with(@href, '/search/title?user_rating=')]",
  876. attrs=Attribute(key='rating',
  877. path="text()")),
  878. Extractor(label='demographic voters',
  879. path="//td[b='Average']/../../tr",
  880. attrs=Attribute(key='demographic voters',
  881. multi=True,
  882. path={
  883. 'voters': "td[1]//text()",
  884. 'votes': "td[2]//text()",
  885. 'average': "td[3]//text()"
  886. })),
  887. Extractor(label='top 250',
  888. path="//a[text()='top 250']",
  889. attrs=Attribute(key='top 250',
  890. path="./preceding-sibling::text()[1]"))
  891. ]
  892. def postprocess_data(self, data):
  893. nd = {}
  894. votes = data.get('votes', [])
  895. if votes:
  896. nd['number of votes'] = {}
  897. for i in xrange(1, 11):
  898. nd['number of votes'][int(votes[i]['ordinal'])] = \
  899. int(votes[i]['votes'].replace(',', ''))
  900. mean = data.get('mean and median', '')
  901. if mean:
  902. means = self.re_means.findall(mean)
  903. if means and len(means[0]) == 2:
  904. am, med = means[0]
  905. try: am = float(am)
  906. except (ValueError, OverflowError): pass
  907. if type(am) is type(1.0):
  908. nd['arithmetic mean'] = am
  909. try: med = int(med)
  910. except (ValueError, OverflowError): pass
  911. if type(med) is type(0):
  912. nd['median'] = med
  913. if 'rating' in data:
  914. nd['rating'] = float(data['rating'])
  915. dem_voters = data.get('demographic voters')
  916. if dem_voters:
  917. nd['demographic'] = {}
  918. for i in xrange(1, len(dem_voters)):
  919. if (dem_voters[i]['votes'] is not None) \
  920. and (dem_voters[i]['votes'].strip()):
  921. nd['demographic'][dem_voters[i]['voters'].strip().lower()] \
  922. = (int(dem_voters[i]['votes'].replace(',', '')),
  923. float(dem_voters[i]['average']))
  924. if 'imdb users' in nd.get('demographic', {}):
  925. nd['votes'] = nd['demographic']['imdb users'][0]
  926. nd['demographic']['all votes'] = nd['demographic']['imdb users']
  927. del nd['demographic']['imdb users']
  928. top250 = data.get('top 250')
  929. if top250:
  930. sd = top250[9:]
  931. i = sd.find(' ')
  932. if i != -1:
  933. sd = sd[:i]
  934. try: sd = int(sd)
  935. except (ValueError, OverflowError): pass
  936. if type(sd) is type(0):
  937. nd['top 250 rank'] = sd
  938. return nd
  939. class DOMHTMLEpisodesRatings(DOMParserBase):
  940. """Parser for the "episode ratings ... by date" page of a given movie.
  941. The page should be provided as a string, as taken from
  942. the akas.imdb.com server. The final result will be a
  943. dictionary, with a key for every relevant section.
  944. Example:
  945. erparser = DOMHTMLEpisodesRatings()
  946. result = erparser.parse(eprating_html_string)
  947. """
  948. _containsObjects = True
  949. extractors = [Extractor(label='title', path="//title",
  950. attrs=Attribute(key='title', path="./text()")),
  951. Extractor(label='ep ratings',
  952. path="//th/../..//tr",
  953. attrs=Attribute(key='episodes', multi=True,
  954. path={'nr': ".//td[1]/text()",
  955. 'ep title': ".//td[2]//text()",
  956. 'movieID': ".//td[2]/a/@href",
  957. 'rating': ".//td[3]/text()",
  958. 'votes': ".//td[4]/text()"}))]
  959. def postprocess_data(self, data):
  960. if 'title' not in data or 'episodes' not in data: return {}
  961. nd = []
  962. title = data['title']
  963. for i in data['episodes']:
  964. ept = i['ep title']
  965. movieID = analyze_imdbid(i['movieID'])
  966. votes = i['votes']
  967. rating = i['rating']
  968. if not (ept and movieID and votes and rating): continue
  969. try:
  970. votes = int(votes.replace(',', '').replace('.', ''))
  971. except:
  972. pass
  973. try:
  974. rating = float(rating)
  975. except:
  976. pass
  977. ept = ept.strip()
  978. ept = u'%s {%s' % (title, ept)
  979. nr = i['nr']
  980. if nr:
  981. ept += u' (#%s)' % nr.strip()
  982. ept += '}'
  983. if movieID is not None:
  984. movieID = str(movieID)
  985. m = Movie(title=ept, movieID=movieID, accessSystem=self._as,
  986. modFunct=self._modFunct)
  987. epofdict = m.get('episode of')
  988. if epofdict is not None:
  989. m['episode of'] = Movie(data=epofdict, accessSystem=self._as,
  990. modFunct=self._modFunct)
  991. nd.append({'episode': m, 'votes': votes, 'rating': rating})
  992. return {'episodes rating': nd}
  993. def _normalize_href(href):
  994. if (href is not None) and (not href.lower().startswith('http://')):
  995. if href.startswith('/'): href = href[1:]
  996. href = '%s%s' % (imdbURL_base, href)
  997. return href
  998. class DOMHTMLOfficialsitesParser(DOMParserBase):
  999. """Parser for the "official sites", "external reviews", "newsgroup
  1000. reviews", "miscellaneous links", "sound clips", "video clips" and
  1001. "photographs" pages of a given movie.
  1002. The page should be provided as a string, as taken from
  1003. the akas.imdb.com server. The final result will be a
  1004. dictionary, with a key for every relevant section.
  1005. Example:
  1006. osparser = DOMHTMLOfficialsitesParser()
  1007. result = osparser.parse(officialsites_html_string)
  1008. """
  1009. _isInPtdf = False
  1010. kind = 'official sites'
  1011. extractors = [
  1012. Extractor(label='site',
  1013. path="//ol/li/a",
  1014. attrs=Attribute(key='self.kind',
  1015. multi=True,
  1016. path={
  1017. 'link': "./@href",
  1018. 'info': "./text()"
  1019. },
  1020. postprocess=lambda x: (x.get('info').strip(),
  1021. urllib.unquote(_normalize_href(x.get('link'))))))
  1022. ]
  1023. class DOMHTMLConnectionParser(DOMParserBase):
  1024. """Parser for the "connections" page of a given movie.
  1025. The page should be provided as a string, as taken from
  1026. the akas.imdb.com server. The final result will be a
  1027. dictionary, with a key for every relevant section.
  1028. Example:
  1029. connparser = DOMHTMLConnectionParser()
  1030. result = connparser.parse(connections_html_string)
  1031. """
  1032. _containsObjects = True
  1033. extractors = [Extractor(label='connection',
  1034. group="//div[@class='_imdbpy']",
  1035. group_key="./h5/text()",
  1036. group_key_normalize=lambda x: x.lower(),
  1037. path="./a",
  1038. attrs=Attribute(key=None,
  1039. path={'title': "./text()",
  1040. 'movieID': "./@href"},
  1041. multi=True))]
  1042. preprocessors = [
  1043. ('<h5>', '</div><div class="_imdbpy"><h5>'),
  1044. # To get the movie's year.
  1045. ('</a> (', ' ('),
  1046. ('\n<br/>', '</a>'),
  1047. ('<br/> - ', '::')
  1048. ]
  1049. def postprocess_data(self, data):
  1050. for key in data.keys():
  1051. nl = []
  1052. for v in data[key]:
  1053. title = v['title']
  1054. ts = title.split('::', 1)
  1055. title = ts[0].strip()
  1056. notes = u''
  1057. if len(ts) == 2:
  1058. notes = ts[1].strip()
  1059. m = Movie(title=title,
  1060. movieID=analyze_imdbid(v['movieID']),
  1061. accessSystem=self._as, notes=notes,
  1062. modFunct=self._modFunct)
  1063. nl.append(m)
  1064. data[key] = nl
  1065. if not data: return {}
  1066. return {'connections': data}
  1067. class DOMHTMLLocationsParser(DOMParserBase):
  1068. """Parser for the "locations" page of a given movie.
  1069. The page should be provided as a string, as taken from
  1070. the akas.imdb.com server. The final result will be a
  1071. dictionary, with a key for every relevant section.
  1072. Example:
  1073. lparser = DOMHTMLLocationsParser()
  1074. result = lparser.parse(locations_html_string)
  1075. """
  1076. extractors = [Extractor(label='locations', path="//dt",
  1077. attrs=Attribute(key='locations', multi=True,
  1078. path={'place': ".//text()",
  1079. 'note': "./following-sibling::dd[1]" \
  1080. "//text()"},
  1081. postprocess=lambda x: (u'%s::%s' % (
  1082. x['place'].strip(),
  1083. (x['note'] or u'').strip())).strip(':')))]
  1084. class DOMHTMLTechParser(DOMParserBase):
  1085. """Parser for the "technical", "business", "literature",
  1086. "publicity" (for people) and "contacts (for people) pages of
  1087. a given movie.
  1088. The page should be provided as a string, as taken from
  1089. the akas.imdb.com server. The final result will be a
  1090. dictionary, with a key for every relevant section.
  1091. Example:
  1092. tparser = HTMLTechParser()
  1093. result = tparser.parse(technical_html_string)
  1094. """
  1095. kind = 'tech'
  1096. extractors = [Extractor(label='tech',
  1097. group="//h5",
  1098. group_key="./text()",
  1099. group_key_normalize=lambda x: x.lower(),
  1100. path="./following-sibling::div[1]",
  1101. attrs=Attribute(key=None,
  1102. path=".//text()",
  1103. postprocess=lambda x: [t.strip()
  1104. for t in x.split('\n') if t.strip()]))]
  1105. preprocessors = [
  1106. (re.compile('(<h5>.*?</h5>)', re.I), r'\1<div class="_imdbpy">'),
  1107. (re.compile('((<br/>|</p>|</table>))\n?<br/>(?!<a)', re.I),
  1108. r'\1</div>'),
  1109. # the ones below are for the publicity parser
  1110. (re.compile('<p>(.*?)</p>', re.I), r'\1<br/>'),
  1111. (re.compile('(</td><td valign="top">)', re.I), r'\1::'),
  1112. (re.compile('(</tr><tr>)', re.I), r'\n\1'),
  1113. # this is for splitting individual entries
  1114. (re.compile('<br/>', re.I), r'\n'),
  1115. ]
  1116. def postprocess_data(self, data):
  1117. for key in data:
  1118. data[key] = filter(None, data[key])
  1119. if self.kind in ('literature', 'business', 'contacts') and data:
  1120. if 'screenplay/teleplay' in data:
  1121. data['screenplay-teleplay'] = data['screenplay/teleplay']
  1122. del data['screenplay/teleplay']
  1123. data = {self.kind: data}
  1124. else:
  1125. if self.kind == 'publicity':
  1126. if 'biography (print)' in data:
  1127. data['biography-print'] = data['biography (print)']
  1128. del data['biography (print)']
  1129. # Tech info.
  1130. for key in data.keys():
  1131. if key.startswith('film negative format'):
  1132. data['film negative format'] = data[key]
  1133. del data[key]
  1134. elif key.startswith('film length'):
  1135. data['film length'] = data[key]
  1136. del data[key]
  1137. return data
  1138. class DOMHTMLDvdParser(DOMParserBase):
  1139. """Parser for the "dvd" page of a given movie.
  1140. The page should be provided as a string, as taken from
  1141. the akas.imdb.com server. The final result will be a
  1142. dictionary, with a key for every relevant section.
  1143. Example:
  1144. dparser = DOMHTMLDvdParser()
  1145. result = dparser.parse(dvd_html_string)
  1146. """
  1147. _isInPtdf = False
  1148. _defGetRefs = True
  1149. extractors = [Extractor(label='dvd',
  1150. path="//div[@class='base_layer']",
  1151. attrs=[Attribute(key=None,
  1152. multi=True,
  1153. path={
  1154. 'title': "../table[1]//h3/text()",
  1155. 'cover': "../table[1]//img/@src",
  1156. 'region': ".//p[b='Region:']/text()",
  1157. 'asin': ".//p[b='ASIN:']/text()",
  1158. 'upc': ".//p[b='UPC:']/text()",
  1159. 'rating': ".//p/b[starts-with(text(), 'Rating:')]/../img/@alt",
  1160. 'certificate': ".//p[b='Certificate:']/text()",
  1161. 'runtime': ".//p[b='Runtime:']/text()",
  1162. 'label': ".//p[b='Label:']/text()",
  1163. 'studio': ".//p[b='Studio:']/text()",
  1164. 'release date': ".//p[b='Release Date:']/text()",
  1165. 'dvd format': ".//p[b='DVD Format:']/text()",
  1166. 'dvd features': ".//p[b='DVD Features: ']//text()",
  1167. 'supplements': "..//div[span='Supplements']" \
  1168. "/following-sibling::div[1]//text()",
  1169. 'review': "..//div[span='Review']/following-sibling::div[1]//text()",
  1170. 'titles': "..//div[starts-with(text(), 'Titles in this Product')]" \
  1171. "/..//text()",
  1172. },
  1173. postprocess=lambda x: {
  1174. 'title': (x.get('title') or u'').strip(),
  1175. 'cover': (x.get('cover') or u'').strip(),
  1176. 'region': (x.get('region') or u'').strip(),
  1177. 'asin': (x.get('asin') or u'').strip(),
  1178. 'upc': (x.get('upc') or u'').strip(),
  1179. 'rating': (x.get('rating') or u'Not Rated').strip().replace('Rating: ', ''),
  1180. 'certificate': (x.get('certificate') or u'').strip(),
  1181. 'runtime': (x.get('runtime') or u'').strip(),
  1182. 'label': (x.get('label') or u'').strip(),
  1183. 'studio': (x.get('studio') or u'').strip(),
  1184. 'release date': (x.get('release date') or u'').strip(),
  1185. 'dvd format': (x.get('dvd format') or u'').strip(),
  1186. 'dvd features': (x.get('dvd features') or u'').strip().replace('DVD Features: ', ''),
  1187. 'supplements': (x.get('supplements') or u'').strip(),
  1188. 'review': (x.get('review') or u'').strip(),
  1189. 'titles in this product': (x.get('titles') or u'').strip().replace('Titles in this Product::', ''),
  1190. }
  1191. )])]
  1192. preprocessors = [
  1193. (re.compile('<p>(<table class="dvd_section" .*)</p>\s*<hr\s*/>', re.I),
  1194. r'<div class="_imdbpy">\1</div>'),
  1195. (re.compile('<p>(<div class\s*=\s*"base_layer")', re.I), r'\1'),
  1196. (re.compile('</p>\s*<p>(<div class="dvd_section")', re.I), r'\1'),
  1197. (re.compile('</div><div class="dvd_row(_alt)?">', re.I), r'::')
  1198. ]
  1199. def postprocess_data(self, data):
  1200. if not data:
  1201. return data
  1202. dvds = data['dvd']
  1203. for dvd in dvds:
  1204. if dvd['cover'].find('noposter') != -1:
  1205. del dvd['cover']
  1206. for key in dvd.keys():
  1207. if not dvd[key]:
  1208. del dvd[key]
  1209. if 'supplements' in dvd:
  1210. dvd['supplements'] = dvd['supplements'].split('::')
  1211. return data
  1212. class DOMHTMLRecParser(DOMParserBase):
  1213. """Parser for the "recommendations" page of a given movie.
  1214. The page should be provided as a string, as taken from
  1215. the akas.imdb.com server. The final result will be a
  1216. dictionary, with a key for every relevant section.
  1217. Example:
  1218. rparser = HTMLRecParser()
  1219. result = rparser.parse(recommendations_html_string)
  1220. """
  1221. _isInPtdf = False
  1222. _containsObjects = True
  1223. extractors = [Extractor(label='recommendations',
  1224. path="//td[@valign='middle'][1]",
  1225. attrs=Attribute(key='../../tr/td[1]//text()',
  1226. multi=True,
  1227. path={'title': ".//text()",
  1228. 'movieID': ".//a/@href"}))]
  1229. def postprocess_data(self, data):
  1230. for key in data.keys():
  1231. n_key = key
  1232. n_keyl = n_key.lower()
  1233. if n_keyl == 'suggested by the database':
  1234. n_key = 'database'
  1235. elif n_keyl == 'imdb users recommend':
  1236. n_key = 'users'
  1237. data[n_key] = [Movie(title=x['title'],
  1238. movieID=analyze_imdbid(x['movieID']),
  1239. accessSystem=self._as, modFunct=self._modFunct)
  1240. for x in data[key]]
  1241. del data[key]
  1242. if data: return {'recommendations': data}
  1243. return data
  1244. class DOMHTMLNewsParser(DOMParserBase):
  1245. """Parser for the "news" page of a given movie or person.
  1246. The page should be provided as a string, as taken from
  1247. the akas.imdb.com server. The final result will be a
  1248. dictionary, with a key for every relevant section.
  1249. Example:
  1250. nwparser = DOMHTMLNewsParser()
  1251. result = nwparser.parse(news_html_string)
  1252. """
  1253. _defGetRefs = True
  1254. _isInPtdf = False
  1255. extractors = [
  1256. Extractor(label='news',
  1257. path="//h2",
  1258. attrs=Attribute(key='news',
  1259. multi=True,
  1260. path={
  1261. 'title': "./text()",
  1262. 'fromdate': "../following-sibling::p[1]/small//text()",
  1263. # FIXME: sometimes (see The Matrix (1999)) <p> is found
  1264. # inside news text.
  1265. 'body': "../following-sibling::p[2]//text()",
  1266. 'link': "../..//a[text()='Permalink']/@href",
  1267. 'fulllink': "../..//a[starts-with(text(), " \
  1268. "'See full article at')]/@href"
  1269. },
  1270. postprocess=lambda x: {
  1271. 'title': x.get('title').strip(),
  1272. 'date': x.get('fromdate').split('|')[0].strip(),
  1273. 'from': x.get('fromdate').split('|')[1].replace('From ',
  1274. '').strip(),
  1275. 'body': (x.get('body') or u'').strip(),
  1276. 'link': _normalize_href(x.get('link')),
  1277. 'full article link': _normalize_href(x.get('fulllink'))
  1278. }))
  1279. ]
  1280. preprocessors = [
  1281. (re.compile('(<a name=[^>]+><h2>)', re.I), r'<div class="_imdbpy">\1'),
  1282. (re.compile('(<hr/>)', re.I), r'</div>\1'),
  1283. (re.compile('<p></p>', re.I), r'')
  1284. ]
  1285. def postprocess_data(self, data):
  1286. if not data.has_key('news'):
  1287. return {}
  1288. for news in data['news']:
  1289. if news.has_key('full article link'):
  1290. if news['full article link'] is None:
  1291. del news['full article link']
  1292. return data
  1293. def _parse_review(x):
  1294. result = {}
  1295. title = x.get('title').strip()
  1296. if title[-1] == ':': title = title[:-1]
  1297. result['title'] = title
  1298. result['link'] = _normalize_href(x.get('link'))
  1299. kind = x.get('kind').strip()
  1300. if kind[-1] == ':': kind = kind[:-1]
  1301. result['review kind'] = kind
  1302. text = x.get('review').replace('\n\n', '||').replace('\n', ' ').split('||')
  1303. review = '\n'.join(text)
  1304. if x.get('author') is not None:
  1305. author = x.get('author').strip()
  1306. review = review.split(author)[0].strip()
  1307. result['review author'] = author[2:]
  1308. if x.get('item') is not None:
  1309. item = x.get('item').strip()
  1310. review = review[len(item):].strip()
  1311. review = "%s: %s" % (item, review)
  1312. result['review'] = review
  1313. return result
  1314. class DOMHTMLAmazonReviewsParser(DOMParserBase):
  1315. """Parser for the "amazon reviews" page of a given movie.
  1316. The page should be provided as a string, as taken from
  1317. the akas.imdb.com server. The final result will be a
  1318. dictionary, with a key for every relevant section.
  1319. Example:
  1320. arparser = DOMHTMLAmazonReviewsParser()
  1321. result = arparser.parse(amazonreviews_html_string)
  1322. """
  1323. _isInPtdf = False
  1324. extractors = [
  1325. Extractor(label='amazon reviews',
  1326. group="//h3",
  1327. group_key="./a/text()",
  1328. group_key_normalize=lambda x: x[:-1],
  1329. path="./following-sibling::p[1]/span[@class='_review']",
  1330. attrs=Attribute(key=None,
  1331. multi=True,
  1332. path={
  1333. 'title': "../preceding-sibling::h3[1]/a[1]/text()",
  1334. 'link': "../preceding-sibling::h3[1]/a[1]/@href",
  1335. 'kind': "./preceding-sibling::b[1]/text()",
  1336. 'item': "./i/b/text()",
  1337. 'review': ".//text()",
  1338. 'author': "./i[starts-with(text(), '--')]/text()"
  1339. },
  1340. postprocess=_parse_review))
  1341. ]
  1342. preprocessors = [
  1343. (re.compile('<p>\n(?!<b>)', re.I), r'\n'),
  1344. (re.compile('(\n</b>\n)', re.I), r'\1<span class="_review">'),
  1345. (re.compile('(</p>\n\n)', re.I), r'</span>\1'),
  1346. (re.compile('(\s\n)(<i><b>)', re.I), r'</span>\1<span class="_review">\2')
  1347. ]
  1348. def postprocess_data(self, data):
  1349. if len(data) == 0:
  1350. return {}
  1351. nd = []
  1352. for item in data.keys():
  1353. nd = nd + data[item]
  1354. return {'amazon reviews': nd}
  1355. def _parse_merchandising_link(x):
  1356. result = {}
  1357. link = x.get('link')
  1358. result['link'] = _normalize_href(link)
  1359. text = x.get('text')
  1360. if text is not None:
  1361. result['link-text'] = text.strip()
  1362. cover = x.get('cover')
  1363. if cover is not None:
  1364. result['cover'] = cover
  1365. description = x.get('description')
  1366. if description is not None:
  1367. shop = x.get('shop')
  1368. if shop is not None:
  1369. result['description'] = u'%s::%s' % (shop, description.strip())
  1370. else:
  1371. result['description'] = description.strip()
  1372. return result
  1373. class DOMHTMLSalesParser(DOMParserBase):
  1374. """Parser for the "merchandising links" page of a given movie.
  1375. The page should be provided as a string, as taken from
  1376. the akas.imdb.com server. The final result will be a
  1377. dictionary, with a key for every relevant section.
  1378. Example:
  1379. sparser = DOMHTMLSalesParser()
  1380. result = sparser.parse(sales_html_string)
  1381. """
  1382. _isInPtdf = False
  1383. extractors = [
  1384. Extractor(label='shops',
  1385. group="//h5/a[@name]/..",
  1386. group_key="./a[1]/text()",
  1387. group_key_normalize=lambda x: x.lower(),
  1388. path=".//following-sibling::table[1]/" \
  1389. "/td[@class='w_rowtable_colshop']//tr[1]",
  1390. attrs=Attribute(key=None,
  1391. multi=True,
  1392. path={
  1393. 'link': "./td[2]/a[1]/@href",
  1394. 'text': "./td[1]/img[1]/@alt",
  1395. 'cover': "./ancestor::td[1]/../td[1]"\
  1396. "/a[1]/img[1]/@src",
  1397. },
  1398. postprocess=_parse_merchandising_link)),
  1399. Extractor(label='others',
  1400. group="//span[@class='_info']/..",
  1401. group_key="./h5/a[1]/text()",
  1402. group_key_normalize=lambda x: x.lower(),
  1403. path="./span[@class='_info']",
  1404. attrs=Attribute(key=None,
  1405. multi=True,
  1406. path={
  1407. 'link': "./preceding-sibling::a[1]/@href",
  1408. 'shop': "./preceding-sibling::a[1]/text()",
  1409. 'description': ".//text()",
  1410. },
  1411. postprocess=_parse_merchandising_link))
  1412. ]
  1413. preprocessors = [
  1414. (re.compile('(<h5><a name=)', re.I), r'</div><div class="_imdbpy">\1'),
  1415. (re.compile('(</h5>\n<br/>\n)</div>', re.I), r'\1'),
  1416. (re.compile('(<br/><br/>\n)(\n)', re.I), r'\1</div>\2'),
  1417. (re.compile('(\n)(Search.*?)(</a>)(\n)', re.I), r'\3\1\2\4'),
  1418. (re.compile('(\n)(Search.*?)(\n)', re.I),
  1419. r'\1<span class="_info">\2</span>\3')
  1420. ]
  1421. def postprocess_data(self, data):
  1422. if len(data) == 0:
  1423. return {}
  1424. return {'merchandising links': data}
  1425. def _build_episode(x):
  1426. """Create a Movie object for a given series' episode."""
  1427. episode_id = analyze_imdbid(x.get('link'))
  1428. episode_title = x.get('title')
  1429. e = Movie(movieID=episode_id, title=episode_title)
  1430. e['kind'] = u'episode'
  1431. oad = x.get('oad')
  1432. if oad:
  1433. e['original air date'] = oad.strip()
  1434. year = x.get('year')
  1435. if year is not None:
  1436. year = year[5:]
  1437. if year == 'unknown': year = u'????'
  1438. if year and year.isdigit():
  1439. year = int(year)
  1440. e['year'] = year
  1441. else:
  1442. if oad and oad[-4:].isdigit():
  1443. e['year'] = int(oad[-4:])
  1444. epinfo = x.get('episode')
  1445. if epinfo is not None:
  1446. season, episode = epinfo.split(':')[0].split(',')
  1447. e['season'] = int(season[7:])
  1448. e['episode'] = int(episode[8:])
  1449. else:
  1450. e['season'] = 'unknown'
  1451. e['episode'] = 'unknown'
  1452. plot = x.get('plot')
  1453. if plot:
  1454. e['plot'] = plot.strip()
  1455. return e
  1456. class DOMHTMLEpisodesParser(DOMParserBase):
  1457. """Parser for the "episode list" page of a given movie.
  1458. The page should be provided as a string, as taken from
  1459. the akas.imdb.com server. The final result will be a
  1460. dictionary, with a key for every relevant section.
  1461. Example:
  1462. eparser = DOMHTMLEpisodesParser()
  1463. result = eparser.parse(episodes_html_string)
  1464. """
  1465. _containsObjects = True
  1466. kind = 'episodes list'
  1467. _episodes_path = "..//h4"
  1468. _oad_path = "./following-sibling::span/strong[1]/text()"
  1469. def _init(self):
  1470. self.extractors = [
  1471. Extractor(label='series',
  1472. path="//html",
  1473. attrs=[Attribute(key='series title',
  1474. path=".//title/text()"),
  1475. Attribute(key='series movieID',
  1476. path=".//h1/a[@class='main']/@href",
  1477. postprocess=analyze_imdbid)
  1478. ]),
  1479. Extractor(label='episodes',
  1480. group="//div[@class='_imdbpy']/h3",
  1481. group_key="./a/@name",
  1482. path=self._episodes_path,
  1483. attrs=Attribute(key=None,
  1484. multi=True,
  1485. path={
  1486. 'link': "./a/@href",
  1487. 'title': "./a/text()",
  1488. 'year': "./preceding-sibling::a[1]/@name",
  1489. 'episode': "./text()[1]",
  1490. 'oad': self._oad_path,
  1491. 'plot': "./following-sibling::text()[1]"
  1492. },
  1493. postprocess=_build_episode))]
  1494. if self.kind == 'episodes cast':
  1495. self.extractors += [
  1496. Extractor(label='cast',
  1497. group="//h4",
  1498. group_key="./text()[1]",
  1499. group_key_normalize=lambda x: x.strip(),
  1500. path="./following-sibling::table[1]//td[@class='nm']",
  1501. attrs=Attribute(key=None,
  1502. multi=True,
  1503. path={'person': "..//text()",
  1504. 'link': "./a/@href",
  1505. 'roleID': \
  1506. "../td[4]/div[@class='_imdbpyrole']/@roleid"},
  1507. postprocess=lambda x: \
  1508. build_person(x.get('person') or u'',
  1509. personID=analyze_imdbid(x.get('link')),
  1510. roleID=(x.get('roleID') or u'').split('/'),
  1511. accessSystem=self._as,
  1512. modFunct=self._modFunct)))
  1513. ]
  1514. preprocessors = [
  1515. (re.compile('(<hr/>\n)(<h3>)', re.I),
  1516. r'</div>\1<div class="_imdbpy">\2'),
  1517. (re.compile('(</p>\n\n)</div>', re.I), r'\1'),
  1518. (re.compile('<h3>(.*?)</h3>', re.I), r'<h4>\1</h4>'),
  1519. (_reRolesMovie, _manageRoles),
  1520. (re.compile('(<br/> <br/>\n)(<hr/>)', re.I), r'\1</div>\2')
  1521. ]
  1522. def postprocess_data(self, data):
  1523. # A bit extreme?
  1524. if not 'series title' in data: return {}
  1525. if not 'series movieID' in data: return {}
  1526. stitle = data['series title'].replace('- Episode list', '')
  1527. stitle = stitle.replace('- Episodes list', '')
  1528. stitle = stitle.replace('- Episode cast', '')
  1529. stitle = stitle.replace('- Episodes cast', '')
  1530. stitle = stitle.strip()
  1531. if not stitle: return {}
  1532. seriesID = data['series movieID']
  1533. if seriesID is None: return {}
  1534. series = Movie(title=stitle, movieID=str(seriesID),
  1535. accessSystem=self._as, modFunct=self._modFunct)
  1536. nd = {}
  1537. for key in data.keys():
  1538. if key.startswith('season-'):
  1539. season_key = key[7:]
  1540. try: season_key = int(season_key)
  1541. except: pass
  1542. nd[season_key] = {}
  1543. for episode in data[key]:
  1544. if not episode: continue
  1545. episode_key = episode.get('episode')
  1546. if episode_key is None: continue
  1547. cast_key = 'Season %s, Episode %s:' % (season_key,
  1548. episode_key)
  1549. if data.has_key(cast_key):
  1550. cast = data[cast_key]
  1551. for i in xrange(len(cast)):
  1552. cast[i].billingPos = i + 1
  1553. episode['cast'] = cast
  1554. episode['episode of'] = series
  1555. nd[season_key][episode_key] = episode
  1556. if len(nd) == 0:
  1557. return {}
  1558. return {'episodes': nd}
  1559. class DOMHTMLEpisodesCastParser(DOMHTMLEpisodesParser):
  1560. """Parser for the "episodes cast" page of a given movie.
  1561. The page should be provided as a string, as taken from
  1562. the akas.imdb.com server. The final result will be a
  1563. dictionary, with a key for every relevant section.
  1564. Example:
  1565. eparser = DOMHTMLEpisodesParser()
  1566. result = eparser.parse(episodes_html_string)
  1567. """
  1568. kind = 'episodes cast'
  1569. _episodes_path = "..//h4"
  1570. _oad_path = "./following-sibling::b[1]/text()"
  1571. class DOMHTMLFaqsParser(DOMParserBase):
  1572. """Parser for the "FAQ" page of a given movie.
  1573. The page should be provided as a string, as taken from
  1574. the akas.imdb.com server. The final result will be a
  1575. dictionary, with a key for every relevant section.
  1576. Example:
  1577. fparser = DOMHTMLFaqsParser()
  1578. result = fparser.parse(faqs_html_string)
  1579. """
  1580. _isInPtdf = False
  1581. _defGetRefs = True
  1582. # XXX: bsoup and lxml don't match (looks like a minor issue, anyway).
  1583. extractors = [
  1584. Extractor(label='faqs',
  1585. path="//div[@class='section']",
  1586. attrs=Attribute(key='faqs',
  1587. multi=True,
  1588. path={
  1589. 'question': "./h3/a/span/text()",
  1590. 'answer': "../following-sibling::div[1]//text()"
  1591. },
  1592. postprocess=lambda x: u'%s::%s' % (x.get('question').strip(),
  1593. '\n\n'.join(x.get('answer').replace(
  1594. '\n\n', '\n').strip().split('||')))))
  1595. ]
  1596. preprocessors = [
  1597. (re.compile('<br/><br/>', re.I), r'||'),
  1598. (re.compile('<h4>(.*?)</h4>\n', re.I), r'||\1--'),
  1599. (re.compile('<span class="spoiler"><span>(.*?)</span></span>', re.I),
  1600. r'[spoiler]\1[/spoiler]')
  1601. ]
  1602. class DOMHTMLAiringParser(DOMParserBase):
  1603. """Parser for the "airing" page of a given movie.
  1604. The page should be provided as a string, as taken from
  1605. the akas.imdb.com server. The final result will be a
  1606. dictionary, with a key for every relevant section.
  1607. Example:
  1608. aparser = DOMHTMLAiringParser()
  1609. result = aparser.parse(airing_html_string)
  1610. """
  1611. _isInPtdf = False
  1612. _containsObjects = True
  1613. extractors = [
  1614. Extractor(label='series title',
  1615. path="//title",
  1616. attrs=Attribute(key='series title', path="./text()",
  1617. postprocess=lambda x: \
  1618. x.replace(' - TV schedule', u''))),
  1619. Extractor(label='series id',
  1620. path="//h1/a[@href]",
  1621. attrs=Attribute(key='series id', path="./@href")),
  1622. Extractor(label='tv airings',
  1623. path="//tr[@class]",
  1624. attrs=Attribute(key='airing',
  1625. multi=True,
  1626. path={
  1627. 'date': "./td[1]//text()",
  1628. 'time': "./td[2]//text()",
  1629. 'channel': "./td[3]//text()",
  1630. 'link': "./td[4]/a[1]/@href",
  1631. 'title': "./td[4]//text()",
  1632. 'season': "./td[5]//text()",
  1633. },
  1634. postprocess=lambda x: {
  1635. 'date': x.get('date'),
  1636. 'time': x.get('time'),
  1637. 'channel': x.get('channel').strip(),
  1638. 'link': x.get('link'),
  1639. 'title': x.get('title'),
  1640. 'season': (x.get('season') or '').strip()
  1641. }
  1642. ))
  1643. ]
  1644. def postprocess_data(self, data):
  1645. if len(data) == 0:
  1646. return {}
  1647. seriesTitle = data['series title']
  1648. seriesID = analyze_imdbid(data['series id'])
  1649. if data.has_key('airing'):
  1650. for airing in data['airing']:
  1651. title = airing.get('title', '').strip()
  1652. if not title:
  1653. epsTitle = seriesTitle
  1654. if seriesID is None:
  1655. continue
  1656. epsID = seriesID
  1657. else:
  1658. epsTitle = '%s {%s}' % (data['series title'],
  1659. airing['title'])
  1660. epsID = analyze_imdbid(airing['link'])
  1661. e = Movie(title=epsTitle, movieID=epsID)
  1662. airing['episode'] = e
  1663. del airing['link']
  1664. del airing['title']
  1665. if not airing['season']:
  1666. del airing['season']
  1667. if 'series title' in data:
  1668. del data['series title']
  1669. if 'series id' in data:
  1670. del data['series id']
  1671. if 'airing' in data:
  1672. data['airing'] = filter(None, data['airing'])
  1673. if 'airing' not in data or not data['airing']:
  1674. return {}
  1675. return data
  1676. class DOMHTMLSynopsisParser(DOMParserBase):
  1677. """Parser for the "synopsis" page of a given movie.
  1678. The page should be provided as a string, as taken from
  1679. the akas.imdb.com server. The final result will be a
  1680. dictionary, with a key for every relevant section.
  1681. Example:
  1682. sparser = HTMLSynopsisParser()
  1683. result = sparser.parse(synopsis_html_string)
  1684. """
  1685. _isInPtdf = False
  1686. extractors = [
  1687. Extractor(label='synopsis',
  1688. path="//div[@class='display'][not(@style)]",
  1689. attrs=Attribute(key='synopsis',
  1690. path=".//text()",
  1691. postprocess=lambda x: '\n\n'.join(x.strip().split('||'))))
  1692. ]
  1693. preprocessors = [
  1694. (re.compile('<br/><br/>', re.I), r'||')
  1695. ]
  1696. class DOMHTMLParentsGuideParser(DOMParserBase):
  1697. """Parser for the "parents guide" page of a given movie.
  1698. The page should be provided as a string, as taken from
  1699. the akas.imdb.com server. The final result will be a
  1700. dictionary, with a key for every relevant section.
  1701. Example:
  1702. pgparser = HTMLParentsGuideParser()
  1703. result = pgparser.parse(parentsguide_html_string)
  1704. """
  1705. _isInPtdf = False
  1706. extractors = [
  1707. Extractor(label='parents guide',
  1708. group="//div[@class='section']",
  1709. group_key="./h3/a/span/text()",
  1710. group_key_normalize=lambda x: x.lower(),
  1711. path="../following-sibling::div[1]/p",
  1712. attrs=Attribute(key=None,
  1713. path=".//text()",
  1714. postprocess=lambda x: [t.strip().replace('\n', ' ')
  1715. for t in x.split('||') if t.strip()]))
  1716. ]
  1717. preprocessors = [
  1718. (re.compile('<br/><br/>', re.I), r'||')
  1719. ]
  1720. def postprocess_data(self, data):
  1721. data2 = {}
  1722. for key in data:
  1723. if data[key]:
  1724. data2[key] = data[key]
  1725. if not data2:
  1726. return {}
  1727. return {'parents guide': data2}
  1728. _OBJECTS = {
  1729. 'movie_parser': ((DOMHTMLMovieParser,), None),
  1730. 'plot_parser': ((DOMHTMLPlotParser,), None),
  1731. 'movie_awards_parser': ((DOMHTMLAwardsParser,), None),
  1732. 'taglines_parser': ((DOMHTMLTaglinesParser,), None),
  1733. 'keywords_parser': ((DOMHTMLKeywordsParser,), None),
  1734. 'crazycredits_parser': ((DOMHTMLCrazyCreditsParser,), None),
  1735. 'goofs_parser': ((DOMHTMLGoofsParser,), None),
  1736. 'alternateversions_parser': ((DOMHTMLAlternateVersionsParser,), None),
  1737. 'trivia_parser': ((DOMHTMLTriviaParser,), None),
  1738. 'soundtrack_parser': ((DOMHTMLSoundtrackParser,), {'kind': 'soundtrack'}),
  1739. 'quotes_parser': ((DOMHTMLQuotesParser,), None),
  1740. 'releasedates_parser': ((DOMHTMLReleaseinfoParser,), None),
  1741. 'ratings_parser': ((DOMHTMLRatingsParser,), None),
  1742. 'officialsites_parser': ((DOMHTMLOfficialsitesParser,), None),
  1743. 'externalrev_parser': ((DOMHTMLOfficialsitesParser,),
  1744. {'kind': 'external reviews'}),
  1745. 'newsgrouprev_parser': ((DOMHTMLOfficialsitesParser,),
  1746. {'kind': 'newsgroup reviews'}),
  1747. 'misclinks_parser': ((DOMHTMLOfficialsitesParser,),
  1748. {'kind': 'misc links'}),
  1749. 'soundclips_parser': ((DOMHTMLOfficialsitesParser,),
  1750. {'kind': 'sound clips'}),
  1751. 'videoclips_parser': ((DOMHTMLOfficialsitesParser,),
  1752. {'kind': 'video clips'}),
  1753. 'photosites_parser': ((DOMHTMLOfficialsitesParser,),
  1754. {'kind': 'photo sites'}),
  1755. 'connections_parser': ((DOMHTMLConnectionParser,), None),
  1756. 'tech_parser': ((DOMHTMLTechParser,), None),
  1757. 'business_parser': ((DOMHTMLTechParser,),
  1758. {'kind': 'business', '_defGetRefs': 1}),
  1759. 'literature_parser': ((DOMHTMLTechParser,), {'kind': 'literature'}),
  1760. 'locations_parser': ((DOMHTMLLocationsParser,), None),
  1761. 'dvd_parser': ((DOMHTMLDvdParser,), None),
  1762. 'rec_parser': ((DOMHTMLRecParser,), None),
  1763. 'news_parser': ((DOMHTMLNewsParser,), None),
  1764. 'amazonrev_parser': ((DOMHTMLAmazonReviewsParser,), None),
  1765. 'sales_parser': ((DOMHTMLSalesParser,), None),
  1766. 'episodes_parser': ((DOMHTMLEpisodesParser,), None),
  1767. 'episodes_cast_parser': ((DOMHTMLEpisodesCastParser,), None),
  1768. 'eprating_parser': ((DOMHTMLEpisodesRatings,), None),
  1769. 'movie_faqs_parser': ((DOMHTMLFaqsParser,), None),
  1770. 'airing_parser': ((DOMHTMLAiringParser,), None),
  1771. 'synopsis_parser': ((DOMHTMLSynopsisParser,), None),
  1772. 'parentsguide_parser': ((DOMHTMLParentsGuideParser,), None)
  1773. }