PageRenderTime 67ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/resources/lib/IMDbPY/imdb/parser/http/movieParser.py

https://code.google.com/p/mythbox/
Python | 1965 lines | 1933 code | 10 blank | 22 comment | 10 complexity | 162897b82999193944068ce81cb60f37 MD5 | raw file
Possible License(s): AGPL-1.0, BSD-2-Clause, GPL-2.0, MIT, Apache-2.0
  1. """
  2. parser.http.movieParser module (imdb package).
  3. This module provides the classes (and the instances), used to parse the
  4. IMDb pages on the akas.imdb.com server about a movie.
  5. E.g., for Brian De Palma's "The Untouchables", the referred
  6. pages would be:
  7. combined details: http://akas.imdb.com/title/tt0094226/combined
  8. plot summary: http://akas.imdb.com/title/tt0094226/plotsummary
  9. ...and so on...
  10. Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
  11. 2008 H. Turgut Uyar <uyar@tekir.org>
  12. This program is free software; you can redistribute it and/or modify
  13. it under the terms of the GNU General Public License as published by
  14. the Free Software Foundation; either version 2 of the License, or
  15. (at your option) any later version.
  16. This program is distributed in the hope that it will be useful,
  17. but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. GNU General Public License for more details.
  20. You should have received a copy of the GNU General Public License
  21. along with this program; if not, write to the Free Software
  22. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  23. """
  24. import re
  25. import urllib
  26. from imdb import imdbURL_base
  27. from imdb.Person import Person
  28. from imdb.Movie import Movie
  29. from imdb.Company import Company
  30. from imdb.utils import analyze_title, split_company_name_notes, _Container
  31. from utils import build_person, DOMParserBase, Attribute, Extractor, \
  32. analyze_imdbid
  33. # Dictionary used to convert some section's names.
  34. _SECT_CONV = {
  35. 'directed': 'director',
  36. 'directed by': 'director',
  37. 'directors': 'director',
  38. 'editors': 'editor',
  39. 'writing credits': 'writer',
  40. 'writers': 'writer',
  41. 'produced': 'producer',
  42. 'cinematography': 'cinematographer',
  43. 'film editing': 'editor',
  44. 'casting': 'casting director',
  45. 'costume design': 'costume designer',
  46. 'makeup department': 'make up',
  47. 'production management': 'production manager',
  48. 'second unit director or assistant director': 'assistant director',
  49. 'costume and wardrobe department': 'costume department',
  50. 'sound department': 'sound crew',
  51. 'stunts': 'stunt performer',
  52. 'other crew': 'miscellaneous crew',
  53. 'also known as': 'akas',
  54. 'country': 'countries',
  55. 'runtime': 'runtimes',
  56. 'language': 'languages',
  57. 'certification': 'certificates',
  58. 'genre': 'genres',
  59. 'created': 'creator',
  60. 'creators': 'creator',
  61. 'color': 'color info',
  62. 'plot': 'plot outline',
  63. 'seasons': 'number of seasons',
  64. 'art directors': 'art direction',
  65. 'assistant directors': 'assistant director',
  66. 'set decorators': 'set decoration',
  67. 'visual effects department': 'visual effects',
  68. 'production managers': 'production manager',
  69. 'miscellaneous': 'miscellaneous crew',
  70. 'make up department': 'make up',
  71. 'plot summary': 'plot outline',
  72. 'cinematographers': 'cinematographer',
  73. 'camera department': 'camera and electrical department',
  74. 'costume designers': 'costume designer',
  75. 'production designers': 'production design',
  76. 'production managers': 'production manager',
  77. 'music original': 'original music',
  78. 'casting directors': 'casting director',
  79. 'other companies': 'miscellaneous companies',
  80. 'producers': 'producer',
  81. 'special effects by': 'special effects department',
  82. 'special effects': 'special effects companies'
  83. }
  84. def _manageRoles(mo):
  85. """Perform some transformation on the html, so that roleIDs can
  86. be easily retrieved."""
  87. firstHalf = mo.group(1)
  88. secondHalf = mo.group(2)
  89. newRoles = []
  90. roles = secondHalf.split(' / ')
  91. for role in roles:
  92. role = role.strip()
  93. if not role:
  94. continue
  95. roleID = analyze_imdbid(role)
  96. if roleID is None:
  97. roleID = u'/'
  98. else:
  99. roleID += u'/'
  100. newRoles.append(u'<div class="_imdbpyrole" roleid="%s">%s</div>' % \
  101. (roleID, role.strip()))
  102. return firstHalf + u' / '.join(newRoles) + mo.group(3)
  103. _reRolesMovie = re.compile(r'(<td class="char">)(.*?)(</td>)',
  104. re.I | re.M | re.S)
  105. def _replaceBR(mo):
  106. """Replaces <br> tags with '::' (useful for some akas)"""
  107. txt = mo.group(0)
  108. return txt.replace('<br>', '::')
  109. _reAkas = re.compile(r'<h5>also known as:</h5>.*?</div>', re.I | re.M | re.S)
  110. def makeSplitter(lstrip=None, sep='|', comments=True,
  111. origNotesSep=' (', newNotesSep='::(', strip=None):
  112. """Return a splitter function suitable for a given set of data."""
  113. def splitter(x):
  114. if not x: return x
  115. x = x.strip()
  116. if not x: return x
  117. if lstrip is not None:
  118. x = x.lstrip(lstrip).lstrip()
  119. lx = x.split(sep)
  120. lx[:] = filter(None, [j.strip() for j in lx])
  121. if comments:
  122. lx[:] = [j.replace(origNotesSep, newNotesSep, 1) for j in lx]
  123. if strip:
  124. lx[:] = [j.strip(strip) for j in lx]
  125. return lx
  126. return splitter
  127. def _toInt(val, replace=()):
  128. """Return the value, converted to integer, or None; if present, 'replace'
  129. must be a list of tuples of values to replace."""
  130. for before, after in replace:
  131. val = val.replace(before, after)
  132. try:
  133. return int(val)
  134. except (TypeError, ValueError):
  135. return None
  136. class DOMHTMLMovieParser(DOMParserBase):
  137. """Parser for the "combined details" (and if instance.mdparse is
  138. True also for the "main details") page of a given movie.
  139. The page should be provided as a string, as taken from
  140. the akas.imdb.com server. The final result will be a
  141. dictionary, with a key for every relevant section.
  142. Example:
  143. mparser = DOMHTMLMovieParser()
  144. result = mparser.parse(combined_details_html_string)
  145. """
  146. _containsObjects = True
  147. extractors = [Extractor(label='title',
  148. path="//h1",
  149. attrs=Attribute(key='title',
  150. path=".//text()",
  151. postprocess=analyze_title)),
  152. Extractor(label='glossarysections',
  153. group="//a[@class='glossary']",
  154. group_key="./@name",
  155. group_key_normalize=lambda x: x.replace('_', ' '),
  156. path="../../../..//tr",
  157. attrs=Attribute(key=None,
  158. multi=True,
  159. path={'person': ".//text()",
  160. 'link': "./td[1]/a[@href]/@href"},
  161. postprocess=lambda x: \
  162. build_person(x.get('person') or u'',
  163. personID=analyze_imdbid(x.get('link')))
  164. )),
  165. Extractor(label='cast',
  166. path="//table[@class='cast']//tr",
  167. attrs=Attribute(key="cast",
  168. multi=True,
  169. path={'person': ".//text()",
  170. 'link': "td[2]/a/@href",
  171. 'roleID': \
  172. "td[4]/div[@class='_imdbpyrole']/@roleid"},
  173. postprocess=lambda x: \
  174. build_person(x.get('person') or u'',
  175. personID=analyze_imdbid(x.get('link')),
  176. roleID=(x.get('roleID') or u'').split('/'))
  177. )),
  178. Extractor(label='genres',
  179. path="//div[@class='info']//a[starts-with(@href," \
  180. " '/Sections/Genres')]",
  181. attrs=Attribute(key="genres",
  182. multi=True,
  183. path="./text()")),
  184. Extractor(label='h5sections',
  185. path="//div[@class='info']/h5/..",
  186. attrs=[
  187. Attribute(key="plot summary",
  188. path="./h5[starts-with(text(), " \
  189. "'Plot:')]/../div/text()",
  190. postprocess=lambda x: \
  191. x.strip().rstrip('|').rstrip()),
  192. Attribute(key="aspect ratio",
  193. path="./h5[starts-with(text()," \
  194. " 'Aspect')]/../div/text()",
  195. postprocess=lambda x: x.strip()),
  196. Attribute(key="mpaa",
  197. path="./h5/a[starts-with(text()," \
  198. " 'MPAA')]/../../div/text()",
  199. postprocess=lambda x: x.strip()),
  200. Attribute(key="countries",
  201. path="./h5[starts-with(text(), " \
  202. "'Countr')]/../div[@class='info-content']//text()",
  203. postprocess=makeSplitter('|')),
  204. Attribute(key="language",
  205. path="./h5[starts-with(text(), " \
  206. "'Language')]/..//text()",
  207. postprocess=makeSplitter('Language:')),
  208. Attribute(key='color info',
  209. path="./h5[starts-with(text(), " \
  210. "'Color')]/..//text()",
  211. postprocess=makeSplitter('Color:')),
  212. Attribute(key='sound mix',
  213. path="./h5[starts-with(text(), " \
  214. "'Sound Mix')]/..//text()",
  215. postprocess=makeSplitter('Sound Mix:')),
  216. # Collects akas not encosed in <i> tags.
  217. Attribute(key='other akas',
  218. path="./h5[starts-with(text(), " \
  219. "'Also Known As')]/../div//text()",
  220. postprocess=makeSplitter(sep='::',
  221. origNotesSep='" - ',
  222. newNotesSep='::',
  223. strip='"')),
  224. Attribute(key='runtimes',
  225. path="./h5[starts-with(text(), " \
  226. "'Runtime')]/../div/text()",
  227. postprocess=makeSplitter()),
  228. Attribute(key='certificates',
  229. path="./h5[starts-with(text(), " \
  230. "'Certificat')]/..//text()",
  231. postprocess=makeSplitter('Certification:')),
  232. Attribute(key='number of seasons',
  233. path="./h5[starts-with(text(), " \
  234. "'Seasons')]/..//text()",
  235. postprocess=lambda x: x.count('|') + 1),
  236. Attribute(key='original air date',
  237. path="./h5[starts-with(text(), " \
  238. "'Original Air Date')]/../div/text()"),
  239. Attribute(key='tv series link',
  240. path="./h5[starts-with(text(), " \
  241. "'TV Series')]/..//a/@href"),
  242. Attribute(key='tv series title',
  243. path="./h5[starts-with(text(), " \
  244. "'TV Series')]/..//a/text()")
  245. ]),
  246. Extractor(label='creator',
  247. path="//h5[starts-with(text(), 'Creator')]/..//a",
  248. attrs=Attribute(key='creator', multi=True,
  249. path={'name': "./text()",
  250. 'link': "./@href"},
  251. postprocess=lambda x: \
  252. build_person(x.get('name') or u'',
  253. personID=analyze_imdbid(x.get('link')))
  254. )),
  255. Extractor(label='thin writer',
  256. path="//h5[starts-with(text(), 'Writer')]/..//a",
  257. attrs=Attribute(key='thin writer', multi=True,
  258. path={'name': "./text()",
  259. 'link': "./@href"},
  260. postprocess=lambda x: \
  261. build_person(x.get('name') or u'',
  262. personID=analyze_imdbid(x.get('link')))
  263. )),
  264. Extractor(label='thin director',
  265. path="//h5[starts-with(text(), 'Director')]/..//a",
  266. attrs=Attribute(key='thin director', multi=True,
  267. path={'name': "./text()",
  268. 'link': "@href"},
  269. postprocess=lambda x: \
  270. build_person(x.get('name') or u'',
  271. personID=analyze_imdbid(x.get('link')))
  272. )),
  273. Extractor(label='top 250/bottom 100',
  274. path="//div[@class='starbar-special']/" \
  275. "a[starts-with(@href, '/chart/')]",
  276. attrs=Attribute(key='top/bottom rank',
  277. path="./text()")),
  278. Extractor(label='series years',
  279. path="//div[@id='tn15title']//span" \
  280. "[starts-with(text(), 'TV series')]",
  281. attrs=Attribute(key='series years',
  282. path="./text()",
  283. postprocess=lambda x: \
  284. x.replace('TV series','').strip())),
  285. Extractor(label='number of episodes',
  286. path="//a[@title='Full Episode List']",
  287. attrs=Attribute(key='number of episodes',
  288. path="./text()",
  289. postprocess=lambda x: \
  290. _toInt(x, [(' Episodes', '')]))),
  291. Extractor(label='akas',
  292. path="//i[@class='transl']",
  293. attrs=Attribute(key='akas', multi=True, path='text()',
  294. postprocess=lambda x:
  295. x.replace(' ', ' ').rstrip('-').replace('" - ',
  296. '"::', 1).strip('"').replace(' ', ' '))),
  297. Extractor(label='production notes/status',
  298. path="//div[@class='info inprod']",
  299. attrs=Attribute(key='production notes',
  300. path=".//text()",
  301. postprocess=lambda x: x.strip())),
  302. Extractor(label='blackcatheader',
  303. group="//b[@class='blackcatheader']",
  304. group_key="./text()",
  305. group_key_normalize=lambda x: x.lower(),
  306. path="../ul/li",
  307. attrs=Attribute(key=None,
  308. multi=True,
  309. path={'name': "./a//text()",
  310. 'comp-link': "./a/@href",
  311. 'notes': "./text()"},
  312. postprocess=lambda x: \
  313. Company(name=x.get('name') or u'',
  314. companyID=analyze_imdbid(x.get('comp-link')),
  315. notes=(x.get('notes') or u'').strip())
  316. )),
  317. Extractor(label='rating',
  318. path="//div[@class='starbar-meta']/b",
  319. attrs=Attribute(key='rating',
  320. path=".//text()")),
  321. Extractor(label='votes',
  322. path="//div[@class='starbar-meta']/a[@href]",
  323. attrs=Attribute(key='votes',
  324. path=".//text()")),
  325. Extractor(label='cover url',
  326. path="//a[@name='poster']",
  327. attrs=Attribute(key='cover url',
  328. path="./img/@src"))
  329. ]
  330. preprocessors = [
  331. (re.compile(r'(<b class="blackcatheader">.+?</b>)', re.I),
  332. r'</div><div>\1'),
  333. ('<small>Full cast and crew for<br></small>', ''),
  334. ('<td> </td>', '<td>...</td>'),
  335. ('<span class="tv-extra">TV mini-series</span>',
  336. '<span class="tv-extra">(mini)</span>'),
  337. (_reRolesMovie, _manageRoles),
  338. (_reAkas, _replaceBR)]
  339. def preprocess_dom(self, dom):
  340. # Handle series information.
  341. xpath = self.xpath(dom, "//b[text()='Series Crew']")
  342. if xpath:
  343. b = xpath[-1] # In doubt, take the last one.
  344. for a in self.xpath(b, "./following::h5/a[@class='glossary']"):
  345. name = a.get('name')
  346. if name:
  347. a.set('name', 'series %s' % name)
  348. # Remove links to IMDbPro.
  349. for proLink in self.xpath(dom, "//span[@class='pro-link']"):
  350. proLink.drop_tree()
  351. # Remove some 'more' links (keep others, like the one around
  352. # the number of votes).
  353. for tn15more in self.xpath(dom,
  354. "//a[@class='tn15more'][starts-with(@href, '/title/')]"):
  355. tn15more.drop_tree()
  356. return dom
  357. re_space = re.compile(r'\s+')
  358. re_airdate = re.compile(r'(.*)\s*\(season (\d+), episode (\d+)\)', re.I)
  359. def postprocess_data(self, data):
  360. # Convert section names.
  361. for sect in data.keys():
  362. if sect in _SECT_CONV:
  363. data[_SECT_CONV[sect]] = data[sect]
  364. del data[sect]
  365. sect = _SECT_CONV[sect]
  366. # Filter out fake values.
  367. for key in data:
  368. value = data[key]
  369. if isinstance(value, list) and value:
  370. if isinstance(value[0], Person):
  371. data[key] = filter(lambda x: x.personID is not None, value)
  372. if isinstance(value[0], _Container):
  373. for obj in data[key]:
  374. obj.accessSystem = self._as
  375. obj.modFunct = self._modFunct
  376. if 'akas' in data or 'other akas' in data:
  377. akas = data.get('akas') or []
  378. other_akas = data.get('other akas') or []
  379. akas += other_akas
  380. if 'akas' in data:
  381. del data['akas']
  382. if 'other akas' in data:
  383. del data['other akas']
  384. if akas:
  385. data['akas'] = akas
  386. if 'runtimes' in data:
  387. data['runtimes'] = [x.replace(' min', u'')
  388. for x in data['runtimes']]
  389. if 'production notes' in data:
  390. pn = data['production notes'].replace('\n\nComments:',
  391. '\nComments:').replace('\n\nNote:',
  392. '\nNote:').replace('Note:\n\n',
  393. 'Note:\n').split('\n')
  394. for k, v in zip(pn[::2], pn[1::2]):
  395. v = v.strip()
  396. if not v:
  397. continue
  398. k = k.lower().strip(':')
  399. if k == 'note':
  400. k = 'status note'
  401. data[k] = v
  402. del data['production notes']
  403. if 'original air date' in data:
  404. oid = self.re_space.sub(' ', data['original air date']).strip()
  405. data['original air date'] = oid
  406. aid = self.re_airdate.findall(oid)
  407. if aid and len(aid[0]) == 3:
  408. date, season, episode = aid[0]
  409. date = date.strip()
  410. try: season = int(season)
  411. except: pass
  412. try: episode = int(episode)
  413. except: pass
  414. if date and date != '????':
  415. data['original air date'] = date
  416. else:
  417. del data['original air date']
  418. # Handle also "episode 0".
  419. if season or type(season) is type(0):
  420. data['season'] = season
  421. if episode or type(season) is type(0):
  422. data['episode'] = episode
  423. for k in ('writer', 'director'):
  424. t_k = 'thin %s' % k
  425. if t_k not in data:
  426. continue
  427. if k not in data:
  428. data[k] = data[t_k]
  429. del data[t_k]
  430. if 'top/bottom rank' in data:
  431. tbVal = data['top/bottom rank'].lower()
  432. if tbVal.startswith('top'):
  433. tbKey = 'top 250 rank'
  434. tbVal = _toInt(tbVal, [('top 250: #', '')])
  435. else:
  436. tbKey = 'bottom 100 rank'
  437. tbVal = _toInt(tbVal, [('bottom 100: #', '')])
  438. if tbVal:
  439. data[tbKey] = tbVal
  440. del data['top/bottom rank']
  441. if 'year' in data and data['year'] == '????':
  442. del data['year']
  443. if 'tv series link' in data:
  444. if 'tv series title' in data:
  445. data['episode of'] = Movie(title=data['tv series title'],
  446. movieID=analyze_imdbid(
  447. data['tv series link']),
  448. accessSystem=self._as,
  449. modFunct=self._modFunct)
  450. del data['tv series title']
  451. del data['tv series link']
  452. if 'rating' in data:
  453. try:
  454. data['rating'] = float(data['rating'].replace('/10', ''))
  455. except (TypeError, ValueError):
  456. pass
  457. if 'votes' in data:
  458. try:
  459. votes = data['votes'].replace(',', '').replace('votes', '')
  460. data['votes'] = int(votes)
  461. except (TypeError, ValueError):
  462. pass
  463. return data
  464. def _process_plotsummary(x):
  465. """Process a plot (contributed by Rdian06)."""
  466. xauthor = x.get('author')
  467. if xauthor:
  468. xauthor = xauthor.replace('{', '<').replace('}', '>').replace('(',
  469. '<').replace(')', '>').strip()
  470. xplot = x.get('plot', u'').strip()
  471. if xauthor:
  472. xplot += u'::%s' % xauthor
  473. return xplot
  474. class DOMHTMLPlotParser(DOMParserBase):
  475. """Parser for the "plot summary" page of a given movie.
  476. The page should be provided as a string, as taken from
  477. the akas.imdb.com server. The final result will be a
  478. dictionary, with a 'plot' key, containing a list
  479. of string with the structure: 'summary::summary_author <author@email>'.
  480. Example:
  481. pparser = HTMLPlotParser()
  482. result = pparser.parse(plot_summary_html_string)
  483. """
  484. _defGetRefs = True
  485. # Notice that recently IMDb started to put the email of the
  486. # author only in the link, that we're not collecting, here.
  487. extractors = [Extractor(label='plot',
  488. path="//p[@class='plotpar']",
  489. attrs=Attribute(key='plot',
  490. multi=True,
  491. path={'plot': './text()',
  492. 'author': './i/a/text()'},
  493. postprocess=_process_plotsummary))]
  494. def _process_award(x):
  495. award = {}
  496. award['award'] = x.get('award').strip()
  497. if not award['award']:
  498. return {}
  499. award['year'] = x.get('year').strip()
  500. if award['year'] and award['year'].isdigit():
  501. award['year'] = int(award['year'])
  502. award['result'] = x.get('result').strip()
  503. category = x.get('category').strip()
  504. if category:
  505. award['category'] = category
  506. received_with = x.get('with')
  507. if received_with is not None:
  508. award['with'] = received_with.strip()
  509. notes = x.get('notes')
  510. if notes is not None:
  511. notes = notes.strip()
  512. if notes:
  513. award['notes'] = notes
  514. award['anchor'] = x.get('anchor')
  515. return award
  516. class DOMHTMLAwardsParser(DOMParserBase):
  517. """Parser for the "awards" page of a given person or movie.
  518. The page should be provided as a string, as taken from
  519. the akas.imdb.com server. The final result will be a
  520. dictionary, with a key for every relevant section.
  521. Example:
  522. awparser = HTMLAwardsParser()
  523. result = awparser.parse(awards_html_string)
  524. """
  525. subject = 'title'
  526. _containsObjects = True
  527. extractors = [
  528. Extractor(label='awards',
  529. group="//table//big",
  530. group_key="./a",
  531. path="./ancestor::tr[1]/following-sibling::tr/" \
  532. "td[last()][not(@colspan)]",
  533. attrs=Attribute(key=None,
  534. multi=True,
  535. path={
  536. 'year': "../td[1]/a/text()",
  537. 'result': "../td[2]/b/text()",
  538. 'award': "../td[3]/text()",
  539. 'category': "./text()[1]",
  540. # FIXME: takes only the first co-recipient
  541. 'with': "./small[starts-with(text()," \
  542. " 'Shared with:')]/following-sibling::a[1]/text()",
  543. 'notes': "./small[last()]//text()",
  544. 'anchor': ".//text()"
  545. },
  546. postprocess=_process_award
  547. )),
  548. Extractor(label='recipients',
  549. group="//table//big",
  550. group_key="./a",
  551. path="./ancestor::tr[1]/following-sibling::tr/" \
  552. "td[last()]/small[1]/preceding-sibling::a",
  553. attrs=Attribute(key=None,
  554. multi=True,
  555. path={
  556. 'name': "./text()",
  557. 'link': "./@href",
  558. 'anchor': "..//text()"
  559. }
  560. ))
  561. ]
  562. preprocessors = [
  563. (re.compile('(<tr><td[^>]*>.*?</td></tr>\n\n</table>)', re.I),
  564. r'\1</table>'),
  565. (re.compile('(<tr><td[^>]*>\n\n<big>.*?</big></td></tr>)', re.I),
  566. r'</table><table class="_imdbpy">\1'),
  567. (re.compile('(<table[^>]*>\n\n)</table>(<table)', re.I), r'\1\2'),
  568. (re.compile('(<small>.*?)<br>(.*?</small)', re.I), r'\1 \2'),
  569. (re.compile('(</tr>\n\n)(<td)', re.I), r'\1<tr>\2')
  570. ]
  571. def preprocess_dom(self, dom):
  572. """Repeat td elements according to their rowspan attributes
  573. in subsequent tr elements.
  574. """
  575. cols = self.xpath(dom, "//td[@rowspan]")
  576. for col in cols:
  577. span = int(col.get('rowspan'))
  578. del col.attrib['rowspan']
  579. position = len(self.xpath(col, "./preceding-sibling::td"))
  580. row = col.getparent()
  581. for tr in self.xpath(row, "./following-sibling::tr")[:span-1]:
  582. # if not cloned, child will be moved to new parent
  583. clone = self.clone(col)
  584. # XXX: beware that here we don't use an "adapted" function,
  585. # because both BeautifulSoup and lxml uses the same
  586. # "insert" method.
  587. tr.insert(position, clone)
  588. return dom
  589. def postprocess_data(self, data):
  590. if len(data) == 0:
  591. return {}
  592. nd = []
  593. for key in data.keys():
  594. dom = self.get_dom(key)
  595. assigner = self.xpath(dom, "//a/text()")[0]
  596. for entry in data[key]:
  597. if not entry.has_key('name'):
  598. if not entry:
  599. continue
  600. # this is an award, not a recipient
  601. entry['assigner'] = assigner.strip()
  602. # find the recipients
  603. matches = [p for p in data[key]
  604. if p.has_key('name') and (entry['anchor'] ==
  605. p['anchor'])]
  606. if self.subject == 'title':
  607. recipients = [Person(name=recipient['name'],
  608. personID=analyze_imdbid(recipient['link']))
  609. for recipient in matches]
  610. entry['to'] = recipients
  611. elif self.subject == 'name':
  612. recipients = [Movie(title=recipient['name'],
  613. movieID=analyze_imdbid(recipient['link']))
  614. for recipient in matches]
  615. entry['for'] = recipients
  616. nd.append(entry)
  617. del entry['anchor']
  618. return {'awards': nd}
  619. class DOMHTMLTaglinesParser(DOMParserBase):
  620. """Parser for the "taglines" page of a given movie.
  621. The page should be provided as a string, as taken from
  622. the akas.imdb.com server. The final result will be a
  623. dictionary, with a key for every relevant section.
  624. Example:
  625. tparser = DOMHTMLTaglinesParser()
  626. result = tparser.parse(taglines_html_string)
  627. """
  628. extractors = [Extractor(label='taglines',
  629. path="//div[@id='tn15content']/p",
  630. attrs=Attribute(key='taglines', multi=True,
  631. path="./text()"))]
  632. class DOMHTMLKeywordsParser(DOMParserBase):
  633. """Parser for the "keywords" page of a given movie.
  634. The page should be provided as a string, as taken from
  635. the akas.imdb.com server. The final result will be a
  636. dictionary, with a key for every relevant section.
  637. Example:
  638. kwparser = DOMHTMLKeywordsParser()
  639. result = kwparser.parse(keywords_html_string)
  640. """
  641. extractors = [Extractor(label='keywords',
  642. path="//a[starts-with(@href, '/keyword/')]",
  643. attrs=Attribute(key='keywords',
  644. path="./text()", multi=True,
  645. postprocess=lambda x: \
  646. x.lower().replace(' ', '-')))]
  647. class DOMHTMLAlternateVersionsParser(DOMParserBase):
  648. """Parser for the "alternate versions" page of a given movie.
  649. The page should be provided as a string, as taken from
  650. the akas.imdb.com server. The final result will be a
  651. dictionary, with a key for every relevant section.
  652. Example:
  653. avparser = HTMLAlternateVersionsParser()
  654. result = avparser.parse(alternateversions_html_string)
  655. """
  656. _defGetRefs = True
  657. extractors = [Extractor(label='alternate versions',
  658. path="//ul[@class='trivia']/li",
  659. attrs=Attribute(key='alternate versions',
  660. multi=True,
  661. path=".//text()",
  662. postprocess=lambda x: x.strip()))]
  663. class DOMHTMLTriviaParser(DOMParserBase):
  664. """Parser for the "trivia" page of a given movie.
  665. The page should be provided as a string, as taken from
  666. the akas.imdb.com server. The final result will be a
  667. dictionary, with a key for every relevant section.
  668. Example:
  669. avparser = HTMLAlternateVersionsParser()
  670. result = avparser.parse(alternateversions_html_string)
  671. """
  672. _defGetRefs = True
  673. extractors = [Extractor(label='alternate versions',
  674. path="//div[@class='sodatext']",
  675. attrs=Attribute(key='trivia',
  676. multi=True,
  677. path=".//text()",
  678. postprocess=lambda x: x.strip()))]
  679. def preprocess_dom(self, dom):
  680. # Remove "link this quote" links.
  681. for qLink in self.xpath(dom, "//span[@class='linksoda']"):
  682. qLink.drop_tree()
  683. return dom
  684. class DOMHTMLSoundtrackParser(DOMHTMLAlternateVersionsParser):
  685. kind = 'soundtrack'
  686. preprocessors = [
  687. ('<br>', '\n')
  688. ]
  689. def postprocess_data(self, data):
  690. if 'soundtrack' in data:
  691. nd = []
  692. for x in data['soundtrack']:
  693. ds = x.split('\n')
  694. title = ds[0]
  695. if title[0] == '"' and title[-1] == '"':
  696. title = title[1:-1]
  697. nds = []
  698. newData = {}
  699. for l in ds[1:]:
  700. if ' with ' in l or ' by ' in l or ' from ' in l \
  701. or ' of ' in l or l.startswith('From '):
  702. nds.append(l)
  703. else:
  704. if nds:
  705. nds[-1] += l
  706. else:
  707. nds.append(l)
  708. newData[title] = {}
  709. for l in nds:
  710. skip = False
  711. for sep in ('From ',):
  712. if l.startswith(sep):
  713. fdix = len(sep)
  714. kind = l[:fdix].rstrip().lower()
  715. info = l[fdix:].lstrip()
  716. newData[title][kind] = info
  717. skip = True
  718. if not skip:
  719. for sep in ' with ', ' by ', ' from ', ' of ':
  720. fdix = l.find(sep)
  721. if fdix != -1:
  722. fdix = fdix+len(sep)
  723. kind = l[:fdix].rstrip().lower()
  724. info = l[fdix:].lstrip()
  725. newData[title][kind] = info
  726. break
  727. nd.append(newData)
  728. data['soundtrack'] = nd
  729. return data
  730. class DOMHTMLCrazyCreditsParser(DOMParserBase):
  731. """Parser for the "crazy credits" page of a given movie.
  732. The page should be provided as a string, as taken from
  733. the akas.imdb.com server. The final result will be a
  734. dictionary, with a key for every relevant section.
  735. Example:
  736. ccparser = DOMHTMLCrazyCreditsParser()
  737. result = ccparser.parse(crazycredits_html_string)
  738. """
  739. _defGetRefs = True
  740. extractors = [Extractor(label='crazy credits', path="//ul/li/tt",
  741. attrs=Attribute(key='crazy credits', multi=True,
  742. path=".//text()",
  743. postprocess=lambda x: \
  744. x.replace('\n', ' ').replace(' ', ' ')))]
  745. class DOMHTMLGoofsParser(DOMParserBase):
  746. """Parser for the "goofs" page of a given movie.
  747. The page should be provided as a string, as taken from
  748. the akas.imdb.com server. The final result will be a
  749. dictionary, with a key for every relevant section.
  750. Example:
  751. gparser = DOMHTMLGoofsParser()
  752. result = gparser.parse(goofs_html_string)
  753. """
  754. _defGetRefs = True
  755. extractors = [Extractor(label='goofs', path="//ul[@class='trivia']/li",
  756. attrs=Attribute(key='goofs', multi=True, path=".//text()",
  757. postprocess=lambda x: (x or u'').strip()))]
  758. class DOMHTMLQuotesParser(DOMParserBase):
  759. """Parser for the "memorable quotes" page of a given movie.
  760. The page should be provided as a string, as taken from
  761. the akas.imdb.com server. The final result will be a
  762. dictionary, with a key for every relevant section.
  763. Example:
  764. qparser = DOMHTMLQuotesParser()
  765. result = qparser.parse(quotes_html_string)
  766. """
  767. _defGetRefs = True
  768. extractors = [
  769. Extractor(label='quotes',
  770. path="//div[@class='_imdbpy']",
  771. attrs=Attribute(key='quotes',
  772. multi=True,
  773. path=".//text()",
  774. postprocess=lambda x: x.strip().replace(' \n',
  775. '::').replace('::\n', '::').replace('\n', ' ')))
  776. ]
  777. preprocessors = [
  778. (re.compile('(<a name="?qt[0-9]{7}"?></a>)', re.I),
  779. r'\1<div class="_imdbpy">'),
  780. (re.compile('<hr width="30%">', re.I), '</div>'),
  781. (re.compile('<hr/>', re.I), '</div>'),
  782. (re.compile('<script.*?</script>', re.I|re.S), ''),
  783. # For BeautifulSoup.
  784. (re.compile('<!-- sid: t-channel : MIDDLE_CENTER -->', re.I), '</div>')
  785. ]
  786. def preprocess_dom(self, dom):
  787. # Remove "link this quote" links.
  788. for qLink in self.xpath(dom, "//p[@class='linksoda']"):
  789. qLink.drop_tree()
  790. return dom
  791. def postprocess_data(self, data):
  792. if 'quotes' not in data:
  793. return {}
  794. for idx, quote in enumerate(data['quotes']):
  795. data['quotes'][idx] = quote.split('::')
  796. return data
  797. class DOMHTMLReleaseinfoParser(DOMParserBase):
  798. """Parser for the "release dates" page of a given movie.
  799. The page should be provided as a string, as taken from
  800. the akas.imdb.com server. The final result will be a
  801. dictionary, with a key for every relevant section.
  802. Example:
  803. rdparser = DOMHTMLReleaseinfoParser()
  804. result = rdparser.parse(releaseinfo_html_string)
  805. """
  806. extractors = [Extractor(label='release dates',
  807. path="//th[@class='xxxx']/../../tr",
  808. attrs=Attribute(key='release dates', multi=True,
  809. path={'country': ".//td[1]//text()",
  810. 'date': ".//td[2]//text()",
  811. 'notes': ".//td[3]//text()"})),
  812. Extractor(label='akas',
  813. path="//div[@class='_imdbpy_akas']/table/tr",
  814. attrs=Attribute(key='akas', multi=True,
  815. path={'title': "./td[1]/text()",
  816. 'countries': "./td[2]/text()"}))]
  817. preprocessors = [
  818. (re.compile('(<h5><a name="?akas"?.*</table>)', re.I | re.M | re.S),
  819. r'<div class="_imdbpy_akas">\1</div>')]
  820. def postprocess_data(self, data):
  821. if not ('release dates' in data or 'akas' in data): return data
  822. releases = data.get('release dates') or []
  823. rl = []
  824. for i in releases:
  825. country = i.get('country')
  826. date = i.get('date')
  827. if not (country and date): continue
  828. country = country.strip()
  829. date = date.strip()
  830. if not (country and date): continue
  831. notes = i['notes']
  832. info = u'%s::%s' % (country, date)
  833. if notes:
  834. info += notes
  835. rl.append(info)
  836. if releases:
  837. del data['release dates']
  838. if rl:
  839. data['release dates'] = rl
  840. akas = data.get('akas') or []
  841. nakas = []
  842. for aka in akas:
  843. title = aka.get('title', '').strip()
  844. if not title:
  845. continue
  846. countries = aka.get('countries', '').split('/')
  847. if not countries:
  848. nakas.append(title)
  849. else:
  850. for country in countries:
  851. nakas.append('%s::%s' % (title, country.strip()))
  852. if akas:
  853. del data['akas']
  854. if nakas:
  855. data['akas from release info'] = nakas
  856. return data
  857. class DOMHTMLRatingsParser(DOMParserBase):
  858. """Parser for the "user ratings" page of a given movie.
  859. The page should be provided as a string, as taken from
  860. the akas.imdb.com server. The final result will be a
  861. dictionary, with a key for every relevant section.
  862. Example:
  863. rparser = DOMHTMLRatingsParser()
  864. result = rparser.parse(userratings_html_string)
  865. """
  866. re_means = re.compile('mean\s*=\s*([0-9]\.[0-9])\.\s*median\s*=\s*([0-9])',
  867. re.I)
  868. extractors = [
  869. Extractor(label='number of votes',
  870. path="//td[b='Percentage']/../../tr",
  871. attrs=[Attribute(key='votes',
  872. multi=True,
  873. path={
  874. 'votes': "td[1]//text()",
  875. 'ordinal': "td[3]//text()"
  876. })]),
  877. Extractor(label='mean and median',
  878. path="//p[starts-with(text(), 'Arithmetic mean')]",
  879. attrs=Attribute(key='mean and median',
  880. path="text()")),
  881. Extractor(label='rating',
  882. path="//a[starts-with(@href, '/search/title?user_rating=')]",
  883. attrs=Attribute(key='rating',
  884. path="text()")),
  885. Extractor(label='demographic voters',
  886. path="//td[b='Average']/../../tr",
  887. attrs=Attribute(key='demographic voters',
  888. multi=True,
  889. path={
  890. 'voters': "td[1]//text()",
  891. 'votes': "td[2]//text()",
  892. 'average': "td[3]//text()"
  893. })),
  894. Extractor(label='top 250',
  895. path="//a[text()='top 250']",
  896. attrs=Attribute(key='top 250',
  897. path="./preceding-sibling::text()[1]"))
  898. ]
  899. def postprocess_data(self, data):
  900. nd = {}
  901. votes = data.get('votes', [])
  902. if votes:
  903. nd['number of votes'] = {}
  904. for i in xrange(1, 11):
  905. _ordinal = int(votes[i]['ordinal'])
  906. _strvts = votes[i]['votes'] or '0'
  907. nd['number of votes'][_ordinal] = \
  908. int(_strvts.replace(',', ''))
  909. mean = data.get('mean and median', '')
  910. if mean:
  911. means = self.re_means.findall(mean)
  912. if means and len(means[0]) == 2:
  913. am, med = means[0]
  914. try: am = float(am)
  915. except (ValueError, OverflowError): pass
  916. if type(am) is type(1.0):
  917. nd['arithmetic mean'] = am
  918. try: med = int(med)
  919. except (ValueError, OverflowError): pass
  920. if type(med) is type(0):
  921. nd['median'] = med
  922. if 'rating' in data:
  923. nd['rating'] = float(data['rating'])
  924. dem_voters = data.get('demographic voters')
  925. if dem_voters:
  926. nd['demographic'] = {}
  927. for i in xrange(1, len(dem_voters)):
  928. if (dem_voters[i]['votes'] is not None) \
  929. and (dem_voters[i]['votes'].strip()):
  930. nd['demographic'][dem_voters[i]['voters'].strip().lower()] \
  931. = (int(dem_voters[i]['votes'].replace(',', '')),
  932. float(dem_voters[i]['average']))
  933. if 'imdb users' in nd.get('demographic', {}):
  934. nd['votes'] = nd['demographic']['imdb users'][0]
  935. nd['demographic']['all votes'] = nd['demographic']['imdb users']
  936. del nd['demographic']['imdb users']
  937. top250 = data.get('top 250')
  938. if top250:
  939. sd = top250[9:]
  940. i = sd.find(' ')
  941. if i != -1:
  942. sd = sd[:i]
  943. try: sd = int(sd)
  944. except (ValueError, OverflowError): pass
  945. if type(sd) is type(0):
  946. nd['top 250 rank'] = sd
  947. return nd
  948. class DOMHTMLEpisodesRatings(DOMParserBase):
  949. """Parser for the "episode ratings ... by date" page of a given movie.
  950. The page should be provided as a string, as taken from
  951. the akas.imdb.com server. The final result will be a
  952. dictionary, with a key for every relevant section.
  953. Example:
  954. erparser = DOMHTMLEpisodesRatings()
  955. result = erparser.parse(eprating_html_string)
  956. """
  957. _containsObjects = True
  958. extractors = [Extractor(label='title', path="//title",
  959. attrs=Attribute(key='title', path="./text()")),
  960. Extractor(label='ep ratings',
  961. path="//th/../..//tr",
  962. attrs=Attribute(key='episodes', multi=True,
  963. path={'nr': ".//td[1]/text()",
  964. 'ep title': ".//td[2]//text()",
  965. 'movieID': ".//td[2]/a/@href",
  966. 'rating': ".//td[3]/text()",
  967. 'votes': ".//td[4]/text()"}))]
  968. def postprocess_data(self, data):
  969. if 'title' not in data or 'episodes' not in data: return {}
  970. nd = []
  971. title = data['title']
  972. for i in data['episodes']:
  973. ept = i['ep title']
  974. movieID = analyze_imdbid(i['movieID'])
  975. votes = i['votes']
  976. rating = i['rating']
  977. if not (ept and movieID and votes and rating): continue
  978. try:
  979. votes = int(votes.replace(',', '').replace('.', ''))
  980. except:
  981. pass
  982. try:
  983. rating = float(rating)
  984. except:
  985. pass
  986. ept = ept.strip()
  987. ept = u'%s {%s' % (title, ept)
  988. nr = i['nr']
  989. if nr:
  990. ept += u' (#%s)' % nr.strip()
  991. ept += '}'
  992. if movieID is not None:
  993. movieID = str(movieID)
  994. m = Movie(title=ept, movieID=movieID, accessSystem=self._as,
  995. modFunct=self._modFunct)
  996. epofdict = m.get('episode of')
  997. if epofdict is not None:
  998. m['episode of'] = Movie(data=epofdict, accessSystem=self._as,
  999. modFunct=self._modFunct)
  1000. nd.append({'episode': m, 'votes': votes, 'rating': rating})
  1001. return {'episodes rating': nd}
  1002. def _normalize_href(href):
  1003. if (href is not None) and (not href.lower().startswith('http://')):
  1004. if href.startswith('/'): href = href[1:]
  1005. href = '%s%s' % (imdbURL_base, href)
  1006. return href
  1007. class DOMHTMLOfficialsitesParser(DOMParserBase):
  1008. """Parser for the "official sites", "external reviews", "newsgroup
  1009. reviews", "miscellaneous links", "sound clips", "video clips" and
  1010. "photographs" pages of a given movie.
  1011. The page should be provided as a string, as taken from
  1012. the akas.imdb.com server. The final result will be a
  1013. dictionary, with a key for every relevant section.
  1014. Example:
  1015. osparser = DOMHTMLOfficialsitesParser()
  1016. result = osparser.parse(officialsites_html_string)
  1017. """
  1018. kind = 'official sites'
  1019. extractors = [
  1020. Extractor(label='site',
  1021. path="//ol/li/a",
  1022. attrs=Attribute(key='self.kind',
  1023. multi=True,
  1024. path={
  1025. 'link': "./@href",
  1026. 'info': "./text()"
  1027. },
  1028. postprocess=lambda x: (x.get('info').strip(),
  1029. urllib.unquote(_normalize_href(x.get('link'))))))
  1030. ]
  1031. class DOMHTMLConnectionParser(DOMParserBase):
  1032. """Parser for the "connections" page of a given movie.
  1033. The page should be provided as a string, as taken from
  1034. the akas.imdb.com server. The final result will be a
  1035. dictionary, with a key for every relevant section.
  1036. Example:
  1037. connparser = DOMHTMLConnectionParser()
  1038. result = connparser.parse(connections_html_string)
  1039. """
  1040. _containsObjects = True
  1041. extractors = [Extractor(label='connection',
  1042. group="//div[@class='_imdbpy']",
  1043. group_key="./h5/text()",
  1044. group_key_normalize=lambda x: x.lower(),
  1045. path="./a",
  1046. attrs=Attribute(key=None,
  1047. path={'title': "./text()",
  1048. 'movieID': "./@href"},
  1049. multi=True))]
  1050. preprocessors = [
  1051. ('<h5>', '</div><div class="_imdbpy"><h5>'),
  1052. # To get the movie's year.
  1053. ('</a> (', ' ('),
  1054. ('\n<br/>', '</a>'),
  1055. ('<br/> - ', '::')
  1056. ]
  1057. def postprocess_data(self, data):
  1058. for key in data.keys():
  1059. nl = []
  1060. for v in data[key]:
  1061. title = v['title']
  1062. ts = title.split('::', 1)
  1063. title = ts[0].strip()
  1064. notes = u''
  1065. if len(ts) == 2:
  1066. notes = ts[1].strip()
  1067. m = Movie(title=title,
  1068. movieID=analyze_imdbid(v['movieID']),
  1069. accessSystem=self._as, notes=notes,
  1070. modFunct=self._modFunct)
  1071. nl.append(m)
  1072. data[key] = nl
  1073. if not data: return {}
  1074. return {'connections': data}
  1075. class DOMHTMLLocationsParser(DOMParserBase):
  1076. """Parser for the "locations" page of a given movie.
  1077. The page should be provided as a string, as taken from
  1078. the akas.imdb.com server. The final result will be a
  1079. dictionary, with a key for every relevant section.
  1080. Example:
  1081. lparser = DOMHTMLLocationsParser()
  1082. result = lparser.parse(locations_html_string)
  1083. """
  1084. extractors = [Extractor(label='locations', path="//dt",
  1085. attrs=Attribute(key='locations', multi=True,
  1086. path={'place': ".//text()",
  1087. 'note': "./following-sibling::dd[1]" \
  1088. "//text()"},
  1089. postprocess=lambda x: (u'%s::%s' % (
  1090. x['place'].strip(),
  1091. (x['note'] or u'').strip())).strip(':')))]
  1092. class DOMHTMLTechParser(DOMParserBase):
  1093. """Parser for the "technical", "business", "literature",
  1094. "publicity" (for people) and "contacts (for people) pages of
  1095. a given movie.
  1096. The page should be provided as a string, as taken from
  1097. the akas.imdb.com server. The final result will be a
  1098. dictionary, with a key for every relevant section.
  1099. Example:
  1100. tparser = HTMLTechParser()
  1101. result = tparser.parse(technical_html_string)
  1102. """
  1103. kind = 'tech'
  1104. extractors = [Extractor(label='tech',
  1105. group="//h5",
  1106. group_key="./text()",
  1107. group_key_normalize=lambda x: x.lower(),
  1108. path="./following-sibling::div[1]",
  1109. attrs=Attribute(key=None,
  1110. path=".//text()",
  1111. postprocess=lambda x: [t.strip()
  1112. for t in x.split('\n') if t.strip()]))]
  1113. preprocessors = [
  1114. (re.compile('(<h5>.*?</h5>)', re.I), r'\1<div class="_imdbpy">'),
  1115. (re.compile('((<br/>|</p>|</table>))\n?<br/>(?!<a)', re.I),
  1116. r'\1</div>'),
  1117. # the ones below are for the publicity parser
  1118. (re.compile('<p>(.*?)</p>', re.I), r'\1<br/>'),
  1119. (re.compile('(</td><td valign="top">)', re.I), r'\1::'),
  1120. (re.compile('(</tr><tr>)', re.I), r'\n\1'),
  1121. # this is for splitting individual entries
  1122. (re.compile('<br/>', re.I), r'\n'),
  1123. ]
  1124. def postprocess_data(self, data):
  1125. for key in data:
  1126. data[key] = filter(None, data[key])
  1127. if self.kind in ('literature', 'business', 'contacts') and data:
  1128. if 'screenplay/teleplay' in data:
  1129. data['screenplay-teleplay'] = data['screenplay/teleplay']
  1130. del data['screenplay/teleplay']
  1131. data = {self.kind: data}
  1132. else:
  1133. if self.kind == 'publicity':
  1134. if 'biography (print)' in data:
  1135. data['biography-print'] = data['biography (print)']
  1136. del data['biography (print)']
  1137. # Tech info.
  1138. for key in data.keys():
  1139. if key.startswith('film negative format'):
  1140. data['film negative format'] = data[key]
  1141. del data[key]
  1142. elif key.startswith('film length'):
  1143. data['film length'] = data[key]
  1144. del data[key]
  1145. return data
  1146. class DOMHTMLDvdParser(DOMParserBase):
  1147. """Parser for the "dvd" page of a given movie.
  1148. The page should be provided as a string, as taken from
  1149. the akas.imdb.com server. The final result will be a
  1150. dictionary, with a key for every relevant section.
  1151. Example:
  1152. dparser = DOMHTMLDvdParser()
  1153. result = dparser.parse(dvd_html_string)
  1154. """
  1155. _defGetRefs = True
  1156. extractors = [Extractor(label='dvd',
  1157. path="//div[@class='base_layer']",
  1158. attrs=[Attribute(key=None,
  1159. multi=True,
  1160. path={
  1161. 'title': "../table[1]//h3/text()",
  1162. 'cover': "../table[1]//img/@src",
  1163. 'region': ".//p[b='Region:']/text()",
  1164. 'asin': ".//p[b='ASIN:']/text()",
  1165. 'upc': ".//p[b='UPC:']/text()",
  1166. 'rating': ".//p/b[starts-with(text(), 'Rating:')]/../img/@alt",
  1167. 'certificate': ".//p[b='Certificate:']/text()",
  1168. 'runtime': ".//p[b='Runtime:']/text()",
  1169. 'label': ".//p[b='Label:']/text()",
  1170. 'studio': ".//p[b='Studio:']/text()",
  1171. 'release date': ".//p[b='Release Date:']/text()",
  1172. 'dvd format': ".//p[b='DVD Format:']/text()",
  1173. 'dvd features': ".//p[b='DVD Features: ']//text()",
  1174. 'supplements': "..//div[span='Supplements']" \
  1175. "/following-sibling::div[1]//text()",
  1176. 'review': "..//div[span='Review']/following-sibling::div[1]//text()",
  1177. 'titles': "..//div[starts-with(text(), 'Titles in this Product')]" \
  1178. "/..//text()",
  1179. },
  1180. postprocess=lambda x: {
  1181. 'title': (x.get('title') or u'').strip(),
  1182. 'cover': (x.get('cover') or u'').strip(),
  1183. 'region': (x.get('region') or u'').strip(),
  1184. 'asin': (x.get('asin') or u'').strip(),
  1185. 'upc': (x.get('upc') or u'').strip(),
  1186. 'rating': (x.get('rating') or u'Not Rated').strip().replace('Rating: ', ''),
  1187. 'certificate': (x.get('certificate') or u'').strip(),
  1188. 'runtime': (x.get('runtime') or u'').strip(),
  1189. 'label': (x.get('label') or u'').strip(),
  1190. 'studio': (x.get('studio') or u'').strip(),
  1191. 'release date': (x.get('release date') or u'').strip(),
  1192. 'dvd format': (x.get('dvd format') or u'').strip(),
  1193. 'dvd features': (x.get('dvd features') or u'').strip().replace('DVD Features: ', ''),
  1194. 'supplements': (x.get('supplements') or u'').strip(),
  1195. 'review': (x.get('review') or u'').strip(),
  1196. 'titles in this product': (x.get('titles') or u'').strip().replace('Titles in this Product::', ''),
  1197. }
  1198. )])]
  1199. preprocessors = [
  1200. (re.compile('<p>(<table class="dvd_section" .*)</p>\s*<hr\s*/>', re.I),
  1201. r'<div class="_imdbpy">\1</div>'),
  1202. (re.compile('<p>(<div class\s*=\s*"base_layer")', re.I), r'\1'),
  1203. (re.compile('</p>\s*<p>(<div class="dvd_section")', re.I), r'\1'),
  1204. (re.compile('</div><div class="dvd_row(_alt)?">', re.I), r'::')
  1205. ]
  1206. def postprocess_data(self, data):
  1207. if not data:
  1208. return data
  1209. dvds = data['dvd']
  1210. for dvd in dvds:
  1211. if dvd['cover'].find('noposter') != -1:
  1212. del dvd['cover']
  1213. for key in dvd.keys():
  1214. if not dvd[key]:
  1215. del dvd[key]
  1216. if 'supplements' in dvd:
  1217. dvd['supplements'] = dvd['supplements'].split('::')
  1218. return data
  1219. class DOMHTMLRecParser(DOMParserBase):
  1220. """Parser for the "recommendations" page of a given movie.
  1221. The page should be provided as a string, as taken from
  1222. the akas.imdb.com server. The final result will be a
  1223. dictionary, with a key for every relevant section.
  1224. Example:
  1225. rparser = HTMLRecParser()
  1226. result = rparser.parse(recommendations_html_string)
  1227. """
  1228. _containsObjects = True
  1229. extractors = [Extractor(label='recommendations',
  1230. path="//td[@valign='middle'][1]",
  1231. attrs=Attribute(key='../../tr/td[1]//text()',
  1232. multi=True,
  1233. path={'title': ".//text()",
  1234. 'movieID': ".//a/@href"}))]
  1235. def postprocess_data(self, data):
  1236. for key in data.keys():
  1237. n_key = key
  1238. n_keyl = n_key.lower()
  1239. if n_keyl == 'suggested by the database':
  1240. n_key = 'database'
  1241. elif n_keyl == 'imdb users recommend':
  1242. n_key = 'users'
  1243. data[n_key] = [Movie(title=x['title'],
  1244. movieID=analyze_imdbid(x['movieID']),
  1245. accessSystem=self._as, modFunct=self._modFunct)
  1246. for x in data[key]]
  1247. del data[key]
  1248. if data: return {'recommendations': data}
  1249. return data
  1250. class DOMHTMLNewsParser(DOMParserBase):
  1251. """Parser for the "news" page of a given movie or person.
  1252. The page should be provided as a string, as taken from
  1253. the akas.imdb.com server. The final result will be a
  1254. dictionary, with a key for every relevant section.
  1255. Example:
  1256. nwparser = DOMHTMLNewsParser()
  1257. result = nwparser.parse(news_html_string)
  1258. """
  1259. _defGetRefs = True
  1260. extractors = [
  1261. Extractor(label='news',
  1262. path="//h2",
  1263. attrs=Attribute(key='news',
  1264. multi=True,
  1265. path={
  1266. 'title': "./text()",
  1267. 'fromdate': "../following-sibling::p[1]/small//text()",
  1268. # FIXME: sometimes (see The Matrix (1999)) <p> is found
  1269. # inside news text.
  1270. 'body': "../following-sibling::p[2]//text()",
  1271. 'link': "../..//a[text()='Permalink']/@href",
  1272. 'fulllink': "../..//a[starts-with(text(), " \
  1273. "'See full article at')]/@href"
  1274. },
  1275. postprocess=lambda x: {
  1276. 'title': x.get('title').strip(),
  1277. 'date': x.get('fromdate').split('|')[0].strip(),
  1278. 'from': x.get('fromdate').split('|')[1].replace('From ',
  1279. '').strip(),
  1280. 'body': (x.get('body') or u'').strip(),
  1281. 'link': _normalize_href(x.get('link')),
  1282. 'full article link': _normalize_href(x.get('fulllink'))
  1283. }))
  1284. ]
  1285. preprocessors = [
  1286. (re.compile('(<a name=[^>]+><h2>)', re.I), r'<div class="_imdbpy">\1'),
  1287. (re.compile('(<hr/>)', re.I), r'</div>\1'),
  1288. (re.compile('<p></p>', re.I), r'')
  1289. ]
  1290. def postprocess_data(self, data):
  1291. if not data.has_key('news'):
  1292. return {}
  1293. for news in data['news']:
  1294. if news.has_key('full article link'):
  1295. if news['full article link'] is None:
  1296. del news['full article link']
  1297. return data
  1298. def _parse_review(x):
  1299. result = {}
  1300. title = x.get('title').strip()
  1301. if title[-1] == ':': title = title[:-1]
  1302. result['title'] = title
  1303. result['link'] = _normalize_href(x.get('link'))
  1304. kind = x.get('kind').strip()
  1305. if kind[-1] == ':': kind = kind[:-1]
  1306. result['review kind'] = kind
  1307. text = x.get('review').replace('\n\n', '||').replace('\n', ' ').split('||')
  1308. review = '\n'.join(text)
  1309. if x.get('author') is not None:
  1310. author = x.get('author').strip()
  1311. review = review.split(author)[0].strip()
  1312. result['review author'] = author[2:]
  1313. if x.get('item') is not None:
  1314. item = x.get('item').strip()
  1315. review = review[len(item):].strip()
  1316. review = "%s: %s" % (item, review)
  1317. result['review'] = review
  1318. return result
  1319. class DOMHTMLAmazonReviewsParser(DOMParserBase):
  1320. """Parser for the "amazon reviews" page of a given movie.
  1321. The page should be provided as a string, as taken from
  1322. the akas.imdb.com server. The final result will be a
  1323. dictionary, with a key for every relevant section.
  1324. Example:
  1325. arparser = DOMHTMLAmazonReviewsParser()
  1326. result = arparser.parse(amazonreviews_html_string)
  1327. """
  1328. extractors = [
  1329. Extractor(label='amazon reviews',
  1330. group="//h3",
  1331. group_key="./a/text()",
  1332. group_key_normalize=lambda x: x[:-1],
  1333. path="./following-sibling::p[1]/span[@class='_review']",
  1334. attrs=Attribute(key=None,
  1335. multi=True,
  1336. path={
  1337. 'title': "../preceding-sibling::h3[1]/a[1]/text()",
  1338. 'link': "../preceding-sibling::h3[1]/a[1]/@href",
  1339. 'kind': "./preceding-sibling::b[1]/text()",
  1340. 'item': "./i/b/text()",
  1341. 'review': ".//text()",
  1342. 'author': "./i[starts-with(text(), '--')]/text()"
  1343. },
  1344. postprocess=_parse_review))
  1345. ]
  1346. preprocessors = [
  1347. (re.compile('<p>\n(?!<b>)', re.I), r'\n'),
  1348. (re.compile('(\n</b>\n)', re.I), r'\1<span class="_review">'),
  1349. (re.compile('(</p>\n\n)', re.I), r'</span>\1'),
  1350. (re.compile('(\s\n)(<i><b>)', re.I), r'</span>\1<span class="_review">\2')
  1351. ]
  1352. def postprocess_data(self, data):
  1353. if len(data) == 0:
  1354. return {}
  1355. nd = []
  1356. for item in data.keys():
  1357. nd = nd + data[item]
  1358. return {'amazon reviews': nd}
  1359. def _parse_merchandising_link(x):
  1360. result = {}
  1361. link = x.get('link')
  1362. result['link'] = _normalize_href(link)
  1363. text = x.get('text')
  1364. if text is not None:
  1365. result['link-text'] = text.strip()
  1366. cover = x.get('cover')
  1367. if cover is not None:
  1368. result['cover'] = cover
  1369. description = x.get('description')
  1370. if description is not None:
  1371. shop = x.get('shop')
  1372. if shop is not None:
  1373. result['description'] = u'%s::%s' % (shop, description.strip())
  1374. else:
  1375. result['description'] = description.strip()
  1376. return result
  1377. class DOMHTMLSalesParser(DOMParserBase):
  1378. """Parser for the "merchandising links" page of a given movie.
  1379. The page should be provided as a string, as taken from
  1380. the akas.imdb.com server. The final result will be a
  1381. dictionary, with a key for every relevant section.
  1382. Example:
  1383. sparser = DOMHTMLSalesParser()
  1384. result = sparser.parse(sales_html_string)
  1385. """
  1386. extractors = [
  1387. Extractor(label='shops',
  1388. group="//h5/a[@name]/..",
  1389. group_key="./a[1]/text()",
  1390. group_key_normalize=lambda x: x.lower(),
  1391. path=".//following-sibling::table[1]/" \
  1392. "/td[@class='w_rowtable_colshop']//tr[1]",
  1393. attrs=Attribute(key=None,
  1394. multi=True,
  1395. path={
  1396. 'link': "./td[2]/a[1]/@href",
  1397. 'text': "./td[1]/img[1]/@alt",
  1398. 'cover': "./ancestor::td[1]/../td[1]"\
  1399. "/a[1]/img[1]/@src",
  1400. },
  1401. postprocess=_parse_merchandising_link)),
  1402. Extractor(label='others',
  1403. group="//span[@class='_info']/..",
  1404. group_key="./h5/a[1]/text()",
  1405. group_key_normalize=lambda x: x.lower(),
  1406. path="./span[@class='_info']",
  1407. attrs=Attribute(key=None,
  1408. multi=True,
  1409. path={
  1410. 'link': "./preceding-sibling::a[1]/@href",
  1411. 'shop': "./preceding-sibling::a[1]/text()",
  1412. 'description': ".//text()",
  1413. },
  1414. postprocess=_parse_merchandising_link))
  1415. ]
  1416. preprocessors = [
  1417. (re.compile('(<h5><a name=)', re.I), r'</div><div class="_imdbpy">\1'),
  1418. (re.compile('(</h5>\n<br/>\n)</div>', re.I), r'\1'),
  1419. (re.compile('(<br/><br/>\n)(\n)', re.I), r'\1</div>\2'),
  1420. (re.compile('(\n)(Search.*?)(</a>)(\n)', re.I), r'\3\1\2\4'),
  1421. (re.compile('(\n)(Search.*?)(\n)', re.I),
  1422. r'\1<span class="_info">\2</span>\3')
  1423. ]
  1424. def postprocess_data(self, data):
  1425. if len(data) == 0:
  1426. return {}
  1427. return {'merchandising links': data}
  1428. def _build_episode(x):
  1429. """Create a Movie object for a given series' episode."""
  1430. episode_id = analyze_imdbid(x.get('link'))
  1431. episode_title = x.get('title')
  1432. e = Movie(movieID=episode_id, title=episode_title)
  1433. e['kind'] = u'episode'
  1434. oad = x.get('oad')
  1435. if oad:
  1436. e['original air date'] = oad.strip()
  1437. year = x.get('year')
  1438. if year is not None:
  1439. year = year[5:]
  1440. if year == 'unknown': year = u'????'
  1441. if year and year.isdigit():
  1442. year = int(year)
  1443. e['year'] = year
  1444. else:
  1445. if oad and oad[-4:].isdigit():
  1446. e['year'] = int(oad[-4:])
  1447. epinfo = x.get('episode')
  1448. if epinfo is not None:
  1449. season, episode = epinfo.split(':')[0].split(',')
  1450. e['season'] = int(season[7:])
  1451. e['episode'] = int(episode[8:])
  1452. else:
  1453. e['season'] = 'unknown'
  1454. e['episode'] = 'unknown'
  1455. plot = x.get('plot')
  1456. if plot:
  1457. e['plot'] = plot.strip()
  1458. return e
  1459. class DOMHTMLEpisodesParser(DOMParserBase):
  1460. """Parser for the "episode list" page of a given movie.
  1461. The page should be provided as a string, as taken from
  1462. the akas.imdb.com server. The final result will be a
  1463. dictionary, with a key for every relevant section.
  1464. Example:
  1465. eparser = DOMHTMLEpisodesParser()
  1466. result = eparser.parse(episodes_html_string)
  1467. """
  1468. _containsObjects = True
  1469. kind = 'episodes list'
  1470. _episodes_path = "..//h4"
  1471. _oad_path = "./following-sibling::span/strong[1]/text()"
  1472. def _init(self):
  1473. self.extractors = [
  1474. Extractor(label='series',
  1475. path="//html",
  1476. attrs=[Attribute(key='series title',
  1477. path=".//title/text()"),
  1478. Attribute(key='series movieID',
  1479. path=".//h1/a[@class='main']/@href",
  1480. postprocess=analyze_imdbid)
  1481. ]),
  1482. Extractor(label='episodes',
  1483. group="//div[@class='_imdbpy']/h3",
  1484. group_key="./a/@name",
  1485. path=self._episodes_path,
  1486. attrs=Attribute(key=None,
  1487. multi=True,
  1488. path={
  1489. 'link': "./a/@href",
  1490. 'title': "./a/text()",
  1491. 'year': "./preceding-sibling::a[1]/@name",
  1492. 'episode': "./text()[1]",
  1493. 'oad': self._oad_path,
  1494. 'plot': "./following-sibling::text()[1]"
  1495. },
  1496. postprocess=_build_episode))]
  1497. if self.kind == 'episodes cast':
  1498. self.extractors += [
  1499. Extractor(label='cast',
  1500. group="//h4",
  1501. group_key="./text()[1]",
  1502. group_key_normalize=lambda x: x.strip(),
  1503. path="./following-sibling::table[1]//td[@class='nm']",
  1504. attrs=Attribute(key=None,
  1505. multi=True,
  1506. path={'person': "..//text()",
  1507. 'link': "./a/@href",
  1508. 'roleID': \
  1509. "../td[4]/div[@class='_imdbpyrole']/@roleid"},
  1510. postprocess=lambda x: \
  1511. build_person(x.get('person') or u'',
  1512. personID=analyze_imdbid(x.get('link')),
  1513. roleID=(x.get('roleID') or u'').split('/'),
  1514. accessSystem=self._as,
  1515. modFunct=self._modFunct)))
  1516. ]
  1517. preprocessors = [
  1518. (re.compile('(<hr/>\n)(<h3>)', re.I),
  1519. r'</div>\1<div class="_imdbpy">\2'),
  1520. (re.compile('(</p>\n\n)</div>', re.I), r'\1'),
  1521. (re.compile('<h3>(.*?)</h3>', re.I), r'<h4>\1</h4>'),
  1522. (_reRolesMovie, _manageRoles),
  1523. (re.compile('(<br/> <br/>\n)(<hr/>)', re.I), r'\1</div>\2')
  1524. ]
  1525. def postprocess_data(self, data):
  1526. # A bit extreme?
  1527. if not 'series title' in data: return {}
  1528. if not 'series movieID' in data: return {}
  1529. stitle = data['series title'].replace('- Episode list', '')
  1530. stitle = stitle.replace('- Episodes list', '')
  1531. stitle = stitle.replace('- Episode cast', '')
  1532. stitle = stitle.replace('- Episodes cast', '')
  1533. stitle = stitle.strip()
  1534. if not stitle: return {}
  1535. seriesID = data['series movieID']
  1536. if seriesID is None: return {}
  1537. series = Movie(title=stitle, movieID=str(seriesID),
  1538. accessSystem=self._as, modFunct=self._modFunct)
  1539. nd = {}
  1540. for key in data.keys():
  1541. if key.startswith('season-'):
  1542. season_key = key[7:]
  1543. try: season_key = int(season_key)
  1544. except: pass
  1545. nd[season_key] = {}
  1546. ep_counter = 1
  1547. for episode in data[key]:
  1548. if not episode: continue
  1549. episode_key = episode.get('episode')
  1550. if episode_key is None: continue
  1551. if not isinstance(episode_key, int):
  1552. episode_key = ep_counter
  1553. ep_counter += 1
  1554. cast_key = 'Season %s, Episode %s:' % (season_key,
  1555. episode_key)
  1556. if data.has_key(cast_key):
  1557. cast = data[cast_key]
  1558. for i in xrange(len(cast)):
  1559. cast[i].billingPos = i + 1
  1560. episode['cast'] = cast
  1561. episode['episode of'] = series
  1562. nd[season_key][episode_key] = episode
  1563. if len(nd) == 0:
  1564. return {}
  1565. return {'episodes': nd}
  1566. class DOMHTMLEpisodesCastParser(DOMHTMLEpisodesParser):
  1567. """Parser for the "episodes cast" page of a given movie.
  1568. The page should be provided as a string, as taken from
  1569. the akas.imdb.com server. The final result will be a
  1570. dictionary, with a key for every relevant section.
  1571. Example:
  1572. eparser = DOMHTMLEpisodesParser()
  1573. result = eparser.parse(episodes_html_string)
  1574. """
  1575. kind = 'episodes cast'
  1576. _episodes_path = "..//h4"
  1577. _oad_path = "./following-sibling::b[1]/text()"
  1578. class DOMHTMLFaqsParser(DOMParserBase):
  1579. """Parser for the "FAQ" page of a given movie.
  1580. The page should be provided as a string, as taken from
  1581. the akas.imdb.com server. The final result will be a
  1582. dictionary, with a key for every relevant section.
  1583. Example:
  1584. fparser = DOMHTMLFaqsParser()
  1585. result = fparser.parse(faqs_html_string)
  1586. """
  1587. _defGetRefs = True
  1588. # XXX: bsoup and lxml don't match (looks like a minor issue, anyway).
  1589. extractors = [
  1590. Extractor(label='faqs',
  1591. path="//div[@class='section']",
  1592. attrs=Attribute(key='faqs',
  1593. multi=True,
  1594. path={
  1595. 'question': "./h3/a/span/text()",
  1596. 'answer': "../following-sibling::div[1]//text()"
  1597. },
  1598. postprocess=lambda x: u'%s::%s' % (x.get('question').strip(),
  1599. '\n\n'.join(x.get('answer').replace(
  1600. '\n\n', '\n').strip().split('||')))))
  1601. ]
  1602. preprocessors = [
  1603. (re.compile('<br/><br/>', re.I), r'||'),
  1604. (re.compile('<h4>(.*?)</h4>\n', re.I), r'||\1--'),
  1605. (re.compile('<span class="spoiler"><span>(.*?)</span></span>', re.I),
  1606. r'[spoiler]\1[/spoiler]')
  1607. ]
  1608. class DOMHTMLAiringParser(DOMParserBase):
  1609. """Parser for the "airing" page of a given movie.
  1610. The page should be provided as a string, as taken from
  1611. the akas.imdb.com server. The final result will be a
  1612. dictionary, with a key for every relevant section.
  1613. Example:
  1614. aparser = DOMHTMLAiringParser()
  1615. result = aparser.parse(airing_html_string)
  1616. """
  1617. _containsObjects = True
  1618. extractors = [
  1619. Extractor(label='series title',
  1620. path="//title",
  1621. attrs=Attribute(key='series title', path="./text()",
  1622. postprocess=lambda x: \
  1623. x.replace(' - TV schedule', u''))),
  1624. Extractor(label='series id',
  1625. path="//h1/a[@href]",
  1626. attrs=Attribute(key='series id', path="./@href")),
  1627. Extractor(label='tv airings',
  1628. path="//tr[@class]",
  1629. attrs=Attribute(key='airing',
  1630. multi=True,
  1631. path={
  1632. 'date': "./td[1]//text()",
  1633. 'time': "./td[2]//text()",
  1634. 'channel': "./td[3]//text()",
  1635. 'link': "./td[4]/a[1]/@href",
  1636. 'title': "./td[4]//text()",
  1637. 'season': "./td[5]//text()",
  1638. },
  1639. postprocess=lambda x: {
  1640. 'date': x.get('date'),
  1641. 'time': x.get('time'),
  1642. 'channel': x.get('channel').strip(),
  1643. 'link': x.get('link'),
  1644. 'title': x.get('title'),
  1645. 'season': (x.get('season') or '').strip()
  1646. }
  1647. ))
  1648. ]
  1649. def postprocess_data(self, data):
  1650. if len(data) == 0:
  1651. return {}
  1652. seriesTitle = data['series title']
  1653. seriesID = analyze_imdbid(data['series id'])
  1654. if data.has_key('airing'):
  1655. for airing in data['airing']:
  1656. title = airing.get('title', '').strip()
  1657. if not title:
  1658. epsTitle = seriesTitle
  1659. if seriesID is None:
  1660. continue
  1661. epsID = seriesID
  1662. else:
  1663. epsTitle = '%s {%s}' % (data['series title'],
  1664. airing['title'])
  1665. epsID = analyze_imdbid(airing['link'])
  1666. e = Movie(title=epsTitle, movieID=epsID)
  1667. airing['episode'] = e
  1668. del airing['link']
  1669. del airing['title']
  1670. if not airing['season']:
  1671. del airing['season']
  1672. if 'series title' in data:
  1673. del data['series title']
  1674. if 'series id' in data:
  1675. del data['series id']
  1676. if 'airing' in data:
  1677. data['airing'] = filter(None, data['airing'])
  1678. if 'airing' not in data or not data['airing']:
  1679. return {}
  1680. return data
  1681. class DOMHTMLSynopsisParser(DOMParserBase):
  1682. """Parser for the "synopsis" page of a given movie.
  1683. The page should be provided as a string, as taken from
  1684. the akas.imdb.com server. The final result will be a
  1685. dictionary, with a key for every relevant section.
  1686. Example:
  1687. sparser = HTMLSynopsisParser()
  1688. result = sparser.parse(synopsis_html_string)
  1689. """
  1690. extractors = [
  1691. Extractor(label='synopsis',
  1692. path="//div[@class='display'][not(@style)]",
  1693. attrs=Attribute(key='synopsis',
  1694. path=".//text()",
  1695. postprocess=lambda x: '\n\n'.join(x.strip().split('||'))))
  1696. ]
  1697. preprocessors = [
  1698. (re.compile('<br/><br/>', re.I), r'||')
  1699. ]
  1700. class DOMHTMLParentsGuideParser(DOMParserBase):
  1701. """Parser for the "parents guide" page of a given movie.
  1702. The page should be provided as a string, as taken from
  1703. the akas.imdb.com server. The final result will be a
  1704. dictionary, with a key for every relevant section.
  1705. Example:
  1706. pgparser = HTMLParentsGuideParser()
  1707. result = pgparser.parse(parentsguide_html_string)
  1708. """
  1709. extractors = [
  1710. Extractor(label='parents guide',
  1711. group="//div[@class='section']",
  1712. group_key="./h3/a/span/text()",
  1713. group_key_normalize=lambda x: x.lower(),
  1714. path="../following-sibling::div[1]/p",
  1715. attrs=Attribute(key=None,
  1716. path=".//text()",
  1717. postprocess=lambda x: [t.strip().replace('\n', ' ')
  1718. for t in x.split('||') if t.strip()]))
  1719. ]
  1720. preprocessors = [
  1721. (re.compile('<br/><br/>', re.I), r'||')
  1722. ]
  1723. def postprocess_data(self, data):
  1724. data2 = {}
  1725. for key in data:
  1726. if data[key]:
  1727. data2[key] = data[key]
  1728. if not data2:
  1729. return {}
  1730. return {'parents guide': data2}
  1731. _OBJECTS = {
  1732. 'movie_parser': ((DOMHTMLMovieParser,), None),
  1733. 'plot_parser': ((DOMHTMLPlotParser,), None),
  1734. 'movie_awards_parser': ((DOMHTMLAwardsParser,), None),
  1735. 'taglines_parser': ((DOMHTMLTaglinesParser,), None),
  1736. 'keywords_parser': ((DOMHTMLKeywordsParser,), None),
  1737. 'crazycredits_parser': ((DOMHTMLCrazyCreditsParser,), None),
  1738. 'goofs_parser': ((DOMHTMLGoofsParser,), None),
  1739. 'alternateversions_parser': ((DOMHTMLAlternateVersionsParser,), None),
  1740. 'trivia_parser': ((DOMHTMLTriviaParser,), None),
  1741. 'soundtrack_parser': ((DOMHTMLSoundtrackParser,), {'kind': 'soundtrack'}),
  1742. 'quotes_parser': ((DOMHTMLQuotesParser,), None),
  1743. 'releasedates_parser': ((DOMHTMLReleaseinfoParser,), None),
  1744. 'ratings_parser': ((DOMHTMLRatingsParser,), None),
  1745. 'officialsites_parser': ((DOMHTMLOfficialsitesParser,), None),
  1746. 'externalrev_parser': ((DOMHTMLOfficialsitesParser,),
  1747. {'kind': 'external reviews'}),
  1748. 'newsgrouprev_parser': ((DOMHTMLOfficialsitesParser,),
  1749. {'kind': 'newsgroup reviews'}),
  1750. 'misclinks_parser': ((DOMHTMLOfficialsitesParser,),
  1751. {'kind': 'misc links'}),
  1752. 'soundclips_parser': ((DOMHTMLOfficialsitesParser,),
  1753. {'kind': 'sound clips'}),
  1754. 'videoclips_parser': ((DOMHTMLOfficialsitesParser,),
  1755. {'kind': 'video clips'}),
  1756. 'photosites_parser': ((DOMHTMLOfficialsitesParser,),
  1757. {'kind': 'photo sites'}),
  1758. 'connections_parser': ((DOMHTMLConnectionParser,), None),
  1759. 'tech_parser': ((DOMHTMLTechParser,), None),
  1760. 'business_parser': ((DOMHTMLTechParser,),
  1761. {'kind': 'business', '_defGetRefs': 1}),
  1762. 'literature_parser': ((DOMHTMLTechParser,), {'kind': 'literature'}),
  1763. 'locations_parser': ((DOMHTMLLocationsParser,), None),
  1764. 'dvd_parser': ((DOMHTMLDvdParser,), None),
  1765. 'rec_parser': ((DOMHTMLRecParser,), None),
  1766. 'news_parser': ((DOMHTMLNewsParser,), None),
  1767. 'amazonrev_parser': ((DOMHTMLAmazonReviewsParser,), None),
  1768. 'sales_parser': ((DOMHTMLSalesParser,), None),
  1769. 'episodes_parser': ((DOMHTMLEpisodesParser,), None),
  1770. 'episodes_cast_parser': ((DOMHTMLEpisodesCastParser,), None),
  1771. 'eprating_parser': ((DOMHTMLEpisodesRatings,), None),
  1772. 'movie_faqs_parser': ((DOMHTMLFaqsParser,), None),
  1773. 'airing_parser': ((DOMHTMLAiringParser,), None),
  1774. 'synopsis_parser': ((DOMHTMLSynopsisParser,), None),
  1775. 'parentsguide_parser': ((DOMHTMLParentsGuideParser,), None)
  1776. }