PageRenderTime 83ms CodeModel.GetById 36ms RepoModel.GetById 1ms app.codeStats 0ms

/imdb/parser/http/movieParser.py

https://bitbucket.org/alberanid/imdbpy/
Python | 1951 lines | 1909 code | 19 blank | 23 comment | 23 complexity | c9223950c96d878cfa96a8fd8e0a796d MD5 | raw file
Possible License(s): GPL-2.0
  1. """
  2. parser.http.movieParser module (imdb package).
  3. This module provides the classes (and the instances), used to parse the
  4. IMDb pages on the akas.imdb.com server about a movie.
  5. E.g., for Brian De Palma's "The Untouchables", the referred
  6. pages would be:
  7. combined details: http://akas.imdb.com/title/tt0094226/combined
  8. plot summary: http://akas.imdb.com/title/tt0094226/plotsummary
  9. ...and so on...
  10. Copyright 2004-2016 Davide Alberani <da@erlug.linux.it>
  11. 2008 H. Turgut Uyar <uyar@tekir.org>
  12. This program is free software; you can redistribute it and/or modify
  13. it under the terms of the GNU General Public License as published by
  14. the Free Software Foundation; either version 2 of the License, or
  15. (at your option) any later version.
  16. This program is distributed in the hope that it will be useful,
  17. but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. GNU General Public License for more details.
  20. You should have received a copy of the GNU General Public License
  21. along with this program; if not, write to the Free Software
  22. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  23. """
  24. import re
  25. import urllib
  26. from imdb import imdbURL_base
  27. from imdb.Person import Person
  28. from imdb.Movie import Movie
  29. from imdb.Company import Company
  30. from imdb.utils import analyze_title, split_company_name_notes, _Container
  31. from utils import build_person, DOMParserBase, Attribute, Extractor, \
  32. analyze_imdbid
  33. # Dictionary used to convert some section's names.
  34. _SECT_CONV = {
  35. 'directed': 'director',
  36. 'directed by': 'director',
  37. 'directors': 'director',
  38. 'editors': 'editor',
  39. 'writing credits': 'writer',
  40. 'writers': 'writer',
  41. 'produced': 'producer',
  42. 'cinematography': 'cinematographer',
  43. 'film editing': 'editor',
  44. 'casting': 'casting director',
  45. 'costume design': 'costume designer',
  46. 'makeup department': 'make up',
  47. 'production management': 'production manager',
  48. 'second unit director or assistant director': 'assistant director',
  49. 'costume and wardrobe department': 'costume department',
  50. 'sound department': 'sound crew',
  51. 'stunts': 'stunt performer',
  52. 'other crew': 'miscellaneous crew',
  53. 'also known as': 'akas',
  54. 'country': 'countries',
  55. 'runtime': 'runtimes',
  56. 'language': 'languages',
  57. 'certification': 'certificates',
  58. 'genre': 'genres',
  59. 'created': 'creator',
  60. 'creators': 'creator',
  61. 'color': 'color info',
  62. 'plot': 'plot outline',
  63. 'seasons': 'number of seasons',
  64. 'art directors': 'art direction',
  65. 'assistant directors': 'assistant director',
  66. 'set decorators': 'set decoration',
  67. 'visual effects department': 'visual effects',
  68. 'production managers': 'production manager',
  69. 'miscellaneous': 'miscellaneous crew',
  70. 'make up department': 'make up',
  71. 'plot summary': 'plot outline',
  72. 'cinematographers': 'cinematographer',
  73. 'camera department': 'camera and electrical department',
  74. 'costume designers': 'costume designer',
  75. 'production designers': 'production design',
  76. 'production managers': 'production manager',
  77. 'music original': 'original music',
  78. 'casting directors': 'casting director',
  79. 'other companies': 'miscellaneous companies',
  80. 'producers': 'producer',
  81. 'special effects by': 'special effects department',
  82. 'special effects': 'special effects companies'
  83. }
  84. def _manageRoles(mo):
  85. """Perform some transformation on the html, so that roleIDs can
  86. be easily retrieved."""
  87. firstHalf = mo.group(1)
  88. secondHalf = mo.group(2)
  89. newRoles = []
  90. roles = secondHalf.split(' / ')
  91. for role in roles:
  92. role = role.strip()
  93. if not role:
  94. continue
  95. roleID = analyze_imdbid(role)
  96. if roleID is None:
  97. roleID = u'/'
  98. else:
  99. roleID += u'/'
  100. newRoles.append(u'<div class="_imdbpyrole" roleid="%s">%s</div>' % \
  101. (roleID, role.strip()))
  102. return firstHalf + u' / '.join(newRoles) + mo.group(3)
  103. _reRolesMovie = re.compile(r'(<td class="char">)(.*?)(</td>)',
  104. re.I | re.M | re.S)
  105. def _replaceBR(mo):
  106. """Replaces <br> tags with '::' (useful for some akas)"""
  107. txt = mo.group(0)
  108. return txt.replace('<br>', '::')
  109. _reAkas = re.compile(r'<h5>also known as:</h5>.*?</div>', re.I | re.M | re.S)
  110. def makeSplitter(lstrip=None, sep='|', comments=True,
  111. origNotesSep=' (', newNotesSep='::(', strip=None):
  112. """Return a splitter function suitable for a given set of data."""
  113. def splitter(x):
  114. if not x: return x
  115. x = x.strip()
  116. if not x: return x
  117. if lstrip is not None:
  118. x = x.lstrip(lstrip).lstrip()
  119. lx = x.split(sep)
  120. lx[:] = filter(None, [j.strip() for j in lx])
  121. if comments:
  122. lx[:] = [j.replace(origNotesSep, newNotesSep, 1) for j in lx]
  123. if strip:
  124. lx[:] = [j.strip(strip) for j in lx]
  125. return lx
  126. return splitter
  127. def _toInt(val, replace=()):
  128. """Return the value, converted to integer, or None; if present, 'replace'
  129. must be a list of tuples of values to replace."""
  130. for before, after in replace:
  131. val = val.replace(before, after)
  132. try:
  133. return int(val)
  134. except (TypeError, ValueError):
  135. return None
  136. class DOMHTMLMovieParser(DOMParserBase):
  137. """Parser for the "combined details" (and if instance.mdparse is
  138. True also for the "main details") page of a given movie.
  139. The page should be provided as a string, as taken from
  140. the akas.imdb.com server. The final result will be a
  141. dictionary, with a key for every relevant section.
  142. Example:
  143. mparser = DOMHTMLMovieParser()
  144. result = mparser.parse(combined_details_html_string)
  145. """
  146. _containsObjects = True
  147. extractors = [Extractor(label='title',
  148. path="//h1",
  149. attrs=Attribute(key='title',
  150. path=".//text()",
  151. postprocess=analyze_title)),
  152. Extractor(label='glossarysections',
  153. group="//a[@class='glossary']",
  154. group_key="./@name",
  155. group_key_normalize=lambda x: x.replace('_', ' '),
  156. path="../../../..//tr",
  157. attrs=Attribute(key=None,
  158. multi=True,
  159. path={'person': ".//text()",
  160. 'link': "./td[1]/a[@href]/@href"},
  161. postprocess=lambda x: \
  162. build_person(x.get('person') or u'',
  163. personID=analyze_imdbid(x.get('link')))
  164. )),
  165. Extractor(label='cast',
  166. path="//table[@class='cast']//tr",
  167. attrs=Attribute(key="cast",
  168. multi=True,
  169. path={'person': ".//text()",
  170. 'link': "td[2]/a/@href",
  171. 'roleID': \
  172. "td[4]/div[@class='_imdbpyrole']/@roleid"},
  173. postprocess=lambda x: \
  174. build_person(x.get('person') or u'',
  175. personID=analyze_imdbid(x.get('link')),
  176. roleID=(x.get('roleID') or u'').split('/'))
  177. )),
  178. Extractor(label='genres',
  179. path="//div[@class='info']//a[starts-with(@href," \
  180. " '/Sections/Genres')]",
  181. attrs=Attribute(key="genres",
  182. multi=True,
  183. path="./text()")),
  184. Extractor(label='myrating',
  185. path="//span[@id='voteuser']",
  186. attrs=Attribute(key='myrating',
  187. path=".//text()")),
  188. Extractor(label='h5sections',
  189. path="//div[@class='info']/h5/..",
  190. attrs=[
  191. Attribute(key="plot summary",
  192. path="./h5[starts-with(text(), " \
  193. "'Plot:')]/../div/text()",
  194. postprocess=lambda x: \
  195. x.strip().rstrip('|').rstrip()),
  196. Attribute(key="aspect ratio",
  197. path="./h5[starts-with(text()," \
  198. " 'Aspect')]/../div/text()",
  199. postprocess=lambda x: x.strip()),
  200. Attribute(key="mpaa",
  201. path="./h5/a[starts-with(text()," \
  202. " 'MPAA')]/../../div/text()",
  203. postprocess=lambda x: x.strip()),
  204. Attribute(key="countries",
  205. path="./h5[starts-with(text(), " \
  206. "'Countr')]/../div[@class='info-content']//text()",
  207. postprocess=makeSplitter('|')),
  208. Attribute(key="language",
  209. path="./h5[starts-with(text(), " \
  210. "'Language')]/..//text()",
  211. postprocess=makeSplitter('Language:')),
  212. Attribute(key='color info',
  213. path="./h5[starts-with(text(), " \
  214. "'Color')]/..//text()",
  215. postprocess=makeSplitter('|')),
  216. Attribute(key='sound mix',
  217. path="./h5[starts-with(text(), " \
  218. "'Sound Mix')]/..//text()",
  219. postprocess=makeSplitter('Sound Mix:')),
  220. # Collects akas not encosed in <i> tags.
  221. Attribute(key='other akas',
  222. path="./h5[starts-with(text(), " \
  223. "'Also Known As')]/../div//text()",
  224. postprocess=makeSplitter(sep='::',
  225. origNotesSep='" - ',
  226. newNotesSep='::',
  227. strip='"')),
  228. Attribute(key='runtimes',
  229. path="./h5[starts-with(text(), " \
  230. "'Runtime')]/../div/text()",
  231. postprocess=makeSplitter()),
  232. Attribute(key='certificates',
  233. path="./h5[starts-with(text(), " \
  234. "'Certificat')]/..//text()",
  235. postprocess=makeSplitter('Certification:')),
  236. Attribute(key='number of seasons',
  237. path="./h5[starts-with(text(), " \
  238. "'Seasons')]/..//text()",
  239. postprocess=lambda x: x.count('|') + 1),
  240. Attribute(key='original air date',
  241. path="./h5[starts-with(text(), " \
  242. "'Original Air Date')]/../div/text()"),
  243. Attribute(key='tv series link',
  244. path="./h5[starts-with(text(), " \
  245. "'TV Series')]/..//a/@href"),
  246. Attribute(key='tv series title',
  247. path="./h5[starts-with(text(), " \
  248. "'TV Series')]/..//a/text()")
  249. ]),
  250. Extractor(label='language codes',
  251. path="//h5[starts-with(text(), 'Language')]/..//a[starts-with(@href, '/language/')]",
  252. attrs=Attribute(key='language codes', multi=True,
  253. path="./@href",
  254. postprocess=lambda x: x.split('/')[2].strip()
  255. )),
  256. Extractor(label='country codes',
  257. path="//h5[starts-with(text(), 'Country')]/..//a[starts-with(@href, '/country/')]",
  258. attrs=Attribute(key='country codes', multi=True,
  259. path="./@href",
  260. postprocess=lambda x: x.split('/')[2].strip()
  261. )),
  262. Extractor(label='creator',
  263. path="//h5[starts-with(text(), 'Creator')]/..//a",
  264. attrs=Attribute(key='creator', multi=True,
  265. path={'name': "./text()",
  266. 'link': "./@href"},
  267. postprocess=lambda x: \
  268. build_person(x.get('name') or u'',
  269. personID=analyze_imdbid(x.get('link')))
  270. )),
  271. Extractor(label='thin writer',
  272. path="//h5[starts-with(text(), 'Writer')]/..//a",
  273. attrs=Attribute(key='thin writer', multi=True,
  274. path={'name': "./text()",
  275. 'link': "./@href"},
  276. postprocess=lambda x: \
  277. build_person(x.get('name') or u'',
  278. personID=analyze_imdbid(x.get('link')))
  279. )),
  280. Extractor(label='thin director',
  281. path="//h5[starts-with(text(), 'Director')]/..//a",
  282. attrs=Attribute(key='thin director', multi=True,
  283. path={'name': "./text()",
  284. 'link': "@href"},
  285. postprocess=lambda x: \
  286. build_person(x.get('name') or u'',
  287. personID=analyze_imdbid(x.get('link')))
  288. )),
  289. Extractor(label='top 250/bottom 100',
  290. path="//div[@class='starbar-special']/" \
  291. "a[starts-with(@href, '/chart/')]",
  292. attrs=Attribute(key='top/bottom rank',
  293. path="./text()")),
  294. Extractor(label='series years',
  295. path="//div[@id='tn15title']//span" \
  296. "[starts-with(text(), 'TV series')]",
  297. attrs=Attribute(key='series years',
  298. path="./text()",
  299. postprocess=lambda x: \
  300. x.replace('TV series','').strip())),
  301. Extractor(label='number of episodes',
  302. path="//a[@title='Full Episode List']",
  303. attrs=Attribute(key='number of episodes',
  304. path="./text()",
  305. postprocess=lambda x: \
  306. _toInt(x, [(' Episodes', '')]))),
  307. Extractor(label='akas',
  308. path="//i[@class='transl']",
  309. attrs=Attribute(key='akas', multi=True, path='text()',
  310. postprocess=lambda x:
  311. x.replace(' ', ' ').rstrip('-').replace('" - ',
  312. '"::', 1).strip('"').replace(' ', ' '))),
  313. Extractor(label='production notes/status',
  314. path="//h5[starts-with(text(), 'Status:')]/..//div[@class='info-content']",
  315. attrs=Attribute(key='production status',
  316. path=".//text()",
  317. postprocess=lambda x: x.strip().split('|')[0].strip().lower())),
  318. Extractor(label='production notes/status updated',
  319. path="//h5[starts-with(text(), 'Status Updated:')]/..//div[@class='info-content']",
  320. attrs=Attribute(key='production status updated',
  321. path=".//text()",
  322. postprocess=lambda x: x.strip())),
  323. Extractor(label='production notes/comments',
  324. path="//h5[starts-with(text(), 'Comments:')]/..//div[@class='info-content']",
  325. attrs=Attribute(key='production comments',
  326. path=".//text()",
  327. postprocess=lambda x: x.strip())),
  328. Extractor(label='production notes/note',
  329. path="//h5[starts-with(text(), 'Note:')]/..//div[@class='info-content']",
  330. attrs=Attribute(key='production note',
  331. path=".//text()",
  332. postprocess=lambda x: x.strip())),
  333. Extractor(label='blackcatheader',
  334. group="//b[@class='blackcatheader']",
  335. group_key="./text()",
  336. group_key_normalize=lambda x: x.lower(),
  337. path="../ul/li",
  338. attrs=Attribute(key=None,
  339. multi=True,
  340. path={'name': "./a//text()",
  341. 'comp-link': "./a/@href",
  342. 'notes': "./text()"},
  343. postprocess=lambda x: \
  344. Company(name=x.get('name') or u'',
  345. companyID=analyze_imdbid(x.get('comp-link')),
  346. notes=(x.get('notes') or u'').strip())
  347. )),
  348. Extractor(label='rating',
  349. path="//div[@class='starbar-meta']/b",
  350. attrs=Attribute(key='rating',
  351. path=".//text()")),
  352. Extractor(label='votes',
  353. path="//div[@class='starbar-meta']/a[@href]",
  354. attrs=Attribute(key='votes',
  355. path=".//text()")),
  356. Extractor(label='cover url',
  357. path="//a[@name='poster']",
  358. attrs=Attribute(key='cover url',
  359. path="./img/@src"))
  360. ]
  361. preprocessors = [
  362. (re.compile(r'(<b class="blackcatheader">.+?</b>)', re.I),
  363. r'</div><div>\1'),
  364. ('<small>Full cast and crew for<br>', ''),
  365. ('<td> </td>', '<td>...</td>'),
  366. ('<span class="tv-extra">TV mini-series</span>',
  367. '<span class="tv-extra">(mini)</span>'),
  368. (_reRolesMovie, _manageRoles),
  369. (_reAkas, _replaceBR)]
  370. def preprocess_dom(self, dom):
  371. # Handle series information.
  372. xpath = self.xpath(dom, "//b[text()='Series Crew']")
  373. if xpath:
  374. b = xpath[-1] # In doubt, take the last one.
  375. for a in self.xpath(b, "./following::h5/a[@class='glossary']"):
  376. name = a.get('name')
  377. if name:
  378. a.set('name', 'series %s' % name)
  379. # Remove links to IMDbPro.
  380. for proLink in self.xpath(dom, "//span[@class='pro-link']"):
  381. proLink.drop_tree()
  382. # Remove some 'more' links (keep others, like the one around
  383. # the number of votes).
  384. for tn15more in self.xpath(dom,
  385. "//a[@class='tn15more'][starts-with(@href, '/title/')]"):
  386. tn15more.drop_tree()
  387. return dom
  388. re_space = re.compile(r'\s+')
  389. re_airdate = re.compile(r'(.*)\s*\(season (\d+), episode (\d+)\)', re.I)
  390. def postprocess_data(self, data):
  391. # Convert section names.
  392. for sect in data.keys():
  393. if sect in _SECT_CONV:
  394. data[_SECT_CONV[sect]] = data[sect]
  395. del data[sect]
  396. sect = _SECT_CONV[sect]
  397. # Filter out fake values.
  398. for key in data:
  399. value = data[key]
  400. if isinstance(value, list) and value:
  401. if isinstance(value[0], Person):
  402. data[key] = filter(lambda x: x.personID is not None, value)
  403. if isinstance(value[0], _Container):
  404. for obj in data[key]:
  405. obj.accessSystem = self._as
  406. obj.modFunct = self._modFunct
  407. if 'akas' in data or 'other akas' in data:
  408. akas = data.get('akas') or []
  409. other_akas = data.get('other akas') or []
  410. akas += other_akas
  411. nakas = []
  412. for aka in akas:
  413. aka = aka.strip()
  414. if aka.endswith('" -'):
  415. aka = aka[:-3].rstrip()
  416. nakas.append(aka)
  417. if 'akas' in data:
  418. del data['akas']
  419. if 'other akas' in data:
  420. del data['other akas']
  421. if nakas:
  422. data['akas'] = nakas
  423. if 'color info' in data:
  424. data['color info'] = [x.replace('Color:', '', 1) for x in data['color info']]
  425. if 'runtimes' in data:
  426. data['runtimes'] = [x.replace(' min', u'')
  427. for x in data['runtimes']]
  428. if 'original air date' in data:
  429. oid = self.re_space.sub(' ', data['original air date']).strip()
  430. data['original air date'] = oid
  431. aid = self.re_airdate.findall(oid)
  432. if aid and len(aid[0]) == 3:
  433. date, season, episode = aid[0]
  434. date = date.strip()
  435. try: season = int(season)
  436. except: pass
  437. try: episode = int(episode)
  438. except: pass
  439. if date and date != '????':
  440. data['original air date'] = date
  441. else:
  442. del data['original air date']
  443. # Handle also "episode 0".
  444. if season or type(season) is type(0):
  445. data['season'] = season
  446. if episode or type(season) is type(0):
  447. data['episode'] = episode
  448. for k in ('writer', 'director'):
  449. t_k = 'thin %s' % k
  450. if t_k not in data:
  451. continue
  452. if k not in data:
  453. data[k] = data[t_k]
  454. del data[t_k]
  455. if 'top/bottom rank' in data:
  456. tbVal = data['top/bottom rank'].lower()
  457. if tbVal.startswith('top'):
  458. tbKey = 'top 250 rank'
  459. tbVal = _toInt(tbVal, [('top 250: #', '')])
  460. else:
  461. tbKey = 'bottom 100 rank'
  462. tbVal = _toInt(tbVal, [('bottom 100: #', '')])
  463. if tbVal:
  464. data[tbKey] = tbVal
  465. del data['top/bottom rank']
  466. if 'year' in data and data['year'] == '????':
  467. del data['year']
  468. if 'tv series link' in data:
  469. if 'tv series title' in data:
  470. data['episode of'] = Movie(title=data['tv series title'],
  471. movieID=analyze_imdbid(
  472. data['tv series link']),
  473. accessSystem=self._as,
  474. modFunct=self._modFunct)
  475. del data['tv series title']
  476. del data['tv series link']
  477. if 'rating' in data:
  478. try:
  479. data['rating'] = float(data['rating'].replace('/10', ''))
  480. except (TypeError, ValueError):
  481. pass
  482. if 'votes' in data:
  483. try:
  484. votes = data['votes'].replace(',', '').replace('votes', '')
  485. data['votes'] = int(votes)
  486. except (TypeError, ValueError):
  487. pass
  488. return data
  489. def _process_plotsummary(x):
  490. """Process a plot (contributed by Rdian06)."""
  491. xauthor = x.get('author')
  492. xplot = x.get('plot', u'').strip()
  493. if xauthor:
  494. xplot += u'::%s' % xauthor
  495. return xplot
  496. class DOMHTMLPlotParser(DOMParserBase):
  497. """Parser for the "plot summary" page of a given movie.
  498. The page should be provided as a string, as taken from
  499. the akas.imdb.com server. The final result will be a
  500. dictionary, with a 'plot' key, containing a list
  501. of string with the structure: 'summary::summary_author <author@email>'.
  502. Example:
  503. pparser = HTMLPlotParser()
  504. result = pparser.parse(plot_summary_html_string)
  505. """
  506. _defGetRefs = True
  507. # Notice that recently IMDb started to put the email of the
  508. # author only in the link, that we're not collecting, here.
  509. extractors = [Extractor(label='plot',
  510. path="//ul[@class='zebraList']//p",
  511. attrs=Attribute(key='plot',
  512. multi=True,
  513. path={'plot': './text()[1]',
  514. 'author': './span/em/a/text()'},
  515. postprocess=_process_plotsummary))]
  516. def _process_award(x):
  517. award = {}
  518. _award = x.get('award')
  519. if _award is not None:
  520. _award = _award.strip()
  521. award['award'] = _award
  522. if not award['award']:
  523. return {}
  524. award['year'] = x.get('year').strip()
  525. if award['year'] and award['year'].isdigit():
  526. award['year'] = int(award['year'])
  527. award['result'] = x.get('result').strip()
  528. category = x.get('category').strip()
  529. if category:
  530. award['category'] = category
  531. received_with = x.get('with')
  532. if received_with is not None:
  533. award['with'] = received_with.strip()
  534. notes = x.get('notes')
  535. if notes is not None:
  536. notes = notes.strip()
  537. if notes:
  538. award['notes'] = notes
  539. award['anchor'] = x.get('anchor')
  540. return award
  541. class DOMHTMLAwardsParser(DOMParserBase):
  542. """Parser for the "awards" page of a given person or movie.
  543. The page should be provided as a string, as taken from
  544. the akas.imdb.com server. The final result will be a
  545. dictionary, with a key for every relevant section.
  546. Example:
  547. awparser = HTMLAwardsParser()
  548. result = awparser.parse(awards_html_string)
  549. """
  550. subject = 'title'
  551. _containsObjects = True
  552. extractors = [
  553. Extractor(label='awards',
  554. group="//table//big",
  555. group_key="./a",
  556. path="./ancestor::tr[1]/following-sibling::tr/" \
  557. "td[last()][not(@colspan)]",
  558. attrs=Attribute(key=None,
  559. multi=True,
  560. path={
  561. 'year': "../td[1]/a/text()",
  562. 'result': "../td[2]/b/text()",
  563. 'award': "../td[3]/text()",
  564. 'category': "./text()[1]",
  565. # FIXME: takes only the first co-recipient
  566. 'with': "./small[starts-with(text()," \
  567. " 'Shared with:')]/following-sibling::a[1]/text()",
  568. 'notes': "./small[last()]//text()",
  569. 'anchor': ".//text()"
  570. },
  571. postprocess=_process_award
  572. )),
  573. Extractor(label='recipients',
  574. group="//table//big",
  575. group_key="./a",
  576. path="./ancestor::tr[1]/following-sibling::tr/" \
  577. "td[last()]/small[1]/preceding-sibling::a",
  578. attrs=Attribute(key=None,
  579. multi=True,
  580. path={
  581. 'name': "./text()",
  582. 'link': "./@href",
  583. 'anchor': "..//text()"
  584. }
  585. ))
  586. ]
  587. preprocessors = [
  588. (re.compile('(<tr><td[^>]*>.*?</td></tr>\n\n</table>)', re.I),
  589. r'\1</table>'),
  590. (re.compile('(<tr><td[^>]*>\n\n<big>.*?</big></td></tr>)', re.I),
  591. r'</table><table class="_imdbpy">\1'),
  592. (re.compile('(<table[^>]*>\n\n)</table>(<table)', re.I), r'\1\2'),
  593. (re.compile('(<small>.*?)<br>(.*?</small)', re.I), r'\1 \2'),
  594. (re.compile('(</tr>\n\n)(<td)', re.I), r'\1<tr>\2')
  595. ]
  596. def preprocess_dom(self, dom):
  597. """Repeat td elements according to their rowspan attributes
  598. in subsequent tr elements.
  599. """
  600. cols = self.xpath(dom, "//td[@rowspan]")
  601. for col in cols:
  602. span = int(col.get('rowspan'))
  603. del col.attrib['rowspan']
  604. position = len(self.xpath(col, "./preceding-sibling::td"))
  605. row = col.getparent()
  606. for tr in self.xpath(row, "./following-sibling::tr")[:span-1]:
  607. # if not cloned, child will be moved to new parent
  608. clone = self.clone(col)
  609. # XXX: beware that here we don't use an "adapted" function,
  610. # because both BeautifulSoup and lxml uses the same
  611. # "insert" method.
  612. tr.insert(position, clone)
  613. return dom
  614. def postprocess_data(self, data):
  615. if len(data) == 0:
  616. return {}
  617. nd = []
  618. for key in data.keys():
  619. dom = self.get_dom(key)
  620. assigner = self.xpath(dom, "//a/text()")[0]
  621. for entry in data[key]:
  622. if not entry.has_key('name'):
  623. if not entry:
  624. continue
  625. # this is an award, not a recipient
  626. entry['assigner'] = assigner.strip()
  627. # find the recipients
  628. matches = [p for p in data[key]
  629. if p.has_key('name') and (entry['anchor'] ==
  630. p['anchor'])]
  631. if self.subject == 'title':
  632. recipients = [Person(name=recipient['name'],
  633. personID=analyze_imdbid(recipient['link']))
  634. for recipient in matches]
  635. entry['to'] = recipients
  636. elif self.subject == 'name':
  637. recipients = [Movie(title=recipient['name'],
  638. movieID=analyze_imdbid(recipient['link']))
  639. for recipient in matches]
  640. entry['for'] = recipients
  641. nd.append(entry)
  642. del entry['anchor']
  643. return {'awards': nd}
  644. class DOMHTMLTaglinesParser(DOMParserBase):
  645. """Parser for the "taglines" page of a given movie.
  646. The page should be provided as a string, as taken from
  647. the akas.imdb.com server. The final result will be a
  648. dictionary, with a key for every relevant section.
  649. Example:
  650. tparser = DOMHTMLTaglinesParser()
  651. result = tparser.parse(taglines_html_string)
  652. """
  653. extractors = [Extractor(label='taglines',
  654. path='//*[contains(concat(" ", normalize-space(@class), " "), " soda ")]',
  655. attrs=Attribute(key='taglines',
  656. multi=True,
  657. path="./text()"))]
  658. def postprocess_data(self, data):
  659. if 'taglines' in data:
  660. data['taglines'] = [tagline.strip() for tagline in data['taglines']]
  661. return data
  662. class DOMHTMLKeywordsParser(DOMParserBase):
  663. """Parser for the "keywords" page of a given movie.
  664. The page should be provided as a string, as taken from
  665. the akas.imdb.com server. The final result will be a
  666. dictionary, with a key for every relevant section.
  667. Example:
  668. kwparser = DOMHTMLKeywordsParser()
  669. result = kwparser.parse(keywords_html_string)
  670. """
  671. extractors = [Extractor(label='keywords',
  672. path="//a[starts-with(@href, '/keyword/')]",
  673. attrs=Attribute(key='keywords',
  674. path="./text()", multi=True,
  675. postprocess=lambda x: \
  676. x.lower().replace(' ', '-')))]
  677. class DOMHTMLAlternateVersionsParser(DOMParserBase):
  678. """Parser for the "alternate versions" page of a given movie.
  679. The page should be provided as a string, as taken from
  680. the akas.imdb.com server. The final result will be a
  681. dictionary, with a key for every relevant section.
  682. Example:
  683. avparser = HTMLAlternateVersionsParser()
  684. result = avparser.parse(alternateversions_html_string)
  685. """
  686. _defGetRefs = True
  687. extractors = [Extractor(label='alternate versions',
  688. path="//ul[@class='trivia']/li",
  689. attrs=Attribute(key='alternate versions',
  690. multi=True,
  691. path=".//text()",
  692. postprocess=lambda x: x.strip()))]
  693. class DOMHTMLTriviaParser(DOMParserBase):
  694. """Parser for the "trivia" page of a given movie.
  695. The page should be provided as a string, as taken from
  696. the akas.imdb.com server. The final result will be a
  697. dictionary, with a key for every relevant section.
  698. Example:
  699. avparser = HTMLAlternateVersionsParser()
  700. result = avparser.parse(alternateversions_html_string)
  701. """
  702. _defGetRefs = True
  703. extractors = [Extractor(label='alternate versions',
  704. path="//div[@class='sodatext']",
  705. attrs=Attribute(key='trivia',
  706. multi=True,
  707. path=".//text()",
  708. postprocess=lambda x: x.strip()))]
  709. def preprocess_dom(self, dom):
  710. # Remove "link this quote" links.
  711. for qLink in self.xpath(dom, "//span[@class='linksoda']"):
  712. qLink.drop_tree()
  713. return dom
  714. class DOMHTMLSoundtrackParser(DOMHTMLAlternateVersionsParser):
  715. kind = 'soundtrack'
  716. preprocessors = [
  717. ('<br>', '\n')
  718. ]
  719. def postprocess_data(self, data):
  720. if 'alternate versions' in data:
  721. nd = []
  722. for x in data['alternate versions']:
  723. ds = x.split('\n')
  724. title = ds[0]
  725. if title[0] == '"' and title[-1] == '"':
  726. title = title[1:-1]
  727. nds = []
  728. newData = {}
  729. for l in ds[1:]:
  730. if ' with ' in l or ' by ' in l or ' from ' in l \
  731. or ' of ' in l or l.startswith('From '):
  732. nds.append(l)
  733. else:
  734. if nds:
  735. nds[-1] += l
  736. else:
  737. nds.append(l)
  738. newData[title] = {}
  739. for l in nds:
  740. skip = False
  741. for sep in ('From ',):
  742. if l.startswith(sep):
  743. fdix = len(sep)
  744. kind = l[:fdix].rstrip().lower()
  745. info = l[fdix:].lstrip()
  746. newData[title][kind] = info
  747. skip = True
  748. if not skip:
  749. for sep in ' with ', ' by ', ' from ', ' of ':
  750. fdix = l.find(sep)
  751. if fdix != -1:
  752. fdix = fdix+len(sep)
  753. kind = l[:fdix].rstrip().lower()
  754. info = l[fdix:].lstrip()
  755. newData[title][kind] = info
  756. break
  757. nd.append(newData)
  758. data['soundtrack'] = nd
  759. return data
  760. class DOMHTMLCrazyCreditsParser(DOMParserBase):
  761. """Parser for the "crazy credits" page of a given movie.
  762. The page should be provided as a string, as taken from
  763. the akas.imdb.com server. The final result will be a
  764. dictionary, with a key for every relevant section.
  765. Example:
  766. ccparser = DOMHTMLCrazyCreditsParser()
  767. result = ccparser.parse(crazycredits_html_string)
  768. """
  769. _defGetRefs = True
  770. extractors = [Extractor(label='crazy credits', path="//ul/li/tt",
  771. attrs=Attribute(key='crazy credits', multi=True,
  772. path=".//text()",
  773. postprocess=lambda x: \
  774. x.replace('\n', ' ').replace(' ', ' ')))]
  775. def _process_goof(x):
  776. if x['spoiler_category']:
  777. return x['spoiler_category'].strip() + ': SPOILER: ' + x['text'].strip()
  778. else:
  779. return x['category'].strip() + ': ' + x['text'].strip()
  780. class DOMHTMLGoofsParser(DOMParserBase):
  781. """Parser for the "goofs" page of a given movie.
  782. The page should be provided as a string, as taken from
  783. the akas.imdb.com server. The final result will be a
  784. dictionary, with a key for every relevant section.
  785. Example:
  786. gparser = DOMHTMLGoofsParser()
  787. result = gparser.parse(goofs_html_string)
  788. """
  789. _defGetRefs = True
  790. extractors = [Extractor(label='goofs', path="//div[@class='soda odd']",
  791. attrs=Attribute(key='goofs', multi=True,
  792. path={
  793. 'text':"./text()",
  794. 'category':'./preceding-sibling::h4[1]/text()',
  795. 'spoiler_category': './h4/text()'
  796. },
  797. postprocess=_process_goof))]
  798. class DOMHTMLQuotesParser(DOMParserBase):
  799. """Parser for the "memorable quotes" page of a given movie.
  800. The page should be provided as a string, as taken from
  801. the akas.imdb.com server. The final result will be a
  802. dictionary, with a key for every relevant section.
  803. Example:
  804. qparser = DOMHTMLQuotesParser()
  805. result = qparser.parse(quotes_html_string)
  806. """
  807. _defGetRefs = True
  808. extractors = [
  809. Extractor(label='quotes_odd',
  810. path="//div[@class='quote soda odd']",
  811. attrs=Attribute(key='quotes_odd',
  812. multi=True,
  813. path=".//text()",
  814. postprocess=lambda x: x.strip().replace(' \n',
  815. '::').replace('::\n', '::').replace('\n', ' '))),
  816. Extractor(label='quotes_even',
  817. path="//div[@class='quote soda even']",
  818. attrs=Attribute(key='quotes_even',
  819. multi=True,
  820. path=".//text()",
  821. postprocess=lambda x: x.strip().replace(' \n',
  822. '::').replace('::\n', '::').replace('\n', ' ')))
  823. ]
  824. preprocessors = [
  825. (re.compile('<a href="#" class="hidesoda hidden">Hide options</a><br>', re.I), '')
  826. ]
  827. def preprocess_dom(self, dom):
  828. # Remove "link this quote" links.
  829. for qLink in self.xpath(dom, "//span[@class='linksoda']"):
  830. qLink.drop_tree()
  831. for qLink in self.xpath(dom, "//div[@class='sharesoda_pre']"):
  832. qLink.drop_tree()
  833. return dom
  834. def postprocess_data(self, data):
  835. quotes = data.get('quotes_odd', []) + data.get('quotes_even', [])
  836. if not quotes:
  837. return {}
  838. quotes = [q.split('::') for q in quotes]
  839. return {'quotes': quotes}
  840. class DOMHTMLReleaseinfoParser(DOMParserBase):
  841. """Parser for the "release dates" page of a given movie.
  842. The page should be provided as a string, as taken from
  843. the akas.imdb.com server. The final result will be a
  844. dictionary, with a key for every relevant section.
  845. Example:
  846. rdparser = DOMHTMLReleaseinfoParser()
  847. result = rdparser.parse(releaseinfo_html_string)
  848. """
  849. extractors = [Extractor(label='release dates',
  850. path="//table[@id='release_dates']//tr",
  851. attrs=Attribute(key='release dates', multi=True,
  852. path={'country': ".//td[1]//text()",
  853. 'date': ".//td[2]//text()",
  854. 'notes': ".//td[3]//text()"})),
  855. Extractor(label='akas',
  856. path="//table[@id='akas']//tr",
  857. attrs=Attribute(key='akas', multi=True,
  858. path={'title': "./td[1]/text()",
  859. 'countries': "./td[2]/text()"}))]
  860. preprocessors = [
  861. (re.compile('(<h5><a name="?akas"?.*</table>)', re.I | re.M | re.S),
  862. r'<div class="_imdbpy_akas">\1</div>')]
  863. def postprocess_data(self, data):
  864. if not ('release dates' in data or 'akas' in data): return data
  865. releases = data.get('release dates') or []
  866. rl = []
  867. for i in releases:
  868. country = i.get('country')
  869. date = i.get('date')
  870. if not (country and date): continue
  871. country = country.strip()
  872. date = date.strip()
  873. if not (country and date): continue
  874. notes = i['notes']
  875. info = u'%s::%s' % (country, date)
  876. if notes:
  877. info += notes
  878. rl.append(info)
  879. if releases:
  880. del data['release dates']
  881. if rl:
  882. data['release dates'] = rl
  883. akas = data.get('akas') or []
  884. nakas = []
  885. for aka in akas:
  886. title = (aka.get('title') or '').strip()
  887. if not title:
  888. continue
  889. countries = (aka.get('countries') or '').split(',')
  890. if not countries:
  891. nakas.append(title)
  892. else:
  893. for country in countries:
  894. nakas.append('%s::%s' % (title, country.strip()))
  895. if akas:
  896. del data['akas']
  897. if nakas:
  898. data['akas from release info'] = nakas
  899. return data
  900. class DOMHTMLRatingsParser(DOMParserBase):
  901. """Parser for the "user ratings" page of a given movie.
  902. The page should be provided as a string, as taken from
  903. the akas.imdb.com server. The final result will be a
  904. dictionary, with a key for every relevant section.
  905. Example:
  906. rparser = DOMHTMLRatingsParser()
  907. result = rparser.parse(userratings_html_string)
  908. """
  909. re_means = re.compile('mean\s*=\s*([0-9]\.[0-9])\.\s*median\s*=\s*([0-9])',
  910. re.I)
  911. extractors = [
  912. Extractor(label='number of votes',
  913. path="//td[b='Percentage']/../../tr",
  914. attrs=[Attribute(key='votes',
  915. multi=True,
  916. path={
  917. 'votes': "td[1]//text()",
  918. 'ordinal': "td[3]//text()"
  919. })]),
  920. Extractor(label='mean and median',
  921. path="//p[starts-with(text(), 'Arithmetic mean')]",
  922. attrs=Attribute(key='mean and median',
  923. path="text()")),
  924. Extractor(label='rating',
  925. path="//a[starts-with(@href, '/search/title?user_rating=')]",
  926. attrs=Attribute(key='rating',
  927. path="text()")),
  928. Extractor(label='demographic voters',
  929. path="//td[b='Average']/../../tr",
  930. attrs=Attribute(key='demographic voters',
  931. multi=True,
  932. path={
  933. 'voters': "td[1]//text()",
  934. 'votes': "td[2]//text()",
  935. 'average': "td[3]//text()"
  936. })),
  937. Extractor(label='top 250',
  938. path="//a[text()='top 250']",
  939. attrs=Attribute(key='top 250',
  940. path="./preceding-sibling::text()[1]"))
  941. ]
  942. def postprocess_data(self, data):
  943. nd = {}
  944. votes = data.get('votes', [])
  945. if votes:
  946. nd['number of votes'] = {}
  947. for i in xrange(1, 11):
  948. _ordinal = int(votes[i]['ordinal'])
  949. _strvts = votes[i]['votes'] or '0'
  950. nd['number of votes'][_ordinal] = \
  951. int(_strvts.replace(',', ''))
  952. mean = data.get('mean and median', '')
  953. if mean:
  954. means = self.re_means.findall(mean)
  955. if means and len(means[0]) == 2:
  956. am, med = means[0]
  957. try: am = float(am)
  958. except (ValueError, OverflowError): pass
  959. if type(am) is type(1.0):
  960. nd['arithmetic mean'] = am
  961. try: med = int(med)
  962. except (ValueError, OverflowError): pass
  963. if type(med) is type(0):
  964. nd['median'] = med
  965. if 'rating' in data:
  966. nd['rating'] = float(data['rating'])
  967. dem_voters = data.get('demographic voters')
  968. if dem_voters:
  969. nd['demographic'] = {}
  970. for i in xrange(1, len(dem_voters)):
  971. if (dem_voters[i]['votes'] is not None) \
  972. and (dem_voters[i]['votes'].strip()):
  973. nd['demographic'][dem_voters[i]['voters'].strip().lower()] \
  974. = (int(dem_voters[i]['votes'].replace(',', '')),
  975. float(dem_voters[i]['average']))
  976. if 'imdb users' in nd.get('demographic', {}):
  977. nd['votes'] = nd['demographic']['imdb users'][0]
  978. nd['demographic']['all votes'] = nd['demographic']['imdb users']
  979. del nd['demographic']['imdb users']
  980. top250 = data.get('top 250')
  981. if top250:
  982. sd = top250[9:]
  983. i = sd.find(' ')
  984. if i != -1:
  985. sd = sd[:i]
  986. try: sd = int(sd)
  987. except (ValueError, OverflowError): pass
  988. if type(sd) is type(0):
  989. nd['top 250 rank'] = sd
  990. return nd
  991. class DOMHTMLEpisodesRatings(DOMParserBase):
  992. """Parser for the "episode ratings ... by date" page of a given movie.
  993. The page should be provided as a string, as taken from
  994. the akas.imdb.com server. The final result will be a
  995. dictionary, with a key for every relevant section.
  996. Example:
  997. erparser = DOMHTMLEpisodesRatings()
  998. result = erparser.parse(eprating_html_string)
  999. """
  1000. _containsObjects = True
  1001. extractors = [Extractor(label='title', path="//title",
  1002. attrs=Attribute(key='title', path="./text()")),
  1003. Extractor(label='ep ratings',
  1004. path="//th/../..//tr",
  1005. attrs=Attribute(key='episodes', multi=True,
  1006. path={'nr': ".//td[1]/text()",
  1007. 'ep title': ".//td[2]//text()",
  1008. 'movieID': ".//td[2]/a/@href",
  1009. 'rating': ".//td[3]/text()",
  1010. 'votes': ".//td[4]/text()"}))]
  1011. def postprocess_data(self, data):
  1012. if 'title' not in data or 'episodes' not in data: return {}
  1013. nd = []
  1014. title = data['title']
  1015. for i in data['episodes']:
  1016. ept = i['ep title']
  1017. movieID = analyze_imdbid(i['movieID'])
  1018. votes = i['votes']
  1019. rating = i['rating']
  1020. if not (ept and movieID and votes and rating): continue
  1021. try:
  1022. votes = int(votes.replace(',', '').replace('.', ''))
  1023. except:
  1024. pass
  1025. try:
  1026. rating = float(rating)
  1027. except:
  1028. pass
  1029. ept = ept.strip()
  1030. ept = u'%s {%s' % (title, ept)
  1031. nr = i['nr']
  1032. if nr:
  1033. ept += u' (#%s)' % nr.strip()
  1034. ept += '}'
  1035. if movieID is not None:
  1036. movieID = str(movieID)
  1037. m = Movie(title=ept, movieID=movieID, accessSystem=self._as,
  1038. modFunct=self._modFunct)
  1039. epofdict = m.get('episode of')
  1040. if epofdict is not None:
  1041. m['episode of'] = Movie(data=epofdict, accessSystem=self._as,
  1042. modFunct=self._modFunct)
  1043. nd.append({'episode': m, 'votes': votes, 'rating': rating})
  1044. return {'episodes rating': nd}
  1045. def _normalize_href(href):
  1046. if (href is not None) and (not href.lower().startswith('http://')):
  1047. if href.startswith('/'): href = href[1:]
  1048. # TODO: imdbURL_base may be set by the user!
  1049. href = '%s%s' % (imdbURL_base, href)
  1050. return href
  1051. class DOMHTMLCriticReviewsParser(DOMParserBase):
  1052. """Parser for the "critic reviews" pages of a given movie.
  1053. The page should be provided as a string, as taken from
  1054. the akas.imdb.com server. The final result will be a
  1055. dictionary, with a key for every relevant section.
  1056. Example:
  1057. osparser = DOMHTMLCriticReviewsParser()
  1058. result = osparser.parse(officialsites_html_string)
  1059. """
  1060. kind = 'critic reviews'
  1061. extractors = [
  1062. Extractor(label='metascore',
  1063. path="//div[@class='metascore_wrap']/div/span",
  1064. attrs=Attribute(key='metascore',
  1065. path=".//text()")),
  1066. Extractor(label='metacritic url',
  1067. path="//div[@class='article']/div[@class='see-more']/a",
  1068. attrs=Attribute(key='metacritic url',
  1069. path="./@href")) ]
  1070. class DOMHTMLOfficialsitesParser(DOMParserBase):
  1071. """Parser for the "official sites", "external reviews", "newsgroup
  1072. reviews", "miscellaneous links", "sound clips", "video clips" and
  1073. "photographs" pages of a given movie.
  1074. The page should be provided as a string, as taken from
  1075. the akas.imdb.com server. The final result will be a
  1076. dictionary, with a key for every relevant section.
  1077. Example:
  1078. osparser = DOMHTMLOfficialsitesParser()
  1079. result = osparser.parse(officialsites_html_string)
  1080. """
  1081. kind = 'official sites'
  1082. extractors = [
  1083. Extractor(label='site',
  1084. path="//ol/li/a",
  1085. attrs=Attribute(key='self.kind',
  1086. multi=True,
  1087. path={
  1088. 'link': "./@href",
  1089. 'info': "./text()"
  1090. },
  1091. postprocess=lambda x: (x.get('info').strip(),
  1092. urllib.unquote(_normalize_href(x.get('link'))))))
  1093. ]
  1094. class DOMHTMLConnectionParser(DOMParserBase):
  1095. """Parser for the "connections" page of a given movie.
  1096. The page should be provided as a string, as taken from
  1097. the akas.imdb.com server. The final result will be a
  1098. dictionary, with a key for every relevant section.
  1099. Example:
  1100. connparser = DOMHTMLConnectionParser()
  1101. result = connparser.parse(connections_html_string)
  1102. """
  1103. _containsObjects = True
  1104. extractors = [Extractor(label='connection',
  1105. group="//div[@class='_imdbpy']",
  1106. group_key="./h5/text()",
  1107. group_key_normalize=lambda x: x.lower(),
  1108. path="./a",
  1109. attrs=Attribute(key=None,
  1110. path={'title': "./text()",
  1111. 'movieID': "./@href"},
  1112. multi=True))]
  1113. preprocessors = [
  1114. ('<h5>', '</div><div class="_imdbpy"><h5>'),
  1115. # To get the movie's year.
  1116. ('</a> (', ' ('),
  1117. ('\n<br/>', '</a>'),
  1118. ('<br/> - ', '::')
  1119. ]
  1120. def postprocess_data(self, data):
  1121. for key in data.keys():
  1122. nl = []
  1123. for v in data[key]:
  1124. title = v['title']
  1125. ts = title.split('::', 1)
  1126. title = ts[0].strip()
  1127. notes = u''
  1128. if len(ts) == 2:
  1129. notes = ts[1].strip()
  1130. m = Movie(title=title,
  1131. movieID=analyze_imdbid(v['movieID']),
  1132. accessSystem=self._as, notes=notes,
  1133. modFunct=self._modFunct)
  1134. nl.append(m)
  1135. data[key] = nl
  1136. if not data: return {}
  1137. return {'connections': data}
  1138. class DOMHTMLLocationsParser(DOMParserBase):
  1139. """Parser for the "locations" page of a given movie.
  1140. The page should be provided as a string, as taken from
  1141. the akas.imdb.com server. The final result will be a
  1142. dictionary, with a key for every relevant section.
  1143. Example:
  1144. lparser = DOMHTMLLocationsParser()
  1145. result = lparser.parse(locations_html_string)
  1146. """
  1147. extractors = [Extractor(label='locations', path="//dt",
  1148. attrs=Attribute(key='locations', multi=True,
  1149. path={'place': ".//text()",
  1150. 'note': "./following-sibling::dd[1]" \
  1151. "//text()"},
  1152. postprocess=lambda x: (u'%s::%s' % (
  1153. x['place'].strip(),
  1154. (x['note'] or u'').strip())).strip(':')))]
  1155. class DOMHTMLTechParser(DOMParserBase):
  1156. """Parser for the "technical", "business", "literature",
  1157. "publicity" (for people) and "contacts (for people) pages of
  1158. a given movie.
  1159. The page should be provided as a string, as taken from
  1160. the akas.imdb.com server. The final result will be a
  1161. dictionary, with a key for every relevant section.
  1162. Example:
  1163. tparser = HTMLTechParser()
  1164. result = tparser.parse(technical_html_string)
  1165. """
  1166. kind = 'tech'
  1167. extractors = [Extractor(label='tech',
  1168. group="//h5",
  1169. group_key="./text()",
  1170. group_key_normalize=lambda x: x.lower(),
  1171. path="./following-sibling::div[1]",
  1172. attrs=Attribute(key=None,
  1173. path=".//text()",
  1174. postprocess=lambda x: [t.strip()
  1175. for t in x.split('\n') if t.strip()]))]
  1176. preprocessors = [
  1177. (re.compile('(<h5>.*?</h5>)', re.I), r'</div>\1<div class="_imdbpy">'),
  1178. (re.compile('((<br/>|</p>|</table>))\n?<br/>(?!<a)', re.I),
  1179. r'\1</div>'),
  1180. # the ones below are for the publicity parser
  1181. (re.compile('<p>(.*?)</p>', re.I), r'\1<br/>'),
  1182. (re.compile('(</td><td valign="top">)', re.I), r'\1::'),
  1183. (re.compile('(</tr><tr>)', re.I), r'\n\1'),
  1184. # this is for splitting individual entries
  1185. (re.compile('<br/>', re.I), r'\n'),
  1186. ]
  1187. def postprocess_data(self, data):
  1188. for key in data:
  1189. data[key] = filter(None, data[key])
  1190. if self.kind in ('literature', 'business', 'contacts') and data:
  1191. if 'screenplay/teleplay' in data:
  1192. data['screenplay-teleplay'] = data['screenplay/teleplay']
  1193. del data['screenplay/teleplay']
  1194. data = {self.kind: data}
  1195. else:
  1196. if self.kind == 'publicity':
  1197. if 'biography (print)' in data:
  1198. data['biography-print'] = data['biography (print)']
  1199. del data['biography (print)']
  1200. # Tech info.
  1201. for key in data.keys():
  1202. if key.startswith('film negative format'):
  1203. data['film negative format'] = data[key]
  1204. del data[key]
  1205. elif key.startswith('film length'):
  1206. data['film length'] = data[key]
  1207. del data[key]
  1208. return data
  1209. class DOMHTMLRecParser(DOMParserBase):
  1210. """Parser for the "recommendations" page of a given movie.
  1211. The page should be provided as a string, as taken from
  1212. the akas.imdb.com server. The final result will be a
  1213. dictionary, with a key for every relevant section.
  1214. Example:
  1215. rparser = HTMLRecParser()
  1216. result = rparser.parse(recommendations_html_string)
  1217. """
  1218. _containsObjects = True
  1219. extractors = [Extractor(label='recommendations',
  1220. path="//td[@valign='middle'][1]",
  1221. attrs=Attribute(key='../../tr/td[1]//text()',
  1222. multi=True,
  1223. path={'title': ".//text()",
  1224. 'movieID': ".//a/@href"}))]
  1225. def postprocess_data(self, data):
  1226. for key in data.keys():
  1227. n_key = key
  1228. n_keyl = n_key.lower()
  1229. if n_keyl == 'suggested by the database':
  1230. n_key = 'database'
  1231. elif n_keyl == 'imdb users recommend':
  1232. n_key = 'users'
  1233. data[n_key] = [Movie(title=x['title'],
  1234. movieID=analyze_imdbid(x['movieID']),
  1235. accessSystem=self._as, modFunct=self._modFunct)
  1236. for x in data[key]]
  1237. del data[key]
  1238. if data: return {'recommendations': data}
  1239. return data
  1240. class DOMHTMLNewsParser(DOMParserBase):
  1241. """Parser for the "news" page of a given movie or person.
  1242. The page should be provided as a string, as taken from
  1243. the akas.imdb.com server. The final result will be a
  1244. dictionary, with a key for every relevant section.
  1245. Example:
  1246. nwparser = DOMHTMLNewsParser()
  1247. result = nwparser.parse(news_html_string)
  1248. """
  1249. _defGetRefs = True
  1250. extractors = [
  1251. Extractor(label='news',
  1252. path="//h2",
  1253. attrs=Attribute(key='news',
  1254. multi=True,
  1255. path={
  1256. 'title': "./text()",
  1257. 'fromdate': "../following-sibling::p[1]/small//text()",
  1258. # FIXME: sometimes (see The Matrix (1999)) <p> is found
  1259. # inside news text.
  1260. 'body': "../following-sibling::p[2]//text()",
  1261. 'link': "../..//a[text()='Permalink']/@href",
  1262. 'fulllink': "../..//a[starts-with(text(), " \
  1263. "'See full article at')]/@href"
  1264. },
  1265. postprocess=lambda x: {
  1266. 'title': x.get('title').strip(),
  1267. 'date': x.get('fromdate').split('|')[0].strip(),
  1268. 'from': x.get('fromdate').split('|')[1].replace('From ',
  1269. '').strip(),
  1270. 'body': (x.get('body') or u'').strip(),
  1271. 'link': _normalize_href(x.get('link')),
  1272. 'full article link': _normalize_href(x.get('fulllink'))
  1273. }))
  1274. ]
  1275. preprocessors = [
  1276. (re.compile('(<a name=[^>]+><h2>)', re.I), r'<div class="_imdbpy">\1'),
  1277. (re.compile('(<hr/>)', re.I), r'</div>\1'),
  1278. (re.compile('<p></p>', re.I), r'')
  1279. ]
  1280. def postprocess_data(self, data):
  1281. if not data.has_key('news'):
  1282. return {}
  1283. for news in data['news']:
  1284. if news.has_key('full article link'):
  1285. if news['full article link'] is None:
  1286. del news['full article link']
  1287. return data
  1288. def _parse_review(x):
  1289. result = {}
  1290. title = x.get('title').strip()
  1291. if title[-1] == ':': title = title[:-1]
  1292. result['title'] = title
  1293. result['link'] = _normalize_href(x.get('link'))
  1294. kind = x.get('kind').strip()
  1295. if kind[-1] == ':': kind = kind[:-1]
  1296. result['review kind'] = kind
  1297. text = x.get('review').replace('\n\n', '||').replace('\n', ' ').split('||')
  1298. review = '\n'.join(text)
  1299. if x.get('author') is not None:
  1300. author = x.get('author').strip()
  1301. review = review.split(author)[0].strip()
  1302. result['review author'] = author[2:]
  1303. if x.get('item') is not None:
  1304. item = x.get('item').strip()
  1305. review = review[len(item):].strip()
  1306. review = "%s: %s" % (item, review)
  1307. result['review'] = review
  1308. return result
  1309. class DOMHTMLSeasonEpisodesParser(DOMParserBase):
  1310. """Parser for the "episode list" page of a given movie.
  1311. The page should be provided as a string, as taken from
  1312. the akas.imdb.com server. The final result will be a
  1313. dictionary, with a key for every relevant section.
  1314. Example:
  1315. sparser = DOMHTMLSeasonEpisodesParser()
  1316. result = sparser.parse(episodes_html_string)
  1317. """
  1318. extractors = [
  1319. Extractor(label='series link',
  1320. path="//div[@class='parent']",
  1321. attrs=[Attribute(key='series link',
  1322. path=".//a/@href")]
  1323. ),
  1324. Extractor(label='series title',
  1325. path="//head/meta[@property='og:title']",
  1326. attrs=[Attribute(key='series title',
  1327. path="./@content")]
  1328. ),
  1329. Extractor(label='seasons list',
  1330. path="//select[@id='bySeason']//option",
  1331. attrs=[Attribute(key='_seasons',
  1332. multi=True,
  1333. path="./@value")]),
  1334. Extractor(label='selected season',
  1335. path="//select[@id='bySeason']//option[@selected]",
  1336. attrs=[Attribute(key='_current_season',
  1337. path='./@value')]),
  1338. Extractor(label='episodes',
  1339. path=".",
  1340. group="//div[@class='info']",
  1341. group_key=".//meta/@content",
  1342. group_key_normalize=lambda x: 'episode %s' % x,
  1343. attrs=[Attribute(key=None,
  1344. multi=True,
  1345. path={
  1346. "link": ".//strong//a[@href][1]/@href",
  1347. "original air date": ".//div[@class='airdate']/text()",
  1348. "title": ".//strong//text()",
  1349. "plot": ".//div[@class='item_description']//text()"
  1350. }
  1351. )]
  1352. )
  1353. ]
  1354. def postprocess_data(self, data):
  1355. series_id = analyze_imdbid(data.get('series link'))
  1356. series_title = data.get('series title', '').strip()
  1357. selected_season = data.get('_current_season',
  1358. 'unknown season').strip()
  1359. if not (series_id and series_title):
  1360. return {}
  1361. series = Movie(title=series_title, movieID=str(series_id),
  1362. accessSystem=self._as, modFunct=self._modFunct)
  1363. if series.get('kind') == 'movie':
  1364. series['kind'] = u'tv series'
  1365. try: selected_season = int(selected_season)
  1366. except: pass
  1367. nd = {selected_season: {}}
  1368. if 'episode -1' in data:
  1369. counter = 1
  1370. for episode in data['episode -1']:
  1371. while 'episode %d' % counter in data:
  1372. counter += 1
  1373. k = 'episode %d' % counter
  1374. data[k] = [episode]
  1375. del data['episode -1']
  1376. for episode_nr, episode in data.iteritems():
  1377. if not (episode and episode[0] and
  1378. episode_nr.startswith('episode ')):
  1379. continue
  1380. episode = episode[0]
  1381. episode_nr = episode_nr[8:].rstrip()
  1382. try: episode_nr = int(episode_nr)
  1383. except: pass
  1384. episode_id = analyze_imdbid(episode.get('link' ''))
  1385. episode_air_date = episode.get('original air date',
  1386. '').strip()
  1387. episode_title = episode.get('title', '').strip()
  1388. episode_plot = episode.get('plot', '')
  1389. if not (episode_nr is not None and episode_id and episode_title):
  1390. continue
  1391. ep_obj = Movie(movieID=episode_id, title=episode_title,
  1392. accessSystem=self._as, modFunct=self._modFunct)
  1393. ep_obj['kind'] = u'episode'
  1394. ep_obj['episode of'] = series
  1395. ep_obj['season'] = selected_season
  1396. ep_obj['episode'] = episode_nr
  1397. if episode_air_date:
  1398. ep_obj['original air date'] = episode_air_date
  1399. if episode_air_date[-4:].isdigit():
  1400. ep_obj['year'] = episode_air_date[-4:]
  1401. if episode_plot:
  1402. ep_obj['plot'] = episode_plot
  1403. nd[selected_season][episode_nr] = ep_obj
  1404. _seasons = data.get('_seasons') or []
  1405. for idx, season in enumerate(_seasons):
  1406. try: _seasons[idx] = int(season)
  1407. except: pass
  1408. return {'episodes': nd, '_seasons': _seasons,
  1409. '_current_season': selected_season}
  1410. def _build_episode(x):
  1411. """Create a Movie object for a given series' episode."""
  1412. episode_id = analyze_imdbid(x.get('link'))
  1413. episode_title = x.get('title')
  1414. e = Movie(movieID=episode_id, title=episode_title)
  1415. e['kind'] = u'episode'
  1416. oad = x.get('oad')
  1417. if oad:
  1418. e['original air date'] = oad.strip()
  1419. year = x.get('year')
  1420. if year is not None:
  1421. year = year[5:]
  1422. if year == 'unknown': year = u'????'
  1423. if year and year.isdigit():
  1424. year = int(year)
  1425. e['year'] = year
  1426. else:
  1427. if oad and oad[-4:].isdigit():
  1428. e['year'] = int(oad[-4:])
  1429. epinfo = x.get('episode')
  1430. if epinfo is not None:
  1431. season, episode = epinfo.split(':')[0].split(',')
  1432. e['season'] = int(season[7:])
  1433. e['episode'] = int(episode[8:])
  1434. else:
  1435. e['season'] = 'unknown'
  1436. e['episode'] = 'unknown'
  1437. plot = x.get('plot')
  1438. if plot:
  1439. e['plot'] = plot.strip()
  1440. return e
  1441. class DOMHTMLEpisodesParser(DOMParserBase):
  1442. """Parser for the "episode list" page of a given movie.
  1443. The page should be provided as a string, as taken from
  1444. the akas.imdb.com server. The final result will be a
  1445. dictionary, with a key for every relevant section.
  1446. Example:
  1447. eparser = DOMHTMLEpisodesParser()
  1448. result = eparser.parse(episodes_html_string)
  1449. """
  1450. # XXX: no more used for the list of episodes parser,
  1451. # but only for the episodes cast parser (see below).
  1452. _containsObjects = True
  1453. kind = 'episodes list'
  1454. _episodes_path = "..//h4"
  1455. _oad_path = "./following-sibling::span/strong[1]/text()"
  1456. def _init(self):
  1457. self.extractors = [
  1458. Extractor(label='series',
  1459. path="//html",
  1460. attrs=[Attribute(key='series title',
  1461. path=".//title/text()"),
  1462. Attribute(key='series movieID',
  1463. path=".//h1/a[@class='main']/@href",
  1464. postprocess=analyze_imdbid)
  1465. ]),
  1466. Extractor(label='episodes',
  1467. group="//div[@class='_imdbpy']/h3",
  1468. group_key="./a/@name",
  1469. path=self._episodes_path,
  1470. attrs=Attribute(key=None,
  1471. multi=True,
  1472. path={
  1473. 'link': "./a/@href",
  1474. 'title': "./a/text()",
  1475. 'year': "./preceding-sibling::a[1]/@name",
  1476. 'episode': "./text()[1]",
  1477. 'oad': self._oad_path,
  1478. 'plot': "./following-sibling::text()[1]"
  1479. },
  1480. postprocess=_build_episode))]
  1481. if self.kind == 'episodes cast':
  1482. self.extractors += [
  1483. Extractor(label='cast',
  1484. group="//h4",
  1485. group_key="./text()[1]",
  1486. group_key_normalize=lambda x: x.strip(),
  1487. path="./following-sibling::table[1]//td[@class='nm']",
  1488. attrs=Attribute(key=None,
  1489. multi=True,
  1490. path={'person': "..//text()",
  1491. 'link': "./a/@href",
  1492. 'roleID': \
  1493. "../td[4]/div[@class='_imdbpyrole']/@roleid"},
  1494. postprocess=lambda x: \
  1495. build_person(x.get('person') or u'',
  1496. personID=analyze_imdbid(x.get('link')),
  1497. roleID=(x.get('roleID') or u'').split('/'),
  1498. accessSystem=self._as,
  1499. modFunct=self._modFunct)))
  1500. ]
  1501. preprocessors = [
  1502. (re.compile('(<hr/>\n)(<h3>)', re.I),
  1503. r'</div>\1<div class="_imdbpy">\2'),
  1504. (re.compile('(</p>\n\n)</div>', re.I), r'\1'),
  1505. (re.compile('<h3>(.*?)</h3>', re.I), r'<h4>\1</h4>'),
  1506. (_reRolesMovie, _manageRoles),
  1507. (re.compile('(<br/> <br/>\n)(<hr/>)', re.I), r'\1</div>\2')
  1508. ]
  1509. def postprocess_data(self, data):
  1510. # A bit extreme?
  1511. if not 'series title' in data: return {}
  1512. if not 'series movieID' in data: return {}
  1513. stitle = data['series title'].replace('- Episode list', '')
  1514. stitle = stitle.replace('- Episodes list', '')
  1515. stitle = stitle.replace('- Episode cast', '')
  1516. stitle = stitle.replace('- Episodes cast', '')
  1517. stitle = stitle.strip()
  1518. if not stitle: return {}
  1519. seriesID = data['series movieID']
  1520. if seriesID is None: return {}
  1521. series = Movie(title=stitle, movieID=str(seriesID),
  1522. accessSystem=self._as, modFunct=self._modFunct)
  1523. nd = {}
  1524. for key in data.keys():
  1525. if key.startswith('filter-season-') or key.startswith('season-'):
  1526. season_key = key.replace('filter-season-', '').replace('season-', '')
  1527. try: season_key = int(season_key)
  1528. except: pass
  1529. nd[season_key] = {}
  1530. ep_counter = 1
  1531. for episode in data[key]:
  1532. if not episode: continue
  1533. episode_key = episode.get('episode')
  1534. if episode_key is None: continue
  1535. if not isinstance(episode_key, int):
  1536. episode_key = ep_counter
  1537. ep_counter += 1
  1538. cast_key = 'Season %s, Episode %s:' % (season_key,
  1539. episode_key)
  1540. if data.has_key(cast_key):
  1541. cast = data[cast_key]
  1542. for i in xrange(len(cast)):
  1543. cast[i].billingPos = i + 1
  1544. episode['cast'] = cast
  1545. episode['episode of'] = series
  1546. nd[season_key][episode_key] = episode
  1547. if len(nd) == 0:
  1548. return {}
  1549. return {'episodes': nd}
  1550. class DOMHTMLEpisodesCastParser(DOMHTMLEpisodesParser):
  1551. """Parser for the "episodes cast" page of a given movie.
  1552. The page should be provided as a string, as taken from
  1553. the akas.imdb.com server. The final result will be a
  1554. dictionary, with a key for every relevant section.
  1555. Example:
  1556. eparser = DOMHTMLEpisodesParser()
  1557. result = eparser.parse(episodes_html_string)
  1558. """
  1559. kind = 'episodes cast'
  1560. _episodes_path = "..//h4"
  1561. _oad_path = "./following-sibling::b[1]/text()"
  1562. class DOMHTMLFaqsParser(DOMParserBase):
  1563. """Parser for the "FAQ" page of a given movie.
  1564. The page should be provided as a string, as taken from
  1565. the akas.imdb.com server. The final result will be a
  1566. dictionary, with a key for every relevant section.
  1567. Example:
  1568. fparser = DOMHTMLFaqsParser()
  1569. result = fparser.parse(faqs_html_string)
  1570. """
  1571. _defGetRefs = True
  1572. # XXX: bsoup and lxml don't match (looks like a minor issue, anyway).
  1573. extractors = [
  1574. Extractor(label='faqs',
  1575. path="//div[@class='section']",
  1576. attrs=Attribute(key='faqs',
  1577. multi=True,
  1578. path={
  1579. 'question': "./h3/a/span/text()",
  1580. 'answer': "../following-sibling::div[1]//text()"
  1581. },
  1582. postprocess=lambda x: u'%s::%s' % (x.get('question').strip(),
  1583. '\n\n'.join(x.get('answer').replace(
  1584. '\n\n', '\n').strip().split('||')))))
  1585. ]
  1586. preprocessors = [
  1587. (re.compile('<br/><br/>', re.I), r'||'),
  1588. (re.compile('<h4>(.*?)</h4>\n', re.I), r'||\1--'),
  1589. (re.compile('<span class="spoiler"><span>(.*?)</span></span>', re.I),
  1590. r'[spoiler]\1[/spoiler]')
  1591. ]
  1592. class DOMHTMLAiringParser(DOMParserBase):
  1593. """Parser for the "airing" page of a given movie.
  1594. The page should be provided as a string, as taken from
  1595. the akas.imdb.com server. The final result will be a
  1596. dictionary, with a key for every relevant section.
  1597. Example:
  1598. aparser = DOMHTMLAiringParser()
  1599. result = aparser.parse(airing_html_string)
  1600. """
  1601. _containsObjects = True
  1602. extractors = [
  1603. Extractor(label='series title',
  1604. path="//title",
  1605. attrs=Attribute(key='series title', path="./text()",
  1606. postprocess=lambda x: \
  1607. x.replace(' - TV schedule', u''))),
  1608. Extractor(label='series id',
  1609. path="//h1/a[@href]",
  1610. attrs=Attribute(key='series id', path="./@href")),
  1611. Extractor(label='tv airings',
  1612. path="//tr[@class]",
  1613. attrs=Attribute(key='airing',
  1614. multi=True,
  1615. path={
  1616. 'date': "./td[1]//text()",
  1617. 'time': "./td[2]//text()",
  1618. 'channel': "./td[3]//text()",
  1619. 'link': "./td[4]/a[1]/@href",
  1620. 'title': "./td[4]//text()",
  1621. 'season': "./td[5]//text()",
  1622. },
  1623. postprocess=lambda x: {
  1624. 'date': x.get('date'),
  1625. 'time': x.get('time'),
  1626. 'channel': x.get('channel').strip(),
  1627. 'link': x.get('link'),
  1628. 'title': x.get('title'),
  1629. 'season': (x.get('season') or '').strip()
  1630. }
  1631. ))
  1632. ]
  1633. def postprocess_data(self, data):
  1634. if len(data) == 0:
  1635. return {}
  1636. seriesTitle = data['series title']
  1637. seriesID = analyze_imdbid(data['series id'])
  1638. if data.has_key('airing'):
  1639. for airing in data['airing']:
  1640. title = airing.get('title', '').strip()
  1641. if not title:
  1642. epsTitle = seriesTitle
  1643. if seriesID is None:
  1644. continue
  1645. epsID = seriesID
  1646. else:
  1647. epsTitle = '%s {%s}' % (data['series title'],
  1648. airing['title'])
  1649. epsID = analyze_imdbid(airing['link'])
  1650. e = Movie(title=epsTitle, movieID=epsID)
  1651. airing['episode'] = e
  1652. del airing['link']
  1653. del airing['title']
  1654. if not airing['season']:
  1655. del airing['season']
  1656. if 'series title' in data:
  1657. del data['series title']
  1658. if 'series id' in data:
  1659. del data['series id']
  1660. if 'airing' in data:
  1661. data['airing'] = filter(None, data['airing'])
  1662. if 'airing' not in data or not data['airing']:
  1663. return {}
  1664. return data
  1665. class DOMHTMLSynopsisParser(DOMParserBase):
  1666. """Parser for the "synopsis" page of a given movie.
  1667. The page should be provided as a string, as taken from
  1668. the akas.imdb.com server. The final result will be a
  1669. dictionary, with a key for every relevant section.
  1670. Example:
  1671. sparser = HTMLSynopsisParser()
  1672. result = sparser.parse(synopsis_html_string)
  1673. """
  1674. extractors = [
  1675. Extractor(label='synopsis',
  1676. path="//div[@class='display'][not(@style)]",
  1677. attrs=Attribute(key='synopsis',
  1678. path=".//text()",
  1679. postprocess=lambda x: '\n\n'.join(x.strip().split('||'))))
  1680. ]
  1681. preprocessors = [
  1682. (re.compile('<br/><br/>', re.I), r'||')
  1683. ]
  1684. class DOMHTMLParentsGuideParser(DOMParserBase):
  1685. """Parser for the "parents guide" page of a given movie.
  1686. The page should be provided as a string, as taken from
  1687. the akas.imdb.com server. The final result will be a
  1688. dictionary, with a key for every relevant section.
  1689. Example:
  1690. pgparser = HTMLParentsGuideParser()
  1691. result = pgparser.parse(parentsguide_html_string)
  1692. """
  1693. extractors = [
  1694. Extractor(label='parents guide',
  1695. group="//div[@class='section']",
  1696. group_key="./h3/a/span/text()",
  1697. group_key_normalize=lambda x: x.lower(),
  1698. path="../following-sibling::div[1]/p",
  1699. attrs=Attribute(key=None,
  1700. path=".//text()",
  1701. postprocess=lambda x: [t.strip().replace('\n', ' ')
  1702. for t in x.split('||') if t.strip()]))
  1703. ]
  1704. preprocessors = [
  1705. (re.compile('<br/><br/>', re.I), r'||')
  1706. ]
  1707. def postprocess_data(self, data):
  1708. data2 = {}
  1709. for key in data:
  1710. if data[key]:
  1711. data2[key] = data[key]
  1712. if not data2:
  1713. return {}
  1714. return {'parents guide': data2}
  1715. _OBJECTS = {
  1716. 'movie_parser': ((DOMHTMLMovieParser,), None),
  1717. 'plot_parser': ((DOMHTMLPlotParser,), None),
  1718. 'movie_awards_parser': ((DOMHTMLAwardsParser,), None),
  1719. 'taglines_parser': ((DOMHTMLTaglinesParser,), None),
  1720. 'keywords_parser': ((DOMHTMLKeywordsParser,), None),
  1721. 'crazycredits_parser': ((DOMHTMLCrazyCreditsParser,), None),
  1722. 'goofs_parser': ((DOMHTMLGoofsParser,), None),
  1723. 'alternateversions_parser': ((DOMHTMLAlternateVersionsParser,), None),
  1724. 'trivia_parser': ((DOMHTMLTriviaParser,), None),
  1725. 'soundtrack_parser': ((DOMHTMLSoundtrackParser,), {'kind': 'soundtrack'}),
  1726. 'quotes_parser': ((DOMHTMLQuotesParser,), None),
  1727. 'releasedates_parser': ((DOMHTMLReleaseinfoParser,), None),
  1728. 'ratings_parser': ((DOMHTMLRatingsParser,), None),
  1729. 'officialsites_parser': ((DOMHTMLOfficialsitesParser,), None),
  1730. 'criticrev_parser': ((DOMHTMLCriticReviewsParser,),
  1731. {'kind': 'critic reviews'}),
  1732. 'externalrev_parser': ((DOMHTMLOfficialsitesParser,),
  1733. {'kind': 'external reviews'}),
  1734. 'newsgrouprev_parser': ((DOMHTMLOfficialsitesParser,),
  1735. {'kind': 'newsgroup reviews'}),
  1736. 'misclinks_parser': ((DOMHTMLOfficialsitesParser,),
  1737. {'kind': 'misc links'}),
  1738. 'soundclips_parser': ((DOMHTMLOfficialsitesParser,),
  1739. {'kind': 'sound clips'}),
  1740. 'videoclips_parser': ((DOMHTMLOfficialsitesParser,),
  1741. {'kind': 'video clips'}),
  1742. 'photosites_parser': ((DOMHTMLOfficialsitesParser,),
  1743. {'kind': 'photo sites'}),
  1744. 'connections_parser': ((DOMHTMLConnectionParser,), None),
  1745. 'tech_parser': ((DOMHTMLTechParser,), None),
  1746. 'business_parser': ((DOMHTMLTechParser,),
  1747. {'kind': 'business', '_defGetRefs': 1}),
  1748. 'literature_parser': ((DOMHTMLTechParser,), {'kind': 'literature'}),
  1749. 'locations_parser': ((DOMHTMLLocationsParser,), None),
  1750. 'rec_parser': ((DOMHTMLRecParser,), None),
  1751. 'news_parser': ((DOMHTMLNewsParser,), None),
  1752. 'episodes_parser': ((DOMHTMLEpisodesParser,), None),
  1753. 'season_episodes_parser': ((DOMHTMLSeasonEpisodesParser,), None),
  1754. 'episodes_cast_parser': ((DOMHTMLEpisodesCastParser,), None),
  1755. 'eprating_parser': ((DOMHTMLEpisodesRatings,), None),
  1756. 'movie_faqs_parser': ((DOMHTMLFaqsParser,), None),
  1757. 'airing_parser': ((DOMHTMLAiringParser,), None),
  1758. 'synopsis_parser': ((DOMHTMLSynopsisParser,), None),
  1759. 'parentsguide_parser': ((DOMHTMLParentsGuideParser,), None)
  1760. }