PageRenderTime 63ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/dependencies/imdbpy/imdb/parser/http/movieParser.py

https://bitbucket.org/filmaster/filmaster-stable/
Python | 1923 lines | 1884 code | 13 blank | 26 comment | 23 complexity | 5dc80d4409a77ef3c03496d0d983b3aa MD5 | raw file
Possible License(s): BSD-2-Clause, GPL-2.0, BSD-3-Clause, JSON
  1. """
  2. parser.http.movieParser module (imdb package).
  3. This module provides the classes (and the instances), used to parse the
  4. IMDb pages on the akas.imdb.com server about a movie.
  5. E.g., for Brian De Palma's "The Untouchables", the referred
  6. pages would be:
  7. combined details: http://akas.imdb.com/title/tt0094226/combined
  8. plot summary: http://akas.imdb.com/title/tt0094226/plotsummary
  9. ...and so on...
  10. Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
  11. 2008 H. Turgut Uyar <uyar@tekir.org>
  12. This program is free software; you can redistribute it and/or modify
  13. it under the terms of the GNU General Public License as published by
  14. the Free Software Foundation; either version 2 of the License, or
  15. (at your option) any later version.
  16. This program is distributed in the hope that it will be useful,
  17. but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. GNU General Public License for more details.
  20. You should have received a copy of the GNU General Public License
  21. along with this program; if not, write to the Free Software
  22. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  23. """
  24. import re
  25. import urllib
  26. from imdb import imdbURL_base
  27. from imdb.Person import Person
  28. from imdb.Movie import Movie
  29. from imdb.Company import Company
  30. from imdb.utils import analyze_title, split_company_name_notes, _Container
  31. from utils import build_person, DOMParserBase, Attribute, Extractor, \
  32. analyze_imdbid
  33. # Dictionary used to convert some section's names.
  34. _SECT_CONV = {
  35. 'directed': 'director',
  36. 'directed by': 'director',
  37. 'directors': 'director',
  38. 'editors': 'editor',
  39. 'writing credits': 'writer',
  40. 'writers': 'writer',
  41. 'produced': 'producer',
  42. 'cinematography': 'cinematographer',
  43. 'film editing': 'editor',
  44. 'casting': 'casting director',
  45. 'costume design': 'costume designer',
  46. 'makeup department': 'make up',
  47. 'production management': 'production manager',
  48. 'second unit director or assistant director': 'assistant director',
  49. 'costume and wardrobe department': 'costume department',
  50. 'sound department': 'sound crew',
  51. 'stunts': 'stunt performer',
  52. 'other crew': 'miscellaneous crew',
  53. 'also known as': 'akas',
  54. 'country': 'countries',
  55. 'runtime': 'runtimes',
  56. 'language': 'languages',
  57. 'certification': 'certificates',
  58. 'genre': 'genres',
  59. 'created': 'creator',
  60. 'creators': 'creator',
  61. 'color': 'color info',
  62. 'plot': 'plot outline',
  63. 'seasons': 'number of seasons',
  64. 'art directors': 'art direction',
  65. 'assistant directors': 'assistant director',
  66. 'set decorators': 'set decoration',
  67. 'visual effects department': 'visual effects',
  68. 'production managers': 'production manager',
  69. 'miscellaneous': 'miscellaneous crew',
  70. 'make up department': 'make up',
  71. 'plot summary': 'plot outline',
  72. 'cinematographers': 'cinematographer',
  73. 'camera department': 'camera and electrical department',
  74. 'costume designers': 'costume designer',
  75. 'production designers': 'production design',
  76. 'production managers': 'production manager',
  77. 'music original': 'original music',
  78. 'casting directors': 'casting director',
  79. 'other companies': 'miscellaneous companies',
  80. 'producers': 'producer',
  81. 'special effects by': 'special effects department',
  82. 'special effects': 'special effects companies'
  83. }
  84. def _manageRoles(mo):
  85. """Perform some transformation on the html, so that roleIDs can
  86. be easily retrieved."""
  87. firstHalf = mo.group(1)
  88. secondHalf = mo.group(2)
  89. newRoles = []
  90. roles = secondHalf.split(' / ')
  91. for role in roles:
  92. role = role.strip()
  93. if not role:
  94. continue
  95. roleID = analyze_imdbid(role)
  96. if roleID is None:
  97. roleID = u'/'
  98. else:
  99. roleID += u'/'
  100. newRoles.append(u'<div class="_imdbpyrole" roleid="%s">%s</div>' % \
  101. (roleID, role.strip()))
  102. return firstHalf + u' / '.join(newRoles) + mo.group(3)
  103. _reRolesMovie = re.compile(r'(<td class="char">)(.*?)(</td>)',
  104. re.I | re.M | re.S)
  105. def _replaceBR(mo):
  106. """Replaces <br> tags with '::' (useful for some akas)"""
  107. txt = mo.group(0)
  108. return txt.replace('<br>', '::')
  109. _reAkas = re.compile(r'<h5>also known as:</h5>.*?</div>', re.I | re.M | re.S)
  110. def makeSplitter(lstrip=None, sep='|', comments=True):
  111. """Return a splitter function suitable for a given set of data."""
  112. def splitter(x):
  113. if not x: return x
  114. x = x.strip()
  115. if not x: return x
  116. if lstrip is not None:
  117. x = x.lstrip(lstrip).lstrip()
  118. lx = x.split(sep)
  119. lx[:] = filter(None, [j.strip() for j in lx])
  120. if comments:
  121. lx[:] = [j.replace(' (', '::(', 1) for j in lx]
  122. return lx
  123. return splitter
  124. def _toInt(val, replace=()):
  125. """Return the value, converted to integer, or None; if present, 'replace'
  126. must be a list of tuples of values to replace."""
  127. for before, after in replace:
  128. val = val.replace(before, after)
  129. try:
  130. return int(val)
  131. except (TypeError, ValueError):
  132. return None
  133. class DOMHTMLMovieParser(DOMParserBase):
  134. """Parser for the "combined details" (and if instance.mdparse is
  135. True also for the "main details") page of a given movie.
  136. The page should be provided as a string, as taken from
  137. the akas.imdb.com server. The final result will be a
  138. dictionary, with a key for every relevant section.
  139. Example:
  140. mparser = DOMHTMLMovieParser()
  141. result = mparser.parse(combined_details_html_string)
  142. """
  143. _containsObjects = True
  144. extractors = [Extractor(label='title',
  145. path="//h1",
  146. attrs=Attribute(key='title',
  147. path=".//text()",
  148. postprocess=analyze_title)),
  149. Extractor(label='glossarysections',
  150. group="//a[@class='glossary']",
  151. group_key="./@name",
  152. group_key_normalize=lambda x: x.replace('_', ' '),
  153. path="../../../..//tr",
  154. attrs=Attribute(key=None,
  155. multi=True,
  156. path={'person': ".//text()",
  157. 'link': "./td[1]/a[@href]/@href"},
  158. postprocess=lambda x: \
  159. build_person(x.get('person') or u'',
  160. personID=analyze_imdbid(x.get('link')))
  161. )),
  162. Extractor(label='cast',
  163. path="//table[@class='cast']//tr",
  164. attrs=Attribute(key="cast",
  165. multi=True,
  166. path={'person': ".//text()",
  167. 'link': "td[2]/a/@href",
  168. 'roleID': \
  169. "td[4]/div[@class='_imdbpyrole']/@roleid"},
  170. postprocess=lambda x: \
  171. build_person(x.get('person') or u'',
  172. personID=analyze_imdbid(x.get('link')),
  173. roleID=(x.get('roleID') or u'').split('/'))
  174. )),
  175. Extractor(label='genres',
  176. path="//div[@class='info']//a[starts-with(@href," \
  177. " '/Sections/Genres')]",
  178. attrs=Attribute(key="genres",
  179. multi=True,
  180. path="./text()")),
  181. Extractor(label='h5sections',
  182. path="//div[@class='info']/h5/..",
  183. attrs=[
  184. Attribute(key="plot summary",
  185. path="./h5[starts-with(text(), " \
  186. "'Plot:')]/../div/text()",
  187. postprocess=lambda x: \
  188. x.strip().rstrip('|').rstrip()),
  189. Attribute(key="aspect ratio",
  190. path="./h5[starts-with(text()," \
  191. " 'Aspect')]/../div/text()",
  192. postprocess=lambda x: x.strip()),
  193. Attribute(key="mpaa",
  194. path="./h5/a[starts-with(text()," \
  195. " 'MPAA')]/../../div/text()",
  196. postprocess=lambda x: x.strip()),
  197. #fix by Pawel Maczewski (email)
  198. Attribute(key="countries",
  199. path="./h5[starts-with(text(), " \
  200. "'Countr')]/..//a/text()",
  201. postprocess=makeSplitter(sep='|')),
  202. # postprocess=makeSplitter(sep='\n')),
  203. Attribute(key="language",
  204. path="./h5[starts-with(text(), " \
  205. "'Language')]/..//text()",
  206. postprocess=makeSplitter('Language:')),
  207. Attribute(key='color info',
  208. path="./h5[starts-with(text(), " \
  209. "'Color')]/..//text()",
  210. postprocess=makeSplitter('Color:')),
  211. Attribute(key='sound mix',
  212. path="./h5[starts-with(text(), " \
  213. "'Sound Mix')]/..//text()",
  214. postprocess=makeSplitter('Sound Mix:')),
  215. # Collects akas not encosed in <i> tags.
  216. Attribute(key='other akas',
  217. path="./h5[starts-with(text(), " \
  218. "'Also Known As')]/../div//text()",
  219. postprocess=makeSplitter(sep='::')),
  220. Attribute(key='runtimes',
  221. path="./h5[starts-with(text(), " \
  222. "'Runtime')]/../div/text()",
  223. postprocess=makeSplitter()),
  224. Attribute(key='certificates',
  225. path="./h5[starts-with(text(), " \
  226. "'Certificat')]/..//text()",
  227. postprocess=makeSplitter('Certification:')),
  228. Attribute(key='number of seasons',
  229. path="./h5[starts-with(text(), " \
  230. "'Seasons')]/..//text()",
  231. postprocess=lambda x: x.count('|') + 1),
  232. Attribute(key='original air date',
  233. path="./h5[starts-with(text(), " \
  234. "'Original Air Date')]/../div/text()"),
  235. Attribute(key='tv series link',
  236. path="./h5[starts-with(text(), " \
  237. "'TV Series')]/..//a/@href"),
  238. Attribute(key='tv series title',
  239. path="./h5[starts-with(text(), " \
  240. "'TV Series')]/..//a/text()")
  241. ]),
  242. Extractor(label='creator',
  243. path="//h5[starts-with(text(), 'Creator')]/..//a",
  244. attrs=Attribute(key='creator', multi=True,
  245. path={'name': "./text()",
  246. 'link': "./@href"},
  247. postprocess=lambda x: \
  248. build_person(x.get('name') or u'',
  249. personID=analyze_imdbid(x.get('link')))
  250. )),
  251. Extractor(label='thin writer',
  252. path="//h5[starts-with(text(), 'Writer')]/..//a",
  253. attrs=Attribute(key='thin writer', multi=True,
  254. path={'name': "./text()",
  255. 'link': "./@href"},
  256. postprocess=lambda x: \
  257. build_person(x.get('name') or u'',
  258. personID=analyze_imdbid(x.get('link')))
  259. )),
  260. Extractor(label='thin director',
  261. path="//h5[starts-with(text(), 'Director')]/..//a",
  262. attrs=Attribute(key='thin director', multi=True,
  263. path={'name': "./text()",
  264. 'link': "@href"},
  265. postprocess=lambda x: \
  266. build_person(x.get('name') or u'',
  267. personID=analyze_imdbid(x.get('link')))
  268. )),
  269. Extractor(label='top 250/bottom 100',
  270. path="//div[@class='starbar-special']/" \
  271. "a[starts-with(@href, '/chart/')]",
  272. attrs=Attribute(key='top/bottom rank',
  273. path="./text()")),
  274. Extractor(label='series years',
  275. path="//div[@id='tn15title']//span" \
  276. "[starts-with(text(), 'TV series')]",
  277. attrs=Attribute(key='series years',
  278. path="./text()",
  279. postprocess=lambda x: \
  280. x.replace('TV series','').strip())),
  281. Extractor(label='number of episodes',
  282. path="//a[@title='Full Episode List']",
  283. attrs=Attribute(key='number of episodes',
  284. path="./text()",
  285. postprocess=lambda x: \
  286. _toInt(x, [(' Episodes', '')]))),
  287. Extractor(label='akas',
  288. path="//i[@class='transl']",
  289. attrs=Attribute(key='akas', multi=True, path='text()',
  290. postprocess=lambda x:
  291. x.replace(' ', ' ').replace(' (',
  292. '::(', 1).replace(' ', ' '))),
  293. Extractor(label='production notes/status',
  294. path="//div[@class='info inprod']",
  295. attrs=Attribute(key='production notes',
  296. path=".//text()",
  297. postprocess=lambda x: x.strip())),
  298. Extractor(label='blackcatheader',
  299. group="//b[@class='blackcatheader']",
  300. group_key="./text()",
  301. group_key_normalize=lambda x: x.lower(),
  302. path="../ul/li",
  303. attrs=Attribute(key=None,
  304. multi=True,
  305. path={'name': "./a//text()",
  306. 'comp-link': "./a/@href",
  307. 'notes': "./text()"},
  308. postprocess=lambda x: \
  309. Company(name=x.get('name') or u'',
  310. companyID=analyze_imdbid(x.get('comp-link')),
  311. notes=(x.get('notes') or u'').strip())
  312. )),
  313. Extractor(label='rating',
  314. path="//div[@class='starbar-meta']/b",
  315. attrs=Attribute(key='rating',
  316. path=".//text()")),
  317. Extractor(label='votes',
  318. path="//div[@class='starbar-meta']/a[@href]",
  319. attrs=Attribute(key='votes',
  320. path=".//text()")),
  321. Extractor(label='cover url',
  322. path="//a[@name='poster']",
  323. attrs=Attribute(key='cover url',
  324. path="./img/@src"))
  325. ]
  326. preprocessors = [
  327. (re.compile(r'(<b class="blackcatheader">.+?</b>)', re.I),
  328. r'</div><div>\1'),
  329. ('<small>Full cast and crew for<br></small>', ''),
  330. ('<td> </td>', '<td>...</td>'),
  331. ('<span class="tv-extra">TV mini-series</span>',
  332. '<span class="tv-extra">(mini)</span>'),
  333. (_reRolesMovie, _manageRoles),
  334. (_reAkas, _replaceBR)]
  335. def preprocess_dom(self, dom):
  336. # Handle series information.
  337. xpath = self.xpath(dom, "//b[text()='Series Crew']")
  338. if xpath:
  339. b = xpath[-1] # In doubt, take the last one.
  340. for a in self.xpath(b, "./following::h5/a[@class='glossary']"):
  341. name = a.get('name')
  342. if name:
  343. a.set('name', 'series %s' % name)
  344. # Remove links to IMDbPro.
  345. for proLink in self.xpath(dom, "//span[@class='pro-link']"):
  346. proLink.drop_tree()
  347. # Remove some 'more' links (keep others, like the one around
  348. # the number of votes).
  349. for tn15more in self.xpath(dom,
  350. "//a[@class='tn15more'][starts-with(@href, '/title/')]"):
  351. tn15more.drop_tree()
  352. return dom
  353. re_space = re.compile(r'\s+')
  354. re_airdate = re.compile(r'(.*)\s*\(season (\d+), episode (\d+)\)', re.I)
  355. def postprocess_data(self, data):
  356. # Convert section names.
  357. for sect in data.keys():
  358. if sect in _SECT_CONV:
  359. data[_SECT_CONV[sect]] = data[sect]
  360. del data[sect]
  361. sect = _SECT_CONV[sect]
  362. # Filter out fake values.
  363. for key in data:
  364. value = data[key]
  365. if isinstance(value, list) and value:
  366. if isinstance(value[0], Person):
  367. data[key] = filter(lambda x: x.personID is not None, value)
  368. if isinstance(value[0], _Container):
  369. for obj in data[key]:
  370. obj.accessSystem = self._as
  371. obj.modFunct = self._modFunct
  372. if 'akas' in data or 'other akas' in data:
  373. akas = data.get('akas') or []
  374. akas += data.get('other akas') or []
  375. if 'akas' in data:
  376. del data['akas']
  377. if 'other akas' in data:
  378. del data['other akas']
  379. if akas:
  380. data['akas'] = akas
  381. if 'runtimes' in data:
  382. data['runtimes'] = [x.replace(' min', u'')
  383. for x in data['runtimes']]
  384. if 'production notes' in data:
  385. pn = data['production notes'].replace('\n\nComments:',
  386. '\nComments:').replace('\n\nNote:',
  387. '\nNote:').replace('Note:\n\n',
  388. 'Note:\n').split('\n')
  389. for k, v in zip(pn[::2], pn[1::2]):
  390. v = v.strip()
  391. if not v:
  392. continue
  393. k = k.lower().strip(':')
  394. if k == 'note':
  395. k = 'status note'
  396. data[k] = v
  397. del data['production notes']
  398. if 'original air date' in data:
  399. oid = self.re_space.sub(' ', data['original air date']).strip()
  400. data['original air date'] = oid
  401. aid = self.re_airdate.findall(oid)
  402. if aid and len(aid[0]) == 3:
  403. date, season, episode = aid[0]
  404. date = date.strip()
  405. try: season = int(season)
  406. except: pass
  407. try: episode = int(episode)
  408. except: pass
  409. if date and date != '????':
  410. data['original air date'] = date
  411. else:
  412. del data['original air date']
  413. # Handle also "episode 0".
  414. if season or type(season) is type(0):
  415. data['season'] = season
  416. if episode or type(season) is type(0):
  417. data['episode'] = episode
  418. for k in ('writer', 'director'):
  419. t_k = 'thin %s' % k
  420. if t_k not in data:
  421. continue
  422. if k not in data:
  423. data[k] = data[t_k]
  424. del data[t_k]
  425. if 'top/bottom rank' in data:
  426. tbVal = data['top/bottom rank'].lower()
  427. if tbVal.startswith('top'):
  428. tbKey = 'top 250 rank'
  429. tbVal = _toInt(tbVal, [('top 250: #', '')])
  430. else:
  431. tbKey = 'bottom 100 rank'
  432. tbVal = _toInt(tbVal, [('bottom 100: #', '')])
  433. if tbVal:
  434. data[tbKey] = tbVal
  435. del data['top/bottom rank']
  436. if 'year' in data and data['year'] == '????':
  437. del data['year']
  438. if 'tv series link' in data:
  439. if 'tv series title' in data:
  440. data['episode of'] = Movie(title=data['tv series title'],
  441. movieID=analyze_imdbid(
  442. data['tv series link']),
  443. accessSystem=self._as,
  444. modFunct=self._modFunct)
  445. del data['tv series title']
  446. del data['tv series link']
  447. if 'rating' in data:
  448. try:
  449. data['rating'] = float(data['rating'].replace('/10', ''))
  450. except (TypeError, ValueError):
  451. pass
  452. if 'votes' in data:
  453. try:
  454. votes = data['votes'].replace(',', '').replace('votes', '')
  455. data['votes'] = int(votes)
  456. except (TypeError, ValueError):
  457. pass
  458. return data
  459. def _process_plotsummary(x):
  460. """Process a plot (contributed by Rdian06)."""
  461. if x.get('author') is None:
  462. xauthor = u'Anonymous'
  463. else:
  464. xauthor = x.get('author').replace('{', '<').replace('}',
  465. '>').replace('(','<').replace(')', '>')
  466. xplot = x.get('plot', '').strip()
  467. return u'%s::%s' % (xplot, xauthor)
  468. class DOMHTMLPlotParser(DOMParserBase):
  469. """Parser for the "plot summary" page of a given movie.
  470. The page should be provided as a string, as taken from
  471. the akas.imdb.com server. The final result will be a
  472. dictionary, with a 'plot' key, containing a list
  473. of string with the structure: 'summary::summary_author <author@email>'.
  474. Example:
  475. pparser = HTMLPlotParser()
  476. result = pparser.parse(plot_summary_html_string)
  477. """
  478. _defGetRefs = True
  479. extractors = [Extractor(label='plot',
  480. path="//p[@class='plotpar']",
  481. attrs=Attribute(key='plot',
  482. multi=True,
  483. path={'plot': './text()',
  484. 'author': './i/a/text()'},
  485. postprocess=_process_plotsummary))]
  486. def _process_award(x):
  487. award = {}
  488. award['year'] = x.get('year').strip()
  489. if award['year'] and award['year'].isdigit():
  490. award['year'] = int(award['year'])
  491. award['result'] = x.get('result').strip()
  492. award['award'] = x.get('award').strip()
  493. category = x.get('category').strip()
  494. if category:
  495. award['category'] = category
  496. received_with = x.get('with')
  497. if received_with is not None:
  498. award['with'] = received_with.strip()
  499. notes = x.get('notes')
  500. if notes is not None:
  501. notes = notes.strip()
  502. if notes:
  503. award['notes'] = notes
  504. award['anchor'] = x.get('anchor')
  505. return award
  506. class DOMHTMLAwardsParser(DOMParserBase):
  507. """Parser for the "awards" page of a given person or movie.
  508. The page should be provided as a string, as taken from
  509. the akas.imdb.com server. The final result will be a
  510. dictionary, with a key for every relevant section.
  511. Example:
  512. awparser = HTMLAwardsParser()
  513. result = awparser.parse(awards_html_string)
  514. """
  515. subject = 'title'
  516. _containsObjects = True
  517. extractors = [
  518. Extractor(label='awards',
  519. group="//table//big",
  520. group_key="./a",
  521. path="./ancestor::tr[1]/following-sibling::tr/" \
  522. "td[last()][not(@colspan)]",
  523. attrs=Attribute(key=None,
  524. multi=True,
  525. path={
  526. 'year': "../td[1]/a/text()",
  527. 'result': "../td[2]/b/text()",
  528. 'award': "../td[3]/text()",
  529. 'category': "./text()[1]",
  530. # FIXME: takes only the first co-recipient
  531. 'with': "./small[starts-with(text()," \
  532. " 'Shared with:')]/following-sibling::a[1]/text()",
  533. 'notes': "./small[last()]//text()",
  534. 'anchor': ".//text()"
  535. },
  536. postprocess=_process_award
  537. )),
  538. Extractor(label='recipients',
  539. group="//table//big",
  540. group_key="./a",
  541. path="./ancestor::tr[1]/following-sibling::tr/" \
  542. "td[last()]/small[1]/preceding-sibling::a",
  543. attrs=Attribute(key=None,
  544. multi=True,
  545. path={
  546. 'name': "./text()",
  547. 'link': "./@href",
  548. 'anchor': "..//text()"
  549. }
  550. ))
  551. ]
  552. preprocessors = [
  553. (re.compile('(<tr><td[^>]*>.*?</td></tr>\n\n</table>)', re.I),
  554. r'\1</table>'),
  555. (re.compile('(<tr><td[^>]*>\n\n<big>.*?</big></td></tr>)', re.I),
  556. r'</table><table class="_imdbpy">\1'),
  557. (re.compile('(<table[^>]*>\n\n)</table>(<table)', re.I), r'\1\2'),
  558. (re.compile('(<small>.*?)<br>(.*?</small)', re.I), r'\1 \2'),
  559. (re.compile('(</tr>\n\n)(<td)', re.I), r'\1<tr>\2')
  560. ]
  561. def preprocess_dom(self, dom):
  562. """Repeat td elements according to their rowspan attributes
  563. in subsequent tr elements.
  564. """
  565. cols = self.xpath(dom, "//td[@rowspan]")
  566. for col in cols:
  567. span = int(col.get('rowspan'))
  568. del col.attrib['rowspan']
  569. position = len(self.xpath(col, "./preceding-sibling::td"))
  570. row = col.getparent()
  571. for tr in self.xpath(row, "./following-sibling::tr")[:span-1]:
  572. # if not cloned, child will be moved to new parent
  573. clone = self.clone(col)
  574. # XXX: beware that here we don't use an "adapted" function,
  575. # because both BeautifulSoup and lxml uses the same
  576. # "insert" method.
  577. tr.insert(position, clone)
  578. return dom
  579. def postprocess_data(self, data):
  580. if len(data) == 0:
  581. return {}
  582. nd = []
  583. for key in data.keys():
  584. dom = self.get_dom(key)
  585. assigner = self.xpath(dom, "//a/text()")[0]
  586. for entry in data[key]:
  587. if not entry.has_key('name'):
  588. # this is an award, not a recipient
  589. entry['assigner'] = assigner.strip()
  590. # find the recipients
  591. matches = [p for p in data[key]
  592. if p.has_key('name') and (entry['anchor'] ==
  593. p['anchor'])]
  594. if self.subject == 'title':
  595. recipients = [Person(name=recipient['name'],
  596. personID=analyze_imdbid(recipient['link']))
  597. for recipient in matches]
  598. entry['to'] = recipients
  599. elif self.subject == 'name':
  600. recipients = [Movie(title=recipient['name'],
  601. movieID=analyze_imdbid(recipient['link']))
  602. for recipient in matches]
  603. entry['for'] = recipients
  604. nd.append(entry)
  605. del entry['anchor']
  606. return {'awards': nd}
  607. class DOMHTMLTaglinesParser(DOMParserBase):
  608. """Parser for the "taglines" page of a given movie.
  609. The page should be provided as a string, as taken from
  610. the akas.imdb.com server. The final result will be a
  611. dictionary, with a key for every relevant section.
  612. Example:
  613. tparser = DOMHTMLTaglinesParser()
  614. result = tparser.parse(taglines_html_string)
  615. """
  616. extractors = [Extractor(label='taglines',
  617. path="//div[@id='tn15content']/p",
  618. attrs=Attribute(key='taglines', multi=True,
  619. path="./text()"))]
  620. class DOMHTMLKeywordsParser(DOMParserBase):
  621. """Parser for the "keywords" page of a given movie.
  622. The page should be provided as a string, as taken from
  623. the akas.imdb.com server. The final result will be a
  624. dictionary, with a key for every relevant section.
  625. Example:
  626. kwparser = DOMHTMLKeywordsParser()
  627. result = kwparser.parse(keywords_html_string)
  628. """
  629. extractors = [Extractor(label='keywords',
  630. path="//a[starts-with(@href, '/keyword/')]",
  631. attrs=Attribute(key='keywords',
  632. path="./text()", multi=True,
  633. postprocess=lambda x: \
  634. x.lower().replace(' ', '-')))]
  635. class DOMHTMLAlternateVersionsParser(DOMParserBase):
  636. """Parser for the "alternate versions" and "trivia" pages of a
  637. given movie.
  638. The page should be provided as a string, as taken from
  639. the akas.imdb.com server. The final result will be a
  640. dictionary, with a key for every relevant section.
  641. Example:
  642. avparser = HTMLAlternateVersionsParser()
  643. result = avparser.parse(alternateversions_html_string)
  644. """
  645. _defGetRefs = True
  646. kind = 'alternate versions'
  647. extractors = [Extractor(label='alternate versions',
  648. path="//ul[@class='trivia']/li",
  649. attrs=Attribute(key='self.kind',
  650. multi=True,
  651. path=".//text()",
  652. postprocess=lambda x: x.strip()))]
  653. class DOMHTMLSoundtrackParser(DOMHTMLAlternateVersionsParser):
  654. kind = 'soundtrack'
  655. preprocessors = [
  656. ('<br>', '\n')
  657. ]
  658. def postprocess_data(self, data):
  659. if 'soundtrack' in data:
  660. nd = []
  661. for x in data['soundtrack']:
  662. ds = x.split('\n')
  663. title = ds[0]
  664. if title[0] == '"' and title[-1] == '"':
  665. title = title[1:-1]
  666. nds = []
  667. newData = {}
  668. for l in ds[1:]:
  669. if ' with ' in l or ' by ' in l or ' from ' in l \
  670. or ' of ' in l or l.startswith('From '):
  671. nds.append(l)
  672. else:
  673. if nds:
  674. nds[-1] += l
  675. else:
  676. nds.append(l)
  677. newData[title] = {}
  678. for l in nds:
  679. skip = False
  680. for sep in ('From ',):
  681. if l.startswith(sep):
  682. fdix = len(sep)
  683. kind = l[:fdix].rstrip().lower()
  684. info = l[fdix:].lstrip()
  685. newData[title][kind] = info
  686. skip = True
  687. if not skip:
  688. for sep in ' with ', ' by ', ' from ', ' of ':
  689. fdix = l.find(sep)
  690. if fdix != -1:
  691. fdix = fdix+len(sep)
  692. kind = l[:fdix].rstrip().lower()
  693. info = l[fdix:].lstrip()
  694. newData[title][kind] = info
  695. break
  696. nd.append(newData)
  697. data['soundtrack'] = nd
  698. return data
  699. class DOMHTMLCrazyCreditsParser(DOMParserBase):
  700. """Parser for the "crazy credits" page of a given movie.
  701. The page should be provided as a string, as taken from
  702. the akas.imdb.com server. The final result will be a
  703. dictionary, with a key for every relevant section.
  704. Example:
  705. ccparser = DOMHTMLCrazyCreditsParser()
  706. result = ccparser.parse(crazycredits_html_string)
  707. """
  708. _defGetRefs = True
  709. extractors = [Extractor(label='crazy credits', path="//ul/li/tt",
  710. attrs=Attribute(key='crazy credits', multi=True,
  711. path=".//text()",
  712. postprocess=lambda x: \
  713. x.replace('\n', ' ').replace(' ', ' ')))]
  714. class DOMHTMLGoofsParser(DOMParserBase):
  715. """Parser for the "goofs" page of a given movie.
  716. The page should be provided as a string, as taken from
  717. the akas.imdb.com server. The final result will be a
  718. dictionary, with a key for every relevant section.
  719. Example:
  720. gparser = DOMHTMLGoofsParser()
  721. result = gparser.parse(goofs_html_string)
  722. """
  723. _defGetRefs = True
  724. extractors = [Extractor(label='goofs', path="//ul[@class='trivia']/li",
  725. attrs=Attribute(key='goofs', multi=True, path=".//text()",
  726. postprocess=lambda x: (x or u'').strip()))]
  727. class DOMHTMLQuotesParser(DOMParserBase):
  728. """Parser for the "memorable quotes" page of a given movie.
  729. The page should be provided as a string, as taken from
  730. the akas.imdb.com server. The final result will be a
  731. dictionary, with a key for every relevant section.
  732. Example:
  733. qparser = DOMHTMLQuotesParser()
  734. result = qparser.parse(quotes_html_string)
  735. """
  736. _defGetRefs = True
  737. extractors = [
  738. Extractor(label='quotes',
  739. path="//div[@class='_imdbpy']",
  740. attrs=Attribute(key='quotes',
  741. multi=True,
  742. path=".//text()",
  743. postprocess=lambda x: x.strip().replace(' \n',
  744. '::').replace('::\n', '::').replace('\n', ' ')))
  745. ]
  746. preprocessors = [
  747. (re.compile('(<a name="?qt[0-9]{7}"?></a>)', re.I),
  748. r'\1<div class="_imdbpy">'),
  749. (re.compile('<hr width="30%">', re.I), '</div>'),
  750. (re.compile('<hr/>', re.I), '</div>'),
  751. (re.compile('<script.*?</script>', re.I|re.S), ''),
  752. # For BeautifulSoup.
  753. (re.compile('<!-- sid: t-channel : MIDDLE_CENTER -->', re.I), '</div>')
  754. ]
  755. def preprocess_dom(self, dom):
  756. # Remove "link this quote" links.
  757. for qLink in self.xpath(dom, "//p[@class='linksoda']"):
  758. qLink.drop_tree()
  759. return dom
  760. def postprocess_data(self, data):
  761. if 'quotes' not in data:
  762. return {}
  763. for idx, quote in enumerate(data['quotes']):
  764. data['quotes'][idx] = quote.split('::')
  765. return data
  766. class DOMHTMLReleaseinfoParser(DOMParserBase):
  767. """Parser for the "release dates" page of a given movie.
  768. The page should be provided as a string, as taken from
  769. the akas.imdb.com server. The final result will be a
  770. dictionary, with a key for every relevant section.
  771. Example:
  772. rdparser = DOMHTMLReleaseinfoParser()
  773. result = rdparser.parse(releaseinfo_html_string)
  774. """
  775. extractors = [Extractor(label='release dates',
  776. path="//th[@class='xxxx']/../../tr",
  777. attrs=Attribute(key='release dates', multi=True,
  778. path={'country': ".//td[1]//text()",
  779. 'date': ".//td[2]//text()",
  780. 'notes': ".//td[3]//text()"})),
  781. Extractor(label='akas',
  782. path="//div[@class='_imdbpy_akas']/table/tr",
  783. attrs=Attribute(key='akas', multi=True,
  784. path={'title': "./td[1]/text()",
  785. 'countries': "./td[2]/text()"}))]
  786. preprocessors = [
  787. (re.compile('(<h5><a name="?akas"?.*</table>)', re.I | re.M | re.S),
  788. r'<div class="_imdbpy_akas">\1</div>')]
  789. def postprocess_data(self, data):
  790. if not ('release dates' in data or 'akas' in data): return data
  791. releases = data['release dates']
  792. rl = []
  793. for i in releases:
  794. country = i.get('country')
  795. date = i.get('date')
  796. if not (country and date): continue
  797. country = country.strip()
  798. date = date.strip()
  799. if not (country and date): continue
  800. notes = i['notes']
  801. info = u'%s::%s' % (country, date)
  802. if notes:
  803. info += notes
  804. rl.append(info)
  805. if releases:
  806. del data['release dates']
  807. if rl:
  808. data['release dates'] = rl
  809. akas = data.get('akas')
  810. nakas = []
  811. for aka in akas:
  812. title = aka.get('title', '').strip()
  813. if not title:
  814. continue
  815. countries = aka.get('countries', '').split('/')
  816. if not countries:
  817. nakas.append(title)
  818. else:
  819. for country in countries:
  820. nakas.append('%s::%s' % (title, country.strip()))
  821. if akas:
  822. del data['akas']
  823. if nakas:
  824. data['akas'] = nakas
  825. return data
  826. class DOMHTMLRatingsParser(DOMParserBase):
  827. """Parser for the "user ratings" page of a given movie.
  828. The page should be provided as a string, as taken from
  829. the akas.imdb.com server. The final result will be a
  830. dictionary, with a key for every relevant section.
  831. Example:
  832. rparser = DOMHTMLRatingsParser()
  833. result = rparser.parse(userratings_html_string)
  834. """
  835. re_means = re.compile('mean\s*=\s*([0-9]\.[0-9])\.\s*median\s*=\s*([0-9])',
  836. re.I)
  837. extractors = [
  838. Extractor(label='number of votes',
  839. path="//td[b='Percentage']/../../tr",
  840. attrs=[Attribute(key='votes',
  841. multi=True,
  842. path={
  843. 'votes': "td[1]//text()",
  844. 'ordinal': "td[3]//text()"
  845. })]),
  846. Extractor(label='mean and median',
  847. path="//p[starts-with(text(), 'Arithmetic mean')]",
  848. attrs=Attribute(key='mean and median',
  849. path="text()")),
  850. Extractor(label='rating',
  851. path="//a[starts-with(@href, '/search/title?user_rating=')]",
  852. attrs=Attribute(key='rating',
  853. path="text()")),
  854. Extractor(label='demographic voters',
  855. path="//td[b='Average']/../../tr",
  856. attrs=Attribute(key='demographic voters',
  857. multi=True,
  858. path={
  859. 'voters': "td[1]//text()",
  860. 'votes': "td[2]//text()",
  861. 'average': "td[3]//text()"
  862. })),
  863. Extractor(label='top 250',
  864. path="//a[text()='top 250']",
  865. attrs=Attribute(key='top 250',
  866. path="./preceding-sibling::text()[1]"))
  867. ]
  868. def postprocess_data(self, data):
  869. nd = {}
  870. votes = data.get('votes', [])
  871. if votes:
  872. nd['number of votes'] = {}
  873. for i in xrange(1, 11):
  874. nd['number of votes'][int(votes[i]['ordinal'])] = \
  875. int(votes[i]['votes'].replace(',', ''))
  876. mean = data.get('mean and median', '')
  877. if mean:
  878. means = self.re_means.findall(mean)
  879. if means and len(means[0]) == 2:
  880. am, med = means[0]
  881. try: am = float(am)
  882. except (ValueError, OverflowError): pass
  883. if type(am) is type(1.0):
  884. nd['arithmetic mean'] = am
  885. try: med = int(med)
  886. except (ValueError, OverflowError): pass
  887. if type(med) is type(0):
  888. nd['median'] = med
  889. if 'rating' in data:
  890. nd['rating'] = float(data['rating'])
  891. dem_voters = data.get('demographic voters')
  892. if dem_voters:
  893. nd['demographic'] = {}
  894. for i in xrange(1, len(dem_voters)):
  895. if (dem_voters[i]['votes'] is not None) \
  896. and (dem_voters[i]['votes'].strip()):
  897. nd['demographic'][dem_voters[i]['voters'].strip().lower()] \
  898. = (int(dem_voters[i]['votes'].replace(',', '')),
  899. float(dem_voters[i]['average']))
  900. if 'imdb users' in nd.get('demographic', {}):
  901. nd['votes'] = nd['demographic']['imdb users'][0]
  902. nd['demographic']['all votes'] = nd['demographic']['imdb users']
  903. del nd['demographic']['imdb users']
  904. top250 = data.get('top 250')
  905. if top250:
  906. sd = top250[9:]
  907. i = sd.find(' ')
  908. if i != -1:
  909. sd = sd[:i]
  910. try: sd = int(sd)
  911. except (ValueError, OverflowError): pass
  912. if type(sd) is type(0):
  913. nd['top 250 rank'] = sd
  914. return nd
  915. class DOMHTMLEpisodesRatings(DOMParserBase):
  916. """Parser for the "episode ratings ... by date" page of a given movie.
  917. The page should be provided as a string, as taken from
  918. the akas.imdb.com server. The final result will be a
  919. dictionary, with a key for every relevant section.
  920. Example:
  921. erparser = DOMHTMLEpisodesRatings()
  922. result = erparser.parse(eprating_html_string)
  923. """
  924. _containsObjects = True
  925. extractors = [Extractor(label='title', path="//title",
  926. attrs=Attribute(key='title', path="./text()")),
  927. Extractor(label='ep ratings',
  928. path="//th/../..//tr",
  929. attrs=Attribute(key='episodes', multi=True,
  930. path={'nr': ".//td[1]/text()",
  931. 'ep title': ".//td[2]//text()",
  932. 'movieID': ".//td[2]/a/@href",
  933. 'rating': ".//td[3]/text()",
  934. 'votes': ".//td[4]/text()"}))]
  935. def postprocess_data(self, data):
  936. if 'title' not in data or 'episodes' not in data: return {}
  937. nd = []
  938. title = data['title']
  939. for i in data['episodes']:
  940. ept = i['ep title']
  941. movieID = analyze_imdbid(i['movieID'])
  942. votes = i['votes']
  943. rating = i['rating']
  944. if not (ept and movieID and votes and rating): continue
  945. try:
  946. votes = int(votes.replace(',', '').replace('.', ''))
  947. except:
  948. pass
  949. try:
  950. rating = float(rating)
  951. except:
  952. pass
  953. ept = ept.strip()
  954. ept = u'%s {%s' % (title, ept)
  955. nr = i['nr']
  956. if nr:
  957. ept += u' (#%s)' % nr.strip()
  958. ept += '}'
  959. if movieID is not None:
  960. movieID = str(movieID)
  961. m = Movie(title=ept, movieID=movieID, accessSystem=self._as,
  962. modFunct=self._modFunct)
  963. epofdict = m.get('episode of')
  964. if epofdict is not None:
  965. m['episode of'] = Movie(data=epofdict, accessSystem=self._as,
  966. modFunct=self._modFunct)
  967. nd.append({'episode': m, 'votes': votes, 'rating': rating})
  968. return {'episodes rating': nd}
  969. def _normalize_href(href):
  970. if (href is not None) and (not href.lower().startswith('http://')):
  971. if href.startswith('/'): href = href[1:]
  972. href = '%s%s' % (imdbURL_base, href)
  973. return href
  974. class DOMHTMLOfficialsitesParser(DOMParserBase):
  975. """Parser for the "official sites", "external reviews", "newsgroup
  976. reviews", "miscellaneous links", "sound clips", "video clips" and
  977. "photographs" pages of a given movie.
  978. The page should be provided as a string, as taken from
  979. the akas.imdb.com server. The final result will be a
  980. dictionary, with a key for every relevant section.
  981. Example:
  982. osparser = DOMHTMLOfficialsitesParser()
  983. result = osparser.parse(officialsites_html_string)
  984. """
  985. kind = 'official sites'
  986. extractors = [
  987. Extractor(label='site',
  988. path="//ol/li/a",
  989. attrs=Attribute(key='self.kind',
  990. multi=True,
  991. path={
  992. 'link': "./@href",
  993. 'info': "./text()"
  994. },
  995. postprocess=lambda x: (x.get('info').strip(),
  996. urllib.unquote(_normalize_href(x.get('link'))))))
  997. ]
  998. class DOMHTMLConnectionParser(DOMParserBase):
  999. """Parser for the "connections" page of a given movie.
  1000. The page should be provided as a string, as taken from
  1001. the akas.imdb.com server. The final result will be a
  1002. dictionary, with a key for every relevant section.
  1003. Example:
  1004. connparser = DOMHTMLConnectionParser()
  1005. result = connparser.parse(connections_html_string)
  1006. """
  1007. _containsObjects = True
  1008. extractors = [Extractor(label='connection',
  1009. group="//div[@class='_imdbpy']",
  1010. group_key="./h5/text()",
  1011. group_key_normalize=lambda x: x.lower(),
  1012. path="./a",
  1013. attrs=Attribute(key=None,
  1014. path={'title': "./text()",
  1015. 'movieID': "./@href"},
  1016. multi=True))]
  1017. preprocessors = [
  1018. ('<h5>', '</div><div class="_imdbpy"><h5>'),
  1019. # To get the movie's year.
  1020. ('</a> (', ' ('),
  1021. ('\n<br/>', '</a>'),
  1022. ('<br/> - ', '::')
  1023. ]
  1024. def postprocess_data(self, data):
  1025. for key in data.keys():
  1026. nl = []
  1027. for v in data[key]:
  1028. title = v['title']
  1029. ts = title.split('::', 1)
  1030. title = ts[0].strip()
  1031. notes = u''
  1032. if len(ts) == 2:
  1033. notes = ts[1].strip()
  1034. m = Movie(title=title,
  1035. movieID=analyze_imdbid(v['movieID']),
  1036. accessSystem=self._as, notes=notes,
  1037. modFunct=self._modFunct)
  1038. nl.append(m)
  1039. data[key] = nl
  1040. if not data: return {}
  1041. return {'connections': data}
  1042. class DOMHTMLLocationsParser(DOMParserBase):
  1043. """Parser for the "locations" page of a given movie.
  1044. The page should be provided as a string, as taken from
  1045. the akas.imdb.com server. The final result will be a
  1046. dictionary, with a key for every relevant section.
  1047. Example:
  1048. lparser = DOMHTMLLocationsParser()
  1049. result = lparser.parse(locations_html_string)
  1050. """
  1051. extractors = [Extractor(label='locations', path="//dt",
  1052. attrs=Attribute(key='locations', multi=True,
  1053. path={'place': ".//text()",
  1054. 'note': "./following-sibling::dd[1]" \
  1055. "//text()"},
  1056. postprocess=lambda x: (u'%s::%s' % (
  1057. x['place'].strip(),
  1058. (x['note'] or u'').strip())).strip(':')))]
  1059. class DOMHTMLTechParser(DOMParserBase):
  1060. """Parser for the "technical", "business", "literature",
  1061. "publicity" (for people) and "contacts (for people) pages of
  1062. a given movie.
  1063. The page should be provided as a string, as taken from
  1064. the akas.imdb.com server. The final result will be a
  1065. dictionary, with a key for every relevant section.
  1066. Example:
  1067. tparser = HTMLTechParser()
  1068. result = tparser.parse(technical_html_string)
  1069. """
  1070. kind = 'tech'
  1071. extractors = [Extractor(label='tech',
  1072. group="//h5",
  1073. group_key="./text()",
  1074. group_key_normalize=lambda x: x.lower(),
  1075. path="./following-sibling::div[1]",
  1076. attrs=Attribute(key=None,
  1077. path=".//text()",
  1078. postprocess=lambda x: [t.strip()
  1079. for t in x.split('\n') if t.strip()]))]
  1080. preprocessors = [
  1081. (re.compile('(<h5>.*?</h5>)', re.I), r'\1<div class="_imdbpy">'),
  1082. (re.compile('((<br/>|</p>|</table>))\n?<br/>(?!<a)', re.I),
  1083. r'\1</div>'),
  1084. # the ones below are for the publicity parser
  1085. (re.compile('<p>(.*?)</p>', re.I), r'\1<br/>'),
  1086. (re.compile('(</td><td valign="top">)', re.I), r'\1::'),
  1087. (re.compile('(</tr><tr>)', re.I), r'\n\1'),
  1088. # this is for splitting individual entries
  1089. (re.compile('<br/>', re.I), r'\n'),
  1090. ]
  1091. def postprocess_data(self, data):
  1092. for key in data:
  1093. data[key] = filter(None, data[key])
  1094. if self.kind in ('literature', 'business', 'contacts') and data:
  1095. if 'screenplay/teleplay' in data:
  1096. data['screenplay-teleplay'] = data['screenplay/teleplay']
  1097. del data['screenplay/teleplay']
  1098. data = {self.kind: data}
  1099. else:
  1100. if self.kind == 'publicity':
  1101. if 'biography (print)' in data:
  1102. data['biography-print'] = data['biography (print)']
  1103. del data['biography (print)']
  1104. # Tech info.
  1105. for key in data.keys():
  1106. if key.startswith('film negative format'):
  1107. data['film negative format'] = data[key]
  1108. del data[key]
  1109. elif key.startswith('film length'):
  1110. data['film length'] = data[key]
  1111. del data[key]
  1112. return data
  1113. class DOMHTMLDvdParser(DOMParserBase):
  1114. """Parser for the "dvd" page of a given movie.
  1115. The page should be provided as a string, as taken from
  1116. the akas.imdb.com server. The final result will be a
  1117. dictionary, with a key for every relevant section.
  1118. Example:
  1119. dparser = DOMHTMLDvdParser()
  1120. result = dparser.parse(dvd_html_string)
  1121. """
  1122. _defGetRefs = True
  1123. extractors = [Extractor(label='dvd',
  1124. path="//div[@class='base_layer']",
  1125. attrs=[Attribute(key=None,
  1126. multi=True,
  1127. path={
  1128. 'title': "../table[1]//h3/text()",
  1129. 'cover': "../table[1]//img/@src",
  1130. 'region': ".//p[b='Region:']/text()",
  1131. 'asin': ".//p[b='ASIN:']/text()",
  1132. 'upc': ".//p[b='UPC:']/text()",
  1133. 'rating': ".//p/b[starts-with(text(), 'Rating:')]/../img/@alt",
  1134. 'certificate': ".//p[b='Certificate:']/text()",
  1135. 'runtime': ".//p[b='Runtime:']/text()",
  1136. 'label': ".//p[b='Label:']/text()",
  1137. 'studio': ".//p[b='Studio:']/text()",
  1138. 'release date': ".//p[b='Release Date:']/text()",
  1139. 'dvd format': ".//p[b='DVD Format:']/text()",
  1140. 'dvd features': ".//p[b='DVD Features: ']//text()",
  1141. 'supplements': "..//div[span='Supplements']" \
  1142. "/following-sibling::div[1]//text()",
  1143. 'review': "..//div[span='Review']/following-sibling::div[1]//text()",
  1144. 'titles': "..//div[starts-with(text(), 'Titles in this Product')]" \
  1145. "/..//text()",
  1146. },
  1147. postprocess=lambda x: {
  1148. 'title': (x.get('title') or u'').strip(),
  1149. 'cover': (x.get('cover') or u'').strip(),
  1150. 'region': (x.get('region') or u'').strip(),
  1151. 'asin': (x.get('asin') or u'').strip(),
  1152. 'upc': (x.get('upc') or u'').strip(),
  1153. 'rating': (x.get('rating') or u'Not Rated').strip().replace('Rating: ', ''),
  1154. 'certificate': (x.get('certificate') or u'').strip(),
  1155. 'runtime': (x.get('runtime') or u'').strip(),
  1156. 'label': (x.get('label') or u'').strip(),
  1157. 'studio': (x.get('studio') or u'').strip(),
  1158. 'release date': (x.get('release date') or u'').strip(),
  1159. 'dvd format': (x.get('dvd format') or u'').strip(),
  1160. 'dvd features': (x.get('dvd features') or u'').strip().replace('DVD Features: ', ''),
  1161. 'supplements': (x.get('supplements') or u'').strip(),
  1162. 'review': (x.get('review') or u'').strip(),
  1163. 'titles in this product': (x.get('titles') or u'').strip().replace('Titles in this Product::', ''),
  1164. }
  1165. )])]
  1166. preprocessors = [
  1167. (re.compile('<p>(<table class="dvd_section" .*)</p>\s*<hr\s*/>', re.I),
  1168. r'<div class="_imdbpy">\1</div>'),
  1169. (re.compile('<p>(<div class\s*=\s*"base_layer")', re.I), r'\1'),
  1170. (re.compile('</p>\s*<p>(<div class="dvd_section")', re.I), r'\1'),
  1171. (re.compile('</div><div class="dvd_row(_alt)?">', re.I), r'::')
  1172. ]
  1173. def postprocess_data(self, data):
  1174. if not data:
  1175. return data
  1176. dvds = data['dvd']
  1177. for dvd in dvds:
  1178. if dvd['cover'].find('noposter') != -1:
  1179. del dvd['cover']
  1180. for key in dvd.keys():
  1181. if not dvd[key]:
  1182. del dvd[key]
  1183. if 'supplements' in dvd:
  1184. dvd['supplements'] = dvd['supplements'].split('::')
  1185. return data
  1186. class DOMHTMLRecParser(DOMParserBase):
  1187. """Parser for the "recommendations" page of a given movie.
  1188. The page should be provided as a string, as taken from
  1189. the akas.imdb.com server. The final result will be a
  1190. dictionary, with a key for every relevant section.
  1191. Example:
  1192. rparser = HTMLRecParser()
  1193. result = rparser.parse(recommendations_html_string)
  1194. """
  1195. _containsObjects = True
  1196. extractors = [Extractor(label='recommendations',
  1197. path="//td[@valign='middle'][1]",
  1198. attrs=Attribute(key='../../tr/td[1]//text()',
  1199. multi=True,
  1200. path={'title': ".//text()",
  1201. 'movieID': ".//a/@href"}))]
  1202. def postprocess_data(self, data):
  1203. for key in data.keys():
  1204. n_key = key
  1205. n_keyl = n_key.lower()
  1206. if n_keyl == 'suggested by the database':
  1207. n_key = 'database'
  1208. elif n_keyl == 'imdb users recommend':
  1209. n_key = 'users'
  1210. data[n_key] = [Movie(title=x['title'],
  1211. movieID=analyze_imdbid(x['movieID']),
  1212. accessSystem=self._as, modFunct=self._modFunct)
  1213. for x in data[key]]
  1214. del data[key]
  1215. if data: return {'recommendations': data}
  1216. return data
  1217. class DOMHTMLNewsParser(DOMParserBase):
  1218. """Parser for the "news" page of a given movie or person.
  1219. The page should be provided as a string, as taken from
  1220. the akas.imdb.com server. The final result will be a
  1221. dictionary, with a key for every relevant section.
  1222. Example:
  1223. nwparser = DOMHTMLNewsParser()
  1224. result = nwparser.parse(news_html_string)
  1225. """
  1226. _defGetRefs = True
  1227. extractors = [
  1228. Extractor(label='news',
  1229. path="//h2",
  1230. attrs=Attribute(key='news',
  1231. multi=True,
  1232. path={
  1233. 'title': "./text()",
  1234. 'fromdate': "../following-sibling::p[1]/small//text()",
  1235. # FIXME: sometimes (see The Matrix (1999)) <p> is found
  1236. # inside news text.
  1237. 'body': "../following-sibling::p[2]//text()",
  1238. 'link': "../..//a[text()='Permalink']/@href",
  1239. 'fulllink': "../..//a[starts-with(text(), " \
  1240. "'See full article at')]/@href"
  1241. },
  1242. postprocess=lambda x: {
  1243. 'title': x.get('title').strip(),
  1244. 'date': x.get('fromdate').split('|')[0].strip(),
  1245. 'from': x.get('fromdate').split('|')[1].replace('From ',
  1246. '').strip(),
  1247. 'body': (x.get('body') or u'').strip(),
  1248. 'link': _normalize_href(x.get('link')),
  1249. 'full article link': _normalize_href(x.get('fulllink'))
  1250. }))
  1251. ]
  1252. preprocessors = [
  1253. (re.compile('(<a name=[^>]+><h2>)', re.I), r'<div class="_imdbpy">\1'),
  1254. (re.compile('(<hr/>)', re.I), r'</div>\1'),
  1255. (re.compile('<p></p>', re.I), r'')
  1256. ]
  1257. def postprocess_data(self, data):
  1258. if not data.has_key('news'):
  1259. return {}
  1260. for news in data['news']:
  1261. if news.has_key('full article link'):
  1262. if news['full article link'] is None:
  1263. del news['full article link']
  1264. return data
  1265. def _parse_review(x):
  1266. result = {}
  1267. title = x.get('title').strip()
  1268. if title[-1] == ':': title = title[:-1]
  1269. result['title'] = title
  1270. result['link'] = _normalize_href(x.get('link'))
  1271. kind = x.get('kind').strip()
  1272. if kind[-1] == ':': kind = kind[:-1]
  1273. result['review kind'] = kind
  1274. text = x.get('review').replace('\n\n', '||').replace('\n', ' ').split('||')
  1275. review = '\n'.join(text)
  1276. if x.get('author') is not None:
  1277. author = x.get('author').strip()
  1278. review = review.split(author)[0].strip()
  1279. result['review author'] = author[2:]
  1280. if x.get('item') is not None:
  1281. item = x.get('item').strip()
  1282. review = review[len(item):].strip()
  1283. review = "%s: %s" % (item, review)
  1284. result['review'] = review
  1285. return result
  1286. class DOMHTMLAmazonReviewsParser(DOMParserBase):
  1287. """Parser for the "amazon reviews" page of a given movie.
  1288. The page should be provided as a string, as taken from
  1289. the akas.imdb.com server. The final result will be a
  1290. dictionary, with a key for every relevant section.
  1291. Example:
  1292. arparser = DOMHTMLAmazonReviewsParser()
  1293. result = arparser.parse(amazonreviews_html_string)
  1294. """
  1295. extractors = [
  1296. Extractor(label='amazon reviews',
  1297. group="//h3",
  1298. group_key="./a/text()",
  1299. group_key_normalize=lambda x: x[:-1],
  1300. path="./following-sibling::p[1]/span[@class='_review']",
  1301. attrs=Attribute(key=None,
  1302. multi=True,
  1303. path={
  1304. 'title': "../preceding-sibling::h3[1]/a[1]/text()",
  1305. 'link': "../preceding-sibling::h3[1]/a[1]/@href",
  1306. 'kind': "./preceding-sibling::b[1]/text()",
  1307. 'item': "./i/b/text()",
  1308. 'review': ".//text()",
  1309. 'author': "./i[starts-with(text(), '--')]/text()"
  1310. },
  1311. postprocess=_parse_review))
  1312. ]
  1313. preprocessors = [
  1314. (re.compile('<p>\n(?!<b>)', re.I), r'\n'),
  1315. (re.compile('(\n</b>\n)', re.I), r'\1<span class="_review">'),
  1316. (re.compile('(</p>\n\n)', re.I), r'</span>\1'),
  1317. (re.compile('(\s\n)(<i><b>)', re.I), r'</span>\1<span class="_review">\2')
  1318. ]
  1319. def postprocess_data(self, data):
  1320. if len(data) == 0:
  1321. return {}
  1322. nd = []
  1323. for item in data.keys():
  1324. nd = nd + data[item]
  1325. return {'amazon reviews': nd}
  1326. def _parse_merchandising_link(x):
  1327. result = {}
  1328. link = x.get('link')
  1329. result['link'] = _normalize_href(link)
  1330. text = x.get('text')
  1331. if text is not None:
  1332. result['link-text'] = text.strip()
  1333. cover = x.get('cover')
  1334. if cover is not None:
  1335. result['cover'] = cover
  1336. description = x.get('description')
  1337. if description is not None:
  1338. shop = x.get('shop')
  1339. if shop is not None:
  1340. result['description'] = u'%s::%s' % (shop, description.strip())
  1341. else:
  1342. result['description'] = description.strip()
  1343. return result
  1344. class DOMHTMLSalesParser(DOMParserBase):
  1345. """Parser for the "merchandising links" page of a given movie.
  1346. The page should be provided as a string, as taken from
  1347. the akas.imdb.com server. The final result will be a
  1348. dictionary, with a key for every relevant section.
  1349. Example:
  1350. sparser = DOMHTMLSalesParser()
  1351. result = sparser.parse(sales_html_string)
  1352. """
  1353. extractors = [
  1354. Extractor(label='shops',
  1355. group="//h5/a[@name]/..",
  1356. group_key="./a[1]/text()",
  1357. group_key_normalize=lambda x: x.lower(),
  1358. path=".//following-sibling::table[1]/" \
  1359. "/td[@class='w_rowtable_colshop']//tr[1]",
  1360. attrs=Attribute(key=None,
  1361. multi=True,
  1362. path={
  1363. 'link': "./td[2]/a[1]/@href",
  1364. 'text': "./td[1]/img[1]/@alt",
  1365. 'cover': "./ancestor::td[1]/../td[1]"\
  1366. "/a[1]/img[1]/@src",
  1367. },
  1368. postprocess=_parse_merchandising_link)),
  1369. Extractor(label='others',
  1370. group="//span[@class='_info']/..",
  1371. group_key="./h5/a[1]/text()",
  1372. group_key_normalize=lambda x: x.lower(),
  1373. path="./span[@class='_info']",
  1374. attrs=Attribute(key=None,
  1375. multi=True,
  1376. path={
  1377. 'link': "./preceding-sibling::a[1]/@href",
  1378. 'shop': "./preceding-sibling::a[1]/text()",
  1379. 'description': ".//text()",
  1380. },
  1381. postprocess=_parse_merchandising_link))
  1382. ]
  1383. preprocessors = [
  1384. (re.compile('(<h5><a name=)', re.I), r'</div><div class="_imdbpy">\1'),
  1385. (re.compile('(</h5>\n<br/>\n)</div>', re.I), r'\1'),
  1386. (re.compile('(<br/><br/>\n)(\n)', re.I), r'\1</div>\2'),
  1387. (re.compile('(\n)(Search.*?)(</a>)(\n)', re.I), r'\3\1\2\4'),
  1388. (re.compile('(\n)(Search.*?)(\n)', re.I),
  1389. r'\1<span class="_info">\2</span>\3')
  1390. ]
  1391. def postprocess_data(self, data):
  1392. if len(data) == 0:
  1393. return {}
  1394. return {'merchandising links': data}
  1395. def _build_episode(x):
  1396. """Create a Movie object for a given series' episode."""
  1397. episode_id = analyze_imdbid(x.get('link'))
  1398. episode_title = x.get('title')
  1399. e = Movie(movieID=episode_id, title=episode_title)
  1400. e['kind'] = u'episode'
  1401. oad = x.get('oad')
  1402. if oad:
  1403. e['original air date'] = oad.strip()
  1404. year = x.get('year')
  1405. if year is not None:
  1406. year = year[5:]
  1407. if year == 'unknown': year = u'????'
  1408. if year and year.isdigit():
  1409. year = int(year)
  1410. e['year'] = year
  1411. else:
  1412. if oad and oad[-4:].isdigit():
  1413. e['year'] = int(oad[-4:])
  1414. epinfo = x.get('episode')
  1415. if epinfo is not None:
  1416. season, episode = epinfo.split(':')[0].split(',')
  1417. e['season'] = int(season[7:])
  1418. e['episode'] = int(episode[8:])
  1419. else:
  1420. e['season'] = 'unknown'
  1421. e['episode'] = 'unknown'
  1422. plot = x.get('plot')
  1423. if plot:
  1424. e['plot'] = plot.strip()
  1425. return e
  1426. class DOMHTMLEpisodesParser(DOMParserBase):
  1427. """Parser for the "episode list" page of a given movie.
  1428. The page should be provided as a string, as taken from
  1429. the akas.imdb.com server. The final result will be a
  1430. dictionary, with a key for every relevant section.
  1431. Example:
  1432. eparser = DOMHTMLEpisodesParser()
  1433. result = eparser.parse(episodes_html_string)
  1434. """
  1435. _containsObjects = True
  1436. kind = 'episodes list'
  1437. _episodes_path = "..//h4"
  1438. _oad_path = "./following-sibling::span/strong[1]/text()"
  1439. def _init(self):
  1440. self.extractors = [
  1441. Extractor(label='series',
  1442. path="//html",
  1443. attrs=[Attribute(key='series title',
  1444. path=".//title/text()"),
  1445. Attribute(key='series movieID',
  1446. path=".//h1/a[@class='main']/@href",
  1447. postprocess=analyze_imdbid)
  1448. ]),
  1449. Extractor(label='episodes',
  1450. group="//div[@class='_imdbpy']/h3",
  1451. group_key="./a/@name",
  1452. path=self._episodes_path,
  1453. attrs=Attribute(key=None,
  1454. multi=True,
  1455. path={
  1456. 'link': "./a/@href",
  1457. 'title': "./a/text()",
  1458. 'year': "./preceding-sibling::a[1]/@name",
  1459. 'episode': "./text()[1]",
  1460. 'oad': self._oad_path,
  1461. 'plot': "./following-sibling::text()[1]"
  1462. },
  1463. postprocess=_build_episode))]
  1464. if self.kind == 'episodes cast':
  1465. self.extractors += [
  1466. Extractor(label='cast',
  1467. group="//h4",
  1468. group_key="./text()[1]",
  1469. group_key_normalize=lambda x: x.strip(),
  1470. path="./following-sibling::table[1]//td[@class='nm']",
  1471. attrs=Attribute(key=None,
  1472. multi=True,
  1473. path={'person': "..//text()",
  1474. 'link': "./a/@href",
  1475. 'roleID': \
  1476. "../td[4]/div[@class='_imdbpyrole']/@roleid"},
  1477. postprocess=lambda x: \
  1478. build_person(x.get('person') or u'',
  1479. personID=analyze_imdbid(x.get('link')),
  1480. roleID=(x.get('roleID') or u'').split('/'),
  1481. accessSystem=self._as,
  1482. modFunct=self._modFunct)))
  1483. ]
  1484. preprocessors = [
  1485. (re.compile('(<hr/>\n)(<h3>)', re.I),
  1486. r'</div>\1<div class="_imdbpy">\2'),
  1487. (re.compile('(</p>\n\n)</div>', re.I), r'\1'),
  1488. (re.compile('<h3>(.*?)</h3>', re.I), r'<h4>\1</h4>'),
  1489. (_reRolesMovie, _manageRoles),
  1490. (re.compile('(<br/> <br/>\n)(<hr/>)', re.I), r'\1</div>\2')
  1491. ]
  1492. def postprocess_data(self, data):
  1493. # A bit extreme?
  1494. if not 'series title' in data: return {}
  1495. if not 'series movieID' in data: return {}
  1496. stitle = data['series title'].replace('- Episode list', '')
  1497. stitle = stitle.replace('- Episodes list', '')
  1498. stitle = stitle.replace('- Episode cast', '')
  1499. stitle = stitle.replace('- Episodes cast', '')
  1500. stitle = stitle.strip()
  1501. if not stitle: return {}
  1502. seriesID = data['series movieID']
  1503. if seriesID is None: return {}
  1504. series = Movie(title=stitle, movieID=str(seriesID),
  1505. accessSystem=self._as, modFunct=self._modFunct)
  1506. nd = {}
  1507. for key in data.keys():
  1508. if key.startswith('season-'):
  1509. season_key = key[7:]
  1510. try: season_key = int(season_key)
  1511. except: pass
  1512. nd[season_key] = {}
  1513. for episode in data[key]:
  1514. if not episode: continue
  1515. episode_key = episode.get('episode')
  1516. if episode_key is None: continue
  1517. cast_key = 'Season %s, Episode %s:' % (season_key,
  1518. episode_key)
  1519. if data.has_key(cast_key):
  1520. cast = data[cast_key]
  1521. for i in xrange(len(cast)):
  1522. cast[i].billingPos = i + 1
  1523. episode['cast'] = cast
  1524. episode['episode of'] = series
  1525. nd[season_key][episode_key] = episode
  1526. if len(nd) == 0:
  1527. return {}
  1528. return {'episodes': nd}
  1529. class DOMHTMLEpisodesCastParser(DOMHTMLEpisodesParser):
  1530. """Parser for the "episodes cast" page of a given movie.
  1531. The page should be provided as a string, as taken from
  1532. the akas.imdb.com server. The final result will be a
  1533. dictionary, with a key for every relevant section.
  1534. Example:
  1535. eparser = DOMHTMLEpisodesParser()
  1536. result = eparser.parse(episodes_html_string)
  1537. """
  1538. kind = 'episodes cast'
  1539. _episodes_path = "..//h4"
  1540. _oad_path = "./following-sibling::b[1]/text()"
  1541. class DOMHTMLFaqsParser(DOMParserBase):
  1542. """Parser for the "FAQ" page of a given movie.
  1543. The page should be provided as a string, as taken from
  1544. the akas.imdb.com server. The final result will be a
  1545. dictionary, with a key for every relevant section.
  1546. Example:
  1547. fparser = DOMHTMLFaqsParser()
  1548. result = fparser.parse(faqs_html_string)
  1549. """
  1550. _defGetRefs = True
  1551. # XXX: bsoup and lxml don't match (looks like a minor issue, anyway).
  1552. extractors = [
  1553. Extractor(label='faqs',
  1554. path="//div[@class='section']",
  1555. attrs=Attribute(key='faqs',
  1556. multi=True,
  1557. path={
  1558. 'question': "./h3/a/span/text()",
  1559. 'answer': "../following-sibling::div[1]//text()"
  1560. },
  1561. postprocess=lambda x: u'%s::%s' % (x.get('question').strip(),
  1562. '\n\n'.join(x.get('answer').replace(
  1563. '\n\n', '\n').strip().split('||')))))
  1564. ]
  1565. preprocessors = [
  1566. (re.compile('<br/><br/>', re.I), r'||'),
  1567. (re.compile('<h4>(.*?)</h4>\n', re.I), r'||\1--'),
  1568. (re.compile('<span class="spoiler"><span>(.*?)</span></span>', re.I),
  1569. r'[spoiler]\1[/spoiler]')
  1570. ]
  1571. class DOMHTMLAiringParser(DOMParserBase):
  1572. """Parser for the "airing" page of a given movie.
  1573. The page should be provided as a string, as taken from
  1574. the akas.imdb.com server. The final result will be a
  1575. dictionary, with a key for every relevant section.
  1576. Example:
  1577. aparser = DOMHTMLAiringParser()
  1578. result = aparser.parse(airing_html_string)
  1579. """
  1580. _containsObjects = True
  1581. extractors = [
  1582. Extractor(label='series title',
  1583. path="//title",
  1584. attrs=Attribute(key='series title', path="./text()",
  1585. postprocess=lambda x: \
  1586. x.replace(' - TV schedule', u''))),
  1587. Extractor(label='series id',
  1588. path="//h1/a[@href]",
  1589. attrs=Attribute(key='series id', path="./@href")),
  1590. Extractor(label='tv airings',
  1591. path="//tr[@class]",
  1592. attrs=Attribute(key='airing',
  1593. multi=True,
  1594. path={
  1595. 'date': "./td[1]//text()",
  1596. 'time': "./td[2]//text()",
  1597. 'channel': "./td[3]//text()",
  1598. 'link': "./td[4]/a[1]/@href",
  1599. 'title': "./td[4]//text()",
  1600. 'season': "./td[5]//text()",
  1601. },
  1602. postprocess=lambda x: {
  1603. 'date': x.get('date'),
  1604. 'time': x.get('time'),
  1605. 'channel': x.get('channel').strip(),
  1606. 'link': x.get('link'),
  1607. 'title': x.get('title'),
  1608. 'season': (x.get('season') or '').strip()
  1609. }
  1610. ))
  1611. ]
  1612. def postprocess_data(self, data):
  1613. if len(data) == 0:
  1614. return {}
  1615. seriesTitle = data['series title']
  1616. seriesID = analyze_imdbid(data['series id'])
  1617. if data.has_key('airing'):
  1618. for airing in data['airing']:
  1619. title = airing.get('title', '').strip()
  1620. if not title:
  1621. epsTitle = seriesTitle
  1622. if seriesID is None:
  1623. continue
  1624. epsID = seriesID
  1625. else:
  1626. epsTitle = '%s {%s}' % (data['series title'],
  1627. airing['title'])
  1628. epsID = analyze_imdbid(airing['link'])
  1629. e = Movie(title=epsTitle, movieID=epsID)
  1630. airing['episode'] = e
  1631. del airing['link']
  1632. del airing['title']
  1633. if not airing['season']:
  1634. del airing['season']
  1635. if 'series title' in data:
  1636. del data['series title']
  1637. if 'series id' in data:
  1638. del data['series id']
  1639. if 'airing' in data:
  1640. data['airing'] = filter(None, data['airing'])
  1641. if 'airing' not in data or not data['airing']:
  1642. return {}
  1643. return data
  1644. class DOMHTMLSynopsisParser(DOMParserBase):
  1645. """Parser for the "synopsis" page of a given movie.
  1646. The page should be provided as a string, as taken from
  1647. the akas.imdb.com server. The final result will be a
  1648. dictionary, with a key for every relevant section.
  1649. Example:
  1650. sparser = HTMLSynopsisParser()
  1651. result = sparser.parse(synopsis_html_string)
  1652. """
  1653. extractors = [
  1654. Extractor(label='synopsis',
  1655. path="//div[@class='display'][not(@style)]",
  1656. attrs=Attribute(key='synopsis',
  1657. path=".//text()",
  1658. postprocess=lambda x: '\n\n'.join(x.strip().split('||'))))
  1659. ]
  1660. preprocessors = [
  1661. (re.compile('<br/><br/>', re.I), r'||')
  1662. ]
  1663. class DOMHTMLParentsGuideParser(DOMParserBase):
  1664. """Parser for the "parents guide" page of a given movie.
  1665. The page should be provided as a string, as taken from
  1666. the akas.imdb.com server. The final result will be a
  1667. dictionary, with a key for every relevant section.
  1668. Example:
  1669. pgparser = HTMLParentsGuideParser()
  1670. result = pgparser.parse(parentsguide_html_string)
  1671. """
  1672. extractors = [
  1673. Extractor(label='parents guide',
  1674. group="//div[@class='section']",
  1675. group_key="./h3/a/span/text()",
  1676. group_key_normalize=lambda x: x.lower(),
  1677. path="../following-sibling::div[1]/p",
  1678. attrs=Attribute(key=None,
  1679. path=".//text()",
  1680. postprocess=lambda x: [t.strip().replace('\n', ' ')
  1681. for t in x.split('||') if t.strip()]))
  1682. ]
  1683. preprocessors = [
  1684. (re.compile('<br/><br/>', re.I), r'||')
  1685. ]
  1686. def postprocess_data(self, data):
  1687. data2 = {}
  1688. for key in data:
  1689. if data[key]:
  1690. data2[key] = data[key]
  1691. if not data2:
  1692. return {}
  1693. return {'parents guide': data2}
  1694. _OBJECTS = {
  1695. 'movie_parser': ((DOMHTMLMovieParser,), None),
  1696. 'plot_parser': ((DOMHTMLPlotParser,), None),
  1697. 'movie_awards_parser': ((DOMHTMLAwardsParser,), None),
  1698. 'taglines_parser': ((DOMHTMLTaglinesParser,), None),
  1699. 'keywords_parser': ((DOMHTMLKeywordsParser,), None),
  1700. 'crazycredits_parser': ((DOMHTMLCrazyCreditsParser,), None),
  1701. 'goofs_parser': ((DOMHTMLGoofsParser,), None),
  1702. 'alternateversions_parser': ((DOMHTMLAlternateVersionsParser,), None),
  1703. 'trivia_parser': ((DOMHTMLAlternateVersionsParser,), {'kind': 'trivia'}),
  1704. 'soundtrack_parser': ((DOMHTMLSoundtrackParser,), {'kind': 'soundtrack'}),
  1705. 'quotes_parser': ((DOMHTMLQuotesParser,), None),
  1706. 'releasedates_parser': ((DOMHTMLReleaseinfoParser,), None),
  1707. 'ratings_parser': ((DOMHTMLRatingsParser,), None),
  1708. 'officialsites_parser': ((DOMHTMLOfficialsitesParser,), None),
  1709. 'externalrev_parser': ((DOMHTMLOfficialsitesParser,),
  1710. {'kind': 'external reviews'}),
  1711. 'newsgrouprev_parser': ((DOMHTMLOfficialsitesParser,),
  1712. {'kind': 'newsgroup reviews'}),
  1713. 'misclinks_parser': ((DOMHTMLOfficialsitesParser,),
  1714. {'kind': 'misc links'}),
  1715. 'soundclips_parser': ((DOMHTMLOfficialsitesParser,),
  1716. {'kind': 'sound clips'}),
  1717. 'videoclips_parser': ((DOMHTMLOfficialsitesParser,),
  1718. {'kind': 'video clips'}),
  1719. 'photosites_parser': ((DOMHTMLOfficialsitesParser,),
  1720. {'kind': 'photo sites'}),
  1721. 'connections_parser': ((DOMHTMLConnectionParser,), None),
  1722. 'tech_parser': ((DOMHTMLTechParser,), None),
  1723. 'business_parser': ((DOMHTMLTechParser,),
  1724. {'kind': 'business', '_defGetRefs': 1}),
  1725. 'literature_parser': ((DOMHTMLTechParser,), {'kind': 'literature'}),
  1726. 'locations_parser': ((DOMHTMLLocationsParser,), None),
  1727. 'dvd_parser': ((DOMHTMLDvdParser,), None),
  1728. 'rec_parser': ((DOMHTMLRecParser,), None),
  1729. 'news_parser': ((DOMHTMLNewsParser,), None),
  1730. 'amazonrev_parser': ((DOMHTMLAmazonReviewsParser,), None),
  1731. 'sales_parser': ((DOMHTMLSalesParser,), None),
  1732. 'episodes_parser': ((DOMHTMLEpisodesParser,), None),
  1733. 'episodes_cast_parser': ((DOMHTMLEpisodesCastParser,), None),
  1734. 'eprating_parser': ((DOMHTMLEpisodesRatings,), None),
  1735. 'movie_faqs_parser': ((DOMHTMLFaqsParser,), None),
  1736. 'airing_parser': ((DOMHTMLAiringParser,), None),
  1737. 'synopsis_parser': ((DOMHTMLSynopsisParser,), None),
  1738. 'parentsguide_parser': ((DOMHTMLParentsGuideParser,), None)
  1739. }