PageRenderTime 29ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/imdb/parser/http/utils.py

https://gitlab.com/akila-33/Sick-Beard
Python | 876 lines | 802 code | 24 blank | 50 comment | 39 complexity | 43e89c03ef9b54a3d813ded2872a96d3 MD5 | raw file
  1. """
  2. parser.http.utils module (imdb package).
  3. This module provides miscellaneous utilities used by
  4. the imdb.parser.http classes.
  5. Copyright 2004-2012 Davide Alberani <da@erlug.linux.it>
  6. 2008 H. Turgut Uyar <uyar@tekir.org>
  7. This program is free software; you can redistribute it and/or modify
  8. it under the terms of the GNU General Public License as published by
  9. the Free Software Foundation; either version 2 of the License, or
  10. (at your option) any later version.
  11. This program is distributed in the hope that it will be useful,
  12. but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. GNU General Public License for more details.
  15. You should have received a copy of the GNU General Public License
  16. along with this program; if not, write to the Free Software
  17. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. """
  19. import re
  20. import logging
  21. import warnings
  22. from imdb._exceptions import IMDbError
  23. from imdb.utils import flatten, _Container
  24. from imdb.Movie import Movie
  25. from imdb.Person import Person
  26. from imdb.Character import Character
  27. # Year, imdbIndex and kind.
  28. re_yearKind_index = re.compile(r'(\([0-9\?]{4}(?:/[IVXLCDM]+)?\)(?: \(mini\)| \(TV\)| \(V\)| \(VG\))?)')
  29. # Match imdb ids in href tags
  30. re_imdbid = re.compile(r'(title/tt|name/nm|character/ch|company/co)([0-9]+)')
  31. def analyze_imdbid(href):
  32. """Return an imdbID from an URL."""
  33. if not href:
  34. return None
  35. match = re_imdbid.search(href)
  36. if not match:
  37. return None
  38. return str(match.group(2))
  39. _modify_keys = list(Movie.keys_tomodify_list) + list(Person.keys_tomodify_list)
  40. def _putRefs(d, re_titles, re_names, re_characters, lastKey=None):
  41. """Iterate over the strings inside list items or dictionary values,
  42. substitutes movie titles and person names with the (qv) references."""
  43. if isinstance(d, list):
  44. for i in xrange(len(d)):
  45. if isinstance(d[i], (unicode, str)):
  46. if lastKey in _modify_keys:
  47. if re_names:
  48. d[i] = re_names.sub(ur"'\1' (qv)", d[i])
  49. if re_titles:
  50. d[i] = re_titles.sub(ur'_\1_ (qv)', d[i])
  51. if re_characters:
  52. d[i] = re_characters.sub(ur'#\1# (qv)', d[i])
  53. elif isinstance(d[i], (list, dict)):
  54. _putRefs(d[i], re_titles, re_names, re_characters,
  55. lastKey=lastKey)
  56. elif isinstance(d, dict):
  57. for k, v in d.items():
  58. lastKey = k
  59. if isinstance(v, (unicode, str)):
  60. if lastKey in _modify_keys:
  61. if re_names:
  62. d[k] = re_names.sub(ur"'\1' (qv)", v)
  63. if re_titles:
  64. d[k] = re_titles.sub(ur'_\1_ (qv)', v)
  65. if re_characters:
  66. d[k] = re_characters.sub(ur'#\1# (qv)', v)
  67. elif isinstance(v, (list, dict)):
  68. _putRefs(d[k], re_titles, re_names, re_characters,
  69. lastKey=lastKey)
  70. # Handle HTML/XML/SGML entities.
  71. from htmlentitydefs import entitydefs
  72. entitydefs = entitydefs.copy()
  73. entitydefsget = entitydefs.get
  74. entitydefs['nbsp'] = ' '
  75. sgmlentity = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\'', 'ndash': '-'}
  76. sgmlentityget = sgmlentity.get
  77. _sgmlentkeys = sgmlentity.keys()
  78. entcharrefs = {}
  79. entcharrefsget = entcharrefs.get
  80. for _k, _v in entitydefs.items():
  81. if _k in _sgmlentkeys: continue
  82. if _v[0:2] == '&#':
  83. dec_code = _v[1:-1]
  84. _v = unichr(int(_v[2:-1]))
  85. entcharrefs[dec_code] = _v
  86. else:
  87. dec_code = '#' + str(ord(_v))
  88. _v = unicode(_v, 'latin_1', 'replace')
  89. entcharrefs[dec_code] = _v
  90. entcharrefs[_k] = _v
  91. del _sgmlentkeys, _k, _v
  92. entcharrefs['#160'] = u' '
  93. entcharrefs['#xA0'] = u' '
  94. entcharrefs['#xa0'] = u' '
  95. entcharrefs['#XA0'] = u' '
  96. entcharrefs['#x22'] = u'"'
  97. entcharrefs['#X22'] = u'"'
  98. # convert &x26; to &amp;, to make BeautifulSoup happy; beware that this
  99. # leaves lone '&' in the html broken, but I assume this is better than
  100. # the contrary...
  101. entcharrefs['#38'] = u'&amp;'
  102. entcharrefs['#x26'] = u'&amp;'
  103. entcharrefs['#x26'] = u'&amp;'
  104. re_entcharrefs = re.compile('&(%s|\#160|\#\d{1,5}|\#x[0-9a-f]{1,4});' %
  105. '|'.join(map(re.escape, entcharrefs)), re.I)
  106. re_entcharrefssub = re_entcharrefs.sub
  107. sgmlentity.update(dict([('#34', u'"'), ('#38', u'&'),
  108. ('#60', u'<'), ('#62', u'>'), ('#39', u"'")]))
  109. re_sgmlref = re.compile('&(%s);' % '|'.join(map(re.escape, sgmlentity)))
  110. re_sgmlrefsub = re_sgmlref.sub
  111. # Matches XML-only single tags, like <br/> ; they are invalid in HTML,
  112. # but widely used by IMDb web site. :-/
  113. re_xmltags = re.compile('<([a-zA-Z]+)/>')
  114. def _replXMLRef(match):
  115. """Replace the matched XML/HTML entities and references;
  116. replace everything except sgml entities like &lt;, &gt;, ..."""
  117. ref = match.group(1)
  118. value = entcharrefsget(ref)
  119. if value is None:
  120. if ref[0] == '#':
  121. ref_code = ref[1:]
  122. if ref_code in ('34', '38', '60', '62', '39'):
  123. return match.group(0)
  124. elif ref_code[0].lower() == 'x':
  125. #if ref[2:] == '26':
  126. # # Don't convert &x26; to &amp;, to make BeautifulSoup happy.
  127. # return '&amp;'
  128. return unichr(int(ref[2:], 16))
  129. else:
  130. return unichr(int(ref[1:]))
  131. else:
  132. return ref
  133. return value
  134. def subXMLRefs(s):
  135. """Return the given html string with entity and char references
  136. replaced."""
  137. return re_entcharrefssub(_replXMLRef, s)
  138. # XXX: no more used here; move it to mobile (they are imported by helpers, too)?
  139. def _replSGMLRefs(match):
  140. """Replace the matched SGML entity."""
  141. ref = match.group(1)
  142. return sgmlentityget(ref, ref)
  143. def subSGMLRefs(s):
  144. """Return the given html string with sgml entity and char references
  145. replaced."""
  146. return re_sgmlrefsub(_replSGMLRefs, s)
  147. _b_p_logger = logging.getLogger('imdbpy.parser.http.build_person')
  148. def build_person(txt, personID=None, billingPos=None,
  149. roleID=None, accessSystem='http', modFunct=None):
  150. """Return a Person instance from the tipical <tr>...</tr> strings
  151. found in the IMDb's web site."""
  152. #if personID is None
  153. # _b_p_logger.debug('empty name or personID for "%s"', txt)
  154. notes = u''
  155. role = u''
  156. # Search the (optional) separator between name and role/notes.
  157. if txt.find('....') != -1:
  158. sep = '....'
  159. elif txt.find('...') != -1:
  160. sep = '...'
  161. else:
  162. sep = '...'
  163. # Replace the first parenthesis, assuming there are only
  164. # notes, after.
  165. # Rationale: no imdbIndex is (ever?) showed on the web site.
  166. txt = txt.replace('(', '...(', 1)
  167. txt_split = txt.split(sep, 1)
  168. name = txt_split[0].strip()
  169. if len(txt_split) == 2:
  170. role_comment = txt_split[1].strip()
  171. # Strip common endings.
  172. if role_comment[-4:] == ' and':
  173. role_comment = role_comment[:-4].rstrip()
  174. elif role_comment[-2:] == ' &':
  175. role_comment = role_comment[:-2].rstrip()
  176. elif role_comment[-6:] == '& ....':
  177. role_comment = role_comment[:-6].rstrip()
  178. # Get the notes.
  179. if roleID is not None:
  180. if not isinstance(roleID, list):
  181. cmt_idx = role_comment.find('(')
  182. if cmt_idx != -1:
  183. role = role_comment[:cmt_idx].rstrip()
  184. notes = role_comment[cmt_idx:]
  185. else:
  186. # Just a role, without notes.
  187. role = role_comment
  188. else:
  189. role = role_comment
  190. else:
  191. # We're managing something that doesn't have a 'role', so
  192. # everything are notes.
  193. notes = role_comment
  194. if role == '....': role = u''
  195. roleNotes = []
  196. # Manages multiple roleIDs.
  197. if isinstance(roleID, list):
  198. rolesplit = role.split('/')
  199. role = []
  200. for r in rolesplit:
  201. nidx = r.find('(')
  202. if nidx != -1:
  203. role.append(r[:nidx].rstrip())
  204. roleNotes.append(r[nidx:])
  205. else:
  206. role.append(r)
  207. roleNotes.append(None)
  208. lr = len(role)
  209. lrid = len(roleID)
  210. if lr > lrid:
  211. roleID += [None] * (lrid - lr)
  212. elif lr < lrid:
  213. roleID = roleID[:lr]
  214. for i, rid in enumerate(roleID):
  215. if rid is not None:
  216. roleID[i] = str(rid)
  217. if lr == 1:
  218. role = role[0]
  219. roleID = roleID[0]
  220. notes = roleNotes[0] or u''
  221. elif roleID is not None:
  222. roleID = str(roleID)
  223. if personID is not None:
  224. personID = str(personID)
  225. if (not name) or (personID is None):
  226. # Set to 'debug', since build_person is expected to receive some crap.
  227. _b_p_logger.debug('empty name or personID for "%s"', txt)
  228. # XXX: return None if something strange is detected?
  229. person = Person(name=name, personID=personID, currentRole=role,
  230. roleID=roleID, notes=notes, billingPos=billingPos,
  231. modFunct=modFunct, accessSystem=accessSystem)
  232. if roleNotes and len(roleNotes) == len(roleID):
  233. for idx, role in enumerate(person.currentRole):
  234. if roleNotes[idx]:
  235. role.notes = roleNotes[idx]
  236. return person
  237. _re_chrIDs = re.compile('[0-9]{7}')
  238. _b_m_logger = logging.getLogger('imdbpy.parser.http.build_movie')
  239. # To shrink spaces.
  240. re_spaces = re.compile(r'\s+')
  241. def build_movie(txt, movieID=None, roleID=None, status=None,
  242. accessSystem='http', modFunct=None, _parsingCharacter=False,
  243. _parsingCompany=False, year=None, chrRoles=None,
  244. rolesNoChar=None, additionalNotes=None):
  245. """Given a string as normally seen on the "categorized" page of
  246. a person on the IMDb's web site, returns a Movie instance."""
  247. # FIXME: Oook, lets face it: build_movie and build_person are now
  248. # two horrible sets of patches to support the new IMDb design. They
  249. # must be rewritten from scratch.
  250. if _parsingCharacter:
  251. _defSep = ' Played by '
  252. elif _parsingCompany:
  253. _defSep = ' ... '
  254. else:
  255. _defSep = ' .... '
  256. title = re_spaces.sub(' ', txt).strip()
  257. # Split the role/notes from the movie title.
  258. tsplit = title.split(_defSep, 1)
  259. role = u''
  260. notes = u''
  261. roleNotes = []
  262. if len(tsplit) == 2:
  263. title = tsplit[0].rstrip()
  264. role = tsplit[1].lstrip()
  265. if title[-9:] == 'TV Series':
  266. title = title[:-9].rstrip()
  267. #elif title[-7:] == '(short)':
  268. # title = title[:-7].rstrip()
  269. #elif title[-11:] == '(TV series)':
  270. # title = title[:-11].rstrip()
  271. #elif title[-10:] == '(TV movie)':
  272. # title = title[:-10].rstrip()
  273. elif title[-14:] == 'TV mini-series':
  274. title = title[:-14] + ' (mini)'
  275. if title and title.endswith(_defSep.rstrip()):
  276. title = title[:-len(_defSep)+1]
  277. # Try to understand where the movie title ends.
  278. while True:
  279. if year:
  280. break
  281. if title[-1:] != ')':
  282. # Ignore the silly "TV Series" notice.
  283. if title[-9:] == 'TV Series':
  284. title = title[:-9].rstrip()
  285. continue
  286. else:
  287. # Just a title: stop here.
  288. break
  289. # Try to match paired parentheses; yes: sometimes there are
  290. # parentheses inside comments...
  291. nidx = title.rfind('(')
  292. while (nidx != -1 and \
  293. title[nidx:].count('(') != title[nidx:].count(')')):
  294. nidx = title[:nidx].rfind('(')
  295. # Unbalanced parentheses: stop here.
  296. if nidx == -1: break
  297. # The last item in parentheses seems to be a year: stop here.
  298. first4 = title[nidx+1:nidx+5]
  299. if (first4.isdigit() or first4 == '????') and \
  300. title[nidx+5:nidx+6] in (')', '/'): break
  301. # The last item in parentheses is a known kind: stop here.
  302. if title[nidx+1:-1] in ('TV', 'V', 'mini', 'VG', 'TV movie',
  303. 'TV series', 'short'): break
  304. # Else, in parentheses there are some notes.
  305. # XXX: should the notes in the role half be kept separated
  306. # from the notes in the movie title half?
  307. if notes: notes = '%s %s' % (title[nidx:], notes)
  308. else: notes = title[nidx:]
  309. title = title[:nidx].rstrip()
  310. if year:
  311. year = year.strip()
  312. if title[-1] == ')':
  313. fpIdx = title.rfind('(')
  314. if fpIdx != -1:
  315. if notes: notes = '%s %s' % (title[fpIdx:], notes)
  316. else: notes = title[fpIdx:]
  317. title = title[:fpIdx].rstrip()
  318. title = u'%s (%s)' % (title, year)
  319. if _parsingCharacter and roleID and not role:
  320. roleID = None
  321. if not roleID:
  322. roleID = None
  323. elif len(roleID) == 1:
  324. roleID = roleID[0]
  325. if not role and chrRoles and isinstance(roleID, (str, unicode)):
  326. roleID = _re_chrIDs.findall(roleID)
  327. role = ' / '.join(filter(None, chrRoles.split('@@')))
  328. # Manages multiple roleIDs.
  329. if isinstance(roleID, list):
  330. tmprole = role.split('/')
  331. role = []
  332. for r in tmprole:
  333. nidx = r.find('(')
  334. if nidx != -1:
  335. role.append(r[:nidx].rstrip())
  336. roleNotes.append(r[nidx:])
  337. else:
  338. role.append(r)
  339. roleNotes.append(None)
  340. lr = len(role)
  341. lrid = len(roleID)
  342. if lr > lrid:
  343. roleID += [None] * (lrid - lr)
  344. elif lr < lrid:
  345. roleID = roleID[:lr]
  346. for i, rid in enumerate(roleID):
  347. if rid is not None:
  348. roleID[i] = str(rid)
  349. if lr == 1:
  350. role = role[0]
  351. roleID = roleID[0]
  352. elif roleID is not None:
  353. roleID = str(roleID)
  354. if movieID is not None:
  355. movieID = str(movieID)
  356. if (not title) or (movieID is None):
  357. _b_m_logger.error('empty title or movieID for "%s"', txt)
  358. if rolesNoChar:
  359. rolesNoChar = filter(None, [x.strip() for x in rolesNoChar.split('/')])
  360. if not role:
  361. role = []
  362. elif not isinstance(role, list):
  363. role = [role]
  364. role += rolesNoChar
  365. notes = notes.strip()
  366. if additionalNotes:
  367. additionalNotes = re_spaces.sub(' ', additionalNotes).strip()
  368. if notes:
  369. notes += u' '
  370. notes += additionalNotes
  371. if role and isinstance(role, list) and notes.endswith(role[-1].replace('\n', ' ')):
  372. role = role[:-1]
  373. m = Movie(title=title, movieID=movieID, notes=notes, currentRole=role,
  374. roleID=roleID, roleIsPerson=_parsingCharacter,
  375. modFunct=modFunct, accessSystem=accessSystem)
  376. if roleNotes and len(roleNotes) == len(roleID):
  377. for idx, role in enumerate(m.currentRole):
  378. try:
  379. if roleNotes[idx]:
  380. role.notes = roleNotes[idx]
  381. except IndexError:
  382. break
  383. # Status can't be checked here, and must be detected by the parser.
  384. if status:
  385. m['status'] = status
  386. return m
  387. class DOMParserBase(object):
  388. """Base parser to handle HTML data from the IMDb's web server."""
  389. _defGetRefs = False
  390. _containsObjects = False
  391. preprocessors = []
  392. extractors = []
  393. usingModule = None
  394. _logger = logging.getLogger('imdbpy.parser.http.domparser')
  395. def __init__(self, useModule=None):
  396. """Initialize the parser. useModule can be used to force it
  397. to use 'BeautifulSoup' or 'lxml'; by default, it's auto-detected,
  398. using 'lxml' if available and falling back to 'BeautifulSoup'
  399. otherwise."""
  400. # Module to use.
  401. if useModule is None:
  402. useModule = ('lxml', 'BeautifulSoup')
  403. if not isinstance(useModule, (tuple, list)):
  404. useModule = [useModule]
  405. self._useModule = useModule
  406. nrMods = len(useModule)
  407. _gotError = False
  408. for idx, mod in enumerate(useModule):
  409. mod = mod.strip().lower()
  410. try:
  411. if mod == 'lxml':
  412. from lxml.html import fromstring
  413. from lxml.etree import tostring
  414. self._is_xml_unicode = False
  415. self.usingModule = 'lxml'
  416. elif mod == 'beautifulsoup':
  417. from bsouplxml.html import fromstring
  418. from bsouplxml.etree import tostring
  419. self._is_xml_unicode = True
  420. self.usingModule = 'beautifulsoup'
  421. else:
  422. self._logger.warn('unknown module "%s"' % mod)
  423. continue
  424. self.fromstring = fromstring
  425. self._tostring = tostring
  426. if _gotError:
  427. warnings.warn('falling back to "%s"' % mod)
  428. break
  429. except ImportError, e:
  430. if idx+1 >= nrMods:
  431. # Raise the exception, if we don't have any more
  432. # options to try.
  433. raise IMDbError('unable to use any parser in %s: %s' % \
  434. (str(useModule), str(e)))
  435. else:
  436. warnings.warn('unable to use "%s": %s' % (mod, str(e)))
  437. _gotError = True
  438. continue
  439. else:
  440. raise IMDbError('unable to use parsers in %s' % str(useModule))
  441. # Fall-back defaults.
  442. self._modFunct = None
  443. self._as = 'http'
  444. self._cname = self.__class__.__name__
  445. self._init()
  446. self.reset()
  447. def reset(self):
  448. """Reset the parser."""
  449. # Names and titles references.
  450. self._namesRefs = {}
  451. self._titlesRefs = {}
  452. self._charactersRefs = {}
  453. self._reset()
  454. def _init(self):
  455. """Subclasses can override this method, if needed."""
  456. pass
  457. def _reset(self):
  458. """Subclasses can override this method, if needed."""
  459. pass
  460. def parse(self, html_string, getRefs=None, **kwds):
  461. """Return the dictionary generated from the given html string;
  462. getRefs can be used to force the gathering of movies/persons/characters
  463. references."""
  464. self.reset()
  465. if getRefs is not None:
  466. self.getRefs = getRefs
  467. else:
  468. self.getRefs = self._defGetRefs
  469. # Useful only for the testsuite.
  470. if not isinstance(html_string, unicode):
  471. html_string = unicode(html_string, 'latin_1', 'replace')
  472. html_string = subXMLRefs(html_string)
  473. # Temporary fix: self.parse_dom must work even for empty strings.
  474. html_string = self.preprocess_string(html_string)
  475. html_string = html_string.strip()
  476. if self.usingModule == 'beautifulsoup':
  477. # tag attributes like title="&#x22;Family Guy&#x22;" will be
  478. # converted to title=""Family Guy"" and this confuses BeautifulSoup.
  479. html_string = html_string.replace('""', '"')
  480. # Browser-specific escapes create problems to BeautifulSoup.
  481. html_string = html_string.replace('<!--[if IE]>', '"')
  482. html_string = html_string.replace('<![endif]-->', '"')
  483. #print html_string.encode('utf8')
  484. if html_string:
  485. dom = self.get_dom(html_string)
  486. #print self.tostring(dom).encode('utf8')
  487. try:
  488. dom = self.preprocess_dom(dom)
  489. except Exception, e:
  490. self._logger.error('%s: caught exception preprocessing DOM',
  491. self._cname, exc_info=True)
  492. if self.getRefs:
  493. try:
  494. self.gather_refs(dom)
  495. except Exception, e:
  496. self._logger.warn('%s: unable to gather refs: %s',
  497. self._cname, exc_info=True)
  498. data = self.parse_dom(dom)
  499. else:
  500. data = {}
  501. try:
  502. data = self.postprocess_data(data)
  503. except Exception, e:
  504. self._logger.error('%s: caught exception postprocessing data',
  505. self._cname, exc_info=True)
  506. if self._containsObjects:
  507. self.set_objects_params(data)
  508. data = self.add_refs(data)
  509. return data
  510. def _build_empty_dom(self):
  511. from bsouplxml import _bsoup
  512. return _bsoup.BeautifulSoup('')
  513. def get_dom(self, html_string):
  514. """Return a dom object, from the given string."""
  515. try:
  516. dom = self.fromstring(html_string)
  517. if dom is None:
  518. dom = self._build_empty_dom()
  519. self._logger.error('%s: using a fake empty DOM', self._cname)
  520. return dom
  521. except Exception, e:
  522. self._logger.error('%s: caught exception parsing DOM',
  523. self._cname, exc_info=True)
  524. return self._build_empty_dom()
  525. def xpath(self, element, path):
  526. """Return elements matching the given XPath."""
  527. try:
  528. xpath_result = element.xpath(path)
  529. if self._is_xml_unicode:
  530. return xpath_result
  531. result = []
  532. for item in xpath_result:
  533. if isinstance(item, str):
  534. item = unicode(item)
  535. result.append(item)
  536. return result
  537. except Exception, e:
  538. self._logger.error('%s: caught exception extracting XPath "%s"',
  539. self._cname, path, exc_info=True)
  540. return []
  541. def tostring(self, element):
  542. """Convert the element to a string."""
  543. if isinstance(element, (unicode, str)):
  544. return unicode(element)
  545. else:
  546. try:
  547. return self._tostring(element, encoding=unicode)
  548. except Exception, e:
  549. self._logger.error('%s: unable to convert to string',
  550. self._cname, exc_info=True)
  551. return u''
  552. def clone(self, element):
  553. """Clone an element."""
  554. return self.fromstring(self.tostring(element))
  555. def preprocess_string(self, html_string):
  556. """Here we can modify the text, before it's parsed."""
  557. if not html_string:
  558. return html_string
  559. # Remove silly &nbsp;&raquo; and &ndash; chars.
  560. html_string = html_string.replace(u' \xbb', u'')
  561. html_string = html_string.replace(u'&ndash;', u'-')
  562. try:
  563. preprocessors = self.preprocessors
  564. except AttributeError:
  565. return html_string
  566. for src, sub in preprocessors:
  567. # re._pattern_type is present only since Python 2.5.
  568. if callable(getattr(src, 'sub', None)):
  569. html_string = src.sub(sub, html_string)
  570. elif isinstance(src, str):
  571. html_string = html_string.replace(src, sub)
  572. elif callable(src):
  573. try:
  574. html_string = src(html_string)
  575. except Exception, e:
  576. _msg = '%s: caught exception preprocessing html'
  577. self._logger.error(_msg, self._cname, exc_info=True)
  578. continue
  579. ##print html_string.encode('utf8')
  580. return html_string
  581. def gather_refs(self, dom):
  582. """Collect references."""
  583. grParser = GatherRefs(useModule=self._useModule)
  584. grParser._as = self._as
  585. grParser._modFunct = self._modFunct
  586. refs = grParser.parse_dom(dom)
  587. refs = grParser.postprocess_data(refs)
  588. self._namesRefs = refs['names refs']
  589. self._titlesRefs = refs['titles refs']
  590. self._charactersRefs = refs['characters refs']
  591. def preprocess_dom(self, dom):
  592. """Last chance to modify the dom, before the rules in self.extractors
  593. are applied by the parse_dom method."""
  594. return dom
  595. def parse_dom(self, dom):
  596. """Parse the given dom according to the rules specified
  597. in self.extractors."""
  598. result = {}
  599. for extractor in self.extractors:
  600. ##print extractor.label
  601. if extractor.group is None:
  602. elements = [(extractor.label, element)
  603. for element in self.xpath(dom, extractor.path)]
  604. else:
  605. groups = self.xpath(dom, extractor.group)
  606. elements = []
  607. for group in groups:
  608. group_key = self.xpath(group, extractor.group_key)
  609. if not group_key: continue
  610. group_key = group_key[0]
  611. # XXX: always tries the conversion to unicode:
  612. # BeautifulSoup.NavigableString is a subclass
  613. # of unicode, and so it's never converted.
  614. group_key = self.tostring(group_key)
  615. normalizer = extractor.group_key_normalize
  616. if normalizer is not None:
  617. if callable(normalizer):
  618. try:
  619. group_key = normalizer(group_key)
  620. except Exception, e:
  621. _m = '%s: unable to apply group_key normalizer'
  622. self._logger.error(_m, self._cname,
  623. exc_info=True)
  624. group_elements = self.xpath(group, extractor.path)
  625. elements.extend([(group_key, element)
  626. for element in group_elements])
  627. for group_key, element in elements:
  628. for attr in extractor.attrs:
  629. if isinstance(attr.path, dict):
  630. data = {}
  631. for field in attr.path.keys():
  632. path = attr.path[field]
  633. value = self.xpath(element, path)
  634. if not value:
  635. data[field] = None
  636. else:
  637. # XXX: use u'' , to join?
  638. data[field] = ''.join(value)
  639. else:
  640. data = self.xpath(element, attr.path)
  641. if not data:
  642. data = None
  643. else:
  644. data = attr.joiner.join(data)
  645. if not data:
  646. continue
  647. attr_postprocess = attr.postprocess
  648. if callable(attr_postprocess):
  649. try:
  650. data = attr_postprocess(data)
  651. except Exception, e:
  652. _m = '%s: unable to apply attr postprocess'
  653. self._logger.error(_m, self._cname, exc_info=True)
  654. key = attr.key
  655. if key is None:
  656. key = group_key
  657. elif key.startswith('.'):
  658. # assuming this is an xpath
  659. try:
  660. key = self.xpath(element, key)[0]
  661. except IndexError:
  662. self._logger.error('%s: XPath returned no items',
  663. self._cname, exc_info=True)
  664. elif key.startswith('self.'):
  665. key = getattr(self, key[5:])
  666. if attr.multi:
  667. if key not in result:
  668. result[key] = []
  669. result[key].append(data)
  670. else:
  671. if isinstance(data, dict):
  672. result.update(data)
  673. else:
  674. result[key] = data
  675. return result
  676. def postprocess_data(self, data):
  677. """Here we can modify the data."""
  678. return data
  679. def set_objects_params(self, data):
  680. """Set parameters of Movie/Person/... instances, since they are
  681. not always set in the parser's code."""
  682. for obj in flatten(data, yieldDictKeys=True, scalar=_Container):
  683. obj.accessSystem = self._as
  684. obj.modFunct = self._modFunct
  685. def add_refs(self, data):
  686. """Modify data according to the expected output."""
  687. if self.getRefs:
  688. titl_re = ur'(%s)' % '|'.join([re.escape(x) for x
  689. in self._titlesRefs.keys()])
  690. if titl_re != ur'()': re_titles = re.compile(titl_re, re.U)
  691. else: re_titles = None
  692. nam_re = ur'(%s)' % '|'.join([re.escape(x) for x
  693. in self._namesRefs.keys()])
  694. if nam_re != ur'()': re_names = re.compile(nam_re, re.U)
  695. else: re_names = None
  696. chr_re = ur'(%s)' % '|'.join([re.escape(x) for x
  697. in self._charactersRefs.keys()])
  698. if chr_re != ur'()': re_characters = re.compile(chr_re, re.U)
  699. else: re_characters = None
  700. _putRefs(data, re_titles, re_names, re_characters)
  701. return {'data': data, 'titlesRefs': self._titlesRefs,
  702. 'namesRefs': self._namesRefs,
  703. 'charactersRefs': self._charactersRefs}
  704. class Extractor(object):
  705. """Instruct the DOM parser about how to parse a document."""
  706. def __init__(self, label, path, attrs, group=None, group_key=None,
  707. group_key_normalize=None):
  708. """Initialize an Extractor object, used to instruct the DOM parser
  709. about how to parse a document."""
  710. # rarely (never?) used, mostly for debugging purposes.
  711. self.label = label
  712. self.group = group
  713. if group_key is None:
  714. self.group_key = ".//text()"
  715. else:
  716. self.group_key = group_key
  717. self.group_key_normalize = group_key_normalize
  718. self.path = path
  719. # A list of attributes to fetch.
  720. if isinstance(attrs, Attribute):
  721. attrs = [attrs]
  722. self.attrs = attrs
  723. def __repr__(self):
  724. """String representation of an Extractor object."""
  725. r = '<Extractor id:%s (label=%s, path=%s, attrs=%s, group=%s, ' \
  726. 'group_key=%s group_key_normalize=%s)>' % (id(self),
  727. self.label, self.path, repr(self.attrs), self.group,
  728. self.group_key, self.group_key_normalize)
  729. return r
  730. class Attribute(object):
  731. """The attribute to consider, for a given node."""
  732. def __init__(self, key, multi=False, path=None, joiner=None,
  733. postprocess=None):
  734. """Initialize an Attribute object, used to specify the
  735. attribute to consider, for a given node."""
  736. # The key under which information will be saved; can be a string or an
  737. # XPath. If None, the label of the containing extractor will be used.
  738. self.key = key
  739. self.multi = multi
  740. self.path = path
  741. if joiner is None:
  742. joiner = ''
  743. self.joiner = joiner
  744. # Post-process this set of information.
  745. self.postprocess = postprocess
  746. def __repr__(self):
  747. """String representation of an Attribute object."""
  748. r = '<Attribute id:%s (key=%s, multi=%s, path=%s, joiner=%s, ' \
  749. 'postprocess=%s)>' % (id(self), self.key,
  750. self.multi, repr(self.path),
  751. self.joiner, repr(self.postprocess))
  752. return r
  753. def _parse_ref(text, link, info):
  754. """Manage links to references."""
  755. if link.find('/title/tt') != -1:
  756. yearK = re_yearKind_index.match(info)
  757. if yearK and yearK.start() == 0:
  758. text += ' %s' % info[:yearK.end()]
  759. return (text.replace('\n', ' '), link)
  760. class GatherRefs(DOMParserBase):
  761. """Parser used to gather references to movies, persons and characters."""
  762. _attrs = [Attribute(key=None, multi=True,
  763. path={
  764. 'text': './text()',
  765. 'link': './@href',
  766. 'info': './following::text()[1]'
  767. },
  768. postprocess=lambda x: _parse_ref(x.get('text') or u'', x.get('link') or '',
  769. (x.get('info') or u'').strip()))]
  770. extractors = [
  771. Extractor(label='names refs',
  772. path="//a[starts-with(@href, '/name/nm')][string-length(@href)=16]",
  773. attrs=_attrs),
  774. Extractor(label='titles refs',
  775. path="//a[starts-with(@href, '/title/tt')]" \
  776. "[string-length(@href)=17]",
  777. attrs=_attrs),
  778. Extractor(label='characters refs',
  779. path="//a[starts-with(@href, '/character/ch')]" \
  780. "[string-length(@href)=21]",
  781. attrs=_attrs),
  782. ]
  783. def postprocess_data(self, data):
  784. result = {}
  785. for item in ('names refs', 'titles refs', 'characters refs'):
  786. result[item] = {}
  787. for k, v in data.get(item, []):
  788. k = k.strip()
  789. v = v.strip()
  790. if not (k and v):
  791. continue
  792. if not v.endswith('/'): continue
  793. imdbID = analyze_imdbid(v)
  794. if item == 'names refs':
  795. obj = Person(personID=imdbID, name=k,
  796. accessSystem=self._as, modFunct=self._modFunct)
  797. elif item == 'titles refs':
  798. obj = Movie(movieID=imdbID, title=k,
  799. accessSystem=self._as, modFunct=self._modFunct)
  800. else:
  801. obj = Character(characterID=imdbID, name=k,
  802. accessSystem=self._as, modFunct=self._modFunct)
  803. # XXX: companies aren't handled: are they ever found in text,
  804. # as links to their page?
  805. result[item][k] = obj
  806. return result
  807. def add_refs(self, data):
  808. return data