PageRenderTime 927ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/library/imdb/parser/http/bsouplxml/bsoupxpath.py

https://github.com/jsmiller84/CouchPotato
Python | 394 lines | 333 code | 18 blank | 43 comment | 16 complexity | 618c47f363b9b991ec34890742f20a05 MD5 | raw file
  1. """
  2. parser.http.bsoupxpath module (imdb.parser.http package).
  3. This module provides XPath support for BeautifulSoup.
  4. Copyright 2008 H. Turgut Uyar <uyar@tekir.org>
  5. This program is free software; you can redistribute it and/or modify
  6. it under the terms of the GNU General Public License as published by
  7. the Free Software Foundation; either version 2 of the License, or
  8. (at your option) any later version.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program; if not, write to the Free Software
  15. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  16. """
  17. __author__ = 'H. Turgut Uyar <uyar@tekir.org>'
  18. __docformat__ = 'restructuredtext'
  19. import re
  20. import string
  21. import _bsoup as BeautifulSoup
  22. # XPath related enumerations and constants
  23. AXIS_ANCESTOR = 'ancestor'
  24. AXIS_ATTRIBUTE = 'attribute'
  25. AXIS_CHILD = 'child'
  26. AXIS_DESCENDANT = 'descendant'
  27. AXIS_FOLLOWING = 'following'
  28. AXIS_FOLLOWING_SIBLING = 'following-sibling'
  29. AXIS_PRECEDING_SIBLING = 'preceding-sibling'
  30. AXES = (AXIS_ANCESTOR, AXIS_ATTRIBUTE, AXIS_CHILD, AXIS_DESCENDANT,
  31. AXIS_FOLLOWING, AXIS_FOLLOWING_SIBLING, AXIS_PRECEDING_SIBLING)
  32. XPATH_FUNCTIONS = ('starts-with', 'string-length')
  33. def tokenize_path(path):
  34. """Tokenize a location path into location steps. Return the list of steps.
  35. If two steps are separated by a double slash, the double slashes are part of
  36. the second step. If they are separated by only one slash, the slash is not
  37. included in any of the steps.
  38. """
  39. # form a list of tuples that mark the start and end positions of steps
  40. separators = []
  41. last_position = 0
  42. i = -1
  43. in_string = False
  44. while i < len(path) - 1:
  45. i = i + 1
  46. if path[i] == "'":
  47. in_string = not in_string
  48. if in_string:
  49. # slashes within strings are not step separators
  50. continue
  51. if path[i] == '/':
  52. if i > 0:
  53. separators.append((last_position, i))
  54. if (path[i+1] == '/'):
  55. last_position = i
  56. i = i + 1
  57. else:
  58. last_position = i + 1
  59. separators.append((last_position, len(path)))
  60. steps = []
  61. for start, end in separators:
  62. steps.append(path[start:end])
  63. return steps
  64. class Path:
  65. """A location path.
  66. """
  67. def __init__(self, path, parse=True):
  68. self.path = path
  69. self.steps = []
  70. if parse:
  71. if (path[0] == '/') and (path[1] != '/'):
  72. # if not on the descendant axis, remove the leading slash
  73. path = path[1:]
  74. steps = tokenize_path(path)
  75. for step in steps:
  76. self.steps.append(PathStep(step))
  77. def apply(self, node):
  78. """Apply the path to a node. Return the resulting list of nodes.
  79. Apply the steps in the path sequentially by sending the output of each
  80. step as input to the next step.
  81. """
  82. # FIXME: this should return a node SET, not a node LIST
  83. # or at least a list with no duplicates
  84. if self.path[0] == '/':
  85. # for an absolute path, start from the root
  86. if not isinstance(node, BeautifulSoup.Tag) \
  87. or (node.name != '[document]'):
  88. node = node.findParent('[document]')
  89. nodes = [node]
  90. for step in self.steps:
  91. nodes = step.apply(nodes)
  92. return nodes
  93. class PathStep:
  94. """A location step in a location path.
  95. """
  96. AXIS_PATTERN = r"""(%s)::|@""" % '|'.join(AXES)
  97. NODE_TEST_PATTERN = r"""\w+(\(\))?"""
  98. PREDICATE_PATTERN = r"""\[(.*?)\]"""
  99. LOCATION_STEP_PATTERN = r"""(%s)?(%s)((%s)*)""" \
  100. % (AXIS_PATTERN, NODE_TEST_PATTERN, PREDICATE_PATTERN)
  101. _re_location_step = re.compile(LOCATION_STEP_PATTERN)
  102. PREDICATE_NOT_PATTERN = r"""not\((.*?)\)"""
  103. PREDICATE_AXIS_PATTERN = r"""(%s)?(%s)(='(.*?)')?""" \
  104. % (AXIS_PATTERN, NODE_TEST_PATTERN)
  105. PREDICATE_FUNCTION_PATTERN = r"""(%s)\(([^,]+(,\s*[^,]+)*)?\)(=(.*))?""" \
  106. % '|'.join(XPATH_FUNCTIONS)
  107. _re_predicate_not = re.compile(PREDICATE_NOT_PATTERN)
  108. _re_predicate_axis = re.compile(PREDICATE_AXIS_PATTERN)
  109. _re_predicate_function = re.compile(PREDICATE_FUNCTION_PATTERN)
  110. def __init__(self, step):
  111. self.step = step
  112. if (step == '.') or (step == '..'):
  113. return
  114. if step[:2] == '//':
  115. default_axis = AXIS_DESCENDANT
  116. step = step[2:]
  117. else:
  118. default_axis = AXIS_CHILD
  119. step_match = self._re_location_step.match(step)
  120. # determine the axis
  121. axis = step_match.group(1)
  122. if axis is None:
  123. self.axis = default_axis
  124. elif axis == '@':
  125. self.axis = AXIS_ATTRIBUTE
  126. else:
  127. self.axis = step_match.group(2)
  128. self.soup_args = {}
  129. self.index = None
  130. self.node_test = step_match.group(3)
  131. if self.node_test == 'text()':
  132. self.soup_args['text'] = True
  133. else:
  134. self.soup_args['name'] = self.node_test
  135. self.checkers = []
  136. predicates = step_match.group(5)
  137. if predicates is not None:
  138. predicates = [p for p in predicates[1:-1].split('][') if p]
  139. for predicate in predicates:
  140. checker = self.__parse_predicate(predicate)
  141. if checker is not None:
  142. self.checkers.append(checker)
  143. def __parse_predicate(self, predicate):
  144. """Parse the predicate. Return a callable that can be used to filter
  145. nodes. Update `self.soup_args` to take advantage of BeautifulSoup search
  146. features.
  147. """
  148. try:
  149. position = int(predicate)
  150. if self.axis == AXIS_DESCENDANT:
  151. return PredicateFilter('position', value=position)
  152. else:
  153. # use the search limit feature instead of a checker
  154. self.soup_args['limit'] = position
  155. self.index = position - 1
  156. return None
  157. except ValueError:
  158. pass
  159. if predicate == "last()":
  160. self.index = -1
  161. return None
  162. negate = self._re_predicate_not.match(predicate)
  163. if negate:
  164. predicate = negate.group(1)
  165. function_match = self._re_predicate_function.match(predicate)
  166. if function_match:
  167. name = function_match.group(1)
  168. arguments = function_match.group(2)
  169. value = function_match.group(4)
  170. if value is not None:
  171. value = function_match.group(5)
  172. return PredicateFilter(name, arguments, value)
  173. axis_match = self._re_predicate_axis.match(predicate)
  174. if axis_match:
  175. axis = axis_match.group(1)
  176. if axis is None:
  177. axis = AXIS_CHILD
  178. elif axis == '@':
  179. axis = AXIS_ATTRIBUTE
  180. if axis == AXIS_ATTRIBUTE:
  181. # use the attribute search feature instead of a checker
  182. attribute_name = axis_match.group(3)
  183. if axis_match.group(5) is not None:
  184. attribute_value = axis_match.group(6)
  185. elif not negate:
  186. attribute_value = True
  187. else:
  188. attribute_value = None
  189. if not self.soup_args.has_key('attrs'):
  190. self.soup_args['attrs'] = {}
  191. self.soup_args['attrs'][attribute_name] = attribute_value
  192. return None
  193. elif axis == AXIS_CHILD:
  194. node_test = axis_match.group(3)
  195. node_value = axis_match.group(6)
  196. return PredicateFilter('axis', node_test, value=node_value,
  197. negate=negate)
  198. raise NotImplementedError("This predicate is not implemented")
  199. def apply(self, nodes):
  200. """Apply the step to a list of nodes. Return the list of nodes for the
  201. next step.
  202. """
  203. if self.step == '.':
  204. return nodes
  205. elif self.step == '..':
  206. return [node.parent for node in nodes]
  207. result = []
  208. for node in nodes:
  209. if self.axis == AXIS_CHILD:
  210. found = node.findAll(recursive=False, **self.soup_args)
  211. elif self.axis == AXIS_DESCENDANT:
  212. found = node.findAll(recursive=True, **self.soup_args)
  213. elif self.axis == AXIS_ATTRIBUTE:
  214. try:
  215. found = [node[self.node_test]]
  216. except KeyError:
  217. found = []
  218. elif self.axis == AXIS_FOLLOWING_SIBLING:
  219. found = node.findNextSiblings(**self.soup_args)
  220. elif self.axis == AXIS_PRECEDING_SIBLING:
  221. # TODO: make sure that the result is reverse ordered
  222. found = node.findPreviousSiblings(**self.soup_args)
  223. elif self.axis == AXIS_FOLLOWING:
  224. # find the last descendant of this node
  225. last = node
  226. while (not isinstance(last, BeautifulSoup.NavigableString)) \
  227. and (len(last.contents) > 0):
  228. last = last.contents[-1]
  229. found = last.findAllNext(**self.soup_args)
  230. elif self.axis == AXIS_ANCESTOR:
  231. found = node.findParents(**self.soup_args)
  232. # this should only be active if there is a position predicate
  233. # and the axis is not 'descendant'
  234. if self.index is not None:
  235. if found:
  236. if len(found) > self.index:
  237. found = [found[self.index]]
  238. else:
  239. found = []
  240. if found:
  241. for checker in self.checkers:
  242. found = filter(checker, found)
  243. result.extend(found)
  244. return result
  245. class PredicateFilter:
  246. """A callable class for filtering nodes.
  247. """
  248. def __init__(self, name, arguments=None, value=None, negate=False):
  249. self.name = name
  250. self.arguments = arguments
  251. self.negate = negate
  252. if name == 'position':
  253. self.__filter = self.__position
  254. self.value = value
  255. elif name == 'axis':
  256. self.__filter = self.__axis
  257. self.node_test = arguments
  258. self.value = value
  259. elif name == 'starts-with':
  260. self.__filter = self.__starts_with
  261. args = map(string.strip, arguments.split(','))
  262. if args[0][0] == '@':
  263. self.arguments = (True, args[0][1:], args[1][1:-1])
  264. else:
  265. self.arguments = (False, args[0], args[1][1:-1])
  266. elif name == 'string-length':
  267. self.__filter = self.__string_length
  268. args = map(string.strip, arguments.split(','))
  269. if args[0][0] == '@':
  270. self.arguments = (True, args[0][1:])
  271. else:
  272. self.arguments = (False, args[0])
  273. self.value = int(value)
  274. else:
  275. raise NotImplementedError("This XPath function is not implemented")
  276. def __call__(self, node):
  277. if self.negate:
  278. return not self.__filter(node)
  279. else:
  280. return self.__filter(node)
  281. def __position(self, node):
  282. if isinstance(node, BeautifulSoup.NavigableString):
  283. actual_position = len(node.findPreviousSiblings(text=True)) + 1
  284. else:
  285. actual_position = len(node.findPreviousSiblings(node.name)) + 1
  286. return actual_position == self.value
  287. def __axis(self, node):
  288. if self.node_test == 'text()':
  289. return node.string == self.value
  290. else:
  291. children = node.findAll(self.node_test, recursive=False)
  292. if len(children) > 0 and self.value is None:
  293. return True
  294. for child in children:
  295. if child.string == self.value:
  296. return True
  297. return False
  298. def __starts_with(self, node):
  299. if self.arguments[0]:
  300. # this is an attribute
  301. attribute_name = self.arguments[1]
  302. if node.has_key(attribute_name):
  303. first = node[attribute_name]
  304. return first.startswith(self.arguments[2])
  305. elif self.arguments[1] == 'text()':
  306. first = node.contents[0]
  307. if isinstance(first, BeautifulSoup.NavigableString):
  308. return first.startswith(self.arguments[2])
  309. return False
  310. def __string_length(self, node):
  311. if self.arguments[0]:
  312. # this is an attribute
  313. attribute_name = self.arguments[1]
  314. if node.has_key(attribute_name):
  315. value = node[attribute_name]
  316. else:
  317. value = None
  318. elif self.arguments[1] == 'text()':
  319. value = node.string
  320. if value is not None:
  321. return len(value) == self.value
  322. return False
  323. _paths = {}
  324. _steps = {}
  325. def get_path(path):
  326. """Utility for eliminating repeated parsings of the same paths and steps.
  327. """
  328. if not _paths.has_key(path):
  329. p = Path(path, parse=False)
  330. steps = tokenize_path(path)
  331. for step in steps:
  332. if not _steps.has_key(step):
  333. _steps[step] = PathStep(step)
  334. p.steps.append(_steps[step])
  335. _paths[path] = p
  336. return _paths[path]