bsoupxpath.py - XPath related enumerations and constants

/library/imdb/parser/http/bsouplxml/bsoupxpath.py

https://github.com/jsmiller84/CouchPotato · Python · 394 lines · 280 code · 51 blank · 63 comment · 99 complexity · 618c47f363b9b991ec34890742f20a05 MD5 · raw file

"""
parser.http.bsoupxpath module (imdb.parser.http package).

This module provides XPath support for BeautifulSoup.

Copyright 2008 H. Turgut Uyar <uyar@tekir.org>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
"""

__author__ = 'H. Turgut Uyar <uyar@tekir.org>'
__docformat__ = 'restructuredtext'


import re
import string
import _bsoup as BeautifulSoup


# XPath related enumerations and constants

AXIS_ANCESTOR          = 'ancestor'
AXIS_ATTRIBUTE         = 'attribute'
AXIS_CHILD             = 'child'
AXIS_DESCENDANT        = 'descendant'
AXIS_FOLLOWING         = 'following'
AXIS_FOLLOWING_SIBLING = 'following-sibling'
AXIS_PRECEDING_SIBLING = 'preceding-sibling'

AXES = (AXIS_ANCESTOR, AXIS_ATTRIBUTE, AXIS_CHILD, AXIS_DESCENDANT,
        AXIS_FOLLOWING, AXIS_FOLLOWING_SIBLING, AXIS_PRECEDING_SIBLING)

XPATH_FUNCTIONS = ('starts-with', 'string-length')


def tokenize_path(path):
    """Tokenize a location path into location steps. Return the list of steps.

    If two steps are separated by a double slash, the double slashes are part of
    the second step. If they are separated by only one slash, the slash is not
    included in any of the steps.
    """
    # form a list of tuples that mark the start and end positions of steps
    separators = []
    last_position = 0
    i = -1
    in_string = False
    while i < len(path) - 1:
        i = i + 1
        if path[i] == "'":
            in_string = not in_string
        if in_string:
            # slashes within strings are not step separators
            continue
        if path[i] == '/':
            if i > 0:
                separators.append((last_position, i))
            if (path[i+1] == '/'):
                last_position = i
                i = i + 1
            else:
                last_position = i + 1
    separators.append((last_position, len(path)))

    steps = []
    for start, end in separators:
        steps.append(path[start:end])
    return steps


class Path:
    """A location path.
    """

    def __init__(self, path, parse=True):
        self.path = path
        self.steps = []
        if parse:
            if (path[0] == '/') and (path[1] != '/'):
                # if not on the descendant axis, remove the leading slash
                path = path[1:]
            steps = tokenize_path(path)
            for step in steps:
                self.steps.append(PathStep(step))

    def apply(self, node):
        """Apply the path to a node. Return the resulting list of nodes.

        Apply the steps in the path sequentially by sending the output of each
        step as input to the next step.
        """
        # FIXME: this should return a node SET, not a node LIST
        # or at least a list with no duplicates
        if self.path[0] == '/':
            # for an absolute path, start from the root
            if not isinstance(node, BeautifulSoup.Tag) \
               or (node.name != '[document]'):
                node = node.findParent('[document]')
        nodes = [node]
        for step in self.steps:
            nodes = step.apply(nodes)
        return nodes


class PathStep:
    """A location step in a location path.
    """

    AXIS_PATTERN          = r"""(%s)::|@""" % '|'.join(AXES)
    NODE_TEST_PATTERN     = r"""\w+(\(\))?"""
    PREDICATE_PATTERN     = r"""\[(.*?)\]"""
    LOCATION_STEP_PATTERN = r"""(%s)?(%s)((%s)*)""" \
                          % (AXIS_PATTERN, NODE_TEST_PATTERN, PREDICATE_PATTERN)

    _re_location_step = re.compile(LOCATION_STEP_PATTERN)

    PREDICATE_NOT_PATTERN = r"""not\((.*?)\)"""
    PREDICATE_AXIS_PATTERN = r"""(%s)?(%s)(='(.*?)')?""" \
                           % (AXIS_PATTERN, NODE_TEST_PATTERN)
    PREDICATE_FUNCTION_PATTERN = r"""(%s)\(([^,]+(,\s*[^,]+)*)?\)(=(.*))?""" \
                               % '|'.join(XPATH_FUNCTIONS)

    _re_predicate_not = re.compile(PREDICATE_NOT_PATTERN)
    _re_predicate_axis = re.compile(PREDICATE_AXIS_PATTERN)
    _re_predicate_function = re.compile(PREDICATE_FUNCTION_PATTERN)

    def __init__(self, step):
        self.step = step
        if (step == '.') or (step == '..'):
            return

        if step[:2] == '//':
            default_axis = AXIS_DESCENDANT
            step = step[2:]
        else:
            default_axis = AXIS_CHILD

        step_match = self._re_location_step.match(step)

        # determine the axis
        axis = step_match.group(1)
        if axis is None:
            self.axis = default_axis
        elif axis == '@':
            self.axis = AXIS_ATTRIBUTE
        else:
            self.axis = step_match.group(2)

        self.soup_args = {}
        self.index = None

        self.node_test = step_match.group(3)
        if self.node_test == 'text()':
            self.soup_args['text'] = True
        else:
            self.soup_args['name'] = self.node_test

        self.checkers = []
        predicates = step_match.group(5)
        if predicates is not None:
            predicates = [p for p in predicates[1:-1].split('][') if p]
            for predicate in predicates:
                checker = self.__parse_predicate(predicate)
                if checker is not None:
                    self.checkers.append(checker)

    def __parse_predicate(self, predicate):
        """Parse the predicate. Return a callable that can be used to filter
        nodes. Update `self.soup_args` to take advantage of BeautifulSoup search
        features.
        """
        try:
            position = int(predicate)
            if self.axis == AXIS_DESCENDANT:
                return PredicateFilter('position', value=position)
            else:
                # use the search limit feature instead of a checker
                self.soup_args['limit'] = position
                self.index = position - 1
                return None
        except ValueError:
            pass

        if predicate == "last()":
            self.index = -1
            return None

        negate = self._re_predicate_not.match(predicate)
        if negate:
            predicate = negate.group(1)

        function_match = self._re_predicate_function.match(predicate)
        if function_match:
            name = function_match.group(1)
            arguments = function_match.group(2)
            value = function_match.group(4)
            if value is not None:
                value = function_match.group(5)
            return PredicateFilter(name, arguments, value)

        axis_match = self._re_predicate_axis.match(predicate)
        if axis_match:
            axis = axis_match.group(1)
            if axis is None:
                axis = AXIS_CHILD
            elif axis == '@':
                axis = AXIS_ATTRIBUTE
            if axis == AXIS_ATTRIBUTE:
                # use the attribute search feature instead of a checker
                attribute_name = axis_match.group(3)
                if axis_match.group(5) is not None:
                    attribute_value = axis_match.group(6)
                elif not negate:
                    attribute_value = True
                else:
                    attribute_value = None
                if not self.soup_args.has_key('attrs'):
                    self.soup_args['attrs'] = {}
                self.soup_args['attrs'][attribute_name] = attribute_value
                return None
            elif axis == AXIS_CHILD:
                node_test = axis_match.group(3)
                node_value = axis_match.group(6)
                return PredicateFilter('axis', node_test, value=node_value,
                                       negate=negate)

        raise NotImplementedError("This predicate is not implemented")

    def apply(self, nodes):
        """Apply the step to a list of nodes. Return the list of nodes for the
        next step.
        """
        if self.step == '.':
            return nodes
        elif self.step == '..':
            return [node.parent for node in nodes]

        result = []
        for node in nodes:
            if self.axis == AXIS_CHILD:
                found = node.findAll(recursive=False, **self.soup_args)
            elif self.axis == AXIS_DESCENDANT:
                found = node.findAll(recursive=True, **self.soup_args)
            elif self.axis == AXIS_ATTRIBUTE:
                try:
                    found = [node[self.node_test]]
                except KeyError:
                    found = []
            elif self.axis == AXIS_FOLLOWING_SIBLING:
                found = node.findNextSiblings(**self.soup_args)
            elif self.axis == AXIS_PRECEDING_SIBLING:
                # TODO: make sure that the result is reverse ordered
                found = node.findPreviousSiblings(**self.soup_args)
            elif self.axis == AXIS_FOLLOWING:
                # find the last descendant of this node
                last = node
                while (not isinstance(last, BeautifulSoup.NavigableString)) \
                      and (len(last.contents) > 0):
                    last = last.contents[-1]
                found = last.findAllNext(**self.soup_args)
            elif self.axis == AXIS_ANCESTOR:
                found = node.findParents(**self.soup_args)

            # this should only be active if there is a position predicate
            # and the axis is not 'descendant'
            if self.index is not None:
                if found:
                    if len(found) > self.index:
                        found = [found[self.index]]
                    else:
                        found = []

            if found:
                for checker in self.checkers:
                    found = filter(checker, found)
                result.extend(found)

        return result


class PredicateFilter:
    """A callable class for filtering nodes.
    """

    def __init__(self, name, arguments=None, value=None, negate=False):
        self.name = name
        self.arguments = arguments
        self.negate = negate

        if name == 'position':
            self.__filter = self.__position
            self.value = value
        elif name == 'axis':
            self.__filter = self.__axis
            self.node_test = arguments
            self.value = value
        elif name == 'starts-with':
            self.__filter = self.__starts_with
            args = map(string.strip, arguments.split(','))
            if args[0][0] == '@':
                self.arguments = (True, args[0][1:], args[1][1:-1])
            else:
                self.arguments = (False, args[0], args[1][1:-1])
        elif name == 'string-length':
            self.__filter = self.__string_length
            args = map(string.strip, arguments.split(','))
            if args[0][0] == '@':
                self.arguments = (True, args[0][1:])
            else:
                self.arguments = (False, args[0])
            self.value = int(value)
        else:
            raise NotImplementedError("This XPath function is not implemented")

    def __call__(self, node):
        if self.negate:
            return not self.__filter(node)
        else:
            return self.__filter(node)

    def __position(self, node):
        if isinstance(node, BeautifulSoup.NavigableString):
            actual_position = len(node.findPreviousSiblings(text=True)) + 1
        else:
            actual_position = len(node.findPreviousSiblings(node.name)) + 1
        return actual_position == self.value

    def __axis(self, node):
        if self.node_test == 'text()':
            return node.string == self.value
        else:
            children = node.findAll(self.node_test, recursive=False)
            if len(children) > 0 and self.value is None:
                return True
            for child in children:
                if child.string == self.value:
                    return True
            return False

    def __starts_with(self, node):
        if self.arguments[0]:
            # this is an attribute
            attribute_name = self.arguments[1]
            if node.has_key(attribute_name):
                first = node[attribute_name]
                return first.startswith(self.arguments[2])
        elif self.arguments[1] == 'text()':
            first = node.contents[0]
            if isinstance(first, BeautifulSoup.NavigableString):
                return first.startswith(self.arguments[2])
        return False

    def __string_length(self, node):
        if self.arguments[0]:
            # this is an attribute
            attribute_name = self.arguments[1]
            if node.has_key(attribute_name):
                value = node[attribute_name]
            else:
                value = None
        elif self.arguments[1] == 'text()':
            value = node.string
        if value is not None:
            return len(value) == self.value
        return False


_paths = {}
_steps = {}

def get_path(path):
    """Utility for eliminating repeated parsings of the same paths and steps.
    """
    if not _paths.has_key(path):
        p = Path(path, parse=False)
        steps = tokenize_path(path)
        for step in steps:
            if not _steps.has_key(step):
                _steps[step] = PathStep(step)
            p.steps.append(_steps[step])
        _paths[path] = p
    return _paths[path]
Tech Fingerprint

Standard Library: String & Text
Alerts (7)

'isinstance(' Overuse may indicate design issues; consider polymorphism
108 269 334 361
Complexity hotspot; lines 172 to 174 (total complexity: 4)
172 173 174