/web/lib/bs4/element.py
Python | 1334 lines | 1125 code | 59 blank | 150 comment | 79 complexity | 3dbd99492b35371eced27e87b881b421 MD5 | raw file
- import collections
- import re
- import sys
- import warnings
- from bs4.dammit import EntitySubstitution
- DEFAULT_OUTPUT_ENCODING = "utf-8"
- PY3K = (sys.version_info[0] > 2)
- whitespace_re = re.compile("\s+")
- def _alias(attr):
- """Alias one attribute name to another for backward compatibility"""
- @property
- def alias(self):
- return getattr(self, attr)
- @alias.setter
- def alias(self):
- return setattr(self, attr)
- return alias
- class NamespacedAttribute(unicode):
- def __new__(cls, prefix, name, namespace=None):
- if name is None:
- obj = unicode.__new__(cls, prefix)
- elif prefix is None:
- # Not really namespaced.
- obj = unicode.__new__(cls, name)
- else:
- obj = unicode.__new__(cls, prefix + ":" + name)
- obj.prefix = prefix
- obj.name = name
- obj.namespace = namespace
- return obj
- class AttributeValueWithCharsetSubstitution(unicode):
- """A stand-in object for a character encoding specified in HTML."""
- class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
- """A generic stand-in for the value of a meta tag's 'charset' attribute.
- When Beautiful Soup parses the markup '<meta charset="utf8">', the
- value of the 'charset' attribute will be one of these objects.
- """
- def __new__(cls, original_value):
- obj = unicode.__new__(cls, original_value)
- obj.original_value = original_value
- return obj
- def encode(self, encoding):
- return encoding
- class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
- """A generic stand-in for the value of a meta tag's 'content' attribute.
- When Beautiful Soup parses the markup:
- <meta http-equiv="content-type" content="text/html; charset=utf8">
- The value of the 'content' attribute will be one of these objects.
- """
- CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
- def __new__(cls, original_value):
- match = cls.CHARSET_RE.search(original_value)
- if match is None:
- # No substitution necessary.
- return unicode.__new__(unicode, original_value)
- obj = unicode.__new__(cls, original_value)
- obj.original_value = original_value
- return obj
- def encode(self, encoding):
- def rewrite(match):
- return match.group(1) + encoding
- return self.CHARSET_RE.sub(rewrite, self.original_value)
- class HTMLAwareEntitySubstitution(EntitySubstitution):
- """Entity substitution rules that are aware of some HTML quirks.
- Specifically, the contents of <script> and <style> tags should not
- undergo entity substitution.
- Incoming NavigableString objects are checked to see if they're the
- direct children of a <script> or <style> tag.
- """
- cdata_containing_tags = set(["script", "style"])
- preformatted_tags = set(["pre"])
- @classmethod
- def _substitute_if_appropriate(cls, ns, f):
- if (isinstance(ns, NavigableString)
- and ns.parent is not None
- and ns.parent.name in cls.cdata_containing_tags):
- # Do nothing.
- return ns
- # Substitute.
- return f(ns)
- @classmethod
- def substitute_html(cls, ns):
- return cls._substitute_if_appropriate(
- ns, EntitySubstitution.substitute_html)
- @classmethod
- def substitute_xml(cls, ns):
- return cls._substitute_if_appropriate(
- ns, EntitySubstitution.substitute_xml)
- class PageElement(object):
- """Contains the navigational information for some part of the page
- (either a tag or a piece of text)"""
- # There are five possible values for the "formatter" argument passed in
- # to methods like encode() and prettify():
- #
- # "html" - All Unicode characters with corresponding HTML entities
- # are converted to those entities on output.
- # "minimal" - Bare ampersands and angle brackets are converted to
- # XML entities: & < >
- # None - The null formatter. Unicode characters are never
- # converted to entities. This is not recommended, but it's
- # faster than "minimal".
- # A function - This function will be called on every string that
- # needs to undergo entity substitution.
- #
- # In an HTML document, the default "html" and "minimal" functions
- # will leave the contents of <script> and <style> tags alone. For
- # an XML document, all tags will be given the same treatment.
- HTML_FORMATTERS = {
- "html" : HTMLAwareEntitySubstitution.substitute_html,
- "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
- None : None
- }
- XML_FORMATTERS = {
- "html" : EntitySubstitution.substitute_html,
- "minimal" : EntitySubstitution.substitute_xml,
- None : None
- }
- def format_string(self, s, formatter='minimal'):
- """Format the given string using the given formatter."""
- if not callable(formatter):
- formatter = self._formatter_for_name(formatter)
- if formatter is None:
- output = s
- else:
- output = formatter(s)
- return output
- @property
- def _is_xml(self):
- """Is this element part of an XML tree or an HTML tree?
- This is used when mapping a formatter name ("minimal") to an
- appropriate function (one that performs entity-substitution on
- the contents of <script> and <style> tags, or not). It's
- inefficient, but it should be called very rarely.
- """
- if self.parent is None:
- # This is the top-level object. It should have .is_xml set
- # from tree creation. If not, take a guess--BS is usually
- # used on HTML markup.
- return getattr(self, 'is_xml', False)
- return self.parent._is_xml
- def _formatter_for_name(self, name):
- "Look up a formatter function based on its name and the tree."
- if self._is_xml:
- return self.XML_FORMATTERS.get(
- name, EntitySubstitution.substitute_xml)
- else:
- return self.HTML_FORMATTERS.get(
- name, HTMLAwareEntitySubstitution.substitute_xml)
- def setup(self, parent=None, previous_element=None):
- """Sets up the initial relations between this element and
- other elements."""
- self.parent = parent
- self.previous_element = previous_element
- if previous_element is not None:
- self.previous_element.next_element = self
- self.next_element = None
- self.previous_sibling = None
- self.next_sibling = None
- if self.parent is not None and self.parent.contents:
- self.previous_sibling = self.parent.contents[-1]
- self.previous_sibling.next_sibling = self
- nextSibling = _alias("next_sibling") # BS3
- previousSibling = _alias("previous_sibling") # BS3
- def replace_with(self, replace_with):
- if replace_with is self:
- return
- if replace_with is self.parent:
- raise ValueError("Cannot replace a Tag with its parent.")
- old_parent = self.parent
- my_index = self.parent.index(self)
- self.extract()
- old_parent.insert(my_index, replace_with)
- return self
- replaceWith = replace_with # BS3
- def unwrap(self):
- my_parent = self.parent
- my_index = self.parent.index(self)
- self.extract()
- for child in reversed(self.contents[:]):
- my_parent.insert(my_index, child)
- return self
- replace_with_children = unwrap
- replaceWithChildren = unwrap # BS3
- def wrap(self, wrap_inside):
- me = self.replace_with(wrap_inside)
- wrap_inside.append(me)
- return wrap_inside
- def extract(self):
- """Destructively rips this element out of the tree."""
- if self.parent is not None:
- del self.parent.contents[self.parent.index(self)]
- #Find the two elements that would be next to each other if
- #this element (and any children) hadn't been parsed. Connect
- #the two.
- last_child = self._last_descendant()
- next_element = last_child.next_element
- if self.previous_element is not None:
- self.previous_element.next_element = next_element
- if next_element is not None:
- next_element.previous_element = self.previous_element
- self.previous_element = None
- last_child.next_element = None
- self.parent = None
- if self.previous_sibling is not None:
- self.previous_sibling.next_sibling = self.next_sibling
- if self.next_sibling is not None:
- self.next_sibling.previous_sibling = self.previous_sibling
- self.previous_sibling = self.next_sibling = None
- return self
- def _last_descendant(self, is_initialized=True, accept_self=True):
- "Finds the last element beneath this object to be parsed."
- if is_initialized and self.next_sibling:
- last_child = self.next_sibling.previous_element
- else:
- last_child = self
- while isinstance(last_child, Tag) and last_child.contents:
- last_child = last_child.contents[-1]
- if not accept_self and last_child == self:
- last_child = None
- return last_child
- # BS3: Not part of the API!
- _lastRecursiveChild = _last_descendant
- def insert(self, position, new_child):
- if new_child is self:
- raise ValueError("Cannot insert a tag into itself.")
- if (isinstance(new_child, basestring)
- and not isinstance(new_child, NavigableString)):
- new_child = NavigableString(new_child)
- position = min(position, len(self.contents))
- if hasattr(new_child, 'parent') and new_child.parent is not None:
- # We're 'inserting' an element that's already one
- # of this object's children.
- if new_child.parent is self:
- current_index = self.index(new_child)
- if current_index < position:
- # We're moving this element further down the list
- # of this object's children. That means that when
- # we extract this element, our target index will
- # jump down one.
- position -= 1
- new_child.extract()
- new_child.parent = self
- previous_child = None
- if position == 0:
- new_child.previous_sibling = None
- new_child.previous_element = self
- else:
- previous_child = self.contents[position - 1]
- new_child.previous_sibling = previous_child
- new_child.previous_sibling.next_sibling = new_child
- new_child.previous_element = previous_child._last_descendant(False)
- if new_child.previous_element is not None:
- new_child.previous_element.next_element = new_child
- new_childs_last_element = new_child._last_descendant(False)
- if position >= len(self.contents):
- new_child.next_sibling = None
- parent = self
- parents_next_sibling = None
- while parents_next_sibling is None and parent is not None:
- parents_next_sibling = parent.next_sibling
- parent = parent.parent
- if parents_next_sibling is not None:
- # We found the element that comes next in the document.
- break
- if parents_next_sibling is not None:
- new_childs_last_element.next_element = parents_next_sibling
- else:
- # The last element of this tag is the last element in
- # the document.
- new_childs_last_element.next_element = None
- else:
- next_child = self.contents[position]
- new_child.next_sibling = next_child
- if new_child.next_sibling is not None:
- new_child.next_sibling.previous_sibling = new_child
- new_childs_last_element.next_element = next_child
- if new_childs_last_element.next_element is not None:
- new_childs_last_element.next_element.previous_element = new_childs_last_element
- self.contents.insert(position, new_child)
- def append(self, tag):
- """Appends the given tag to the contents of this tag."""
- self.insert(len(self.contents), tag)
- def insert_before(self, predecessor):
- """Makes the given element the immediate predecessor of this one.
- The two elements will have the same parent, and the given element
- will be immediately before this one.
- """
- if self is predecessor:
- raise ValueError("Can't insert an element before itself.")
- parent = self.parent
- if parent is None:
- raise ValueError(
- "Element has no parent, so 'before' has no meaning.")
- # Extract first so that the index won't be screwed up if they
- # are siblings.
- if isinstance(predecessor, PageElement):
- predecessor.extract()
- index = parent.index(self)
- parent.insert(index, predecessor)
- def insert_after(self, successor):
- """Makes the given element the immediate successor of this one.
- The two elements will have the same parent, and the given element
- will be immediately after this one.
- """
- if self is successor:
- raise ValueError("Can't insert an element after itself.")
- parent = self.parent
- if parent is None:
- raise ValueError(
- "Element has no parent, so 'after' has no meaning.")
- # Extract first so that the index won't be screwed up if they
- # are siblings.
- if isinstance(successor, PageElement):
- successor.extract()
- index = parent.index(self)
- parent.insert(index+1, successor)
- def find_next(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the first item that matches the given criteria and
- appears after this Tag in the document."""
- return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
- findNext = find_next # BS3
- def find_all_next(self, name=None, attrs={}, text=None, limit=None,
- **kwargs):
- """Returns all items that match the given criteria and appear
- after this Tag in the document."""
- return self._find_all(name, attrs, text, limit, self.next_elements,
- **kwargs)
- findAllNext = find_all_next # BS3
- def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the closest sibling to this Tag that matches the
- given criteria and appears after this Tag in the document."""
- return self._find_one(self.find_next_siblings, name, attrs, text,
- **kwargs)
- findNextSibling = find_next_sibling # BS3
- def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
- **kwargs):
- """Returns the siblings of this Tag that match the given
- criteria and appear after this Tag in the document."""
- return self._find_all(name, attrs, text, limit,
- self.next_siblings, **kwargs)
- findNextSiblings = find_next_siblings # BS3
- fetchNextSiblings = find_next_siblings # BS2
- def find_previous(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the first item that matches the given criteria and
- appears before this Tag in the document."""
- return self._find_one(
- self.find_all_previous, name, attrs, text, **kwargs)
- findPrevious = find_previous # BS3
- def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
- **kwargs):
- """Returns all items that match the given criteria and appear
- before this Tag in the document."""
- return self._find_all(name, attrs, text, limit, self.previous_elements,
- **kwargs)
- findAllPrevious = find_all_previous # BS3
- fetchPrevious = find_all_previous # BS2
- def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the closest sibling to this Tag that matches the
- given criteria and appears before this Tag in the document."""
- return self._find_one(self.find_previous_siblings, name, attrs, text,
- **kwargs)
- findPreviousSibling = find_previous_sibling # BS3
- def find_previous_siblings(self, name=None, attrs={}, text=None,
- limit=None, **kwargs):
- """Returns the siblings of this Tag that match the given
- criteria and appear before this Tag in the document."""
- return self._find_all(name, attrs, text, limit,
- self.previous_siblings, **kwargs)
- findPreviousSiblings = find_previous_siblings # BS3
- fetchPreviousSiblings = find_previous_siblings # BS2
- def find_parent(self, name=None, attrs={}, **kwargs):
- """Returns the closest parent of this Tag that matches the given
- criteria."""
- # NOTE: We can't use _find_one because findParents takes a different
- # set of arguments.
- r = None
- l = self.find_parents(name, attrs, 1, **kwargs)
- if l:
- r = l[0]
- return r
- findParent = find_parent # BS3
- def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
- """Returns the parents of this Tag that match the given
- criteria."""
- return self._find_all(name, attrs, None, limit, self.parents,
- **kwargs)
- findParents = find_parents # BS3
- fetchParents = find_parents # BS2
- @property
- def next(self):
- return self.next_element
- @property
- def previous(self):
- return self.previous_element
- #These methods do the real heavy lifting.
- def _find_one(self, method, name, attrs, text, **kwargs):
- r = None
- l = method(name, attrs, text, 1, **kwargs)
- if l:
- r = l[0]
- return r
- def _find_all(self, name, attrs, text, limit, generator, **kwargs):
- "Iterates over a generator looking for things that match."
- if isinstance(name, SoupStrainer):
- strainer = name
- else:
- strainer = SoupStrainer(name, attrs, text, **kwargs)
- if text is None and not limit and not attrs and not kwargs:
- if name is True or name is None:
- # Optimization to find all tags.
- result = (element for element in generator
- if isinstance(element, Tag))
- return ResultSet(strainer, result)
- elif isinstance(name, basestring):
- # Optimization to find all tags with a given name.
- result = (element for element in generator
- if isinstance(element, Tag)
- and element.name == name)
- return ResultSet(strainer, result)
- results = ResultSet(strainer)
- while True:
- try:
- i = next(generator)
- except StopIteration:
- break
- if i:
- found = strainer.search(i)
- if found:
- results.append(found)
- if limit and len(results) >= limit:
- break
- return results
- #These generators can be used to navigate starting from both
- #NavigableStrings and Tags.
- @property
- def next_elements(self):
- i = self.next_element
- while i is not None:
- yield i
- i = i.next_element
- @property
- def next_siblings(self):
- i = self.next_sibling
- while i is not None:
- yield i
- i = i.next_sibling
- @property
- def previous_elements(self):
- i = self.previous_element
- while i is not None:
- yield i
- i = i.previous_element
- @property
- def previous_siblings(self):
- i = self.previous_sibling
- while i is not None:
- yield i
- i = i.previous_sibling
- @property
- def parents(self):
- i = self.parent
- while i is not None:
- yield i
- i = i.parent
- # Methods for supporting CSS selectors.
- tag_name_re = re.compile('^[a-z0-9]+$')
- # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
- # \---/ \---/\-------------/ \-------/
- # | | | |
- # | | | The value
- # | | ~,|,^,$,* or =
- # | Attribute
- # Tag
- attribselect_re = re.compile(
- r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' +
- r'=?"?(?P<value>[^\]"]*)"?\]$'
- )
- def _attr_value_as_string(self, value, default=None):
- """Force an attribute value into a string representation.
- A multi-valued attribute will be converted into a
- space-separated stirng.
- """
- value = self.get(value, default)
- if isinstance(value, list) or isinstance(value, tuple):
- value =" ".join(value)
- return value
- def _tag_name_matches_and(self, function, tag_name):
- if not tag_name:
- return function
- else:
- def _match(tag):
- return tag.name == tag_name and function(tag)
- return _match
- def _attribute_checker(self, operator, attribute, value=''):
- """Create a function that performs a CSS selector operation.
- Takes an operator, attribute and optional value. Returns a
- function that will return True for elements that match that
- combination.
- """
- if operator == '=':
- # string representation of `attribute` is equal to `value`
- return lambda el: el._attr_value_as_string(attribute) == value
- elif operator == '~':
- # space-separated list representation of `attribute`
- # contains `value`
- def _includes_value(element):
- attribute_value = element.get(attribute, [])
- if not isinstance(attribute_value, list):
- attribute_value = attribute_value.split()
- return value in attribute_value
- return _includes_value
- elif operator == '^':
- # string representation of `attribute` starts with `value`
- return lambda el: el._attr_value_as_string(
- attribute, '').startswith(value)
- elif operator == '$':
- # string represenation of `attribute` ends with `value`
- return lambda el: el._attr_value_as_string(
- attribute, '').endswith(value)
- elif operator == '*':
- # string representation of `attribute` contains `value`
- return lambda el: value in el._attr_value_as_string(attribute, '')
- elif operator == '|':
- # string representation of `attribute` is either exactly
- # `value` or starts with `value` and then a dash.
- def _is_or_starts_with_dash(element):
- attribute_value = element._attr_value_as_string(attribute, '')
- return (attribute_value == value or attribute_value.startswith(
- value + '-'))
- return _is_or_starts_with_dash
- else:
- return lambda el: el.has_attr(attribute)
- # Old non-property versions of the generators, for backwards
- # compatibility with BS3.
- def nextGenerator(self):
- return self.next_elements
- def nextSiblingGenerator(self):
- return self.next_siblings
- def previousGenerator(self):
- return self.previous_elements
- def previousSiblingGenerator(self):
- return self.previous_siblings
- def parentGenerator(self):
- return self.parents
- class NavigableString(unicode, PageElement):
- PREFIX = ''
- SUFFIX = ''
- def __new__(cls, value):
- """Create a new NavigableString.
- When unpickling a NavigableString, this method is called with
- the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
- passed in to the superclass's __new__ or the superclass won't know
- how to handle non-ASCII characters.
- """
- if isinstance(value, unicode):
- return unicode.__new__(cls, value)
- return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
- def __copy__(self):
- return self
- def __getnewargs__(self):
- return (unicode(self),)
- def __getattr__(self, attr):
- """text.string gives you text. This is for backwards
- compatibility for Navigable*String, but for CData* it lets you
- get the string without the CData wrapper."""
- if attr == 'string':
- return self
- else:
- raise AttributeError(
- "'%s' object has no attribute '%s'" % (
- self.__class__.__name__, attr))
- def output_ready(self, formatter="minimal"):
- output = self.format_string(self, formatter)
- return self.PREFIX + output + self.SUFFIX
- @property
- def name(self):
- return None
- @name.setter
- def name(self, name):
- raise AttributeError("A NavigableString cannot be given a name.")
- class PreformattedString(NavigableString):
- """A NavigableString not subject to the normal formatting rules.
- The string will be passed into the formatter (to trigger side effects),
- but the return value will be ignored.
- """
- def output_ready(self, formatter="minimal"):
- """CData strings are passed into the formatter.
- But the return value is ignored."""
- self.format_string(self, formatter)
- return self.PREFIX + self + self.SUFFIX
- class CData(PreformattedString):
- PREFIX = u'<![CDATA['
- SUFFIX = u']]>'
- class ProcessingInstruction(PreformattedString):
- PREFIX = u'<?'
- SUFFIX = u'?>'
- class Comment(PreformattedString):
- PREFIX = u'<!--'
- SUFFIX = u'-->'
- class Declaration(PreformattedString):
- PREFIX = u'<!'
- SUFFIX = u'!>'
- class Doctype(PreformattedString):
- @classmethod
- def for_name_and_ids(cls, name, pub_id, system_id):
- value = name or ''
- if pub_id is not None:
- value += ' PUBLIC "%s"' % pub_id
- if system_id is not None:
- value += ' "%s"' % system_id
- elif system_id is not None:
- value += ' SYSTEM "%s"' % system_id
- return Doctype(value)
- PREFIX = u'<!DOCTYPE '
- SUFFIX = u'>\n'
- class Tag(PageElement):
- """Represents a found HTML tag with its attributes and contents."""
- def __init__(self, parser=None, builder=None, name=None, namespace=None,
- prefix=None, attrs=None, parent=None, previous=None):
- "Basic constructor."
- if parser is None:
- self.parser_class = None
- else:
- # We don't actually store the parser object: that lets extracted
- # chunks be garbage-collected.
- self.parser_class = parser.__class__
- if name is None:
- raise ValueError("No value provided for new tag's name.")
- self.name = name
- self.namespace = namespace
- self.prefix = prefix
- if attrs is None:
- attrs = {}
- elif attrs and builder.cdata_list_attributes:
- attrs = builder._replace_cdata_list_attribute_values(
- self.name, attrs)
- else:
- attrs = dict(attrs)
- self.attrs = attrs
- self.contents = []
- self.setup(parent, previous)
- self.hidden = False
- # Set up any substitutions, such as the charset in a META tag.
- if builder is not None:
- builder.set_up_substitutions(self)
- self.can_be_empty_element = builder.can_be_empty_element(name)
- else:
- self.can_be_empty_element = False
- parserClass = _alias("parser_class") # BS3
- @property
- def is_empty_element(self):
- """Is this tag an empty-element tag? (aka a self-closing tag)
- A tag that has contents is never an empty-element tag.
- A tag that has no contents may or may not be an empty-element
- tag. It depends on the builder used to create the tag. If the
- builder has a designated list of empty-element tags, then only
- a tag whose name shows up in that list is considered an
- empty-element tag.
- If the builder has no designated list of empty-element tags,
- then any tag with no contents is an empty-element tag.
- """
- return len(self.contents) == 0 and self.can_be_empty_element
- isSelfClosing = is_empty_element # BS3
- @property
- def string(self):
- """Convenience property to get the single string within this tag.
- :Return: If this tag has a single string child, return value
- is that string. If this tag has no children, or more than one
- child, return value is None. If this tag has one child tag,
- return value is the 'string' attribute of the child tag,
- recursively.
- """
- if len(self.contents) != 1:
- return None
- child = self.contents[0]
- if isinstance(child, NavigableString):
- return child
- return child.string
- @string.setter
- def string(self, string):
- self.clear()
- self.append(string.__class__(string))
- def _all_strings(self, strip=False, types=(NavigableString, CData)):
- """Yield all strings of certain classes, possibly stripping them.
- By default, yields only NavigableString and CData objects. So
- no comments, processing instructions, etc.
- """
- for descendant in self.descendants:
- if (
- (types is None and not isinstance(descendant, NavigableString))
- or
- (types is not None and type(descendant) not in types)):
- continue
- if strip:
- descendant = descendant.strip()
- if len(descendant) == 0:
- continue
- yield descendant
- strings = property(_all_strings)
- @property
- def stripped_strings(self):
- for string in self._all_strings(True):
- yield string
- def get_text(self, separator=u"", strip=False,
- types=(NavigableString, CData)):
- """
- Get all child strings, concatenated using the given separator.
- """
- return separator.join([s for s in self._all_strings(
- strip, types=types)])
- getText = get_text
- text = property(get_text)
- def decompose(self):
- """Recursively destroys the contents of this tree."""
- self.extract()
- i = self
- while i is not None:
- next = i.next_element
- i.__dict__.clear()
- i.contents = []
- i = next
- def clear(self, decompose=False):
- """
- Extract all children. If decompose is True, decompose instead.
- """
- if decompose:
- for element in self.contents[:]:
- if isinstance(element, Tag):
- element.decompose()
- else:
- element.extract()
- else:
- for element in self.contents[:]:
- element.extract()
- def index(self, element):
- """
- Find the index of a child by identity, not value. Avoids issues with
- tag.contents.index(element) getting the index of equal elements.
- """
- for i, child in enumerate(self.contents):
- if child is element:
- return i
- raise ValueError("Tag.index: element not in tag")
- def get(self, key, default=None):
- """Returns the value of the 'key' attribute for the tag, or
- the value given for 'default' if it doesn't have that
- attribute."""
- return self.attrs.get(key, default)
- def has_attr(self, key):
- return key in self.attrs
- def __hash__(self):
- return str(self).__hash__()
- def __getitem__(self, key):
- """tag[key] returns the value of the 'key' attribute for the tag,
- and throws an exception if it's not there."""
- return self.attrs[key]
- def __iter__(self):
- "Iterating over a tag iterates over its contents."
- return iter(self.contents)
- def __len__(self):
- "The length of a tag is the length of its list of contents."
- return len(self.contents)
- def __contains__(self, x):
- return x in self.contents
- def __nonzero__(self):
- "A tag is non-None even if it has no contents."
- return True
- def __setitem__(self, key, value):
- """Setting tag[key] sets the value of the 'key' attribute for the
- tag."""
- self.attrs[key] = value
- def __delitem__(self, key):
- "Deleting tag[key] deletes all 'key' attributes for the tag."
- self.attrs.pop(key, None)
- def __call__(self, *args, **kwargs):
- """Calling a tag like a function is the same as calling its
- find_all() method. Eg. tag('a') returns a list of all the A tags
- found within this tag."""
- return self.find_all(*args, **kwargs)
- def __getattr__(self, tag):
- #print "Getattr %s.%s" % (self.__class__, tag)
- if len(tag) > 3 and tag.endswith('Tag'):
- # BS3: soup.aTag -> "soup.find("a")
- tag_name = tag[:-3]
- warnings.warn(
- '.%sTag is deprecated, use .find("%s") instead.' % (
- tag_name, tag_name))
- return self.find(tag_name)
- # We special case contents to avoid recursion.
- elif not tag.startswith("__") and not tag=="contents":
- return self.find(tag)
- raise AttributeError(
- "'%s' object has no attribute '%s'" % (self.__class__, tag))
- def __eq__(self, other):
- """Returns true iff this tag has the same name, the same attributes,
- and the same contents (recursively) as the given tag."""
- if self is other:
- return True
- if (not hasattr(other, 'name') or
- not hasattr(other, 'attrs') or
- not hasattr(other, 'contents') or
- self.name != other.name or
- self.attrs != other.attrs or
- len(self) != len(other)):
- return False
- for i, my_child in enumerate(self.contents):
- if my_child != other.contents[i]:
- return False
- return True
- def __ne__(self, other):
- """Returns true iff this tag is not identical to the other tag,
- as defined in __eq__."""
- return not self == other
- def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
- """Renders this tag as a string."""
- return self.encode(encoding)
- def __unicode__(self):
- return self.decode()
- def __str__(self):
- return self.encode()
- if PY3K:
- __str__ = __repr__ = __unicode__
- def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
- indent_level=None, formatter="minimal",
- errors="xmlcharrefreplace"):
- # Turn the data structure into Unicode, then encode the
- # Unicode.
- u = self.decode(indent_level, encoding, formatter)
- return u.encode(encoding, errors)
- def _should_pretty_print(self, indent_level):
- """Should this tag be pretty-printed?"""
- return (
- indent_level is not None and
- (self.name not in HTMLAwareEntitySubstitution.preformatted_tags
- or self._is_xml))
- def decode(self, indent_level=None,
- eventual_encoding=DEFAULT_OUTPUT_ENCODING,
- formatter="minimal"):
- """Returns a Unicode representation of this tag and its contents.
- :param eventual_encoding: The tag is destined to be
- encoded into this encoding. This method is _not_
- responsible for performing that encoding. This information
- is passed in so that it can be substituted in if the
- document contains a <META> tag that mentions the document's
- encoding.
- """
- # First off, turn a string formatter into a function. This
- # will stop the lookup from happening over and over again.
- if not callable(formatter):
- formatter = self._formatter_for_name(formatter)
- attrs = []
- if self.attrs:
- for key, val in sorted(self.attrs.items()):
- if val is None:
- decoded = key
- else:
- if isinstance(val, list) or isinstance(val, tuple):
- val = ' '.join(val)
- elif not isinstance(val, basestring):
- val = unicode(val)
- elif (
- isinstance(val, AttributeValueWithCharsetSubstitution)
- and eventual_encoding is not None):
- val = val.encode(eventual_encoding)
- text = self.format_string(val, formatter)
- decoded = (
- unicode(key) + '='
- + EntitySubstitution.quoted_attribute_value(text))
- attrs.append(decoded)
- close = ''
- closeTag = ''
- prefix = ''
- if self.prefix:
- prefix = self.prefix + ":"
- if self.is_empty_element:
- close = '/'
- else:
- closeTag = '</%s%s>' % (prefix, self.name)
- pretty_print = self._should_pretty_print(indent_level)
- space = ''
- indent_space = ''
- if indent_level is not None:
- indent_space = (' ' * (indent_level - 1))
- if pretty_print:
- space = indent_space
- indent_contents = indent_level + 1
- else:
- indent_contents = None
- contents = self.decode_contents(
- indent_contents, eventual_encoding, formatter)
- if self.hidden:
- # This is the 'document root' object.
- s = contents
- else:
- s = []
- attribute_string = ''
- if attrs:
- attribute_string = ' ' + ' '.join(attrs)
- if indent_level is not None:
- # Even if this particular tag is not pretty-printed,
- # we should indent up to the start of the tag.
- s.append(indent_space)
- s.append('<%s%s%s%s>' % (
- prefix, self.name, attribute_string, close))
- if pretty_print:
- s.append("\n")
- s.append(contents)
- if pretty_print and contents and contents[-1] != "\n":
- s.append("\n")
- if pretty_print and closeTag:
- s.append(space)
- s.append(closeTag)
- if indent_level is not None and closeTag and self.next_sibling:
- # Even if this particular tag is not pretty-printed,
- # we're now done with the tag, and we should add a
- # newline if appropriate.
- s.append("\n")
- s = ''.join(s)
- return s
- def prettify(self, encoding=None, formatter="minimal"):
- if encoding is None:
- return self.decode(True, formatter=formatter)
- else:
- return self.encode(encoding, True, formatter=formatter)
- def decode_contents(self, indent_level=None,
- eventual_encoding=DEFAULT_OUTPUT_ENCODING,
- formatter="minimal"):
- """Renders the contents of this tag as a Unicode string.
- :param eventual_encoding: The tag is destined to be
- encoded into this encoding. This method is _not_
- responsible for performing that encoding. This information
- is passed in so that it can be substituted in if the
- document contains a <META> tag that mentions the document's
- encoding.
- """
- # First off, turn a string formatter into a function. This
- # will stop the lookup from happening over and over again.
- if not callable(formatter):
- formatter = self._formatter_for_name(formatter)
- pretty_print = (indent_level is not None)
- s = []
- for c in self:
- text = None
- if isinstance(c, NavigableString):
- text = c.output_ready(formatter)
- elif isinstance(c, Tag):
- s.append(c.decode(indent_level, eventual_encoding,
- formatter))
- if text and indent_level and not self.name == 'pre':
- text = text.strip()
- if text:
- if pretty_print and not self.name == 'pre':
- s.append(" " * (indent_level - 1))
- s.append(text)
- if pretty_print and not self.name == 'pre':
- s.append("\n")
- return ''.join(s)
- def encode_contents(
- self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
- formatter="minimal"):
- """Renders the contents of this tag as a bytestring."""
- contents = self.decode_contents(indent_level, encoding, formatter)
- return contents.encode(encoding)
- # Old method for BS3 compatibility
- def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
- prettyPrint=False, indentLevel=0):
- if not prettyPrint:
- indentLevel = None
- return self.encode_contents(
- indent_level=indentLevel, encoding=encoding)
- #Soup methods
- def find(self, name=None, attrs={}, recursive=True, text=None,
- **kwargs):
- """Return only the first child of this Tag matching the given
- criteria."""
- r = None
- l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
- if l:
- r = l[0]
- return r
- findChild = find
- def find_all(self, name=None, attrs={}, recursive=True, text=None,
- limit=None, **kwargs):
- """Extracts a list of Tag objects that match the given
- criteria. You can specify the name of the Tag and any
- attributes you want the Tag to have.
- The value of a key-value pair in the 'attrs' map can be a
- string, a list of strings, a regular expression object, or a
- callable that takes a string and returns whether or not the
- string matches for some custom definition of 'matches'. The
- same is true of the tag name."""
- generator = self.descendants
- if not recursive:
- generator = self.children
- return self._find_all(name, attrs, text, limit, generator, **kwargs)
- findAll = find_all # BS3
- findChildren = find_all # BS2
- #Generator methods
- @property
- def children(self):
- # return iter() to make the purpose of the method clear
- return iter(self.contents) # XXX This seems to be untested.
- @property
- def descendants(self):
- if not len(self.contents):
- return
- stopNode = self._last_descendant().next_element
- current = self.contents[0]
- while current is not stopNode:
- yield current
- current = current.next_element
- # CSS selector code
- _selector_combinators = ['>', '+', '~']
- _select_debug = False
- def select(self, selector, _candidate_generator=None):
- """Perform a CSS selection operation on the current element."""
- tokens = selector.split()
- current_context = [self]
- if tokens[-1] in self._selector_combinators:
- raise ValueError(
- 'Final combinator "%s" is missing an argument.' % tokens[-1])
- if self._select_debug:
- print 'Running CSS selector "%s"' % selector
- for index, token in enumerate(tokens):
- if self._select_debug:
- print ' Considering token "%s"' % token
- recursive_candidate_generator = None
- tag_name = None
- if tokens[index-1] in self._selector_combinators:
- # This token was consumed by the previous combinator. Skip it.
- if self._select_debug:
- print ' Token was consumed by the previous combinator.'
- continue
- # Each operation corresponds to a checker function, a rule
- # for determining whether a candidate matches the
- # selector. Candidates are generated by the active
- # iterator.
- checker = None
- m = self.attribselect_re.match(token)
- if m is not None:
- # Attribute selector
- tag_name, attribute, operator, value = m.groups()
- checker = self._attribute_checker(operator, attribute, value)
- elif '#' in token:
- # ID selector
- tag_name, tag_id = token.split('#', 1)
- def id_matches(tag):
- return tag.get('id', None) == tag_id
- checker = id_matches
- elif '.' in token:
- # Class selector
- tag_name, klass = token.split('.', 1)
- classes = set(klass.split('.'))
- def classes_match(candidate):
- return classes.issubset(candidate.get('class', []))
- checker = classes_match
- elif ':' in token:
- # Pseudo-class
- tag_name, pseudo = token.split(':', 1)
- if tag_name == '':
- raise ValueError(
- "A pseudo-class must be prefixed with a tag name.")
- pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
- found = []
- if pseudo_attributes is not None:
- pseudo_type, pseudo_value = pseudo_attributes.groups()
- if pseudo_type == 'nth-of-type':
- try:
- pseudo_value = int(pseudo_value)
- except:
- raise NotImplementedError(
- 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
- if pseudo_value < 1:
- raise ValueError(
- 'nth-of-type pseudo-class value must be at least 1.')
- class Counter(object):
- def __init__(self, destination):
- self.count = 0
- self.destination = destination
- def nth_child_of_type(self, tag):
- self.count += 1
- if self.count == self.destination:
- return True
- if self.count > self.destination:
- # Stop the generator that's sending us
- # these things.
- raise StopIteration()
- return False
- checker = Counter(pseudo_value).nth_child_of_type
- else:
- raise NotImplementedError(
- 'Only the following pseudo-classes are implemented: nth-of-type.')
- elif token == '*':
- # Star selector -- matches everything
- pass
- elif token == '>':
- # Run the next token as a CSS selector against the
- # direct children of each tag in the current context.
- recursive_candidate_generator = lambda tag: tag.children
- elif token == '~':
- # Run the next token as a CSS selector against the
- # siblings of each tag in the current context.
- recursive_candidate_generator = lambda tag: tag.next_siblings
- elif token == '+':
- # For each tag in the current context, run the next
- # token as a CSS selector against the tag's next
- # sibling that's a tag.
- def next_tag_sibling(tag):
- yield tag.find_next_sibling(True)
- recursive_candidate_generator = next_tag_sibling
- elif self.tag_name_re.match(token):
- # Just a tag name.
- tag_name = token
- else:
- raise ValueError(
- 'Unsupported or invalid CSS selector: "%s"' % token)
- if recursive_candidate_generator:
- # This happens when the selector looks like "> foo".
- #
- # The generator calls select() recursively on every
- # member of the current context, passing in a different
- # candidate generator and a different selector.
- #
- # In the case of "> foo", the candidate generator is
- # one that yields a tag's direct children (">"), and
- # the selector is "foo".
- next_token = tokens[index+1]
- def recursive_select(tag):
- if self._select_debug:
- print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
- print '-' * 40
- for i in tag.select(next_token, recursive_candidate_generator):
- if self._select_debug:
- print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
- yield i
-