PageRenderTime 142ms CodeModel.GetById 73ms app.highlight 57ms RepoModel.GetById 1ms app.codeStats 0ms

/scripts/microbes/BeautifulSoup.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 1812 lines | 1652 code | 76 blank | 84 comment | 97 complexity | c5294ca3eff11c63274843ef7a55d150 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1"""Beautiful Soup
   2Elixir and Tonic
   3"The Screen-Scraper's Friend"
   4http://www.crummy.com/software/BeautifulSoup/
   5
   6Beautiful Soup parses a (possibly invalid) XML or HTML document into a
   7tree representation. It provides methods and Pythonic idioms that make
   8it easy to navigate, search, and modify the tree.
   9
  10A well-structured XML/HTML document yields a well-behaved data
  11structure. An ill-structured XML/HTML document yields a
  12correspondingly ill-behaved data structure. If your document is only
  13locally well-structured, you can use this library to find and process
  14the well-structured part of it.
  15
  16Beautiful Soup works with Python 2.2 and up. It has no external
  17dependencies, but you'll have more success at converting data to UTF-8
  18if you also install these three packages:
  19
  20* chardet, for auto-detecting character encodings
  21  http://chardet.feedparser.org/
  22* cjkcodecs and iconv_codec, which add more encodings to the ones supported
  23  by stock Python.
  24  http://cjkpython.i18n.org/
  25
  26Beautiful Soup defines classes for two main parsing strategies:
  27    
  28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
  29   language that kind of looks like XML.
  30
  31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
  32   or invalid. This class has web browser-like heuristics for
  33   obtaining a sensible parse tree in the face of common HTML errors.
  34
  35Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
  36the encoding of an HTML or XML document, and converting it to
  37Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed
  38Parser.
  39
  40For more than you ever wanted to know about Beautiful Soup, see the
  41documentation:
  42http://www.crummy.com/software/BeautifulSoup/documentation.html
  43"""
  44from __future__ import generators
  45
  46__author__ = "Leonard Richardson (crummy.com)"
  47__contributors__ = ["Sam Ruby (intertwingly.net)",
  48                    "the unwitting Mark Pilgrim (diveintomark.org)",
  49                    "http://www.crummy.com/software/BeautifulSoup/AUTHORS.html"]
  50__version__ = "3.0.3"
  51__copyright__ = "Copyright (c) 2004-2006 Leonard Richardson"
  52__license__ = "PSF"
  53
  54from sgmllib import SGMLParser, SGMLParseError
  55import codecs
  56import types
  57import re
  58import sgmllib
  59from htmlentitydefs import name2codepoint
  60
  61# This RE makes Beautiful Soup able to parse XML with namespaces.
  62sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
  63
  64# This RE makes Beautiful Soup capable of recognizing numeric character
  65# references that use hexadecimal.
  66sgmllib.charref = re.compile('&#(\d+|x[0-9a-fA-F]+);')
  67
  68DEFAULT_OUTPUT_ENCODING = "utf-8"
  69
  70# First, the classes that represent markup elements.
  71
  72class PageElement:
  73    """Contains the navigational information for some part of the page
  74    (either a tag or a piece of text)"""
  75
  76    def setup(self, parent=None, previous=None):
  77        """Sets up the initial relations between this element and
  78        other elements."""        
  79        self.parent = parent
  80        self.previous = previous
  81        self.next = None
  82        self.previousSibling = None
  83        self.nextSibling = None
  84        if self.parent and self.parent.contents:
  85            self.previousSibling = self.parent.contents[-1]
  86            self.previousSibling.nextSibling = self
  87
  88    def replaceWith(self, replaceWith):        
  89        oldParent = self.parent
  90        myIndex = self.parent.contents.index(self)
  91        if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
  92            # We're replacing this element with one of its siblings.
  93            index = self.parent.contents.index(replaceWith)
  94            if index and index < myIndex:
  95                # Furthermore, it comes before this element. That
  96                # means that when we extract it, the index of this
  97                # element will change.
  98                myIndex = myIndex - 1
  99        self.extract()        
 100        oldParent.insert(myIndex, replaceWith)
 101        
 102    def extract(self):
 103        """Destructively rips this element out of the tree."""        
 104        if self.parent:
 105            try:
 106                self.parent.contents.remove(self)
 107            except ValueError:
 108                pass
 109
 110        #Find the two elements that would be next to each other if
 111        #this element (and any children) hadn't been parsed. Connect
 112        #the two.        
 113        lastChild = self._lastRecursiveChild()
 114        nextElement = lastChild.next
 115
 116        if self.previous:
 117            self.previous.next = nextElement
 118        if nextElement:
 119            nextElement.previous = self.previous
 120        self.previous = None
 121        lastChild.next = None
 122
 123        self.parent = None        
 124        if self.previousSibling:
 125            self.previousSibling.nextSibling = self.nextSibling
 126        if self.nextSibling:
 127            self.nextSibling.previousSibling = self.previousSibling
 128        self.previousSibling = self.nextSibling = None       
 129
 130    def _lastRecursiveChild(self):
 131        "Finds the last element beneath this object to be parsed."
 132        lastChild = self
 133        while hasattr(lastChild, 'contents') and lastChild.contents:
 134            lastChild = lastChild.contents[-1]
 135        return lastChild
 136
 137    def insert(self, position, newChild):
 138        if (isinstance(newChild, basestring)
 139            or isinstance(newChild, unicode)) \
 140            and not isinstance(newChild, NavigableString):
 141            newChild = NavigableString(newChild)        
 142
 143        position =  min(position, len(self.contents))
 144        if hasattr(newChild, 'parent') and newChild.parent != None:
 145            # We're 'inserting' an element that's already one
 146            # of this object's children. 
 147            if newChild.parent == self:
 148                index = self.find(newChild)
 149                if index and index < position:
 150                    # Furthermore we're moving it further down the
 151                    # list of this object's children. That means that
 152                    # when we extract this element, our target index
 153                    # will jump down one.
 154                    position = position - 1
 155            newChild.extract()
 156            
 157        newChild.parent = self
 158        previousChild = None
 159        if position == 0:
 160            newChild.previousSibling = None
 161            newChild.previous = self
 162        else:
 163            previousChild = self.contents[position-1]
 164            newChild.previousSibling = previousChild
 165            newChild.previousSibling.nextSibling = newChild
 166            newChild.previous = previousChild._lastRecursiveChild()
 167        if newChild.previous:
 168            newChild.previous.next = newChild        
 169
 170        newChildsLastElement = newChild._lastRecursiveChild()
 171
 172        if position >= len(self.contents):
 173            newChild.nextSibling = None
 174            
 175            parent = self
 176            parentsNextSibling = None
 177            while not parentsNextSibling:
 178                parentsNextSibling = parent.nextSibling
 179                parent = parent.parent
 180                if not parent: # This is the last element in the document.
 181                    break
 182            if parentsNextSibling:
 183                newChildsLastElement.next = parentsNextSibling
 184            else:
 185                newChildsLastElement.next = None
 186        else:
 187            nextChild = self.contents[position]            
 188            newChild.nextSibling = nextChild            
 189            if newChild.nextSibling:
 190                newChild.nextSibling.previousSibling = newChild
 191            newChildsLastElement.next = nextChild
 192
 193        if newChildsLastElement.next:
 194            newChildsLastElement.next.previous = newChildsLastElement
 195        self.contents.insert(position, newChild)
 196
 197    def findNext(self, name=None, attrs={}, text=None, **kwargs):
 198        """Returns the first item that matches the given criteria and
 199        appears after this Tag in the document."""
 200        return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
 201
 202    def findAllNext(self, name=None, attrs={}, text=None, limit=None,
 203                    **kwargs):
 204        """Returns all items that match the given criteria and appear
 205        before after Tag in the document."""
 206        return self._findAll(name, attrs, text, limit, self.nextGenerator)
 207
 208    def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
 209        """Returns the closest sibling to this Tag that matches the
 210        given criteria and appears after this Tag in the document."""
 211        return self._findOne(self.findNextSiblings, name, attrs, text,
 212                             **kwargs)
 213
 214    def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
 215                         **kwargs):
 216        """Returns the siblings of this Tag that match the given
 217        criteria and appear after this Tag in the document."""
 218        return self._findAll(name, attrs, text, limit,
 219                             self.nextSiblingGenerator, **kwargs)
 220    fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
 221
 222    def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
 223        """Returns the first item that matches the given criteria and
 224        appears before this Tag in the document."""
 225        return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
 226
 227    def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
 228                        **kwargs):
 229        """Returns all items that match the given criteria and appear
 230        before this Tag in the document."""
 231        return self._findAll(name, attrs, text, limit, self.previousGenerator,
 232                           **kwargs)
 233    fetchPrevious = findAllPrevious # Compatibility with pre-3.x
 234
 235    def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
 236        """Returns the closest sibling to this Tag that matches the
 237        given criteria and appears before this Tag in the document."""
 238        return self._findOne(self.findPreviousSiblings, name, attrs, text,
 239                             **kwargs)
 240
 241    def findPreviousSiblings(self, name=None, attrs={}, text=None,
 242                             limit=None, **kwargs):
 243        """Returns the siblings of this Tag that match the given
 244        criteria and appear before this Tag in the document."""
 245        return self._findAll(name, attrs, text, limit,
 246                             self.previousSiblingGenerator, **kwargs)
 247    fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
 248
 249    def findParent(self, name=None, attrs={}, **kwargs):
 250        """Returns the closest parent of this Tag that matches the given
 251        criteria."""
 252        # NOTE: We can't use _findOne because findParents takes a different
 253        # set of arguments.
 254        r = None
 255        l = self.findParents(name, attrs, 1)
 256        if l:
 257            r = l[0]
 258        return r
 259
 260    def findParents(self, name=None, attrs={}, limit=None, **kwargs):
 261        """Returns the parents of this Tag that match the given
 262        criteria."""
 263
 264        return self._findAll(name, attrs, None, limit, self.parentGenerator,
 265                             **kwargs)
 266    fetchParents = findParents # Compatibility with pre-3.x
 267
 268    #These methods do the real heavy lifting.
 269
 270    def _findOne(self, method, name, attrs, text, **kwargs):
 271        r = None
 272        l = method(name, attrs, text, 1, **kwargs)
 273        if l:
 274            r = l[0]
 275        return r
 276    
 277    def _findAll(self, name, attrs, text, limit, generator, **kwargs):
 278        "Iterates over a generator looking for things that match."
 279
 280        if isinstance(name, SoupStrainer):
 281            strainer = name
 282        else:
 283            # Build a SoupStrainer
 284            strainer = SoupStrainer(name, attrs, text, **kwargs)
 285        results = ResultSet(strainer)
 286        g = generator()
 287        while True:
 288            try:
 289                i = g.next()
 290            except StopIteration:
 291                break
 292            if i:
 293                found = strainer.search(i)
 294                if found:
 295                    results.append(found)
 296                    if limit and len(results) >= limit:
 297                        break
 298        return results
 299
 300    #These Generators can be used to navigate starting from both
 301    #NavigableStrings and Tags.                
 302    def nextGenerator(self):
 303        i = self
 304        while i:
 305            i = i.next
 306            yield i
 307
 308    def nextSiblingGenerator(self):
 309        i = self
 310        while i:
 311            i = i.nextSibling
 312            yield i
 313
 314    def previousGenerator(self):
 315        i = self
 316        while i:
 317            i = i.previous
 318            yield i
 319
 320    def previousSiblingGenerator(self):
 321        i = self
 322        while i:
 323            i = i.previousSibling
 324            yield i
 325
 326    def parentGenerator(self):
 327        i = self
 328        while i:
 329            i = i.parent
 330            yield i
 331
 332    # Utility methods
 333    def substituteEncoding(self, str, encoding=None):
 334        encoding = encoding or "utf-8"
 335        return str.replace("%SOUP-ENCODING%", encoding)    
 336
 337    def toEncoding(self, s, encoding=None):
 338        """Encodes an object to a string in some encoding, or to Unicode.
 339        ."""
 340        if isinstance(s, unicode):
 341            if encoding:
 342                s = s.encode(encoding)
 343        elif isinstance(s, str):
 344            if encoding:
 345                s = s.encode(encoding)
 346            else:
 347                s = unicode(s)
 348        else:
 349            if encoding:
 350                s  = self.toEncoding(str(s), encoding)
 351            else:
 352                s = unicode(s)
 353        return s
 354
 355class NavigableString(unicode, PageElement):
 356
 357    def __getattr__(self, attr):
 358        """text.string gives you text. This is for backwards
 359        compatibility for Navigable*String, but for CData* it lets you
 360        get the string without the CData wrapper."""
 361        if attr == 'string':
 362            return self
 363        else:
 364            raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
 365
 366    def __unicode__(self):
 367        return __str__(self, None)
 368
 369    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 370        if encoding:
 371            return self.encode(encoding)
 372        else:
 373            return self
 374        
 375class CData(NavigableString):
 376
 377    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 378        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
 379
 380class ProcessingInstruction(NavigableString):
 381    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 382        output = self
 383        if "%SOUP-ENCODING%" in output:
 384            output = self.substituteEncoding(output, encoding)
 385        return "<?%s?>" % self.toEncoding(output, encoding)
 386
 387class Comment(NavigableString):
 388    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 389        return "<!--%s-->" % NavigableString.__str__(self, encoding)    
 390
 391class Declaration(NavigableString):
 392    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 393        return "<!%s>" % NavigableString.__str__(self, encoding)        
 394
 395class Tag(PageElement):
 396    """Represents a found HTML tag with its attributes and contents."""
 397
 398    XML_ENTITIES_TO_CHARS = { 'apos' : "'",
 399                              "quot" : '"',
 400                              "amp" : "&",
 401                              "lt" : "<",
 402                              "gt" : ">"
 403                              }
 404    # An RE for finding ampersands that aren't the start of of a
 405    # numeric entity.
 406    BARE_AMPERSAND = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
 407
 408    def __init__(self, parser, name, attrs=None, parent=None,
 409                 previous=None):
 410        "Basic constructor."
 411
 412        # We don't actually store the parser object: that lets extracted
 413        # chunks be garbage-collected
 414        self.parserClass = parser.__class__
 415        self.isSelfClosing = parser.isSelfClosingTag(name)
 416        self.convertHTMLEntities = parser.convertHTMLEntities
 417        self.name = name
 418        if attrs == None:
 419            attrs = []
 420        self.attrs = attrs
 421        self.contents = []
 422        self.setup(parent, previous)
 423        self.hidden = False
 424        self.containsSubstitutions = False
 425
 426    def get(self, key, default=None):
 427        """Returns the value of the 'key' attribute for the tag, or
 428        the value given for 'default' if it doesn't have that
 429        attribute."""
 430        return self._getAttrMap().get(key, default)    
 431
 432    def has_key(self, key):
 433        return self._getAttrMap().has_key(key)
 434
 435    def __getitem__(self, key):
 436        """tag[key] returns the value of the 'key' attribute for the tag,
 437        and throws an exception if it's not there."""
 438        return self._getAttrMap()[key]
 439
 440    def __iter__(self):
 441        "Iterating over a tag iterates over its contents."
 442        return iter(self.contents)
 443
 444    def __len__(self):
 445        "The length of a tag is the length of its list of contents."
 446        return len(self.contents)
 447
 448    def __contains__(self, x):
 449        return x in self.contents
 450
 451    def __nonzero__(self):
 452        "A tag is non-None even if it has no contents."
 453        return True
 454
 455    def __setitem__(self, key, value):        
 456        """Setting tag[key] sets the value of the 'key' attribute for the
 457        tag."""
 458        self._getAttrMap()
 459        self.attrMap[key] = value
 460        found = False
 461        for i in range(0, len(self.attrs)):
 462            if self.attrs[i][0] == key:
 463                self.attrs[i] = (key, value)
 464                found = True
 465        if not found:
 466            self.attrs.append((key, value))
 467        self._getAttrMap()[key] = value
 468
 469    def __delitem__(self, key):
 470        "Deleting tag[key] deletes all 'key' attributes for the tag."
 471        for item in self.attrs:
 472            if item[0] == key:
 473                self.attrs.remove(item)
 474                #We don't break because bad HTML can define the same
 475                #attribute multiple times.
 476            self._getAttrMap()
 477            if self.attrMap.has_key(key):
 478                del self.attrMap[key]
 479
 480    def __call__(self, *args, **kwargs):
 481        """Calling a tag like a function is the same as calling its
 482        findAll() method. Eg. tag('a') returns a list of all the A tags
 483        found within this tag."""
 484        return apply(self.findAll, args, kwargs)
 485
 486    def __getattr__(self, tag):
 487        #print "Getattr %s.%s" % (self.__class__, tag)
 488        if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
 489            return self.find(tag[:-3])
 490        elif tag.find('__') != 0:
 491            return self.find(tag)
 492
 493    def __eq__(self, other):
 494        """Returns true iff this tag has the same name, the same attributes,
 495        and the same contents (recursively) as the given tag.
 496
 497        NOTE: right now this will return false if two tags have the
 498        same attributes in a different order. Should this be fixed?"""
 499        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
 500            return False
 501        for i in range(0, len(self.contents)):
 502            if self.contents[i] != other.contents[i]:
 503                return False
 504        return True
 505
 506    def __ne__(self, other):
 507        """Returns true iff this tag is not identical to the other tag,
 508        as defined in __eq__."""
 509        return not self == other
 510
 511    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 512        """Renders this tag as a string."""
 513        return self.__str__(encoding)
 514
 515    def __unicode__(self):
 516        return self.__str__(None)
 517
 518    def _convertEntities(self, match):
 519        x = match.group(1)
 520        if x in name2codepoint:
 521            return unichr(name2codepoint[x])            
 522        elif "&" + x + ";" in self.XML_ENTITIES_TO_CHARS:
 523            return '&%s;' % x
 524        else:
 525            return '&amp;%s;' % x
 526
 527    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
 528                prettyPrint=False, indentLevel=0):
 529        """Returns a string or Unicode representation of this tag and
 530        its contents. To get Unicode, pass None for encoding.
 531
 532        NOTE: since Python's HTML parser consumes whitespace, this
 533        method is not certain to reproduce the whitespace present in
 534        the original string."""
 535
 536        encodedName = self.toEncoding(self.name, encoding)
 537
 538        attrs = []
 539        if self.attrs:
 540            for key, val in self.attrs:
 541                fmt = '%s="%s"'
 542                if isString(val):                    
 543                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
 544                        val = self.substituteEncoding(val, encoding)
 545
 546                    # The attribute value either:
 547                    #
 548                    # * Contains no embedded double quotes or single quotes.
 549                    #   No problem: we enclose it in double quotes.
 550                    # * Contains embedded single quotes. No problem:
 551                    #   double quotes work here too.
 552                    # * Contains embedded double quotes. No problem:
 553                    #   we enclose it in single quotes.
 554                    # * Embeds both single _and_ double quotes. This
 555                    #   can't happen naturally, but it can happen if
 556                    #   you modify an attribute value after parsing
 557                    #   the document. Now we have a bit of a
 558                    #   problem. We solve it by enclosing the
 559                    #   attribute in single quotes, and escaping any
 560                    #   embedded single quotes to XML entities.
 561                    if '"' in val:
 562                        # This can't happen naturally, but it can happen
 563                        # if you modify an attribute value after parsing.
 564                        if "'" in val:
 565                            val = val.replace('"', "&quot;")
 566                        else:
 567                            fmt = "%s='%s'"
 568
 569                    # Optionally convert any HTML entities
 570                    if self.convertHTMLEntities:
 571                        val = re.sub("&(\w+);", self._convertEntities, val)
 572
 573                    # Now we're okay w/r/t quotes. But the attribute
 574                    # value might also contain angle brackets, or
 575                    # ampersands that aren't part of entities. We need
 576                    # to escape those to XML entities too.
 577                    val = val.replace("<", "&lt;").replace(">", "&gt;")
 578                    val = self.BARE_AMPERSAND.sub("&amp;", val)
 579
 580                                      
 581                attrs.append(fmt % (self.toEncoding(key, encoding),
 582                                    self.toEncoding(val, encoding)))
 583        close = ''
 584        closeTag = ''
 585        if self.isSelfClosing:
 586            close = ' /'
 587        else:
 588            closeTag = '</%s>' % encodedName
 589
 590        indentTag, indentContents = 0, 0
 591        if prettyPrint:
 592            indentTag = indentLevel
 593            space = (' ' * (indentTag-1))
 594            indentContents = indentTag + 1
 595        contents = self.renderContents(encoding, prettyPrint, indentContents)
 596        if self.hidden:
 597            s = contents
 598        else:
 599            s = []
 600            attributeString = ''
 601            if attrs:
 602                attributeString = ' ' + ' '.join(attrs)            
 603            if prettyPrint:
 604                s.append(space)
 605            s.append('<%s%s%s>' % (encodedName, attributeString, close))
 606            if prettyPrint:
 607                s.append("\n")
 608            s.append(contents)
 609            if prettyPrint and contents and contents[-1] != "\n":
 610                s.append("\n")
 611            if prettyPrint and closeTag:
 612                s.append(space)
 613            s.append(closeTag)
 614            if prettyPrint and closeTag and self.nextSibling:
 615                s.append("\n")
 616            s = ''.join(s)
 617        return s
 618
 619    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
 620        return self.__str__(encoding, True)
 621
 622    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
 623                       prettyPrint=False, indentLevel=0):
 624        """Renders the contents of this tag as a string in the given
 625        encoding. If encoding is None, returns a Unicode string.."""
 626        s=[]
 627        for c in self:
 628            text = None
 629            if isinstance(c, NavigableString):
 630                text = c.__str__(encoding)
 631            elif isinstance(c, Tag):
 632                s.append(c.__str__(encoding, prettyPrint, indentLevel))
 633            if text and prettyPrint:
 634                text = text.strip()              
 635            if text:
 636                if prettyPrint:
 637                    s.append(" " * (indentLevel-1))
 638                s.append(text)
 639                if prettyPrint:
 640                    s.append("\n")
 641        return ''.join(s)    
 642
 643    #Soup methods
 644
 645    def find(self, name=None, attrs={}, recursive=True, text=None,
 646             **kwargs):
 647        """Return only the first child of this Tag matching the given
 648        criteria."""
 649        r = None
 650        l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
 651        if l:
 652            r = l[0]
 653        return r
 654    findChild = find
 655
 656    def findAll(self, name=None, attrs={}, recursive=True, text=None,
 657                limit=None, **kwargs):
 658        """Extracts a list of Tag objects that match the given
 659        criteria.  You can specify the name of the Tag and any
 660        attributes you want the Tag to have.
 661
 662        The value of a key-value pair in the 'attrs' map can be a
 663        string, a list of strings, a regular expression object, or a
 664        callable that takes a string and returns whether or not the
 665        string matches for some custom definition of 'matches'. The
 666        same is true of the tag name."""
 667        generator = self.recursiveChildGenerator
 668        if not recursive:
 669            generator = self.childGenerator
 670        return self._findAll(name, attrs, text, limit, generator, **kwargs)
 671    findChildren = findAll
 672
 673    # Pre-3.x compatibility methods
 674    first = find
 675    fetch = findAll
 676    
 677    def fetchText(self, text=None, recursive=True, limit=None):
 678        return self.findAll(text=text, recursive=recursive, limit=limit)
 679
 680    def firstText(self, text=None, recursive=True):
 681        return self.find(text=text, recursive=recursive)
 682    
 683    #Utility methods
 684
 685    def append(self, tag):
 686        """Appends the given tag to the contents of this tag."""
 687        self.contents.append(tag)
 688
 689    #Private methods
 690
 691    def _getAttrMap(self):
 692        """Initializes a map representation of this tag's attributes,
 693        if not already initialized."""
 694        if not getattr(self, 'attrMap'):
 695            self.attrMap = {}
 696            for (key, value) in self.attrs:
 697                self.attrMap[key] = value 
 698        return self.attrMap
 699
 700    #Generator methods
 701    def childGenerator(self):
 702        for i in range(0, len(self.contents)):
 703            yield self.contents[i]
 704        raise StopIteration
 705    
 706    def recursiveChildGenerator(self):
 707        stack = [(self, 0)]
 708        while stack:
 709            tag, start = stack.pop()
 710            if isinstance(tag, Tag):            
 711                for i in range(start, len(tag.contents)):
 712                    a = tag.contents[i]
 713                    yield a
 714                    if isinstance(a, Tag) and tag.contents:
 715                        if i < len(tag.contents) - 1:
 716                            stack.append((tag, i+1))
 717                        stack.append((a, 0))
 718                        break
 719        raise StopIteration
 720
 721# Next, a couple classes to represent queries and their results.
 722class SoupStrainer:
 723    """Encapsulates a number of ways of matching a markup element (tag or
 724    text)."""
 725
 726    def __init__(self, name=None, attrs={}, text=None, **kwargs):
 727        self.name = name
 728        if isString(attrs):
 729            kwargs['class'] = attrs
 730            attrs = None
 731        if kwargs:
 732            if attrs:
 733                attrs = attrs.copy()
 734                attrs.update(kwargs)
 735            else:
 736                attrs = kwargs
 737        self.attrs = attrs
 738        self.text = text
 739
 740    def __str__(self):
 741        if self.text:
 742            return self.text
 743        else:
 744            return "%s|%s" % (self.name, self.attrs)
 745    
 746    def searchTag(self, markupName=None, markupAttrs={}):
 747        found = None
 748        markup = None
 749        if isinstance(markupName, Tag):
 750            markup = markupName
 751            markupAttrs = markup
 752        callFunctionWithTagData = callable(self.name) \
 753                                and not isinstance(markupName, Tag)
 754
 755        if (not self.name) \
 756               or callFunctionWithTagData \
 757               or (markup and self._matches(markup, self.name)) \
 758               or (not markup and self._matches(markupName, self.name)):
 759            if callFunctionWithTagData:
 760                match = self.name(markupName, markupAttrs)
 761            else:
 762                match = True            
 763                markupAttrMap = None
 764                for attr, matchAgainst in self.attrs.items():
 765                    if not markupAttrMap:
 766                         if hasattr(markupAttrs, 'get'):
 767                            markupAttrMap = markupAttrs
 768                         else:
 769                            markupAttrMap = {}
 770                            for k,v in markupAttrs:
 771                                markupAttrMap[k] = v
 772                    attrValue = markupAttrMap.get(attr)
 773                    if not self._matches(attrValue, matchAgainst):
 774                        match = False
 775                        break
 776            if match:
 777                if markup:
 778                    found = markup
 779                else:
 780                    found = markupName
 781        return found
 782
 783    def search(self, markup):
 784        #print 'looking for %s in %s' % (self, markup)
 785        found = None
 786        # If given a list of items, scan it for a text element that
 787        # matches.        
 788        if isList(markup) and not isinstance(markup, Tag):
 789            for element in markup:
 790                if isinstance(element, NavigableString) \
 791                       and self.search(element):
 792                    found = element
 793                    break
 794        # If it's a Tag, make sure its name or attributes match.
 795        # Don't bother with Tags if we're searching for text.
 796        elif isinstance(markup, Tag):
 797            if not self.text:
 798                found = self.searchTag(markup)
 799        # If it's text, make sure the text matches.
 800        elif isinstance(markup, NavigableString) or \
 801                 isString(markup):
 802            if self._matches(markup, self.text):
 803                found = markup
 804        else:
 805            raise Exception, "I don't know how to match against a %s" \
 806                  % markup.__class__
 807        return found
 808        
 809    def _matches(self, markup, matchAgainst):    
 810        #print "Matching %s against %s" % (markup, matchAgainst)
 811        result = False
 812        if matchAgainst == True and type(matchAgainst) == types.BooleanType:
 813            result = markup != None
 814        elif callable(matchAgainst):
 815            result = matchAgainst(markup)
 816        else:
 817            #Custom match methods take the tag as an argument, but all
 818            #other ways of matching match the tag name as a string.
 819            if isinstance(markup, Tag):
 820                markup = markup.name
 821            if markup and not isString(markup):
 822                markup = unicode(markup)
 823            #Now we know that chunk is either a string, or None.
 824            if hasattr(matchAgainst, 'match'):
 825                # It's a regexp object.
 826                result = markup and matchAgainst.search(markup)
 827            elif isList(matchAgainst):
 828                result = markup in matchAgainst
 829            elif hasattr(matchAgainst, 'items'):
 830                result = markup.has_key(matchAgainst)
 831            elif matchAgainst and isString(markup):
 832                if isinstance(markup, unicode):
 833                    matchAgainst = unicode(matchAgainst)
 834                else:
 835                    matchAgainst = str(matchAgainst)
 836
 837            if not result:
 838                result = matchAgainst == markup
 839        return result
 840
 841class ResultSet(list):
 842    """A ResultSet is just a list that keeps track of the SoupStrainer
 843    that created it."""
 844    def __init__(self, source):
 845        list.__init__([])
 846        self.source = source
 847
 848# Now, some helper functions.
 849
 850def isList(l):
 851    """Convenience method that works with all 2.x versions of Python
 852    to determine whether or not something is listlike."""
 853    return hasattr(l, '__iter__') \
 854           or (type(l) in (types.ListType, types.TupleType))
 855
 856def isString(s):
 857    """Convenience method that works with all 2.x versions of Python
 858    to determine whether or not something is stringlike."""
 859    try:
 860        return isinstance(s, unicode) or isintance(s, basestring) 
 861    except NameError:
 862        return isinstance(s, str)
 863
 864def buildTagMap(default, *args):
 865    """Turns a list of maps, lists, or scalars into a single map.
 866    Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
 867    NESTING_RESET_TAGS maps out of lists and partial maps."""
 868    built = {}
 869    for portion in args:
 870        if hasattr(portion, 'items'):
 871            #It's a map. Merge it.
 872            for k,v in portion.items():
 873                built[k] = v
 874        elif isList(portion):
 875            #It's a list. Map each item to the default.
 876            for k in portion:
 877                built[k] = default
 878        else:
 879            #It's a scalar. Map it to the default.
 880            built[portion] = default
 881    return built
 882
 883# Now, the parser classes.
 884
 885class BeautifulStoneSoup(Tag, SGMLParser):
 886
 887    """This class contains the basic parser and search code. It defines
 888    a parser that knows nothing about tag behavior except for the
 889    following:
 890   
 891      You can't close a tag without closing all the tags it encloses.
 892      That is, "<foo><bar></foo>" actually means
 893      "<foo><bar></bar></foo>".
 894
 895    [Another possible explanation is "<foo><bar /></foo>", but since
 896    this class defines no SELF_CLOSING_TAGS, it will never use that
 897    explanation.]
 898
 899    This class is useful for parsing XML or made-up markup languages,
 900    or when BeautifulSoup makes an assumption counter to what you were
 901    expecting."""
 902
 903    SELF_CLOSING_TAGS = {}
 904    NESTABLE_TAGS = {}
 905    RESET_NESTING_TAGS = {}
 906    QUOTE_TAGS = {}
 907
 908    MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
 909                       lambda x: x.group(1) + ' />'),
 910                      (re.compile('<!\s+([^<>]*)>'),
 911                       lambda x: '<!' + x.group(1) + '>')
 912                      ]
 913
 914    ROOT_TAG_NAME = u'[document]'
 915
 916    HTML_ENTITIES = "html"
 917    XML_ENTITIES = "xml"
 918    ALL_ENTITIES = [HTML_ENTITIES, XML_ENTITIES]
 919
 920    def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
 921                 markupMassage=True, smartQuotesTo=XML_ENTITIES,
 922                 convertEntities=None, selfClosingTags=None):
 923        """The Soup object is initialized as the 'root tag', and the
 924        provided markup (which can be a string or a file-like object)
 925        is fed into the underlying parser. 
 926
 927        sgmllib will process most bad HTML, and the BeautifulSoup
 928        class has some tricks for dealing with some HTML that kills
 929        sgmllib, but Beautiful Soup can nonetheless choke or lose data
 930        if your data uses self-closing tags or declarations
 931        incorrectly.
 932
 933        By default, Beautiful Soup uses regexes to sanitize input,
 934        avoiding the vast majority of these problems. If the problems
 935        don't apply to you, pass in False for markupMassage, and
 936        you'll get better performance.
 937
 938        The default parser massage techniques fix the two most common
 939        instances of invalid HTML that choke sgmllib:
 940
 941         <br/> (No space between name of closing tag and tag close)
 942         <! --Comment--> (Extraneous whitespace in declaration)
 943
 944        You can pass in a custom list of (RE object, replace method)
 945        tuples to get Beautiful Soup to scrub your input the way you
 946        want."""
 947
 948        self.parseOnlyThese = parseOnlyThese
 949        self.fromEncoding = fromEncoding
 950        self.smartQuotesTo = smartQuotesTo
 951
 952        if convertEntities:
 953            # It doesn't make sense to convert encoded characters to
 954            # entities even while you're converting entities to Unicode.
 955            # Just convert it all to Unicode.
 956            self.smartQuotesTo = None
 957
 958        if isList(convertEntities):
 959            self.convertHTMLEntities = self.HTML_ENTITIES in convertEntities
 960            self.convertXMLEntities = self.XML_ENTITIES in convertEntities
 961        else:
 962            self.convertHTMLEntities = self.HTML_ENTITIES == convertEntities
 963            self.convertXMLEntities = self.XML_ENTITIES == convertEntities
 964
 965        self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
 966        SGMLParser.__init__(self)
 967            
 968        if hasattr(markup, 'read'):        # It's a file-type object.
 969            markup = markup.read()
 970        self.markup = markup
 971        self.markupMassage = markupMassage
 972        try:
 973            self._feed()
 974        except StopParsing:
 975            pass
 976        self.markup = None                 # The markup can now be GCed
 977
 978    def _feed(self, inDocumentEncoding=None):
 979        # Convert the document to Unicode.
 980        markup = self.markup
 981        if isinstance(markup, unicode):
 982            if not hasattr(self, 'originalEncoding'):
 983                self.originalEncoding = None
 984        else:
 985            dammit = UnicodeDammit\
 986                     (markup, [self.fromEncoding, inDocumentEncoding],
 987                      smartQuotesTo=self.smartQuotesTo)
 988            markup = dammit.unicode
 989            self.originalEncoding = dammit.originalEncoding
 990        if markup:
 991            if self.markupMassage:
 992                if not isList(self.markupMassage):
 993                    self.markupMassage = self.MARKUP_MASSAGE            
 994                for fix, m in self.markupMassage:
 995                    markup = fix.sub(m, markup)
 996        self.reset()
 997
 998        SGMLParser.feed(self, markup or "")
 999        SGMLParser.close(self)
1000        # Close out any unfinished strings and close all the open tags.
1001        self.endData()
1002        while self.currentTag.name != self.ROOT_TAG_NAME:
1003            self.popTag()
1004
1005    def __getattr__(self, methodName):
1006        """This method routes method call requests to either the SGMLParser
1007        superclass or the Tag superclass, depending on the method name."""
1008        #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1009
1010        if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
1011               or methodName.find('do_') == 0:
1012            return SGMLParser.__getattr__(self, methodName)
1013        elif methodName.find('__') != 0:
1014            return Tag.__getattr__(self, methodName)
1015        else:
1016            raise AttributeError
1017
1018    def isSelfClosingTag(self, name):
1019        """Returns true iff the given string is the name of a
1020        self-closing tag according to this parser."""
1021        return self.SELF_CLOSING_TAGS.has_key(name) \
1022               or self.instanceSelfClosingTags.has_key(name)
1023            
1024    def reset(self):
1025        Tag.__init__(self, self, self.ROOT_TAG_NAME)
1026        self.hidden = 1
1027        SGMLParser.reset(self)
1028        self.currentData = []
1029        self.currentTag = None
1030        self.tagStack = []
1031        self.quoteStack = []
1032        self.pushTag(self)
1033    
1034    def popTag(self):
1035        tag = self.tagStack.pop()
1036        # Tags with just one string-owning child get the child as a
1037        # 'string' property, so that soup.tag.string is shorthand for
1038        # soup.tag.contents[0]
1039        if len(self.currentTag.contents) == 1 and \
1040           isinstance(self.currentTag.contents[0], NavigableString):
1041            self.currentTag.string = self.currentTag.contents[0]
1042
1043        #print "Pop", tag.name
1044        if self.tagStack:
1045            self.currentTag = self.tagStack[-1]
1046        return self.currentTag
1047
1048    def pushTag(self, tag):
1049        #print "Push", tag.name
1050        if self.currentTag:
1051            self.currentTag.append(tag)
1052        self.tagStack.append(tag)
1053        self.currentTag = self.tagStack[-1]
1054
1055    def endData(self, containerClass=NavigableString):
1056        if self.currentData:
1057            currentData = ''.join(self.currentData)
1058            if currentData.endswith('<') and self.convertHTMLEntities:
1059                currentData = currentData[:-1] + '&lt;'
1060            if not currentData.strip():
1061                if '\n' in currentData:
1062                    currentData = '\n'
1063                else:
1064                    currentData = ' '
1065            self.currentData = []
1066            if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1067                   (not self.parseOnlyThese.text or \
1068                    not self.parseOnlyThese.search(currentData)):
1069                return
1070            o = containerClass(currentData)
1071            o.setup(self.currentTag, self.previous)
1072            if self.previous:
1073                self.previous.next = o
1074            self.previous = o
1075            self.currentTag.contents.append(o)
1076
1077
1078    def _popToTag(self, name, inclusivePop=True):
1079        """Pops the tag stack up to and including the most recent
1080        instance of the given tag. If inclusivePop is false, pops the tag
1081        stack up to but *not* including the most recent instqance of
1082        the given tag."""
1083        #print "Popping to %s" % name
1084        if name == self.ROOT_TAG_NAME:
1085            return            
1086
1087        numPops = 0
1088        mostRecentTag = None
1089        for i in range(len(self.tagStack)-1, 0, -1):
1090            if name == self.tagStack[i].name:
1091                numPops = len(self.tagStack)-i
1092                break
1093        if not inclusivePop:
1094            numPops = numPops - 1
1095
1096        for i in range(0, numPops):
1097            mostRecentTag = self.popTag()
1098        return mostRecentTag    
1099
1100    def _smartPop(self, name):
1101
1102        """We need to pop up to the previous tag of this type, unless
1103        one of this tag's nesting reset triggers comes between this
1104        tag and the previous tag of this type, OR unless this tag is a
1105        generic nesting trigger and another generic nesting trigger
1106        comes between this tag and the previous tag of this type.
1107
1108        Examples:
1109         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1110         <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
1111         <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
1112         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1113
1114         <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1115         <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1116         <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1117        """
1118
1119        nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1120        isNestable = nestingResetTriggers != None
1121        isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1122        popTo = None
1123        inclusive = True
1124        for i in range(len(self.tagStack)-1, 0, -1):
1125            p = self.tagStack[i]
1126            if (not p or p.name == name) and not isNestable:
1127                #Non-nestable tags get popped to the top or to their
1128                #last occurance.
1129                popTo = name
1130                break
1131            if (nestingResetTriggers != None
1132                and p.name in nestingResetTriggers) \
1133                or (nestingResetTriggers == None and isResetNesting
1134                    and self.RESET_NESTING_TAGS.has_key(p.name)):
1135                
1136                #If we encounter one of the nesting reset triggers
1137                #peculiar to this tag, or we encounter another tag
1138                #that causes nesting to reset, pop up to but not
1139                #including that tag.
1140                popTo = p.name
1141                inclusive = False
1142                break
1143            p = p.parent
1144        if popTo:
1145            self._popToTag(popTo, inclusive)
1146
1147    def unknown_starttag(self, name, attrs, selfClosing=0):
1148        #print "Start tag %s: %s" % (name, attrs)
1149        if self.quoteStack:
1150            #This is not a real tag.
1151            #print "<%s> is not real!" % name
1152            attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1153            self.currentData.append('<%s%s>' % (name, attrs))
1154            return        
1155        self.endData()
1156
1157        if not self.isSelfClosingTag(name) and not selfClosing:
1158            self._smartPop(name)
1159
1160        if self.parseOnlyThese and len(self.tagStack) <= 1 \
1161               and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1162            return
1163
1164        tag = Tag(self, name, attrs, self.currentTag, self.previous)
1165        if self.previous:
1166            self.previous.next = tag
1167        self.previous = tag
1168        self.pushTag(tag)
1169        if selfClosing or self.isSelfClosingTag(name):
1170            self.popTag()                
1171        if name in self.QUOTE_TAGS:
1172            #print "Beginning quote (%s)" % name
1173            self.quoteStack.append(name)
1174            self.literal = 1
1175        return tag
1176
1177    def unknown_endtag(self, name):
1178        #print "End tag %s" % name
1179        if self.quoteStack and self.quoteStack[-1] != name:
1180            #This is not a real end tag.
1181            #print "</%s> is not real!" % name
1182            self.currentData.append('</%s>' % name)
1183            return
1184        self.endData()
1185        self._popToTag(name)
1186        if self.quoteStack and self.quoteStack[-1] == name:
1187            self.quoteStack.pop()
1188            self.literal = (len(self.quoteStack) > 0)
1189
1190    def handle_data(self, data):
1191        if self.convertHTMLEntities:
1192            if data[0] == '&':
1193                data = self.BARE_AMPERSAND.sub("&amp;",data)
1194            else:
1195                data = data.replace('&','&amp;') \
1196                           .replace('<','&lt;') \
1197                           .replace('>','&gt;')
1198        self.currentData.append(data)
1199
1200    def _toStringSubclass(self, text, subclass):
1201        """Adds a certain piece of text to the tree as a NavigableString
1202        subclass."""
1203        self.endData()
1204        self.handle_data(text)
1205        self.endData(subclass)
1206
1207    def handle_pi(self, text):
1208        """Handle a processing instruction as a ProcessingInstruction
1209        object, possibly one with a %SOUP-ENCODING% slot into which an
1210        encoding will be plugged later."""
1211        if text[:3] == "xml":
1212            text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
1213        self._toStringSubclass(text, ProcessingInstruction)
1214
1215    def handle_comment(self, text):
1216        "Handle comments as Comment objects."
1217        self._toStringSubclass(text, Comment)
1218
1219    def handle_charref(self, ref):
1220        "Handle character references as data."
1221        if ref[0] == 'x':
1222            data = unichr(int(ref[1:],16))
1223        else:
1224            data = unichr(int(ref))
1225        
1226        if u'\x80' <= data <= u'\x9F':
1227            data = UnicodeDammit.subMSChar(chr(ord(data)), self.smartQuotesTo)
1228        elif not self.convertHTMLEntities and not self.convertXMLEntities:
1229            data = '&#%s;' % ref
1230
1231        self.handle_data(data)
1232
1233    def handle_entityref(self, ref):
1234        """Handle entity references as data, possibly converting known
1235        HTML entity references to the corresponding Unicode
1236        characters."""
1237        replaceWithXMLEntity = self.convertXMLEntities and \
1238                               self.XML_ENTITIES_TO_CHARS.has_key(ref)
1239        if self.convertHTMLEntities or replaceWithXMLEntity:
1240            try:
1241                data = unichr(name2codepoint[ref])
1242            except KeyError:
1243                if replaceWithXMLEntity:
1244                    data = self.XML_ENTITIES_TO_CHARS.get(ref)
1245                else:
1246                    data="&amp;%s" % ref
1247        else:
1248            data = '&%s;' % ref
1249        self.handle_data(data)
1250        
1251    def handle_decl(self, data):
1252        "Handle DOCTYPEs and the like as Declaration objects."
1253        self._toStringSubclass(data, Declaration)
1254
1255    def parse_declaration(self, i):
1256        """Treat a bogus SGML declaration as raw data. Treat a CDATA
1257        declaration as a CData object."""
1258        j = None
1259        if self.rawdata[i:i+9] == '<![CDATA[':
1260             k = self.rawdata.find(']]>', i)
1261             if k == -1:
1262                 k = len(self.rawdata)
1263             data = self.rawdata[i+9:k]
1264             j = k+3
1265             self._toStringSubclass(data, CData)
1266        else:
1267            try:
1268                j = SGMLParser.parse_declaration(self, i)
1269            except SGMLParseError:
1270                toHandle = self.rawdata[i:]
1271                self.handle_data(toHandle)
1272                j = i + len(toHandle)
1273        return j
1274
1275class BeautifulSoup(BeautifulStoneSoup):
1276
1277    """This parser knows the following facts about HTML:
1278
1279    * Some tags have no closing tag and should be interpreted as being
1280      closed as soon as they are encountered.
1281
1282    * The text inside some tags (ie. 'script') may contain tags which
1283      are not really part of the document and which should be parsed
1284      as text, not tags. If you want to parse the text as tags, you can
1285      always fetch it and parse it explicitly.
1286
1287    * Tag nesting rules:
1288
1289      Most tags can't be nested at all. For instance, the occurance of
1290      a <p> tag should implicitly close the previous <p> tag.
1291
1292       <p>Para1<p>Para2
1293        should be transformed into:
1294       <p>Para1</p><p>Para2
1295
1296      Some tags can be nested arbitrarily. For instance, the occurance
1297      of a <blockquote> tag should _not_ implicitly close the previous
1298      <blockquote> tag.
1299
1300       Alice said: <blockquote>Bob said: <blockquote>Blah
1301        should NOT be transformed into:
1302       Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1303
1304      Some tags can be nested, but the nesting is reset by the
1305      interposition of …

Large files files are truncated, but you can click here to view the full file