PageRenderTime 35ms CodeModel.GetById 13ms app.highlight 18ms RepoModel.GetById 1ms app.codeStats 0ms

/lib-python/2.7/HTMLParser.py

https://bitbucket.org/evelyn559/pypy
Python | 393 lines | 355 code | 16 blank | 22 comment | 43 complexity | be37e4a371c6dc5700ea5be7dbcf72c8 MD5 | raw file
  1"""A parser for HTML and XHTML."""
  2
  3# This file is based on sgmllib.py, but the API is slightly different.
  4
  5# XXX There should be a way to distinguish between PCDATA (parsed
  6# character data -- the normal case), RCDATA (replaceable character
  7# data -- only char and entity references and end tags are special)
  8# and CDATA (character data -- only end tags are special).
  9
 10
 11import markupbase
 12import re
 13
 14# Regular expressions used for parsing
 15
 16interesting_normal = re.compile('[&<]')
 17interesting_cdata = re.compile(r'<(/|\Z)')
 18incomplete = re.compile('&[a-zA-Z#]')
 19
 20entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
 21charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
 22
 23starttagopen = re.compile('<[a-zA-Z]')
 24piclose = re.compile('>')
 25commentclose = re.compile(r'--\s*>')
 26tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
 27attrfind = re.compile(
 28    r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
 29    r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
 30
 31locatestarttagend = re.compile(r"""
 32  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
 33  (?:\s+                             # whitespace before attribute name
 34    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
 35      (?:\s*=\s*                     # value indicator
 36        (?:'[^']*'                   # LITA-enclosed value
 37          |\"[^\"]*\"                # LIT-enclosed value
 38          |[^'\">\s]+                # bare value
 39         )
 40       )?
 41     )
 42   )*
 43  \s*                                # trailing whitespace
 44""", re.VERBOSE)
 45endendtag = re.compile('>')
 46endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
 47
 48
 49class HTMLParseError(Exception):
 50    """Exception raised for all parse errors."""
 51
 52    def __init__(self, msg, position=(None, None)):
 53        assert msg
 54        self.msg = msg
 55        self.lineno = position[0]
 56        self.offset = position[1]
 57
 58    def __str__(self):
 59        result = self.msg
 60        if self.lineno is not None:
 61            result = result + ", at line %d" % self.lineno
 62        if self.offset is not None:
 63            result = result + ", column %d" % (self.offset + 1)
 64        return result
 65
 66
 67class HTMLParser(markupbase.ParserBase):
 68    """Find tags and other markup and call handler functions.
 69
 70    Usage:
 71        p = HTMLParser()
 72        p.feed(data)
 73        ...
 74        p.close()
 75
 76    Start tags are handled by calling self.handle_starttag() or
 77    self.handle_startendtag(); end tags by self.handle_endtag().  The
 78    data between tags is passed from the parser to the derived class
 79    by calling self.handle_data() with the data as argument (the data
 80    may be split up in arbitrary chunks).  Entity references are
 81    passed by calling self.handle_entityref() with the entity
 82    reference as the argument.  Numeric character references are
 83    passed to self.handle_charref() with the string containing the
 84    reference as the argument.
 85    """
 86
 87    CDATA_CONTENT_ELEMENTS = ("script", "style")
 88
 89
 90    def __init__(self):
 91        """Initialize and reset this instance."""
 92        self.reset()
 93
 94    def reset(self):
 95        """Reset this instance.  Loses all unprocessed data."""
 96        self.rawdata = ''
 97        self.lasttag = '???'
 98        self.interesting = interesting_normal
 99        markupbase.ParserBase.reset(self)
100
101    def feed(self, data):
102        r"""Feed data to the parser.
103
104        Call this as often as you want, with as little or as much text
105        as you want (may include '\n').
106        """
107        self.rawdata = self.rawdata + data
108        self.goahead(0)
109
110    def close(self):
111        """Handle any buffered data."""
112        self.goahead(1)
113
114    def error(self, message):
115        raise HTMLParseError(message, self.getpos())
116
117    __starttag_text = None
118
119    def get_starttag_text(self):
120        """Return full source of start tag: '<...>'."""
121        return self.__starttag_text
122
123    def set_cdata_mode(self):
124        self.interesting = interesting_cdata
125
126    def clear_cdata_mode(self):
127        self.interesting = interesting_normal
128
129    # Internal -- handle data as far as reasonable.  May leave state
130    # and data to be processed by a subsequent call.  If 'end' is
131    # true, force handling all data as if followed by EOF marker.
132    def goahead(self, end):
133        rawdata = self.rawdata
134        i = 0
135        n = len(rawdata)
136        while i < n:
137            match = self.interesting.search(rawdata, i) # < or &
138            if match:
139                j = match.start()
140            else:
141                j = n
142            if i < j: self.handle_data(rawdata[i:j])
143            i = self.updatepos(i, j)
144            if i == n: break
145            startswith = rawdata.startswith
146            if startswith('<', i):
147                if starttagopen.match(rawdata, i): # < + letter
148                    k = self.parse_starttag(i)
149                elif startswith("</", i):
150                    k = self.parse_endtag(i)
151                elif startswith("<!--", i):
152                    k = self.parse_comment(i)
153                elif startswith("<?", i):
154                    k = self.parse_pi(i)
155                elif startswith("<!", i):
156                    k = self.parse_declaration(i)
157                elif (i + 1) < n:
158                    self.handle_data("<")
159                    k = i + 1
160                else:
161                    break
162                if k < 0:
163                    if end:
164                        self.error("EOF in middle of construct")
165                    break
166                i = self.updatepos(i, k)
167            elif startswith("&#", i):
168                match = charref.match(rawdata, i)
169                if match:
170                    name = match.group()[2:-1]
171                    self.handle_charref(name)
172                    k = match.end()
173                    if not startswith(';', k-1):
174                        k = k - 1
175                    i = self.updatepos(i, k)
176                    continue
177                else:
178                    if ";" in rawdata[i:]: #bail by consuming &#
179                        self.handle_data(rawdata[0:2])
180                        i = self.updatepos(i, 2)
181                    break
182            elif startswith('&', i):
183                match = entityref.match(rawdata, i)
184                if match:
185                    name = match.group(1)
186                    self.handle_entityref(name)
187                    k = match.end()
188                    if not startswith(';', k-1):
189                        k = k - 1
190                    i = self.updatepos(i, k)
191                    continue
192                match = incomplete.match(rawdata, i)
193                if match:
194                    # match.group() will contain at least 2 chars
195                    if end and match.group() == rawdata[i:]:
196                        self.error("EOF in middle of entity or char ref")
197                    # incomplete
198                    break
199                elif (i + 1) < n:
200                    # not the end of the buffer, and can't be confused
201                    # with some other construct
202                    self.handle_data("&")
203                    i = self.updatepos(i, i + 1)
204                else:
205                    break
206            else:
207                assert 0, "interesting.search() lied"
208        # end while
209        if end and i < n:
210            self.handle_data(rawdata[i:n])
211            i = self.updatepos(i, n)
212        self.rawdata = rawdata[i:]
213
214    # Internal -- parse processing instr, return end or -1 if not terminated
215    def parse_pi(self, i):
216        rawdata = self.rawdata
217        assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
218        match = piclose.search(rawdata, i+2) # >
219        if not match:
220            return -1
221        j = match.start()
222        self.handle_pi(rawdata[i+2: j])
223        j = match.end()
224        return j
225
226    # Internal -- handle starttag, return end or -1 if not terminated
227    def parse_starttag(self, i):
228        self.__starttag_text = None
229        endpos = self.check_for_whole_start_tag(i)
230        if endpos < 0:
231            return endpos
232        rawdata = self.rawdata
233        self.__starttag_text = rawdata[i:endpos]
234
235        # Now parse the data between i+1 and j into a tag and attrs
236        attrs = []
237        match = tagfind.match(rawdata, i+1)
238        assert match, 'unexpected call to parse_starttag()'
239        k = match.end()
240        self.lasttag = tag = rawdata[i+1:k].lower()
241
242        while k < endpos:
243            m = attrfind.match(rawdata, k)
244            if not m:
245                break
246            attrname, rest, attrvalue = m.group(1, 2, 3)
247            if not rest:
248                attrvalue = None
249            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
250                 attrvalue[:1] == '"' == attrvalue[-1:]:
251                attrvalue = attrvalue[1:-1]
252                attrvalue = self.unescape(attrvalue)
253            attrs.append((attrname.lower(), attrvalue))
254            k = m.end()
255
256        end = rawdata[k:endpos].strip()
257        if end not in (">", "/>"):
258            lineno, offset = self.getpos()
259            if "\n" in self.__starttag_text:
260                lineno = lineno + self.__starttag_text.count("\n")
261                offset = len(self.__starttag_text) \
262                         - self.__starttag_text.rfind("\n")
263            else:
264                offset = offset + len(self.__starttag_text)
265            self.error("junk characters in start tag: %r"
266                       % (rawdata[k:endpos][:20],))
267        if end.endswith('/>'):
268            # XHTML-style empty tag: <span attr="value" />
269            self.handle_startendtag(tag, attrs)
270        else:
271            self.handle_starttag(tag, attrs)
272            if tag in self.CDATA_CONTENT_ELEMENTS:
273                self.set_cdata_mode()
274        return endpos
275
276    # Internal -- check to see if we have a complete starttag; return end
277    # or -1 if incomplete.
278    def check_for_whole_start_tag(self, i):
279        rawdata = self.rawdata
280        m = locatestarttagend.match(rawdata, i)
281        if m:
282            j = m.end()
283            next = rawdata[j:j+1]
284            if next == ">":
285                return j + 1
286            if next == "/":
287                if rawdata.startswith("/>", j):
288                    return j + 2
289                if rawdata.startswith("/", j):
290                    # buffer boundary
291                    return -1
292                # else bogus input
293                self.updatepos(i, j + 1)
294                self.error("malformed empty start tag")
295            if next == "":
296                # end of input
297                return -1
298            if next in ("abcdefghijklmnopqrstuvwxyz=/"
299                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
300                # end of input in or before attribute value, or we have the
301                # '/' from a '/>' ending
302                return -1
303            self.updatepos(i, j)
304            self.error("malformed start tag")
305        raise AssertionError("we should not get here!")
306
307    # Internal -- parse endtag, return end or -1 if incomplete
308    def parse_endtag(self, i):
309        rawdata = self.rawdata
310        assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
311        match = endendtag.search(rawdata, i+1) # >
312        if not match:
313            return -1
314        j = match.end()
315        match = endtagfind.match(rawdata, i) # </ + tag + >
316        if not match:
317            self.error("bad end tag: %r" % (rawdata[i:j],))
318        tag = match.group(1)
319        self.handle_endtag(tag.lower())
320        self.clear_cdata_mode()
321        return j
322
323    # Overridable -- finish processing of start+end tag: <tag.../>
324    def handle_startendtag(self, tag, attrs):
325        self.handle_starttag(tag, attrs)
326        self.handle_endtag(tag)
327
328    # Overridable -- handle start tag
329    def handle_starttag(self, tag, attrs):
330        pass
331
332    # Overridable -- handle end tag
333    def handle_endtag(self, tag):
334        pass
335
336    # Overridable -- handle character reference
337    def handle_charref(self, name):
338        pass
339
340    # Overridable -- handle entity reference
341    def handle_entityref(self, name):
342        pass
343
344    # Overridable -- handle data
345    def handle_data(self, data):
346        pass
347
348    # Overridable -- handle comment
349    def handle_comment(self, data):
350        pass
351
352    # Overridable -- handle declaration
353    def handle_decl(self, decl):
354        pass
355
356    # Overridable -- handle processing instruction
357    def handle_pi(self, data):
358        pass
359
360    def unknown_decl(self, data):
361        self.error("unknown declaration: %r" % (data,))
362
363    # Internal -- helper to remove special character quoting
364    entitydefs = None
365    def unescape(self, s):
366        if '&' not in s:
367            return s
368        def replaceEntities(s):
369            s = s.groups()[0]
370            try:
371                if s[0] == "#":
372                    s = s[1:]
373                    if s[0] in ['x','X']:
374                        c = int(s[1:], 16)
375                    else:
376                        c = int(s)
377                    return unichr(c)
378            except ValueError:
379                return '&#'+s+';'
380            else:
381                # Cannot use name2codepoint directly, because HTMLParser supports apos,
382                # which is not part of HTML 4
383                import htmlentitydefs
384                if HTMLParser.entitydefs is None:
385                    entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
386                    for k, v in htmlentitydefs.name2codepoint.iteritems():
387                        entitydefs[k] = unichr(v)
388                try:
389                    return self.entitydefs[s]
390                except KeyError:
391                    return '&'+s+';'
392
393        return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)