PageRenderTime 43ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 1ms

/django/utils/html_parser.py

https://github.com/sesostris/django
Python | 114 lines | 99 code | 5 blank | 10 comment | 5 complexity | 00ff9b5dafcade64f0f60e10c11f0659 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. from django.utils.six.moves import html_parser as _html_parser
  2. import re
  3. import sys
  4. current_version = sys.version_info
  5. use_workaround = (
  6. (current_version < (2, 6, 8)) or
  7. (current_version >= (2, 7) and current_version < (2, 7, 3)) or
  8. (current_version >= (3, 0) and current_version < (3, 2, 3))
  9. )
  10. HTMLParseError = _html_parser.HTMLParseError
  11. if not use_workaround:
  12. HTMLParser = _html_parser.HTMLParser
  13. else:
  14. tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
  15. class HTMLParser(_html_parser.HTMLParser):
  16. """
  17. Patched version of stdlib's HTMLParser with patch from:
  18. http://bugs.python.org/issue670664
  19. """
  20. def __init__(self):
  21. _html_parser.HTMLParser.__init__(self)
  22. self.cdata_tag = None
  23. def set_cdata_mode(self, tag):
  24. try:
  25. self.interesting = _html_parser.interesting_cdata
  26. except AttributeError:
  27. self.interesting = re.compile(r'</\s*%s\s*>' % tag.lower(), re.I)
  28. self.cdata_tag = tag.lower()
  29. def clear_cdata_mode(self):
  30. self.interesting = _html_parser.interesting_normal
  31. self.cdata_tag = None
  32. # Internal -- handle starttag, return end or -1 if not terminated
  33. def parse_starttag(self, i):
  34. self.__starttag_text = None
  35. endpos = self.check_for_whole_start_tag(i)
  36. if endpos < 0:
  37. return endpos
  38. rawdata = self.rawdata
  39. self.__starttag_text = rawdata[i:endpos]
  40. # Now parse the data between i+1 and j into a tag and attrs
  41. attrs = []
  42. match = tagfind.match(rawdata, i + 1)
  43. assert match, 'unexpected call to parse_starttag()'
  44. k = match.end()
  45. self.lasttag = tag = match.group(1).lower()
  46. while k < endpos:
  47. m = _html_parser.attrfind.match(rawdata, k)
  48. if not m:
  49. break
  50. attrname, rest, attrvalue = m.group(1, 2, 3)
  51. if not rest:
  52. attrvalue = None
  53. elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
  54. attrvalue[:1] == '"' == attrvalue[-1:]:
  55. attrvalue = attrvalue[1:-1]
  56. if attrvalue:
  57. attrvalue = self.unescape(attrvalue)
  58. attrs.append((attrname.lower(), attrvalue))
  59. k = m.end()
  60. end = rawdata[k:endpos].strip()
  61. if end not in (">", "/>"):
  62. lineno, offset = self.getpos()
  63. if "\n" in self.__starttag_text:
  64. lineno = lineno + self.__starttag_text.count("\n")
  65. offset = len(self.__starttag_text) \
  66. - self.__starttag_text.rfind("\n")
  67. else:
  68. offset = offset + len(self.__starttag_text)
  69. self.error("junk characters in start tag: %r"
  70. % (rawdata[k:endpos][:20],))
  71. if end.endswith('/>'):
  72. # XHTML-style empty tag: <span attr="value" />
  73. self.handle_startendtag(tag, attrs)
  74. else:
  75. self.handle_starttag(tag, attrs)
  76. if tag in self.CDATA_CONTENT_ELEMENTS:
  77. self.set_cdata_mode(tag) # <--------------------------- Changed
  78. return endpos
  79. # Internal -- parse endtag, return end or -1 if incomplete
  80. def parse_endtag(self, i):
  81. rawdata = self.rawdata
  82. assert rawdata[i:i + 2] == "</", "unexpected call to parse_endtag"
  83. match = _html_parser.endendtag.search(rawdata, i + 1) # >
  84. if not match:
  85. return -1
  86. j = match.end()
  87. match = _html_parser.endtagfind.match(rawdata, i) # </ + tag + >
  88. if not match:
  89. if self.cdata_tag is not None: # *** add ***
  90. self.handle_data(rawdata[i:j]) # *** add ***
  91. return j # *** add ***
  92. self.error("bad end tag: %r" % (rawdata[i:j],))
  93. # --- changed start ---------------------------------------------------
  94. tag = match.group(1).strip()
  95. if self.cdata_tag is not None:
  96. if tag.lower() != self.cdata_tag:
  97. self.handle_data(rawdata[i:j])
  98. return j
  99. # --- changed end -----------------------------------------------------
  100. self.handle_endtag(tag.lower())
  101. self.clear_cdata_mode()
  102. return j