PageRenderTime 44ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/django/utils/html_parser.py

https://github.com/andnils/django
Python | 124 lines | 103 code | 4 blank | 17 comment | 4 complexity | 3cefb38d4730a3ce4fcc4a403f3a5a63 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. from django.utils.six.moves import html_parser as _html_parser
  2. import re
  3. import sys
  4. current_version = sys.version_info
  5. use_workaround = (
  6. (current_version < (2, 7, 3)) or
  7. (current_version >= (3, 0) and current_version < (3, 2, 3))
  8. )
  9. HTMLParseError = _html_parser.HTMLParseError
  10. if not use_workaround:
  11. if current_version >= (3, 4):
  12. class HTMLParser(_html_parser.HTMLParser):
  13. """Explicitly set convert_charrefs to be False.
  14. This silences a deprecation warning on Python 3.4, but we can't do
  15. it at call time because Python 2.7 does not have the keyword
  16. argument.
  17. """
  18. def __init__(self, convert_charrefs=False, **kwargs):
  19. _html_parser.HTMLParser.__init__(self, convert_charrefs=convert_charrefs, **kwargs)
  20. else:
  21. HTMLParser = _html_parser.HTMLParser
  22. else:
  23. tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
  24. class HTMLParser(_html_parser.HTMLParser):
  25. """
  26. Patched version of stdlib's HTMLParser with patch from:
  27. http://bugs.python.org/issue670664
  28. """
  29. def __init__(self):
  30. _html_parser.HTMLParser.__init__(self)
  31. self.cdata_tag = None
  32. def set_cdata_mode(self, tag):
  33. try:
  34. self.interesting = _html_parser.interesting_cdata
  35. except AttributeError:
  36. self.interesting = re.compile(r'</\s*%s\s*>' % tag.lower(), re.I)
  37. self.cdata_tag = tag.lower()
  38. def clear_cdata_mode(self):
  39. self.interesting = _html_parser.interesting_normal
  40. self.cdata_tag = None
  41. # Internal -- handle starttag, return end or -1 if not terminated
  42. def parse_starttag(self, i):
  43. self.__starttag_text = None
  44. endpos = self.check_for_whole_start_tag(i)
  45. if endpos < 0:
  46. return endpos
  47. rawdata = self.rawdata
  48. self.__starttag_text = rawdata[i:endpos]
  49. # Now parse the data between i+1 and j into a tag and attrs
  50. attrs = []
  51. match = tagfind.match(rawdata, i + 1)
  52. assert match, 'unexpected call to parse_starttag()'
  53. k = match.end()
  54. self.lasttag = tag = match.group(1).lower()
  55. while k < endpos:
  56. m = _html_parser.attrfind.match(rawdata, k)
  57. if not m:
  58. break
  59. attrname, rest, attrvalue = m.group(1, 2, 3)
  60. if not rest:
  61. attrvalue = None
  62. elif (attrvalue[:1] == '\'' == attrvalue[-1:] or
  63. attrvalue[:1] == '"' == attrvalue[-1:]):
  64. attrvalue = attrvalue[1:-1]
  65. if attrvalue:
  66. attrvalue = self.unescape(attrvalue)
  67. attrs.append((attrname.lower(), attrvalue))
  68. k = m.end()
  69. end = rawdata[k:endpos].strip()
  70. if end not in (">", "/>"):
  71. lineno, offset = self.getpos()
  72. if "\n" in self.__starttag_text:
  73. lineno = lineno + self.__starttag_text.count("\n")
  74. offset = (len(self.__starttag_text)
  75. - self.__starttag_text.rfind("\n"))
  76. else:
  77. offset = offset + len(self.__starttag_text)
  78. self.error("junk characters in start tag: %r"
  79. % (rawdata[k:endpos][:20],))
  80. if end.endswith('/>'):
  81. # XHTML-style empty tag: <span attr="value" />
  82. self.handle_startendtag(tag, attrs)
  83. else:
  84. self.handle_starttag(tag, attrs)
  85. if tag in self.CDATA_CONTENT_ELEMENTS:
  86. self.set_cdata_mode(tag) # <--------------------------- Changed
  87. return endpos
  88. # Internal -- parse endtag, return end or -1 if incomplete
  89. def parse_endtag(self, i):
  90. rawdata = self.rawdata
  91. assert rawdata[i:i + 2] == "</", "unexpected call to parse_endtag"
  92. match = _html_parser.endendtag.search(rawdata, i + 1) # >
  93. if not match:
  94. return -1
  95. j = match.end()
  96. match = _html_parser.endtagfind.match(rawdata, i) # </ + tag + >
  97. if not match:
  98. if self.cdata_tag is not None: # *** add ***
  99. self.handle_data(rawdata[i:j]) # *** add ***
  100. return j # *** add ***
  101. self.error("bad end tag: %r" % (rawdata[i:j],))
  102. # --- changed start ---------------------------------------------------
  103. tag = match.group(1).strip()
  104. if self.cdata_tag is not None:
  105. if tag.lower() != self.cdata_tag:
  106. self.handle_data(rawdata[i:j])
  107. return j
  108. # --- changed end -----------------------------------------------------
  109. self.handle_endtag(tag.lower())
  110. self.clear_cdata_mode()
  111. return j