PageRenderTime 57ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 0ms

/mediacore/lib/xhtml/__init__.py

https://bitbucket.org/mediadrop/simplestation-code-dumps
Python | 213 lines | 173 code | 5 blank | 35 comment | 4 complexity | 6781890a7e41a3d120f30455af90c7a7 MD5 | raw file
  1. # This file is a part of MediaCore, Copyright 2010 Simple Station Inc.
  2. #
  3. # MediaCore is free software: you can redistribute it and/or modify
  4. # it under the terms of the GNU General Public License as published by
  5. # the Free Software Foundation, either version 3 of the License, or
  6. # (at your option) any later version.
  7. #
  8. # MediaCore is distributed in the hope that it will be useful,
  9. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. # GNU General Public License for more details.
  12. #
  13. # You should have received a copy of the GNU General Public License
  14. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. """
  16. Our own XHTML sanitation helpers
  17. """
  18. import re
  19. from BeautifulSoup import BeautifulSoup
  20. from webhelpers import text
  21. from mediacore.lib.xhtml.htmlsanitizer import (Cleaner,
  22. entities_to_unicode as decode_entities,
  23. encode_xhtml_entities as encode_entities)
  24. __all__ = [
  25. 'clean_xhtml',
  26. 'decode_entities',
  27. 'encode_entities',
  28. 'excerpt_xhtml',
  29. 'line_break_xhtml',
  30. 'list_acceptable_xhtml',
  31. 'strip_xhtml',
  32. 'truncate_xhtml',
  33. ]
  34. # Configuration for HTML sanitization
  35. blank_line = re.compile("\s*\n\s*\n\s*", re.M)
  36. block_tags = 'p br pre blockquote div h1 h2 h3 h4 h5 h6 hr ul ol li form table tr td tbody thead'.split()
  37. block_spaces = re.compile("\s*(</{0,1}(" + "|".join(block_tags) + ")>)\s*", re.M)
  38. block_close = re.compile("(</(" + "|".join(block_tags) + ")>)", re.M)
  39. valid_tags = dict.fromkeys('p i em strong b u a br pre abbr ol ul li sub sup ins del blockquote cite'.split())
  40. valid_attrs = dict.fromkeys('href title'.split())
  41. elem_map = {'b': 'strong', 'i': 'em'}
  42. truncate_filters = ['strip_empty_tags']
  43. cleaner_filters = [
  44. 'add_nofollow', 'br_to_p', 'clean_whitespace', 'encode_xml_specials',
  45. 'make_links', 'rename_tags', 'strip_attrs', 'strip_cdata',
  46. 'strip_comments', 'strip_empty_tags', 'strip_schemes', 'strip_tags'
  47. ]
  48. # Map all invalid block elements to be paragraphs.
  49. for t in block_tags:
  50. if t not in valid_tags:
  51. elem_map[t] = 'p'
  52. cleaner_settings = dict(
  53. convert_entities = BeautifulSoup.ALL_ENTITIES,
  54. valid_tags = valid_tags,
  55. valid_attrs = valid_attrs,
  56. elem_map = elem_map,
  57. filters = cleaner_filters
  58. )
  59. def clean_xhtml(string, p_wrap=True, _cleaner_settings=None):
  60. """Convert the given plain text or HTML into valid XHTML.
  61. If there is no markup in the string, apply paragraph formatting.
  62. :param string: XHTML input string
  63. :type string: unicode
  64. :param p_wrap: Wrap the output in <p></p> tags?
  65. :type p_wrap: bool
  66. :param _cleaner_settings: Constructor kwargs for
  67. :class:`mediacore.lib.htmlsanitizer.Cleaner`
  68. :type _cleaner_settings: dict
  69. :returns: XHTML
  70. :rtype: unicode
  71. """
  72. if not string or not string.strip():
  73. # If the string is none, or empty, or whitespace
  74. return u""
  75. if _cleaner_settings is None:
  76. _cleaner_settings = cleaner_settings
  77. # remove carriage return chars; FIXME: is this necessary?
  78. string = string.replace(u"\r", u"")
  79. # remove non-breaking-space characters. FIXME: is this necessary?
  80. string = string.replace(u"\xa0", u" ")
  81. string = string.replace(u"&nbsp;", u" ")
  82. # replace all blank lines with <br> tags
  83. string = blank_line.sub(u"<br/>", string)
  84. # initialize and run the cleaner
  85. string = Cleaner(string, **_cleaner_settings)()
  86. # FIXME: It's possible that the rename_tags operation creates
  87. # some invalid nesting. e.g.
  88. # >>> c = Cleaner("", "rename_tags", elem_map={'h2': 'p'})
  89. # >>> c('<p><h2>head</h2></p>')
  90. # u'<p><p>head</p></p>'
  91. # This is undesirable, so here we... just re-parse the markup.
  92. # But this ... could be pretty slow.
  93. cleaner = Cleaner(string, **_cleaner_settings)
  94. string = cleaner()
  95. # Wrap in a <p> tag when no tags are used, and there are no blank
  96. # lines to trigger automatic <p> creation
  97. # FIXME: This should trigger any time we don't have a wrapping block tag
  98. # FIXME: This doesn't wrap orphaned text when it follows a <p> tag, for ex
  99. if p_wrap \
  100. and len(cleaner.root.contents) == 1 \
  101. and isinstance(cleaner.root.contents[0], basestring):
  102. string = u"<p>%s</p>" % string.strip()
  103. # strip all whitespace from immediately before/after block-level elements
  104. string = block_spaces.sub(u"\\1", string)
  105. return string.strip()
  106. def truncate_xhtml(string, size, _strip_xhtml=False, _decode_entities=False):
  107. """Truncate a XHTML string to roughly a given size (full words).
  108. :param string: XHTML
  109. :type string: unicode
  110. :param size: Max length
  111. :param _strip_xhtml: Flag to strip out all XHTML
  112. :param _decode_entities: Flag to convert XHTML entities to unicode chars
  113. :rtype: unicode
  114. """
  115. if not string:
  116. return u''
  117. if _strip_xhtml:
  118. # Insert whitespace after block elements.
  119. # So they are separated when we strip the xhtml.
  120. string = block_spaces.sub(u"\\1 ", string)
  121. string = strip_xhtml(string)
  122. string = decode_entities(string)
  123. if len(string) > size:
  124. string = text.truncate(string, length=size, whole_word=True)
  125. if _strip_xhtml:
  126. if not _decode_entities:
  127. # re-encode the entities, if we have to.
  128. string = encode_entities(string)
  129. else:
  130. if _decode_entities:
  131. string = Cleaner(string,
  132. *truncate_filters, **cleaner_settings)()
  133. else:
  134. # re-encode the entities, if we have to.
  135. string = Cleaner(string, 'encode_xml_specials',
  136. *truncate_filters, **cleaner_settings)()
  137. return string.strip()
  138. def excerpt_xhtml(string, size, buffer=60):
  139. """Return an excerpt for the given string.
  140. Truncate to the given size iff we are removing more than the buffer size.
  141. :param string: A XHTML string
  142. :param size: The desired length
  143. :type size: int
  144. :param buffer: How much more than the desired length we can go to
  145. avoid truncating just a couple words etc.
  146. :type buffer: int
  147. :returns: XHTML
  148. """
  149. if not string:
  150. return u''
  151. new_str = decode_entities(string)
  152. if len(new_str) <= size + buffer:
  153. return string
  154. return truncate_xhtml(new_str, size)
  155. def strip_xhtml(string, _decode_entities=False):
  156. """Strip out xhtml and optionally convert HTML entities to unicode.
  157. :rtype: unicode
  158. """
  159. if not string:
  160. return u''
  161. string = ''.join(BeautifulSoup(string).findAll(text=True))
  162. if _decode_entities:
  163. string = decode_entities(string)
  164. return string
  165. def line_break_xhtml(string):
  166. """Add a linebreak after block-level tags are closed.
  167. :type string: unicode
  168. :rtype: unicode
  169. """
  170. if string:
  171. string = block_close.sub(u"\\1\n", string).rstrip()
  172. return string
  173. def list_acceptable_xhtml():
  174. return dict(
  175. tags = ", ".join(sorted(valid_tags)),
  176. attrs = ", ".join(sorted(valid_attrs)),
  177. map = ", ".join(["%s -> %s" % (t, elem_map[t]) for t in elem_map])
  178. )