PageRenderTime 223ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/product/PortalTransforms/libtransforms/utils.py

https://github.com/smetsjp/erp5
Python | 236 lines | 230 code | 4 blank | 2 comment | 2 complexity | 6f353bc2436f92aa2c34be94aac53294 MD5 | raw file
  1. import re
  2. import os
  3. import sys
  4. from sgmllib import SGMLParser, SGMLParseError
  5. try:
  6. # Need to be imported before win32api to avoid dll loading
  7. # problems.
  8. import pywintypes
  9. import pythoncom
  10. import win32api
  11. WIN32 = True
  12. except ImportError:
  13. WIN32 = False
  14. class MissingBinary(Exception): pass
  15. envPath = os.getenv('PATH', '')
  16. bin_search_path = [path for path in envPath.split(os.pathsep)
  17. if os.path.isdir(path)]
  18. cygwin = 'c:/cygwin'
  19. # cygwin support
  20. if sys.platform == 'win32' and os.path.isdir(cygwin):
  21. for p in ['/bin', '/usr/bin', '/usr/local/bin' ]:
  22. p = os.path.join(cygwin, p)
  23. if os.path.isdir(p):
  24. bin_search_path.append(p)
  25. if sys.platform == 'win32':
  26. extensions = ('.exe', '.com', '.bat', )
  27. else:
  28. extensions = ()
  29. def bin_search(binary):
  30. """search the bin_search_path for a given binary returning its fullname or
  31. raises MissingBinary"""
  32. mode = os.R_OK | os.X_OK
  33. for path in bin_search_path:
  34. for ext in ('', ) + extensions:
  35. pathbin = os.path.join(path, binary) + ext
  36. if os.access(pathbin, mode) == 1:
  37. return pathbin
  38. raise MissingBinary('Unable to find binary "%s" in %s' %
  39. (binary, os.pathsep.join(bin_search_path)))
  40. def getShortPathName(binary):
  41. if WIN32:
  42. try:
  43. binary = win32api.GetShortPathName(binary)
  44. except win32api.error:
  45. log("Failed to GetShortPathName for '%s'" % binary)
  46. return binary
  47. def sansext(path):
  48. return os.path.splitext(os.path.basename(path))[0]
  49. ##########################################################################
  50. # The code below is taken from CMFDefault.utils to remove
  51. # dependencies for Python-only installations
  52. ##########################################################################
  53. def bodyfinder(text):
  54. """ Return body or unchanged text if no body tags found.
  55. Always use html_headcheck() first.
  56. """
  57. lowertext = text.lower()
  58. bodystart = lowertext.find('<body')
  59. if bodystart == -1:
  60. return text
  61. bodystart = lowertext.find('>', bodystart) + 1
  62. if bodystart == 0:
  63. return text
  64. bodyend = lowertext.rfind('</body>', bodystart)
  65. if bodyend == -1:
  66. return text
  67. return text[bodystart:bodyend]
  68. #
  69. # HTML cleaning code
  70. #
  71. # These are the HTML tags that we will leave intact
  72. VALID_TAGS = { 'a' : 1
  73. , 'b' : 1
  74. , 'base' : 0
  75. , 'blockquote' : 1
  76. , 'body' : 1
  77. , 'br' : 0
  78. , 'caption' : 1
  79. , 'cite' : 1
  80. , 'code' : 1
  81. , 'div' : 1
  82. , 'dl' : 1
  83. , 'dt' : 1
  84. , 'dd' : 1
  85. , 'em' : 1
  86. , 'h1' : 1
  87. , 'h2' : 1
  88. , 'h3' : 1
  89. , 'h4' : 1
  90. , 'h5' : 1
  91. , 'h6' : 1
  92. , 'head' : 1
  93. , 'hr' : 0
  94. , 'html' : 1
  95. , 'i' : 1
  96. , 'img' : 0
  97. , 'kbd' : 1
  98. , 'li' : 1
  99. , 'meta' : 0
  100. , 'ol' : 1
  101. , 'p' : 1
  102. , 'pre' : 1
  103. , 'span' : 1
  104. , 'strong' : 1
  105. , 'strike' : 1
  106. , 'table' : 1
  107. , 'tbody' : 1
  108. , 'thead' : 1
  109. , 'td' : 1
  110. , 'th' : 1
  111. , 'title' : 1
  112. , 'tr' : 1
  113. , 'tt' : 1
  114. , 'u' : 1
  115. , 'ul' : 1
  116. }
  117. NASTY_TAGS = { 'script' : 1
  118. , 'object' : 1
  119. , 'embed' : 1
  120. , 'applet' : 1
  121. }
  122. class IllegalHTML( ValueError ):
  123. pass
  124. class StrippingParser( SGMLParser ):
  125. """ Pass only allowed tags; raise exception for known-bad. """
  126. from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
  127. def __init__( self ):
  128. SGMLParser.__init__( self )
  129. self.result = ""
  130. def handle_data( self, data ):
  131. if data:
  132. self.result = self.result + data
  133. def handle_charref( self, name ):
  134. self.result = "%s&#%s;" % ( self.result, name )
  135. def handle_entityref(self, name):
  136. if self.entitydefs.has_key(name):
  137. x = ';'
  138. else:
  139. # this breaks unstandard entities that end with ';'
  140. x = ''
  141. self.result = "%s&%s%s" % (self.result, name, x)
  142. def unknown_starttag(self, tag, attrs):
  143. """ Delete all tags except for legal ones.
  144. """
  145. if VALID_TAGS.has_key(tag):
  146. self.result = self.result + '<' + tag
  147. for k, v in attrs:
  148. if k.lower().startswith( 'on' ):
  149. raise IllegalHTML, 'Javascipt event "%s" not allowed.' % k
  150. if v.lower().startswith( 'javascript:' ):
  151. raise IllegalHTML, 'Javascipt URI "%s" not allowed.' % v
  152. self.result = '%s %s="%s"' % (self.result, k, v)
  153. endTag = '</%s>' % tag
  154. if VALID_TAGS.get(tag):
  155. self.result = self.result + '>'
  156. else:
  157. self.result = self.result + ' />'
  158. elif NASTY_TAGS.get( tag ):
  159. raise IllegalHTML, 'Dynamic tag "%s" not allowed.' % tag
  160. else:
  161. pass # omit tag
  162. def unknown_endtag(self, tag):
  163. if VALID_TAGS.get( tag ):
  164. self.result = "%s</%s>" % (self.result, tag)
  165. remTag = '</%s>' % tag
  166. def parse_declaration(self, i):
  167. """Fix handling of CDATA sections. Code borrowed from BeautifulSoup.
  168. """
  169. j = None
  170. if self.rawdata[i:i+9] == '<![CDATA[':
  171. k = self.rawdata.find(']]>', i)
  172. if k == -1:
  173. k = len(self.rawdata)
  174. data = self.rawdata[i+9:k]
  175. j = k+3
  176. self.result.append("<![CDATA[%s]]>" % data)
  177. else:
  178. try:
  179. j = SGMLParser.parse_declaration(self, i)
  180. except SGMLParseError:
  181. toHandle = self.rawdata[i:]
  182. self.result.append(toHandle)
  183. j = i + len(toHandle)
  184. return j
  185. def scrubHTML( html ):
  186. """ Strip illegal HTML tags from string text. """
  187. parser = StrippingParser()
  188. parser.feed( html )
  189. parser.close()
  190. return parser.result