PageRenderTime 48ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 1ms

/vendor/cache/ruby/2.7.0/gems/loofah-2.6.0/lib/loofah/scrubbers.rb

https://gitlab.com/gitnyasha/zimcreative
Ruby | 298 lines | 112 code | 17 blank | 169 comment | 21 complexity | 1d79db56846006966e44fe59789b1c61 MD5 | raw file
  1. # frozen_string_literal: true
  2. module Loofah
  3. #
  4. # Loofah provides some built-in scrubbers for sanitizing with
  5. # HTML5lib's safelist and for accomplishing some common
  6. # transformation tasks.
  7. #
  8. #
  9. # === Loofah::Scrubbers::Strip / scrub!(:strip)
  10. #
  11. # +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
  12. #
  13. # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
  14. # Loofah.fragment(unsafe_html).scrub!(:strip)
  15. # => "ohai! <div>div is safe</div> but foo is <b>not</b>"
  16. #
  17. #
  18. # === Loofah::Scrubbers::Prune / scrub!(:prune)
  19. #
  20. # +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
  21. #
  22. # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
  23. # Loofah.fragment(unsafe_html).scrub!(:prune)
  24. # => "ohai! <div>div is safe</div> "
  25. #
  26. #
  27. # === Loofah::Scrubbers::Escape / scrub!(:escape)
  28. #
  29. # +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
  30. #
  31. # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
  32. # Loofah.fragment(unsafe_html).scrub!(:escape)
  33. # => "ohai! <div>div is safe</div> &lt;foo&gt;but foo is &lt;b&gt;not&lt;/b&gt;&lt;/foo&gt;"
  34. #
  35. #
  36. # === Loofah::Scrubbers::Whitewash / scrub!(:whitewash)
  37. #
  38. # +:whitewash+ removes all comments, styling and attributes in
  39. # addition to doing markup-fixer-uppery and pruning unsafe tags. I
  40. # like to call this "whitewashing", since it's like putting a new
  41. # layer of paint on top of the HTML input to make it look nice.
  42. #
  43. # messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
  44. # Loofah.fragment(messy_markup).scrub!(:whitewash)
  45. # => "ohai! <div>div with attributes</div>"
  46. #
  47. # One use case for this scrubber is to clean up HTML that was
  48. # cut-and-pasted from Microsoft Word into a WYSIWYG editor or a
  49. # rich text editor. Microsoft's software is famous for injecting
  50. # all kinds of cruft into its HTML output. Who needs that crap?
  51. # Certainly not me.
  52. #
  53. #
  54. # === Loofah::Scrubbers::NoFollow / scrub!(:nofollow)
  55. #
  56. # +:nofollow+ adds a rel="nofollow" attribute to all links
  57. #
  58. # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
  59. # Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
  60. # => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
  61. #
  62. #
  63. # === Loofah::Scrubbers::NoOpener / scrub!(:noopener)
  64. #
  65. # +:noopener+ adds a rel="noopener" attribute to all links
  66. #
  67. # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
  68. # Loofah.fragment(link_farmers_markup).scrub!(:noopener)
  69. # => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
  70. #
  71. #
  72. # === Loofah::Scrubbers::Unprintable / scrub!(:unprintable)
  73. #
  74. # +:unprintable+ removes unprintable Unicode characters.
  75. #
  76. # markup = "<p>Some text with an unprintable character at the end\u2028</p>"
  77. # Loofah.fragment(markup).scrub!(:unprintable)
  78. # => "<p>Some text with an unprintable character at the end</p>"
  79. #
  80. # You may not be able to see the unprintable character in the above example, but there is a
  81. # U+2028 character right before the closing </p> tag. These characters can cause issues if
  82. # the content is ever parsed by JavaScript - more information here:
  83. #
  84. # http://timelessrepo.com/json-isnt-a-javascript-subset
  85. #
  86. module Scrubbers
  87. #
  88. # === scrub!(:strip)
  89. #
  90. # +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
  91. #
  92. # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
  93. # Loofah.fragment(unsafe_html).scrub!(:strip)
  94. # => "ohai! <div>div is safe</div> but foo is <b>not</b>"
  95. #
  96. class Strip < Scrubber
  97. def initialize
  98. @direction = :bottom_up
  99. end
  100. def scrub(node)
  101. return CONTINUE if html5lib_sanitize(node) == CONTINUE
  102. if node.children.length == 1 && node.children.first.cdata?
  103. sanitized_text = Loofah.fragment(node.children.first.to_html).scrub!(:strip).to_html
  104. node.before Nokogiri::XML::Text.new(sanitized_text, node.document)
  105. else
  106. node.before node.children
  107. end
  108. node.remove
  109. end
  110. end
  111. #
  112. # === scrub!(:prune)
  113. #
  114. # +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
  115. #
  116. # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
  117. # Loofah.fragment(unsafe_html).scrub!(:prune)
  118. # => "ohai! <div>div is safe</div> "
  119. #
  120. class Prune < Scrubber
  121. def initialize
  122. @direction = :top_down
  123. end
  124. def scrub(node)
  125. return CONTINUE if html5lib_sanitize(node) == CONTINUE
  126. node.remove
  127. return STOP
  128. end
  129. end
  130. #
  131. # === scrub!(:escape)
  132. #
  133. # +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
  134. #
  135. # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
  136. # Loofah.fragment(unsafe_html).scrub!(:escape)
  137. # => "ohai! <div>div is safe</div> &lt;foo&gt;but foo is &lt;b&gt;not&lt;/b&gt;&lt;/foo&gt;"
  138. #
  139. class Escape < Scrubber
  140. def initialize
  141. @direction = :top_down
  142. end
  143. def scrub(node)
  144. return CONTINUE if html5lib_sanitize(node) == CONTINUE
  145. node.add_next_sibling Nokogiri::XML::Text.new(node.to_s, node.document)
  146. node.remove
  147. return STOP
  148. end
  149. end
  150. #
  151. # === scrub!(:whitewash)
  152. #
  153. # +:whitewash+ removes all comments, styling and attributes in
  154. # addition to doing markup-fixer-uppery and pruning unsafe tags. I
  155. # like to call this "whitewashing", since it's like putting a new
  156. # layer of paint on top of the HTML input to make it look nice.
  157. #
  158. # messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
  159. # Loofah.fragment(messy_markup).scrub!(:whitewash)
  160. # => "ohai! <div>div with attributes</div>"
  161. #
  162. # One use case for this scrubber is to clean up HTML that was
  163. # cut-and-pasted from Microsoft Word into a WYSIWYG editor or a
  164. # rich text editor. Microsoft's software is famous for injecting
  165. # all kinds of cruft into its HTML output. Who needs that crap?
  166. # Certainly not me.
  167. #
  168. class Whitewash < Scrubber
  169. def initialize
  170. @direction = :top_down
  171. end
  172. def scrub(node)
  173. case node.type
  174. when Nokogiri::XML::Node::ELEMENT_NODE
  175. if HTML5::Scrub.allowed_element? node.name
  176. node.attributes.each { |attr| node.remove_attribute(attr.first) }
  177. return CONTINUE if node.namespaces.empty?
  178. end
  179. when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
  180. return CONTINUE
  181. end
  182. node.remove
  183. STOP
  184. end
  185. end
  186. #
  187. # === scrub!(:nofollow)
  188. #
  189. # +:nofollow+ adds a rel="nofollow" attribute to all links
  190. #
  191. # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
  192. # Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
  193. # => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
  194. #
  195. class NoFollow < Scrubber
  196. def initialize
  197. @direction = :top_down
  198. end
  199. def scrub(node)
  200. return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
  201. append_attribute(node, "rel", "nofollow")
  202. return STOP
  203. end
  204. end
  205. #
  206. # === scrub!(:noopener)
  207. #
  208. # +:noopener+ adds a rel="noopener" attribute to all links
  209. #
  210. # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
  211. # Loofah.fragment(link_farmers_markup).scrub!(:noopener)
  212. # => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
  213. #
  214. class NoOpener < Scrubber
  215. def initialize
  216. @direction = :top_down
  217. end
  218. def scrub(node)
  219. return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
  220. append_attribute(node, "rel", "noopener")
  221. return STOP
  222. end
  223. end
  224. # This class probably isn't useful publicly, but is used for #to_text's current implemention
  225. class NewlineBlockElements < Scrubber # :nodoc:
  226. def initialize
  227. @direction = :bottom_up
  228. end
  229. def scrub(node)
  230. return CONTINUE unless Loofah::Elements::BLOCK_LEVEL.include?(node.name)
  231. node.add_next_sibling Nokogiri::XML::Text.new("\n#{node.content}\n", node.document)
  232. node.remove
  233. end
  234. end
  235. #
  236. # === scrub!(:unprintable)
  237. #
  238. # +:unprintable+ removes unprintable Unicode characters.
  239. #
  240. # markup = "<p>Some text with an unprintable character at the end\u2028</p>"
  241. # Loofah.fragment(markup).scrub!(:unprintable)
  242. # => "<p>Some text with an unprintable character at the end</p>"
  243. #
  244. # You may not be able to see the unprintable character in the above example, but there is a
  245. # U+2028 character right before the closing </p> tag. These characters can cause issues if
  246. # the content is ever parsed by JavaScript - more information here:
  247. #
  248. # http://timelessrepo.com/json-isnt-a-javascript-subset
  249. #
  250. class Unprintable < Scrubber
  251. def initialize
  252. @direction = :top_down
  253. end
  254. def scrub(node)
  255. if node.type == Nokogiri::XML::Node::TEXT_NODE || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
  256. node.content = node.content.gsub(/\u2028|\u2029/, "")
  257. end
  258. CONTINUE
  259. end
  260. end
  261. #
  262. # A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
  263. #
  264. MAP = {
  265. :escape => Escape,
  266. :prune => Prune,
  267. :whitewash => Whitewash,
  268. :strip => Strip,
  269. :nofollow => NoFollow,
  270. :noopener => NoOpener,
  271. :newline_block_elements => NewlineBlockElements,
  272. :unprintable => Unprintable,
  273. }
  274. #
  275. # Returns an array of symbols representing the built-in scrubbers
  276. #
  277. def self.scrubber_symbols
  278. MAP.keys
  279. end
  280. end
  281. end