PageRenderTime 40ms CodeModel.GetById 13ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/dryopteris/sanitize.rb

https://github.com/quake/dryopteris
Ruby | 135 lines | 110 code | 19 blank | 6 comment | 20 complexity | 68cce335d3b05059de82548e7a726764 MD5 | raw file
  1. require 'rubygems'
  2. gem 'nokogiri', '>=1.0.5'
  3. require 'nokogiri'
  4. require 'cgi'
  5. require "dryopteris/whitelist"
  6. module Dryopteris
  7. class << self
  8. def strip_tags(string_or_io, encoding=nil)
  9. return nil if string_or_io.nil?
  10. return "" if string_or_io.strip.size == 0
  11. doc = Nokogiri::HTML.parse(string_or_io, nil, encoding)
  12. body_element = doc.at("/html/body")
  13. return "" if body_element.nil?
  14. CGI.escapeHTML body_element.inner_text
  15. end
  16. def sanitize(string, encoding=nil)
  17. return nil if string.nil?
  18. return "" if string.strip.size == 0
  19. string = "<html><body>" + string + "</body></html>"
  20. doc = Nokogiri::HTML.parse(string, nil, encoding)
  21. body = doc.xpath("/html/body").first
  22. return "" if body.nil?
  23. body.children.each do |node|
  24. traverse_conditionally_top_down(node, :sanitize_node)
  25. end
  26. body.children.map { |x| x.to_xml }.join
  27. end
  28. def sanitize_document(string_or_io, encoding=nil)
  29. return nil if string_or_io.nil?
  30. return "" if string_or_io.strip.size == 0
  31. doc = Nokogiri::HTML.parse(string_or_io, nil, encoding)
  32. elements = doc.xpath("/html/head/*","/html/body/*")
  33. return "" if (elements.nil? || elements.empty?)
  34. elements.each do |node|
  35. traverse_conditionally_top_down(node, :sanitize_node)
  36. end
  37. doc.root.to_xml
  38. end
  39. private
  40. def traverse_conditionally_top_down(node, method_name)
  41. return if send(method_name, node)
  42. node.children.each {|j| traverse_conditionally_top_down(j, method_name)}
  43. end
  44. def remove_tags_from_node(node)
  45. replacement_killer = Nokogiri::XML::Text.new(node.text, node.document)
  46. node.add_next_sibling(replacement_killer)
  47. node.remove
  48. return true
  49. end
  50. def sanitize_node(node)
  51. case node.type
  52. when 1 # Nokogiri::XML::Node::ELEMENT_NODE
  53. if HashedWhiteList::ALLOWED_ELEMENTS[node.name]
  54. node.attributes.each do |attr|
  55. node.remove_attribute(attr.first) unless HashedWhiteList::ALLOWED_ATTRIBUTES[attr.first]
  56. end
  57. node.attributes.each do |attr|
  58. if HashedWhiteList::ATTR_VAL_IS_URI[attr.first]
  59. # this block lifted nearly verbatim from HTML5 sanitization
  60. val_unescaped = CGI.unescapeHTML(attr.last.to_s).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
  61. if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and HashedWhiteList::ALLOWED_PROTOCOLS[val_unescaped.split(':')[0]].nil?
  62. node.remove_attribute(attr.first)
  63. end
  64. end
  65. end
  66. if node.attributes['style']
  67. node['style'] = sanitize_css(node.attributes['style'])
  68. end
  69. return false
  70. end
  71. when 3 # Nokogiri::XML::Node::TEXT_NODE
  72. return false
  73. when 4 # Nokogiri::XML::Node::CDATA_SECTION_NODE
  74. return false
  75. end
  76. replacement_killer = Nokogiri::XML::Text.new(node.to_s, node.document)
  77. node.add_next_sibling(replacement_killer)
  78. node.remove
  79. return true
  80. end
  81. # this liftend nearly verbatim from html5
  82. def sanitize_css(style)
  83. # disallow urls
  84. style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
  85. # gauntlet
  86. return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
  87. return '' unless style =~ /^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$/
  88. clean = []
  89. style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
  90. next if val.empty?
  91. prop.downcase!
  92. if HashedWhiteList::ALLOWED_CSS_PROPERTIES[prop]
  93. clean << "#{prop}: #{val};"
  94. elsif %w[background border margin padding].include?(prop.split('-')[0])
  95. clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
  96. HashedWhiteList::ALLOWED_CSS_KEYWORDS[keyword].nil? and
  97. keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
  98. end
  99. elsif HashedWhiteList::ALLOWED_SVG_PROPERTIES[prop]
  100. clean << "#{prop}: #{val};"
  101. end
  102. end
  103. style = clean.join(' ')
  104. end
  105. end # self
  106. module HashedWhiteList
  107. # turn each of the whitelist arrays into a hash for faster lookup
  108. WhiteList.constants.each do |constant|
  109. next unless WhiteList.module_eval("#{constant}").is_a?(Array)
  110. module_eval <<-CODE
  111. #{constant} = {}
  112. WhiteList::#{constant}.each { |c| #{constant}[c] = true ; #{constant}[c.downcase] = true }
  113. CODE
  114. end
  115. end
  116. end