PageRenderTime 42ms CodeModel.GetById 8ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/ruby/1.9/rexml/text.rb

https://bitbucket.org/nicksieger/jruby
Ruby | 404 lines | 275 code | 35 blank | 94 comment | 30 complexity | 57d360a52af2df62eda0b98025668e3a MD5 | raw file
Possible License(s): GPL-3.0, JSON
  1. require 'rexml/entity'
  2. require 'rexml/doctype'
  3. require 'rexml/child'
  4. require 'rexml/doctype'
  5. require 'rexml/parseexception'
  6. module REXML
  7. # Represents text nodes in an XML document
  8. class Text < Child
  9. include Comparable
  10. # The order in which the substitutions occur
  11. SPECIALS = [ /&(?!#?[\w-]+;)/u, /</u, />/u, /"/u, /'/u, /\r/u ]
  12. SUBSTITUTES = ['&amp;', '&lt;', '&gt;', '&quot;', '&apos;', '&#13;']
  13. # Characters which are substituted in written strings
  14. SLAICEPS = [ '<', '>', '"', "'", '&' ]
  15. SETUTITSBUS = [ /&lt;/u, /&gt;/u, /&quot;/u, /&apos;/u, /&amp;/u ]
  16. # If +raw+ is true, then REXML leaves the value alone
  17. attr_accessor :raw
  18. NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um
  19. NUMERICENTITY = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
  20. VALID_CHAR = [
  21. 0x9, 0xA, 0xD,
  22. (0x20..0xD7FF),
  23. (0xE000..0xFFFD),
  24. (0x10000..0x10FFFF)
  25. ]
  26. if String.method_defined? :encode
  27. VALID_XML_CHARS = Regexp.new('^['+
  28. VALID_CHAR.map { |item|
  29. case item
  30. when Fixnum
  31. [item].pack('U').force_encoding('utf-8')
  32. when Range
  33. [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
  34. end
  35. }.join +
  36. ']*$')
  37. else
  38. VALID_XML_CHARS = /^(
  39. [\x09\x0A\x0D\x20-\x7E] # ASCII
  40. | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
  41. | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
  42. | [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte
  43. | \xEF[\x80-\xBE]{2} #
  44. | \xEF\xBF[\x80-\xBD] # excluding U+fffe and U+ffff
  45. | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
  46. | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
  47. | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
  48. | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
  49. )*$/nx;
  50. end
  51. # Constructor
  52. # +arg+ if a String, the content is set to the String. If a Text,
  53. # the object is shallowly cloned.
  54. #
  55. # +respect_whitespace+ (boolean, false) if true, whitespace is
  56. # respected
  57. #
  58. # +parent+ (nil) if this is a Parent object, the parent
  59. # will be set to this.
  60. #
  61. # +raw+ (nil) This argument can be given three values.
  62. # If true, then the value of used to construct this object is expected to
  63. # contain no unescaped XML markup, and REXML will not change the text. If
  64. # this value is false, the string may contain any characters, and REXML will
  65. # escape any and all defined entities whose values are contained in the
  66. # text. If this value is nil (the default), then the raw value of the
  67. # parent will be used as the raw value for this node. If there is no raw
  68. # value for the parent, and no value is supplied, the default is false.
  69. # Use this field if you have entities defined for some text, and you don't
  70. # want REXML to escape that text in output.
  71. # Text.new( "<&", false, nil, false ) #-> "&lt;&amp;"
  72. # Text.new( "&lt;&amp;", false, nil, false ) #-> "&amp;lt;&amp;amp;"
  73. # Text.new( "<&", false, nil, true ) #-> Parse exception
  74. # Text.new( "&lt;&amp;", false, nil, true ) #-> "&lt;&amp;"
  75. # # Assume that the entity "s" is defined to be "sean"
  76. # # and that the entity "r" is defined to be "russell"
  77. # Text.new( "sean russell" ) #-> "&s; &r;"
  78. # Text.new( "sean russell", false, nil, true ) #-> "sean russell"
  79. #
  80. # +entity_filter+ (nil) This can be an array of entities to match in the
  81. # supplied text. This argument is only useful if +raw+ is set to false.
  82. # Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell"
  83. # Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell"
  84. # In the last example, the +entity_filter+ argument is ignored.
  85. #
  86. # +illegal+ INTERNAL USE ONLY
  87. def initialize(arg, respect_whitespace=false, parent=nil, raw=nil,
  88. entity_filter=nil, illegal=NEEDS_A_SECOND_CHECK )
  89. @raw = false
  90. @parent = nil
  91. if parent
  92. super( parent )
  93. @raw = parent.raw
  94. end
  95. @raw = raw unless raw.nil?
  96. @entity_filter = entity_filter
  97. @normalized = @unnormalized = nil
  98. if arg.kind_of? String
  99. @string = arg.clone
  100. @string.squeeze!(" \n\t") unless respect_whitespace
  101. elsif arg.kind_of? Text
  102. @string = arg.to_s
  103. @raw = arg.raw
  104. elsif
  105. raise "Illegal argument of type #{arg.type} for Text constructor (#{arg})"
  106. end
  107. @string.gsub!( /\r\n?/, "\n" )
  108. Text.check(@string, illegal, doctype) if @raw
  109. end
  110. def parent= parent
  111. super(parent)
  112. Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent
  113. end
  114. # check for illegal characters
  115. def Text.check string, pattern, doctype
  116. # illegal anywhere
  117. if string !~ VALID_XML_CHARS
  118. if String.method_defined? :encode
  119. string.chars.each do |c|
  120. case c.ord
  121. when *VALID_CHAR
  122. else
  123. raise "Illegal character #{c.inspect} in raw string \"#{string}\""
  124. end
  125. end
  126. else
  127. string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/n) do |c|
  128. case c.unpack('U')
  129. when *VALID_CHAR
  130. else
  131. raise "Illegal character #{c.inspect} in raw string \"#{string}\""
  132. end
  133. end
  134. end
  135. end
  136. # context sensitive
  137. string.scan(pattern) do
  138. if $1[-1] != ?;
  139. raise "Illegal character '#{$1}' in raw string \"#{string}\""
  140. elsif $1[0] == ?&
  141. if $5 and $5[0] == ?#
  142. case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
  143. when *VALID_CHAR
  144. else
  145. raise "Illegal character '#{$1}' in raw string \"#{string}\""
  146. end
  147. # FIXME: below can't work but this needs API change.
  148. # elsif @parent and $3 and !SUBSTITUTES.include?($1)
  149. # if !doctype or !doctype.entities.has_key?($3)
  150. # raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
  151. # end
  152. end
  153. end
  154. end
  155. end
  156. def node_type
  157. :text
  158. end
  159. def empty?
  160. @string.size==0
  161. end
  162. def clone
  163. return Text.new(self)
  164. end
  165. # Appends text to this text node. The text is appended in the +raw+ mode
  166. # of this text node.
  167. def <<( to_append )
  168. @string << to_append.gsub( /\r\n?/, "\n" )
  169. end
  170. # +other+ a String or a Text
  171. # +returns+ the result of (to_s <=> arg.to_s)
  172. def <=>( other )
  173. to_s() <=> other.to_s
  174. end
  175. def doctype
  176. if @parent
  177. doc = @parent.document
  178. doc.doctype if doc
  179. end
  180. end
  181. REFERENCE = /#{Entity::REFERENCE}/
  182. # Returns the string value of this text node. This string is always
  183. # escaped, meaning that it is a valid XML text node string, and all
  184. # entities that can be escaped, have been inserted. This method respects
  185. # the entity filter set in the constructor.
  186. #
  187. # # Assume that the entity "s" is defined to be "sean", and that the
  188. # # entity "r" is defined to be "russell"
  189. # t = Text.new( "< & sean russell", false, nil, false, ['s'] )
  190. # t.to_s #-> "&lt; &amp; &s; russell"
  191. # t = Text.new( "< & &s; russell", false, nil, false )
  192. # t.to_s #-> "&lt; &amp; &s; russell"
  193. # u = Text.new( "sean russell", false, nil, true )
  194. # u.to_s #-> "sean russell"
  195. def to_s
  196. return @string if @raw
  197. return @normalized if @normalized
  198. @normalized = Text::normalize( @string, doctype, @entity_filter )
  199. end
  200. def inspect
  201. @string.inspect
  202. end
  203. # Returns the string value of this text. This is the text without
  204. # entities, as it might be used programmatically, or printed to the
  205. # console. This ignores the 'raw' attribute setting, and any
  206. # entity_filter.
  207. #
  208. # # Assume that the entity "s" is defined to be "sean", and that the
  209. # # entity "r" is defined to be "russell"
  210. # t = Text.new( "< & sean russell", false, nil, false, ['s'] )
  211. # t.value #-> "< & sean russell"
  212. # t = Text.new( "< & &s; russell", false, nil, false )
  213. # t.value #-> "< & sean russell"
  214. # u = Text.new( "sean russell", false, nil, true )
  215. # u.value #-> "sean russell"
  216. def value
  217. return @unnormalized if @unnormalized
  218. @unnormalized = Text::unnormalize( @string, doctype )
  219. end
  220. # Sets the contents of this text node. This expects the text to be
  221. # unnormalized. It returns self.
  222. #
  223. # e = Element.new( "a" )
  224. # e.add_text( "foo" ) # <a>foo</a>
  225. # e[0].value = "bar" # <a>bar</a>
  226. # e[0].value = "<a>" # <a>&lt;a&gt;</a>
  227. def value=( val )
  228. @string = val.gsub( /\r\n?/, "\n" )
  229. @unnormalized = nil
  230. @normalized = nil
  231. @raw = false
  232. end
  233. def wrap(string, width, addnewline=false)
  234. # Recursively wrap string at width.
  235. return string if string.length <= width
  236. place = string.rindex(' ', width) # Position in string with last ' ' before cutoff
  237. if addnewline then
  238. return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width)
  239. else
  240. return string[0,place] + "\n" + wrap(string[place+1..-1], width)
  241. end
  242. end
  243. def indent_text(string, level=1, style="\t", indentfirstline=true)
  244. return string if level < 0
  245. new_string = ''
  246. string.each_line { |line|
  247. indent_string = style * level
  248. new_line = (indent_string + line).sub(/[\s]+$/,'')
  249. new_string << new_line
  250. }
  251. new_string.strip! unless indentfirstline
  252. return new_string
  253. end
  254. # == DEPRECATED
  255. # See REXML::Formatters
  256. #
  257. def write( writer, indent=-1, transitive=false, ie_hack=false )
  258. Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters")
  259. formatter = if indent > -1
  260. REXML::Formatters::Pretty.new( indent )
  261. else
  262. REXML::Formatters::Default.new
  263. end
  264. formatter.write( self, writer )
  265. end
  266. # FIXME
  267. # This probably won't work properly
  268. def xpath
  269. path = @parent.xpath
  270. path += "/text()"
  271. return path
  272. end
  273. # Writes out text, substituting special characters beforehand.
  274. # +out+ A String, IO, or any other object supporting <<( String )
  275. # +input+ the text to substitute and the write out
  276. #
  277. # z=utf8.unpack("U*")
  278. # ascOut=""
  279. # z.each{|r|
  280. # if r < 0x100
  281. # ascOut.concat(r.chr)
  282. # else
  283. # ascOut.concat(sprintf("&#x%x;", r))
  284. # end
  285. # }
  286. # puts ascOut
  287. def write_with_substitution out, input
  288. copy = input.clone
  289. # Doing it like this rather than in a loop improves the speed
  290. copy.gsub!( SPECIALS[0], SUBSTITUTES[0] )
  291. copy.gsub!( SPECIALS[1], SUBSTITUTES[1] )
  292. copy.gsub!( SPECIALS[2], SUBSTITUTES[2] )
  293. copy.gsub!( SPECIALS[3], SUBSTITUTES[3] )
  294. copy.gsub!( SPECIALS[4], SUBSTITUTES[4] )
  295. copy.gsub!( SPECIALS[5], SUBSTITUTES[5] )
  296. out << copy
  297. end
  298. # Reads text, substituting entities
  299. def Text::read_with_substitution( input, illegal=nil )
  300. copy = input.clone
  301. if copy =~ illegal
  302. raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" )
  303. end if illegal
  304. copy.gsub!( /\r\n?/, "\n" )
  305. if copy.include? ?&
  306. copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] )
  307. copy.gsub!( SETUTITSBUS[1], SLAICEPS[1] )
  308. copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] )
  309. copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] )
  310. copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] )
  311. copy.gsub!( /&#0*((?:\d+)|(?:x[a-f0-9]+));/ ) {
  312. m=$1
  313. #m='0' if m==''
  314. m = "0#{m}" if m[0] == ?x
  315. [Integer(m)].pack('U*')
  316. }
  317. end
  318. copy
  319. end
  320. EREFERENCE = /&(?!#{Entity::NAME};)/
  321. # Escapes all possible entities
  322. def Text::normalize( input, doctype=nil, entity_filter=nil )
  323. copy = input.to_s
  324. # Doing it like this rather than in a loop improves the speed
  325. #copy = copy.gsub( EREFERENCE, '&amp;' )
  326. copy = copy.gsub( "&", "&amp;" )
  327. if doctype
  328. # Replace all ampersands that aren't part of an entity
  329. doctype.entities.each_value do |entity|
  330. copy = copy.gsub( entity.value,
  331. "&#{entity.name};" ) if entity.value and
  332. not( entity_filter and entity_filter.include?(entity) )
  333. end
  334. else
  335. # Replace all ampersands that aren't part of an entity
  336. DocType::DEFAULT_ENTITIES.each_value do |entity|
  337. copy = copy.gsub(entity.value, "&#{entity.name};" )
  338. end
  339. end
  340. copy
  341. end
  342. # Unescapes all possible entities
  343. def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
  344. string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) {
  345. ref = $&
  346. if ref[1] == ?#
  347. if ref[2] == ?x
  348. [ref[3...-1].to_i(16)].pack('U*')
  349. else
  350. [ref[2...-1].to_i].pack('U*')
  351. end
  352. elsif ref == '&amp;'
  353. '&'
  354. elsif filter and filter.include?( ref[1...-1] )
  355. ref
  356. elsif doctype
  357. doctype.entity( ref[1...-1] ) or ref
  358. else
  359. entity_value = DocType::DEFAULT_ENTITIES[ ref[1...-1] ]
  360. entity_value ? entity_value.value : ref
  361. end
  362. }
  363. end
  364. end
  365. end