PageRenderTime 50ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/ruby/1.8/rexml/text.rb

https://bitbucket.org/nicksieger/jruby
Ruby | 344 lines | 226 code | 31 blank | 87 comment | 25 complexity | 1c4f8c56ccf5291e580f2e3a64bd0d93 MD5 | raw file
Possible License(s): GPL-3.0, JSON
  1. require 'rexml/entity'
  2. require 'rexml/doctype'
  3. require 'rexml/child'
  4. require 'rexml/doctype'
  5. require 'rexml/parseexception'
  6. module REXML
  7. # Represents text nodes in an XML document
  8. class Text < Child
  9. include Comparable
  10. # The order in which the substitutions occur
  11. SPECIALS = [ /&(?!#?[\w-]+;)/u, /</u, />/u, /"/u, /'/u, /\r/u ]
  12. SUBSTITUTES = ['&amp;', '&lt;', '&gt;', '&quot;', '&apos;', '&#13;']
  13. # Characters which are substituted in written strings
  14. SLAICEPS = [ '<', '>', '"', "'", '&' ]
  15. SETUTITSBUS = [ /&lt;/u, /&gt;/u, /&quot;/u, /&apos;/u, /&amp;/u ]
  16. # If +raw+ is true, then REXML leaves the value alone
  17. attr_accessor :raw
  18. ILLEGAL = /(<|&(?!(#{Entity::NAME})|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));))/um
  19. NUMERICENTITY = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
  20. # Constructor
  21. # +arg+ if a String, the content is set to the String. If a Text,
  22. # the object is shallowly cloned.
  23. #
  24. # +respect_whitespace+ (boolean, false) if true, whitespace is
  25. # respected
  26. #
  27. # +parent+ (nil) if this is a Parent object, the parent
  28. # will be set to this.
  29. #
  30. # +raw+ (nil) This argument can be given three values.
  31. # If true, then the value of used to construct this object is expected to
  32. # contain no unescaped XML markup, and REXML will not change the text. If
  33. # this value is false, the string may contain any characters, and REXML will
  34. # escape any and all defined entities whose values are contained in the
  35. # text. If this value is nil (the default), then the raw value of the
  36. # parent will be used as the raw value for this node. If there is no raw
  37. # value for the parent, and no value is supplied, the default is false.
  38. # Use this field if you have entities defined for some text, and you don't
  39. # want REXML to escape that text in output.
  40. # Text.new( "<&", false, nil, false ) #-> "&lt;&amp;"
  41. # Text.new( "&lt;&amp;", false, nil, false ) #-> "&amp;lt;&amp;amp;"
  42. # Text.new( "<&", false, nil, true ) #-> Parse exception
  43. # Text.new( "&lt;&amp;", false, nil, true ) #-> "&lt;&amp;"
  44. # # Assume that the entity "s" is defined to be "sean"
  45. # # and that the entity "r" is defined to be "russell"
  46. # Text.new( "sean russell" ) #-> "&s; &r;"
  47. # Text.new( "sean russell", false, nil, true ) #-> "sean russell"
  48. #
  49. # +entity_filter+ (nil) This can be an array of entities to match in the
  50. # supplied text. This argument is only useful if +raw+ is set to false.
  51. # Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell"
  52. # Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell"
  53. # In the last example, the +entity_filter+ argument is ignored.
  54. #
  55. # +pattern+ INTERNAL USE ONLY
  56. def initialize(arg, respect_whitespace=false, parent=nil, raw=nil,
  57. entity_filter=nil, illegal=ILLEGAL )
  58. @raw = false
  59. if parent
  60. super( parent )
  61. @raw = parent.raw
  62. else
  63. @parent = nil
  64. end
  65. @raw = raw unless raw.nil?
  66. @entity_filter = entity_filter
  67. @normalized = @unnormalized = nil
  68. if arg.kind_of? String
  69. @string = arg.clone
  70. @string.squeeze!(" \n\t") unless respect_whitespace
  71. elsif arg.kind_of? Text
  72. @string = arg.to_s
  73. @raw = arg.raw
  74. elsif
  75. raise "Illegal argument of type #{arg.type} for Text constructor (#{arg})"
  76. end
  77. @string.gsub!( /\r\n?/, "\n" )
  78. # check for illegal characters
  79. if @raw
  80. if @string =~ illegal
  81. raise "Illegal character '#{$1}' in raw string \"#{@string}\""
  82. end
  83. end
  84. end
  85. def node_type
  86. :text
  87. end
  88. def empty?
  89. @string.size==0
  90. end
  91. def clone
  92. return Text.new(self)
  93. end
  94. # Appends text to this text node. The text is appended in the +raw+ mode
  95. # of this text node.
  96. def <<( to_append )
  97. @string << to_append.gsub( /\r\n?/, "\n" )
  98. end
  99. # +other+ a String or a Text
  100. # +returns+ the result of (to_s <=> arg.to_s)
  101. def <=>( other )
  102. to_s() <=> other.to_s
  103. end
  104. REFERENCE = /#{Entity::REFERENCE}/
  105. # Returns the string value of this text node. This string is always
  106. # escaped, meaning that it is a valid XML text node string, and all
  107. # entities that can be escaped, have been inserted. This method respects
  108. # the entity filter set in the constructor.
  109. #
  110. # # Assume that the entity "s" is defined to be "sean", and that the
  111. # # entity "r" is defined to be "russell"
  112. # t = Text.new( "< & sean russell", false, nil, false, ['s'] )
  113. # t.to_s #-> "&lt; &amp; &s; russell"
  114. # t = Text.new( "< & &s; russell", false, nil, false )
  115. # t.to_s #-> "&lt; &amp; &s; russell"
  116. # u = Text.new( "sean russell", false, nil, true )
  117. # u.to_s #-> "sean russell"
  118. def to_s
  119. return @string if @raw
  120. return @normalized if @normalized
  121. doctype = nil
  122. if @parent
  123. doc = @parent.document
  124. doctype = doc.doctype if doc
  125. end
  126. @normalized = Text::normalize( @string, doctype, @entity_filter )
  127. end
  128. def inspect
  129. @string.inspect
  130. end
  131. # Returns the string value of this text. This is the text without
  132. # entities, as it might be used programmatically, or printed to the
  133. # console. This ignores the 'raw' attribute setting, and any
  134. # entity_filter.
  135. #
  136. # # Assume that the entity "s" is defined to be "sean", and that the
  137. # # entity "r" is defined to be "russell"
  138. # t = Text.new( "< & sean russell", false, nil, false, ['s'] )
  139. # t.value #-> "< & sean russell"
  140. # t = Text.new( "< & &s; russell", false, nil, false )
  141. # t.value #-> "< & sean russell"
  142. # u = Text.new( "sean russell", false, nil, true )
  143. # u.value #-> "sean russell"
  144. def value
  145. @unnormalized if @unnormalized
  146. doctype = nil
  147. if @parent
  148. doc = @parent.document
  149. doctype = doc.doctype if doc
  150. end
  151. @unnormalized = Text::unnormalize( @string, doctype )
  152. end
  153. # Sets the contents of this text node. This expects the text to be
  154. # unnormalized. It returns self.
  155. #
  156. # e = Element.new( "a" )
  157. # e.add_text( "foo" ) # <a>foo</a>
  158. # e[0].value = "bar" # <a>bar</a>
  159. # e[0].value = "<a>" # <a>&lt;a&gt;</a>
  160. def value=( val )
  161. @string = val.gsub( /\r\n?/, "\n" )
  162. @unnormalized = nil
  163. @normalized = nil
  164. @raw = false
  165. end
  166. def wrap(string, width, addnewline=false)
  167. # Recursively wrap string at width.
  168. return string if string.length <= width
  169. place = string.rindex(' ', width) # Position in string with last ' ' before cutoff
  170. if addnewline then
  171. return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width)
  172. else
  173. return string[0,place] + "\n" + wrap(string[place+1..-1], width)
  174. end
  175. end
  176. def indent_text(string, level=1, style="\t", indentfirstline=true)
  177. return string if level < 0
  178. new_string = ''
  179. string.each { |line|
  180. indent_string = style * level
  181. new_line = (indent_string + line).sub(/[\s]+$/,'')
  182. new_string << new_line
  183. }
  184. new_string.strip! unless indentfirstline
  185. return new_string
  186. end
  187. # == DEPRECATED
  188. # See REXML::Formatters
  189. #
  190. def write( writer, indent=-1, transitive=false, ie_hack=false )
  191. Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters")
  192. formatter = if indent > -1
  193. REXML::Formatters::Pretty.new( indent )
  194. else
  195. REXML::Formatters::Default.new
  196. end
  197. formatter.write( self, writer )
  198. end
  199. # FIXME
  200. # This probably won't work properly
  201. def xpath
  202. path = @parent.xpath
  203. path += "/text()"
  204. return path
  205. end
  206. # Writes out text, substituting special characters beforehand.
  207. # +out+ A String, IO, or any other object supporting <<( String )
  208. # +input+ the text to substitute and the write out
  209. #
  210. # z=utf8.unpack("U*")
  211. # ascOut=""
  212. # z.each{|r|
  213. # if r < 0x100
  214. # ascOut.concat(r.chr)
  215. # else
  216. # ascOut.concat(sprintf("&#x%x;", r))
  217. # end
  218. # }
  219. # puts ascOut
  220. def write_with_substitution out, input
  221. copy = input.clone
  222. # Doing it like this rather than in a loop improves the speed
  223. copy.gsub!( SPECIALS[0], SUBSTITUTES[0] )
  224. copy.gsub!( SPECIALS[1], SUBSTITUTES[1] )
  225. copy.gsub!( SPECIALS[2], SUBSTITUTES[2] )
  226. copy.gsub!( SPECIALS[3], SUBSTITUTES[3] )
  227. copy.gsub!( SPECIALS[4], SUBSTITUTES[4] )
  228. copy.gsub!( SPECIALS[5], SUBSTITUTES[5] )
  229. out << copy
  230. end
  231. # Reads text, substituting entities
  232. def Text::read_with_substitution( input, illegal=nil )
  233. copy = input.clone
  234. if copy =~ illegal
  235. raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" )
  236. end if illegal
  237. copy.gsub!( /\r\n?/, "\n" )
  238. if copy.include? ?&
  239. copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] )
  240. copy.gsub!( SETUTITSBUS[1], SLAICEPS[1] )
  241. copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] )
  242. copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] )
  243. copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] )
  244. copy.gsub!( /&#0*((?:\d+)|(?:x[a-f0-9]+));/ ) {|m|
  245. m=$1
  246. #m='0' if m==''
  247. m = "0#{m}" if m[0] == ?x
  248. [Integer(m)].pack('U*')
  249. }
  250. end
  251. copy
  252. end
  253. EREFERENCE = /&(?!#{Entity::NAME};)/
  254. # Escapes all possible entities
  255. def Text::normalize( input, doctype=nil, entity_filter=nil )
  256. copy = input.to_s
  257. # Doing it like this rather than in a loop improves the speed
  258. #copy = copy.gsub( EREFERENCE, '&amp;' )
  259. copy = copy.gsub( "&", "&amp;" )
  260. if doctype
  261. # Replace all ampersands that aren't part of an entity
  262. doctype.entities.each_value do |entity|
  263. copy = copy.gsub( entity.value,
  264. "&#{entity.name};" ) if entity.value and
  265. not( entity_filter and entity_filter.include?(entity) )
  266. end
  267. else
  268. # Replace all ampersands that aren't part of an entity
  269. DocType::DEFAULT_ENTITIES.each_value do |entity|
  270. copy = copy.gsub(entity.value, "&#{entity.name};" )
  271. end
  272. end
  273. copy
  274. end
  275. # Unescapes all possible entities
  276. def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
  277. rv = string.clone
  278. rv.gsub!( /\r\n?/, "\n" )
  279. matches = rv.scan( REFERENCE )
  280. return rv if matches.size == 0
  281. rv.gsub!( NUMERICENTITY ) {|m|
  282. m=$1
  283. m = "0#{m}" if m[0] == ?x
  284. [Integer(m)].pack('U*')
  285. }
  286. matches.collect!{|x|x[0]}.compact!
  287. if matches.size > 0
  288. if doctype
  289. matches.each do |entity_reference|
  290. unless filter and filter.include?(entity_reference)
  291. entity_value = doctype.entity( entity_reference )
  292. re = /&#{entity_reference};/
  293. rv.gsub!( re, entity_value ) if entity_value
  294. end
  295. end
  296. else
  297. matches.each do |entity_reference|
  298. unless filter and filter.include?(entity_reference)
  299. entity_value = DocType::DEFAULT_ENTITIES[ entity_reference ]
  300. re = /&#{entity_reference};/
  301. rv.gsub!( re, entity_value.value ) if entity_value
  302. end
  303. end
  304. end
  305. rv.gsub!( /&amp;/, '&' )
  306. end
  307. rv
  308. end
  309. end
  310. end