PageRenderTime 62ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/vendor/jruby-1.1.6RC1/lib/ruby/1.8/rexml/parsers/baseparser.rb

https://bitbucket.org/nicksieger/advent-jruby
Ruby | 463 lines | 361 code | 27 blank | 75 comment | 61 complexity | cd07efdf9f78cbda176c968e65b6b53a MD5 | raw file
Possible License(s): CPL-1.0, AGPL-1.0, LGPL-2.1, JSON
  1. require 'rexml/parseexception'
  2. require 'rexml/source'
  3. module REXML
  4. module Parsers
  5. # = Using the Pull Parser
  6. # <em>This API is experimental, and subject to change.</em>
  7. # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
  8. # while parser.has_next?
  9. # res = parser.next
  10. # puts res[1]['att'] if res.start_tag? and res[0] == 'b'
  11. # end
  12. # See the PullEvent class for information on the content of the results.
  13. # The data is identical to the arguments passed for the various events to
  14. # the StreamListener API.
  15. #
  16. # Notice that:
  17. # parser = PullParser.new( "<a>BAD DOCUMENT" )
  18. # while parser.has_next?
  19. # res = parser.next
  20. # raise res[1] if res.error?
  21. # end
  22. #
  23. # Nat Price gave me some good ideas for the API.
  24. class BaseParser
  25. NCNAME_STR= '[\w:][\-\w\d.]*'
  26. NAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
  27. NAMECHAR = '[\-\w\d\.:]'
  28. NAME = "([\\w:]#{NAMECHAR}*)"
  29. NMTOKEN = "(?:#{NAMECHAR})+"
  30. NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
  31. REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)"
  32. REFERENCE_RE = /#{REFERENCE}/
  33. DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
  34. DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
  35. ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um
  36. COMMENT_START = /\A<!--/u
  37. COMMENT_PATTERN = /<!--(.*?)-->/um
  38. CDATA_START = /\A<!\[CDATA\[/u
  39. CDATA_END = /^\s*\]\s*>/um
  40. CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
  41. XMLDECL_START = /\A<\?xml\s/u;
  42. XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
  43. INSTRUCTION_START = /\A<\?/u
  44. INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um
  45. TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{NAME_STR}\s*=\s*(["']).*?\3)*)\s*(\/)?>/um
  46. CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um
  47. VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
  48. ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
  49. STANDALONE = /\bstandalone\s*=\s["'](.*?)['"]/um
  50. ENTITY_START = /^\s*<!ENTITY/
  51. IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
  52. ELEMENTDECL_START = /^\s*<!ELEMENT/um
  53. ELEMENTDECL_PATTERN = /^\s*(<!ELEMENT.*?)>/um
  54. SYSTEMENTITY = /^\s*(%.*?;)\s*$/um
  55. ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
  56. NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
  57. ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
  58. ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"
  59. ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"
  60. DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
  61. ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
  62. ATTDEF_RE = /#{ATTDEF}/
  63. ATTLISTDECL_START = /^\s*<!ATTLIST/um
  64. ATTLISTDECL_PATTERN = /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
  65. NOTATIONDECL_START = /^\s*<!NOTATION/um
  66. PUBLIC = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
  67. SYSTEM = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
  68. TEXT_PATTERN = /\A([^<]*)/um
  69. # Entity constants
  70. PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
  71. SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))}
  72. PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}
  73. EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
  74. NDATADECL = "\\s+NDATA\\s+#{NAME}"
  75. PEREFERENCE = "%#{NAME};"
  76. ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
  77. PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
  78. ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
  79. PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
  80. GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
  81. ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
  82. EREFERENCE = /&(?!#{NAME};)/
  83. DEFAULT_ENTITIES = {
  84. 'gt' => [/&gt;/, '&gt;', '>', />/],
  85. 'lt' => [/&lt;/, '&lt;', '<', /</],
  86. 'quot' => [/&quot;/, '&quot;', '"', /"/],
  87. "apos" => [/&apos;/, "&apos;", "'", /'/]
  88. }
  89. ######################################################################
  90. # These are patterns to identify common markup errors, to make the
  91. # error messages more informative.
  92. ######################################################################
  93. MISSING_ATTRIBUTE_QUOTES = /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um
  94. def initialize( source )
  95. self.stream = source
  96. end
  97. def add_listener( listener )
  98. if !defined?(@listeners) or !@listeners
  99. @listeners = []
  100. instance_eval <<-EOL
  101. alias :_old_pull :pull
  102. def pull
  103. event = _old_pull
  104. @listeners.each do |listener|
  105. listener.receive event
  106. end
  107. event
  108. end
  109. EOL
  110. end
  111. @listeners << listener
  112. end
  113. attr_reader :source
  114. def stream=( source )
  115. @source = SourceFactory.create_from( source )
  116. @closed = nil
  117. @document_status = nil
  118. @tags = []
  119. @stack = []
  120. @entities = []
  121. end
  122. def position
  123. if @source.respond_to? :position
  124. @source.position
  125. else
  126. # FIXME
  127. 0
  128. end
  129. end
  130. # Returns true if there are no more events
  131. def empty?
  132. return (@source.empty? and @stack.empty?)
  133. end
  134. # Returns true if there are more events. Synonymous with !empty?
  135. def has_next?
  136. return !(@source.empty? and @stack.empty?)
  137. end
  138. # Push an event back on the head of the stream. This method
  139. # has (theoretically) infinite depth.
  140. def unshift token
  141. @stack.unshift(token)
  142. end
  143. # Peek at the +depth+ event in the stack. The first element on the stack
  144. # is at depth 0. If +depth+ is -1, will parse to the end of the input
  145. # stream and return the last event, which is always :end_document.
  146. # Be aware that this causes the stream to be parsed up to the +depth+
  147. # event, so you can effectively pre-parse the entire document (pull the
  148. # entire thing into memory) using this method.
  149. def peek depth=0
  150. raise %Q[Illegal argument "#{depth}"] if depth < -1
  151. temp = []
  152. if depth == -1
  153. temp.push(pull()) until empty?
  154. else
  155. while @stack.size+temp.size < depth+1
  156. temp.push(pull())
  157. end
  158. end
  159. @stack += temp if temp.size > 0
  160. @stack[depth]
  161. end
  162. # Returns the next event. This is a +PullEvent+ object.
  163. def pull
  164. if @closed
  165. x, @closed = @closed, nil
  166. return [ :end_element, x ]
  167. end
  168. return [ :end_document ] if empty?
  169. return @stack.shift if @stack.size > 0
  170. @source.read if @source.buffer.size<2
  171. #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
  172. if @document_status == nil
  173. #@source.consume( /^\s*/um )
  174. word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
  175. word = word[1] unless word.nil?
  176. #STDERR.puts "WORD = #{word.inspect}"
  177. case word
  178. when COMMENT_START
  179. return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
  180. when XMLDECL_START
  181. #STDERR.puts "XMLDECL"
  182. results = @source.match( XMLDECL_PATTERN, true )[1]
  183. version = VERSION.match( results )
  184. version = version[1] unless version.nil?
  185. encoding = ENCODING.match(results)
  186. encoding = encoding[1] unless encoding.nil?
  187. @source.encoding = encoding
  188. standalone = STANDALONE.match(results)
  189. standalone = standalone[1] unless standalone.nil?
  190. return [ :xmldecl, version, encoding, standalone ]
  191. when INSTRUCTION_START
  192. return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
  193. when DOCTYPE_START
  194. md = @source.match( DOCTYPE_PATTERN, true )
  195. identity = md[1]
  196. close = md[2]
  197. identity =~ IDENTITY
  198. name = $1
  199. raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
  200. pub_sys = $2.nil? ? nil : $2.strip
  201. long_name = $4.nil? ? nil : $4.strip
  202. uri = $6.nil? ? nil : $6.strip
  203. args = [ :start_doctype, name, pub_sys, long_name, uri ]
  204. if close == ">"
  205. @document_status = :after_doctype
  206. @source.read if @source.buffer.size<2
  207. md = @source.match(/^\s*/um, true)
  208. @stack << [ :end_doctype ]
  209. else
  210. @document_status = :in_doctype
  211. end
  212. return args
  213. when /^\s+/
  214. else
  215. @document_status = :after_doctype
  216. @source.read if @source.buffer.size<2
  217. md = @source.match(/\s*/um, true)
  218. end
  219. end
  220. if @document_status == :in_doctype
  221. md = @source.match(/\s*(.*?>)/um)
  222. case md[1]
  223. when SYSTEMENTITY
  224. match = @source.match( SYSTEMENTITY, true )[1]
  225. return [ :externalentity, match ]
  226. when ELEMENTDECL_START
  227. return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
  228. when ENTITY_START
  229. match = @source.match( ENTITYDECL, true ).to_a.compact
  230. match[0] = :entitydecl
  231. ref = false
  232. if match[1] == '%'
  233. ref = true
  234. match.delete_at 1
  235. end
  236. # Now we have to sort out what kind of entity reference this is
  237. if match[2] == 'SYSTEM'
  238. # External reference
  239. match[3] = match[3][1..-2] # PUBID
  240. match.delete_at(4) if match.size > 4 # Chop out NDATA decl
  241. # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
  242. elsif match[2] == 'PUBLIC'
  243. # External reference
  244. match[3] = match[3][1..-2] # PUBID
  245. match[4] = match[4][1..-2] # HREF
  246. # match is [ :entity, name, PUBLIC, pubid, href ]
  247. else
  248. match[2] = match[2][1..-2]
  249. match.pop if match.size == 4
  250. # match is [ :entity, name, value ]
  251. end
  252. match << '%' if ref
  253. return match
  254. when ATTLISTDECL_START
  255. md = @source.match( ATTLISTDECL_PATTERN, true )
  256. raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
  257. element = md[1]
  258. contents = md[0]
  259. pairs = {}
  260. values = md[0].scan( ATTDEF_RE )
  261. values.each do |attdef|
  262. unless attdef[3] == "#IMPLIED"
  263. attdef.compact!
  264. val = attdef[3]
  265. val = attdef[4] if val == "#FIXED "
  266. pairs[attdef[0]] = val
  267. end
  268. end
  269. return [ :attlistdecl, element, pairs, contents ]
  270. when NOTATIONDECL_START
  271. md = nil
  272. if @source.match( PUBLIC )
  273. md = @source.match( PUBLIC, true )
  274. vals = [md[1],md[2],md[4],md[6]]
  275. elsif @source.match( SYSTEM )
  276. md = @source.match( SYSTEM, true )
  277. vals = [md[1],md[2],nil,md[4]]
  278. else
  279. raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
  280. end
  281. return [ :notationdecl, *vals ]
  282. when CDATA_END
  283. @document_status = :after_doctype
  284. @source.match( CDATA_END, true )
  285. return [ :end_doctype ]
  286. end
  287. end
  288. begin
  289. if @source.buffer[0] == ?<
  290. if @source.buffer[1] == ?/
  291. last_tag = @tags.pop
  292. #md = @source.match_to_consume( '>', CLOSE_MATCH)
  293. md = @source.match( CLOSE_MATCH, true )
  294. raise REXML::ParseException.new( "Missing end tag for "+
  295. "'#{last_tag}' (got \"#{md[1]}\")",
  296. @source) unless last_tag == md[1]
  297. return [ :end_element, last_tag ]
  298. elsif @source.buffer[1] == ?!
  299. md = @source.match(/\A(\s*[^>]*>)/um)
  300. #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
  301. raise REXML::ParseException.new("Malformed node", @source) unless md
  302. if md[0][2] == ?-
  303. md = @source.match( COMMENT_PATTERN, true )
  304. return [ :comment, md[1] ] if md
  305. else
  306. md = @source.match( CDATA_PATTERN, true )
  307. return [ :cdata, md[1] ] if md
  308. end
  309. raise REXML::ParseException.new( "Declarations can only occur "+
  310. "in the doctype declaration.", @source)
  311. elsif @source.buffer[1] == ??
  312. md = @source.match( INSTRUCTION_PATTERN, true )
  313. return [ :processing_instruction, md[1], md[2] ] if md
  314. raise REXML::ParseException.new( "Bad instruction declaration",
  315. @source)
  316. else
  317. # Get the next tag
  318. md = @source.match(TAG_MATCH, true)
  319. unless md
  320. # Check for missing attribute quotes
  321. raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
  322. raise REXML::ParseException.new("malformed XML: missing tag start", @source)
  323. end
  324. attrs = []
  325. if md[2].size > 0
  326. attrs = md[2].scan( ATTRIBUTE_PATTERN )
  327. raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0
  328. end
  329. if md[4]
  330. @closed = md[1]
  331. else
  332. @tags.push( md[1] )
  333. end
  334. attributes = {}
  335. attrs.each { |a,b,c| attributes[a] = c }
  336. return [ :start_element, md[1], attributes ]
  337. end
  338. else
  339. md = @source.match( TEXT_PATTERN, true )
  340. if md[0].length == 0
  341. @source.match( /(\s+)/, true )
  342. end
  343. #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
  344. #return [ :text, "" ] if md[0].length == 0
  345. # unnormalized = Text::unnormalize( md[1], self )
  346. # return PullEvent.new( :text, md[1], unnormalized )
  347. return [ :text, md[1] ]
  348. end
  349. rescue REXML::ParseException
  350. raise
  351. rescue Exception, NameError => error
  352. raise REXML::ParseException.new( "Exception parsing",
  353. @source, self, (error ? error : $!) )
  354. end
  355. return [ :dummy ]
  356. end
  357. def entity( reference, entities )
  358. value = nil
  359. value = entities[ reference ] if entities
  360. if not value
  361. value = DEFAULT_ENTITIES[ reference ]
  362. value = value[2] if value
  363. end
  364. unnormalize( value, entities ) if value
  365. end
  366. # Escapes all possible entities
  367. def normalize( input, entities=nil, entity_filter=nil )
  368. copy = input.clone
  369. # Doing it like this rather than in a loop improves the speed
  370. copy.gsub!( EREFERENCE, '&amp;' )
  371. entities.each do |key, value|
  372. copy.gsub!( value, "&#{key};" ) unless entity_filter and
  373. entity_filter.include?(entity)
  374. end if entities
  375. copy.gsub!( EREFERENCE, '&amp;' )
  376. DEFAULT_ENTITIES.each do |key, value|
  377. copy.gsub!( value[3], value[1] )
  378. end
  379. copy
  380. end
  381. # Unescapes all possible entities
  382. def unnormalize( string, entities=nil, filter=nil )
  383. rv = string.clone
  384. rv.gsub!( /\r\n?/, "\n" )
  385. matches = rv.scan( REFERENCE_RE )
  386. return rv if matches.size == 0
  387. rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {|m|
  388. m=$1
  389. m = "0#{m}" if m[0] == ?x
  390. [Integer(m)].pack('U*')
  391. }
  392. matches.collect!{|x|x[0]}.compact!
  393. if matches.size > 0
  394. matches.each do |entity_reference|
  395. unless filter and filter.include?(entity_reference)
  396. entity_value = entity( entity_reference, entities )
  397. if entity_value
  398. re = /&#{entity_reference};/
  399. rv.gsub!( re, entity_value )
  400. end
  401. end
  402. end
  403. matches.each do |entity_reference|
  404. unless filter and filter.include?(entity_reference)
  405. er = DEFAULT_ENTITIES[entity_reference]
  406. rv.gsub!( er[0], er[2] ) if er
  407. end
  408. end
  409. rv.gsub!( /&amp;/, '&' )
  410. end
  411. rv
  412. end
  413. end
  414. end
  415. end
  416. =begin
  417. case event[0]
  418. when :start_element
  419. when :text
  420. when :end_element
  421. when :processing_instruction
  422. when :cdata
  423. when :comment
  424. when :xmldecl
  425. when :start_doctype
  426. when :end_doctype
  427. when :externalentity
  428. when :elementdecl
  429. when :entity
  430. when :attlistdecl
  431. when :notationdecl
  432. when :end_doctype
  433. end
  434. =end