PageRenderTime 53ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/rexml/parsers/baseparser.rb

https://github.com/EarthJem/ruby
Ruby | 530 lines | 417 code | 36 blank | 77 comment | 78 complexity | e50984ae5d13728dec4eb27dcddc6d63 MD5 | raw file
Possible License(s): GPL-2.0, BSD-3-Clause
  1. require 'rexml/parseexception'
  2. require 'rexml/undefinednamespaceexception'
  3. require 'rexml/source'
  4. require 'set'
  5. module REXML
  6. module Parsers
  7. # = Using the Pull Parser
  8. # <em>This API is experimental, and subject to change.</em>
  9. # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
  10. # while parser.has_next?
  11. # res = parser.next
  12. # puts res[1]['att'] if res.start_tag? and res[0] == 'b'
  13. # end
  14. # See the PullEvent class for information on the content of the results.
  15. # The data is identical to the arguments passed for the various events to
  16. # the StreamListener API.
  17. #
  18. # Notice that:
  19. # parser = PullParser.new( "<a>BAD DOCUMENT" )
  20. # while parser.has_next?
  21. # res = parser.next
  22. # raise res[1] if res.error?
  23. # end
  24. #
  25. # Nat Price gave me some good ideas for the API.
  26. class BaseParser
  27. LETTER = '[:alpha:]'
  28. DIGIT = '[:digit:]'
  29. COMBININGCHAR = '' # TODO
  30. EXTENDER = '' # TODO
  31. NCNAME_STR= "[#{LETTER}_:][-[:alnum:]._:#{COMBININGCHAR}#{EXTENDER}]*"
  32. NAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})"
  33. UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
  34. NAMECHAR = '[\-\w\.:]'
  35. NAME = "([\\w:]#{NAMECHAR}*)"
  36. NMTOKEN = "(?:#{NAMECHAR})+"
  37. NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
  38. REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)"
  39. REFERENCE_RE = /#{REFERENCE}/
  40. DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
  41. DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
  42. ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\4/um
  43. COMMENT_START = /\A<!--/u
  44. COMMENT_PATTERN = /<!--(.*?)-->/um
  45. CDATA_START = /\A<!\[CDATA\[/u
  46. CDATA_END = /^\s*\]\s*>/um
  47. CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
  48. XMLDECL_START = /\A<\?xml\s/u;
  49. XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
  50. INSTRUCTION_START = /\A<\?/u
  51. INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um
  52. TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{UNAME_STR}\s*=\s*(["']).*?\5)*)\s*(\/)?>/um
  53. CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um
  54. VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
  55. ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
  56. STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um
  57. ENTITY_START = /^\s*<!ENTITY/
  58. IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
  59. ELEMENTDECL_START = /^\s*<!ELEMENT/um
  60. ELEMENTDECL_PATTERN = /^\s*(<!ELEMENT.*?)>/um
  61. SYSTEMENTITY = /^\s*(%.*?;)\s*$/um
  62. ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
  63. NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
  64. ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
  65. ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"
  66. ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"
  67. DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
  68. ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
  69. ATTDEF_RE = /#{ATTDEF}/
  70. ATTLISTDECL_START = /^\s*<!ATTLIST/um
  71. ATTLISTDECL_PATTERN = /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
  72. NOTATIONDECL_START = /^\s*<!NOTATION/um
  73. PUBLIC = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
  74. SYSTEM = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
  75. TEXT_PATTERN = /\A([^<]*)/um
  76. # Entity constants
  77. PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
  78. SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))}
  79. PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}
  80. EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
  81. NDATADECL = "\\s+NDATA\\s+#{NAME}"
  82. PEREFERENCE = "%#{NAME};"
  83. ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
  84. PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
  85. ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
  86. PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
  87. GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
  88. ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
  89. EREFERENCE = /&(?!#{NAME};)/
  90. DEFAULT_ENTITIES = {
  91. 'gt' => [/&gt;/, '&gt;', '>', />/],
  92. 'lt' => [/&lt;/, '&lt;', '<', /</],
  93. 'quot' => [/&quot;/, '&quot;', '"', /"/],
  94. "apos" => [/&apos;/, "&apos;", "'", /'/]
  95. }
  96. ######################################################################
  97. # These are patterns to identify common markup errors, to make the
  98. # error messages more informative.
  99. ######################################################################
  100. MISSING_ATTRIBUTE_QUOTES = /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um
  101. def initialize( source )
  102. self.stream = source
  103. @listeners = []
  104. end
  105. def add_listener( listener )
  106. @listeners << listener
  107. end
  108. attr_reader :source
  109. def stream=( source )
  110. @source = SourceFactory.create_from( source )
  111. @closed = nil
  112. @document_status = nil
  113. @tags = []
  114. @stack = []
  115. @entities = []
  116. @nsstack = []
  117. end
  118. def position
  119. if @source.respond_to? :position
  120. @source.position
  121. else
  122. # FIXME
  123. 0
  124. end
  125. end
  126. # Returns true if there are no more events
  127. def empty?
  128. return (@source.empty? and @stack.empty?)
  129. end
  130. # Returns true if there are more events. Synonymous with !empty?
  131. def has_next?
  132. return !(@source.empty? and @stack.empty?)
  133. end
  134. # Push an event back on the head of the stream. This method
  135. # has (theoretically) infinite depth.
  136. def unshift token
  137. @stack.unshift(token)
  138. end
  139. # Peek at the +depth+ event in the stack. The first element on the stack
  140. # is at depth 0. If +depth+ is -1, will parse to the end of the input
  141. # stream and return the last event, which is always :end_document.
  142. # Be aware that this causes the stream to be parsed up to the +depth+
  143. # event, so you can effectively pre-parse the entire document (pull the
  144. # entire thing into memory) using this method.
  145. def peek depth=0
  146. raise %Q[Illegal argument "#{depth}"] if depth < -1
  147. temp = []
  148. if depth == -1
  149. temp.push(pull()) until empty?
  150. else
  151. while @stack.size+temp.size < depth+1
  152. temp.push(pull())
  153. end
  154. end
  155. @stack += temp if temp.size > 0
  156. @stack[depth]
  157. end
  158. # Returns the next event. This is a +PullEvent+ object.
  159. def pull
  160. pull_event.tap do |event|
  161. @listeners.each do |listener|
  162. listener.receive event
  163. end
  164. end
  165. end
  166. def pull_event
  167. if @closed
  168. x, @closed = @closed, nil
  169. return [ :end_element, x ]
  170. end
  171. return [ :end_document ] if empty?
  172. return @stack.shift if @stack.size > 0
  173. #STDERR.puts @source.encoding
  174. @source.read if @source.buffer.size<2
  175. #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
  176. if @document_status == nil
  177. #@source.consume( /^\s*/um )
  178. word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
  179. word = word[1] unless word.nil?
  180. #STDERR.puts "WORD = #{word.inspect}"
  181. case word
  182. when COMMENT_START
  183. return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
  184. when XMLDECL_START
  185. #STDERR.puts "XMLDECL"
  186. results = @source.match( XMLDECL_PATTERN, true )[1]
  187. version = VERSION.match( results )
  188. version = version[1] unless version.nil?
  189. encoding = ENCODING.match(results)
  190. encoding = encoding[1] unless encoding.nil?
  191. if need_source_encoding_update?(encoding)
  192. @source.encoding = encoding
  193. end
  194. if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
  195. encoding = "UTF-16"
  196. end
  197. standalone = STANDALONE.match(results)
  198. standalone = standalone[1] unless standalone.nil?
  199. return [ :xmldecl, version, encoding, standalone ]
  200. when INSTRUCTION_START
  201. return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
  202. when DOCTYPE_START
  203. md = @source.match( DOCTYPE_PATTERN, true )
  204. @nsstack.unshift(curr_ns=Set.new)
  205. identity = md[1]
  206. close = md[2]
  207. identity =~ IDENTITY
  208. name = $1
  209. raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
  210. pub_sys = $2.nil? ? nil : $2.strip
  211. long_name = $4.nil? ? nil : $4.strip
  212. uri = $6.nil? ? nil : $6.strip
  213. args = [ :start_doctype, name, pub_sys, long_name, uri ]
  214. if close == ">"
  215. @document_status = :after_doctype
  216. @source.read if @source.buffer.size<2
  217. md = @source.match(/^\s*/um, true)
  218. @stack << [ :end_doctype ]
  219. else
  220. @document_status = :in_doctype
  221. end
  222. return args
  223. when /^\s+/
  224. else
  225. @document_status = :after_doctype
  226. @source.read if @source.buffer.size<2
  227. md = @source.match(/\s*/um, true)
  228. if @source.encoding == "UTF-8"
  229. @source.buffer.force_encoding(::Encoding::UTF_8)
  230. end
  231. end
  232. end
  233. if @document_status == :in_doctype
  234. md = @source.match(/\s*(.*?>)/um)
  235. case md[1]
  236. when SYSTEMENTITY
  237. match = @source.match( SYSTEMENTITY, true )[1]
  238. return [ :externalentity, match ]
  239. when ELEMENTDECL_START
  240. return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
  241. when ENTITY_START
  242. match = @source.match( ENTITYDECL, true ).to_a.compact
  243. match[0] = :entitydecl
  244. ref = false
  245. if match[1] == '%'
  246. ref = true
  247. match.delete_at 1
  248. end
  249. # Now we have to sort out what kind of entity reference this is
  250. if match[2] == 'SYSTEM'
  251. # External reference
  252. match[3] = match[3][1..-2] # PUBID
  253. match.delete_at(4) if match.size > 4 # Chop out NDATA decl
  254. # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
  255. elsif match[2] == 'PUBLIC'
  256. # External reference
  257. match[3] = match[3][1..-2] # PUBID
  258. match[4] = match[4][1..-2] # HREF
  259. # match is [ :entity, name, PUBLIC, pubid, href ]
  260. else
  261. match[2] = match[2][1..-2]
  262. match.pop if match.size == 4
  263. # match is [ :entity, name, value ]
  264. end
  265. match << '%' if ref
  266. return match
  267. when ATTLISTDECL_START
  268. md = @source.match( ATTLISTDECL_PATTERN, true )
  269. raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
  270. element = md[1]
  271. contents = md[0]
  272. pairs = {}
  273. values = md[0].scan( ATTDEF_RE )
  274. values.each do |attdef|
  275. unless attdef[3] == "#IMPLIED"
  276. attdef.compact!
  277. val = attdef[3]
  278. val = attdef[4] if val == "#FIXED "
  279. pairs[attdef[0]] = val
  280. if attdef[0] =~ /^xmlns:(.*)/
  281. @nsstack[0] << $1
  282. end
  283. end
  284. end
  285. return [ :attlistdecl, element, pairs, contents ]
  286. when NOTATIONDECL_START
  287. md = nil
  288. if @source.match( PUBLIC )
  289. md = @source.match( PUBLIC, true )
  290. vals = [md[1],md[2],md[4],md[6]]
  291. elsif @source.match( SYSTEM )
  292. md = @source.match( SYSTEM, true )
  293. vals = [md[1],md[2],nil,md[4]]
  294. else
  295. raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
  296. end
  297. return [ :notationdecl, *vals ]
  298. when CDATA_END
  299. @document_status = :after_doctype
  300. @source.match( CDATA_END, true )
  301. return [ :end_doctype ]
  302. end
  303. end
  304. begin
  305. if @source.buffer[0] == ?<
  306. if @source.buffer[1] == ?/
  307. @nsstack.shift
  308. last_tag = @tags.pop
  309. #md = @source.match_to_consume( '>', CLOSE_MATCH)
  310. md = @source.match( CLOSE_MATCH, true )
  311. raise REXML::ParseException.new( "Missing end tag for "+
  312. "'#{last_tag}' (got \"#{md[1]}\")",
  313. @source) unless last_tag == md[1]
  314. return [ :end_element, last_tag ]
  315. elsif @source.buffer[1] == ?!
  316. md = @source.match(/\A(\s*[^>]*>)/um)
  317. #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
  318. raise REXML::ParseException.new("Malformed node", @source) unless md
  319. if md[0][2] == ?-
  320. md = @source.match( COMMENT_PATTERN, true )
  321. case md[1]
  322. when /--/, /-\z/
  323. raise REXML::ParseException.new("Malformed comment", @source)
  324. end
  325. return [ :comment, md[1] ] if md
  326. else
  327. md = @source.match( CDATA_PATTERN, true )
  328. return [ :cdata, md[1] ] if md
  329. end
  330. raise REXML::ParseException.new( "Declarations can only occur "+
  331. "in the doctype declaration.", @source)
  332. elsif @source.buffer[1] == ??
  333. md = @source.match( INSTRUCTION_PATTERN, true )
  334. return [ :processing_instruction, md[1], md[2] ] if md
  335. raise REXML::ParseException.new( "Bad instruction declaration",
  336. @source)
  337. else
  338. # Get the next tag
  339. md = @source.match(TAG_MATCH, true)
  340. unless md
  341. # Check for missing attribute quotes
  342. raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
  343. raise REXML::ParseException.new("malformed XML: missing tag start", @source)
  344. end
  345. attributes = {}
  346. prefixes = Set.new
  347. prefixes << md[2] if md[2]
  348. @nsstack.unshift(curr_ns=Set.new)
  349. if md[4].size > 0
  350. attrs = md[4].scan( ATTRIBUTE_PATTERN )
  351. raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0
  352. attrs.each do |attr_name, prefix, local_part, quote, value|
  353. if prefix == "xmlns"
  354. if local_part == "xml"
  355. if value != "http://www.w3.org/XML/1998/namespace"
  356. msg = "The 'xml' prefix must not be bound to any other namespace "+
  357. "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
  358. raise REXML::ParseException.new( msg, @source, self )
  359. end
  360. elsif local_part == "xmlns"
  361. msg = "The 'xmlns' prefix must not be declared "+
  362. "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
  363. raise REXML::ParseException.new( msg, @source, self)
  364. end
  365. curr_ns << local_part
  366. elsif prefix
  367. prefixes << prefix unless prefix == "xml"
  368. end
  369. if attributes.has_key?(attr_name)
  370. msg = "Duplicate attribute #{attr_name.inspect}"
  371. raise REXML::ParseException.new(msg, @source, self)
  372. end
  373. attributes[attr_name] = value
  374. end
  375. end
  376. # Verify that all of the prefixes have been defined
  377. for prefix in prefixes
  378. unless @nsstack.find{|k| k.member?(prefix)}
  379. raise UndefinedNamespaceException.new(prefix,@source,self)
  380. end
  381. end
  382. if md[6]
  383. @closed = md[1]
  384. @nsstack.shift
  385. else
  386. @tags.push( md[1] )
  387. end
  388. return [ :start_element, md[1], attributes ]
  389. end
  390. else
  391. md = @source.match( TEXT_PATTERN, true )
  392. if md[0].length == 0
  393. @source.match( /(\s+)/, true )
  394. end
  395. #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
  396. #return [ :text, "" ] if md[0].length == 0
  397. # unnormalized = Text::unnormalize( md[1], self )
  398. # return PullEvent.new( :text, md[1], unnormalized )
  399. return [ :text, md[1] ]
  400. end
  401. rescue REXML::UndefinedNamespaceException
  402. raise
  403. rescue REXML::ParseException
  404. raise
  405. rescue Exception, NameError => error
  406. raise REXML::ParseException.new( "Exception parsing",
  407. @source, self, (error ? error : $!) )
  408. end
  409. return [ :dummy ]
  410. end
  411. private :pull_event
  412. def entity( reference, entities )
  413. value = nil
  414. value = entities[ reference ] if entities
  415. if not value
  416. value = DEFAULT_ENTITIES[ reference ]
  417. value = value[2] if value
  418. end
  419. unnormalize( value, entities ) if value
  420. end
  421. # Escapes all possible entities
  422. def normalize( input, entities=nil, entity_filter=nil )
  423. copy = input.clone
  424. # Doing it like this rather than in a loop improves the speed
  425. copy.gsub!( EREFERENCE, '&amp;' )
  426. entities.each do |key, value|
  427. copy.gsub!( value, "&#{key};" ) unless entity_filter and
  428. entity_filter.include?(entity)
  429. end if entities
  430. copy.gsub!( EREFERENCE, '&amp;' )
  431. DEFAULT_ENTITIES.each do |key, value|
  432. copy.gsub!( value[3], value[1] )
  433. end
  434. copy
  435. end
  436. # Unescapes all possible entities
  437. def unnormalize( string, entities=nil, filter=nil )
  438. rv = string.clone
  439. rv.gsub!( /\r\n?/, "\n" )
  440. matches = rv.scan( REFERENCE_RE )
  441. return rv if matches.size == 0
  442. rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
  443. m=$1
  444. m = "0#{m}" if m[0] == ?x
  445. [Integer(m)].pack('U*')
  446. }
  447. matches.collect!{|x|x[0]}.compact!
  448. if matches.size > 0
  449. matches.each do |entity_reference|
  450. unless filter and filter.include?(entity_reference)
  451. entity_value = entity( entity_reference, entities )
  452. if entity_value
  453. re = /&#{entity_reference};/
  454. rv.gsub!( re, entity_value )
  455. else
  456. er = DEFAULT_ENTITIES[entity_reference]
  457. rv.gsub!( er[0], er[2] ) if er
  458. end
  459. end
  460. end
  461. rv.gsub!( /&amp;/, '&' )
  462. end
  463. rv
  464. end
  465. private
  466. def need_source_encoding_update?(xml_declaration_encoding)
  467. return false if xml_declaration_encoding.nil?
  468. return false if /\AUTF-16\z/i =~ xml_declaration_encoding
  469. true
  470. end
  471. end
  472. end
  473. end
  474. =begin
  475. case event[0]
  476. when :start_element
  477. when :text
  478. when :end_element
  479. when :processing_instruction
  480. when :cdata
  481. when :comment
  482. when :xmldecl
  483. when :start_doctype
  484. when :end_doctype
  485. when :externalentity
  486. when :elementdecl
  487. when :entity
  488. when :attlistdecl
  489. when :notationdecl
  490. when :end_doctype
  491. end
  492. =end