PageRenderTime 89ms CodeModel.GetById 38ms app.highlight 44ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/Ruby/lib/ruby/1.8/rss/parser.rb

http://github.com/agross/netopenspace
Ruby | 541 lines | 460 code | 61 blank | 20 comment | 26 complexity | d13c7c257401d4ad4b4e513c67eba41b MD5 | raw file
  1require "forwardable"
  2require "open-uri"
  3
  4require "rss/rss"
  5require "rss/xml"
  6
  7module RSS
  8
  9  class NotWellFormedError < Error
 10    attr_reader :line, :element
 11
 12    # Create a new NotWellFormedError for an error at +line+
 13    # in +element+.  If a block is given the return value of
 14    # the block ends up in the error message.
 15    def initialize(line=nil, element=nil)
 16      message = "This is not well formed XML"
 17      if element or line
 18        message << "\nerror occurred"
 19        message << " in #{element}" if element
 20        message << " at about #{line} line" if line
 21      end
 22      message << "\n#{yield}" if block_given?
 23      super(message)
 24    end
 25  end
 26
 27  class XMLParserNotFound < Error
 28    def initialize
 29      super("available XML parser was not found in " <<
 30            "#{AVAILABLE_PARSER_LIBRARIES.inspect}.")
 31    end
 32  end
 33
 34  class NotValidXMLParser < Error
 35    def initialize(parser)
 36      super("#{parser} is not an available XML parser. " <<
 37            "Available XML parser"<<
 38            (AVAILABLE_PARSERS.size > 1 ? "s are ": " is ") <<
 39            "#{AVAILABLE_PARSERS.inspect}.")
 40    end
 41  end
 42
 43  class NSError < InvalidRSSError
 44    attr_reader :tag, :prefix, :uri
 45    def initialize(tag, prefix, require_uri)
 46      @tag, @prefix, @uri = tag, prefix, require_uri
 47      super("prefix <#{prefix}> doesn't associate uri " <<
 48            "<#{require_uri}> in tag <#{tag}>")
 49    end
 50  end
 51
 52  class Parser
 53
 54    extend Forwardable
 55
 56    class << self
 57
 58      @@default_parser = nil
 59
 60      def default_parser
 61        @@default_parser || AVAILABLE_PARSERS.first
 62      end
 63
 64      # Set @@default_parser to new_value if it is one of the
 65      # available parsers. Else raise NotValidXMLParser error.
 66      def default_parser=(new_value)
 67        if AVAILABLE_PARSERS.include?(new_value)
 68          @@default_parser = new_value
 69        else
 70          raise NotValidXMLParser.new(new_value)
 71        end
 72      end
 73
 74      def parse(rss, do_validate=true, ignore_unknown_element=true,
 75                parser_class=default_parser)
 76        parser = new(rss, parser_class)
 77        parser.do_validate = do_validate
 78        parser.ignore_unknown_element = ignore_unknown_element
 79        parser.parse
 80      end
 81    end
 82
 83    def_delegators(:@parser, :parse, :rss,
 84                   :ignore_unknown_element,
 85                   :ignore_unknown_element=, :do_validate,
 86                   :do_validate=)
 87
 88    def initialize(rss, parser_class=self.class.default_parser)
 89      @parser = parser_class.new(normalize_rss(rss))
 90    end
 91
 92    private
 93
 94    # Try to get the XML associated with +rss+.
 95    # Return +rss+ if it already looks like XML, or treat it as a URI,
 96    # or a file to get the XML,
 97    def normalize_rss(rss)
 98      return rss if maybe_xml?(rss)
 99
100      uri = to_uri(rss)
101      
102      if uri.respond_to?(:read)
103        uri.read
104      elsif !rss.tainted? and File.readable?(rss)
105        File.open(rss) {|f| f.read}
106      else
107        rss
108      end
109    end
110
111    # maybe_xml? tests if source is a string that looks like XML.
112    def maybe_xml?(source)
113      source.is_a?(String) and /</ =~ source
114    end
115
116    # Attempt to convert rss to a URI, but just return it if 
117    # there's a ::URI::Error
118    def to_uri(rss)
119      return rss if rss.is_a?(::URI::Generic)
120
121      begin
122        ::URI.parse(rss)
123      rescue ::URI::Error
124        rss
125      end
126    end
127  end
128
129  class BaseParser
130
131    class << self
132      def raise_for_undefined_entity?
133        listener.raise_for_undefined_entity?
134      end
135    end
136    
137    def initialize(rss)
138      @listener = self.class.listener.new
139      @rss = rss
140    end
141
142    def rss
143      @listener.rss
144    end
145
146    def ignore_unknown_element
147      @listener.ignore_unknown_element
148    end
149
150    def ignore_unknown_element=(new_value)
151      @listener.ignore_unknown_element = new_value
152    end
153
154    def do_validate
155      @listener.do_validate
156    end
157
158    def do_validate=(new_value)
159      @listener.do_validate = new_value
160    end
161
162    def parse
163      if @listener.rss.nil?
164        _parse
165      end
166      @listener.rss
167    end
168
169  end
170
171  class BaseListener
172
173    extend Utils
174
175    class << self
176
177      @@accessor_bases = {}
178      @@registered_uris = {}
179      @@class_names = {}
180
181      # return the setter for the uri, tag_name pair, or nil.
182      def setter(uri, tag_name)
183        _getter = getter(uri, tag_name)
184        if _getter
185          "#{_getter}="
186        else
187          nil
188        end
189      end
190
191      def getter(uri, tag_name)
192        (@@accessor_bases[uri] || {})[tag_name]
193      end
194
195      # return the tag_names for setters associated with uri
196      def available_tags(uri)
197        (@@accessor_bases[uri] || {}).keys
198      end
199      
200      # register uri against this name.
201      def register_uri(uri, name)
202        @@registered_uris[name] ||= {}
203        @@registered_uris[name][uri] = nil
204      end
205      
206      # test if this uri is registered against this name
207      def uri_registered?(uri, name)
208        @@registered_uris[name].has_key?(uri)
209      end
210
211      # record class_name for the supplied uri and tag_name
212      def install_class_name(uri, tag_name, class_name)
213        @@class_names[uri] ||= {}
214        @@class_names[uri][tag_name] = class_name
215      end
216
217      # retrieve class_name for the supplied uri and tag_name
218      # If it doesn't exist, capitalize the tag_name
219      def class_name(uri, tag_name)
220        name = (@@class_names[uri] || {})[tag_name]
221        return name if name
222
223        tag_name = tag_name.gsub(/[_\-]([a-z]?)/) do
224          $1.upcase
225        end
226        tag_name[0, 1].upcase + tag_name[1..-1]
227      end
228
229      def install_get_text_element(uri, name, accessor_base)
230        install_accessor_base(uri, name, accessor_base)
231        def_get_text_element(uri, name, *get_file_and_line_from_caller(1))
232      end
233      
234      def raise_for_undefined_entity?
235        true
236      end
237    
238      private
239      # set the accessor for the uri, tag_name pair
240      def install_accessor_base(uri, tag_name, accessor_base)
241        @@accessor_bases[uri] ||= {}
242        @@accessor_bases[uri][tag_name] = accessor_base.chomp("=")
243      end
244
245      def def_get_text_element(uri, element_name, file, line)
246        register_uri(uri, element_name)
247        method_name = "start_#{element_name}"
248        unless private_method_defined?(method_name)
249          define_method(method_name) do |name, prefix, attrs, ns|
250            uri = _ns(ns, prefix)
251            if self.class.uri_registered?(uri, element_name)
252              start_get_text_element(name, prefix, ns, uri)
253            else
254              start_else_element(name, prefix, attrs, ns)
255            end
256          end
257          private(method_name)
258        end
259      end
260    end
261  end
262
263  module ListenerMixin
264    attr_reader :rss
265
266    attr_accessor :ignore_unknown_element
267    attr_accessor :do_validate
268
269    def initialize
270      @rss = nil
271      @ignore_unknown_element = true
272      @do_validate = true
273      @ns_stack = [{"xml" => :xml}]
274      @tag_stack = [[]]
275      @text_stack = ['']
276      @proc_stack = []
277      @last_element = nil
278      @version = @encoding = @standalone = nil
279      @xml_stylesheets = []
280      @xml_child_mode = false
281      @xml_element = nil
282      @last_xml_element = nil
283    end
284    
285    # set instance vars for version, encoding, standalone
286    def xmldecl(version, encoding, standalone)
287      @version, @encoding, @standalone = version, encoding, standalone
288    end
289
290    def instruction(name, content)
291      if name == "xml-stylesheet"
292        params = parse_pi_content(content)
293        if params.has_key?("href")
294          @xml_stylesheets << XMLStyleSheet.new(params)
295        end
296      end
297    end
298
299    def tag_start(name, attributes)
300      @text_stack.push('')
301
302      ns = @ns_stack.last.dup
303      attrs = {}
304      attributes.each do |n, v|
305        if /\Axmlns(?:\z|:)/ =~ n
306          ns[$POSTMATCH] = v
307        else
308          attrs[n] = v
309        end
310      end
311      @ns_stack.push(ns)
312
313      prefix, local = split_name(name)
314      @tag_stack.last.push([_ns(ns, prefix), local])
315      @tag_stack.push([])
316      if @xml_child_mode
317        previous = @last_xml_element
318        element_attrs = attributes.dup
319        unless previous
320          ns.each do |ns_prefix, value|
321            next if ns_prefix == "xml"
322            key = ns_prefix.empty? ? "xmlns" : "xmlns:#{ns_prefix}"
323            element_attrs[key] ||= value
324          end
325        end
326        next_element = XML::Element.new(local,
327                                        prefix.empty? ? nil : prefix,
328                                        _ns(ns, prefix),
329                                        element_attrs)
330        previous << next_element if previous
331        @last_xml_element = next_element
332        pr = Proc.new do |text, tags|
333          if previous
334            @last_xml_element = previous
335          else
336            @xml_element = @last_xml_element
337            @last_xml_element = nil
338          end
339        end
340        @proc_stack.push(pr)
341      else
342        if @rss.nil? and respond_to?("initial_start_#{local}", true)
343          __send__("initial_start_#{local}", local, prefix, attrs, ns.dup)
344        elsif respond_to?("start_#{local}", true)
345          __send__("start_#{local}", local, prefix, attrs, ns.dup)
346        else
347          start_else_element(local, prefix, attrs, ns.dup)
348        end
349      end
350    end
351
352    def tag_end(name)
353      if DEBUG
354        p "end tag #{name}"
355        p @tag_stack
356      end
357      text = @text_stack.pop
358      tags = @tag_stack.pop
359      pr = @proc_stack.pop
360      pr.call(text, tags) unless pr.nil?
361      @ns_stack.pop
362    end
363
364    def text(data)
365      if @xml_child_mode
366        @last_xml_element << data if @last_xml_element
367      else
368        @text_stack.last << data
369      end
370    end
371
372    private
373    def _ns(ns, prefix)
374      ns.fetch(prefix, "")
375    end
376
377    CONTENT_PATTERN = /\s*([^=]+)=(["'])([^\2]+?)\2/
378    # Extract the first name="value" pair from content.
379    # Works with single quotes according to the constant
380    # CONTENT_PATTERN. Return a Hash.
381    def parse_pi_content(content)
382      params = {}
383      content.scan(CONTENT_PATTERN) do |name, quote, value|
384        params[name] = value
385      end
386      params
387    end
388
389    def start_else_element(local, prefix, attrs, ns)
390      class_name = self.class.class_name(_ns(ns, prefix), local)
391      current_class = @last_element.class
392      if class_name and
393          (current_class.const_defined?(class_name) or
394           current_class.constants.include?(class_name))
395        next_class = current_class.const_get(class_name)
396        start_have_something_element(local, prefix, attrs, ns, next_class)
397      else
398        if !@do_validate or @ignore_unknown_element
399          @proc_stack.push(nil)
400        else
401          parent = "ROOT ELEMENT???"
402          if current_class.tag_name
403            parent = current_class.tag_name
404          end
405          raise NotExpectedTagError.new(local, _ns(ns, prefix), parent)
406        end
407      end
408    end
409
410    NAMESPLIT = /^(?:([\w:][-\w\d.]*):)?([\w:][-\w\d.]*)/
411    def split_name(name)
412      name =~ NAMESPLIT
413      [$1 || '', $2]
414    end
415
416    def check_ns(tag_name, prefix, ns, require_uri)
417      unless _ns(ns, prefix) == require_uri
418        if @do_validate
419          raise NSError.new(tag_name, prefix, require_uri)
420        else
421          # Force bind required URI with prefix
422          @ns_stack.last[prefix] = require_uri
423        end
424      end
425    end
426
427    def start_get_text_element(tag_name, prefix, ns, required_uri)
428      pr = Proc.new do |text, tags|
429        setter = self.class.setter(required_uri, tag_name)
430        if @last_element.respond_to?(setter)
431          if @do_validate
432            getter = self.class.getter(required_uri, tag_name)
433            if @last_element.__send__(getter)
434              raise TooMuchTagError.new(tag_name, @last_element.tag_name)
435            end
436          end
437          @last_element.__send__(setter, text.to_s)
438        else
439          if @do_validate and !@ignore_unknown_element
440            raise NotExpectedTagError.new(tag_name, _ns(ns, prefix),
441                                          @last_element.tag_name)
442          end
443        end
444      end
445      @proc_stack.push(pr)
446    end
447
448    def start_have_something_element(tag_name, prefix, attrs, ns, klass)
449      check_ns(tag_name, prefix, ns, klass.required_uri)
450      attributes = collect_attributes(tag_name, prefix, attrs, ns, klass)
451      @proc_stack.push(setup_next_element(tag_name, klass, attributes))
452    end
453
454    def collect_attributes(tag_name, prefix, attrs, ns, klass)
455      attributes = {}
456      klass.get_attributes.each do |a_name, a_uri, required, element_name|
457        if a_uri.is_a?(String) or !a_uri.respond_to?(:include?)
458          a_uri = [a_uri]
459        end
460        unless a_uri == [""]
461          for prefix, uri in ns
462            if a_uri.include?(uri)
463              val = attrs["#{prefix}:#{a_name}"]
464              break if val
465            end
466          end
467        end
468        if val.nil? and a_uri.include?("")
469          val = attrs[a_name]
470        end
471
472        if @do_validate and required and val.nil?
473          unless a_uri.include?("")
474            for prefix, uri in ns
475              if a_uri.include?(uri)
476                a_name = "#{prefix}:#{a_name}"
477              end
478            end
479          end
480          raise MissingAttributeError.new(tag_name, a_name)
481        end
482
483        attributes[a_name] = val
484      end
485      attributes
486    end
487
488    def setup_next_element(tag_name, klass, attributes)
489      previous = @last_element
490      next_element = klass.new(@do_validate, attributes)
491      previous.set_next_element(tag_name, next_element)
492      @last_element = next_element
493      @last_element.parent = previous if klass.need_parent?
494      @xml_child_mode = @last_element.have_xml_content?
495
496      Proc.new do |text, tags|
497        p(@last_element.class) if DEBUG
498        if @xml_child_mode
499          @last_element.content = @xml_element.to_s
500          xml_setter = @last_element.class.xml_setter
501          @last_element.__send__(xml_setter, @xml_element)
502          @xml_element = nil
503          @xml_child_mode = false
504        else
505          if klass.have_content?
506            if @last_element.need_base64_encode?
507              text = Base64.decode64(text.lstrip)
508            end
509            @last_element.content = text
510          end
511        end
512        if @do_validate
513          @last_element.validate_for_stream(tags, @ignore_unknown_element)
514        end
515        @last_element = previous
516      end
517    end
518  end
519
520  unless const_defined? :AVAILABLE_PARSER_LIBRARIES
521    AVAILABLE_PARSER_LIBRARIES = [
522      ["rss/xmlparser", :XMLParserParser],
523      ["rss/xmlscanner", :XMLScanParser],
524      ["rss/rexmlparser", :REXMLParser],
525    ]
526  end
527
528  AVAILABLE_PARSERS = []
529
530  AVAILABLE_PARSER_LIBRARIES.each do |lib, parser|
531    begin
532      require lib
533      AVAILABLE_PARSERS.push(const_get(parser))
534    rescue LoadError
535    end
536  end
537
538  if AVAILABLE_PARSERS.empty?
539    raise XMLParserNotFound
540  end
541end