/tools/Ruby/lib/ruby/1.8/rss/parser.rb

http://github.com/agross/netopenspace · Ruby · 541 lines · 460 code · 61 blank · 20 comment · 26 complexity · d13c7c257401d4ad4b4e513c67eba41b MD5 · raw file

  1. require "forwardable"
  2. require "open-uri"
  3. require "rss/rss"
  4. require "rss/xml"
  5. module RSS
  6. class NotWellFormedError < Error
  7. attr_reader :line, :element
  8. # Create a new NotWellFormedError for an error at +line+
  9. # in +element+. If a block is given the return value of
  10. # the block ends up in the error message.
  11. def initialize(line=nil, element=nil)
  12. message = "This is not well formed XML"
  13. if element or line
  14. message << "\nerror occurred"
  15. message << " in #{element}" if element
  16. message << " at about #{line} line" if line
  17. end
  18. message << "\n#{yield}" if block_given?
  19. super(message)
  20. end
  21. end
  22. class XMLParserNotFound < Error
  23. def initialize
  24. super("available XML parser was not found in " <<
  25. "#{AVAILABLE_PARSER_LIBRARIES.inspect}.")
  26. end
  27. end
  28. class NotValidXMLParser < Error
  29. def initialize(parser)
  30. super("#{parser} is not an available XML parser. " <<
  31. "Available XML parser"<<
  32. (AVAILABLE_PARSERS.size > 1 ? "s are ": " is ") <<
  33. "#{AVAILABLE_PARSERS.inspect}.")
  34. end
  35. end
  36. class NSError < InvalidRSSError
  37. attr_reader :tag, :prefix, :uri
  38. def initialize(tag, prefix, require_uri)
  39. @tag, @prefix, @uri = tag, prefix, require_uri
  40. super("prefix <#{prefix}> doesn't associate uri " <<
  41. "<#{require_uri}> in tag <#{tag}>")
  42. end
  43. end
  44. class Parser
  45. extend Forwardable
  46. class << self
  47. @@default_parser = nil
  48. def default_parser
  49. @@default_parser || AVAILABLE_PARSERS.first
  50. end
  51. # Set @@default_parser to new_value if it is one of the
  52. # available parsers. Else raise NotValidXMLParser error.
  53. def default_parser=(new_value)
  54. if AVAILABLE_PARSERS.include?(new_value)
  55. @@default_parser = new_value
  56. else
  57. raise NotValidXMLParser.new(new_value)
  58. end
  59. end
  60. def parse(rss, do_validate=true, ignore_unknown_element=true,
  61. parser_class=default_parser)
  62. parser = new(rss, parser_class)
  63. parser.do_validate = do_validate
  64. parser.ignore_unknown_element = ignore_unknown_element
  65. parser.parse
  66. end
  67. end
  68. def_delegators(:@parser, :parse, :rss,
  69. :ignore_unknown_element,
  70. :ignore_unknown_element=, :do_validate,
  71. :do_validate=)
  72. def initialize(rss, parser_class=self.class.default_parser)
  73. @parser = parser_class.new(normalize_rss(rss))
  74. end
  75. private
  76. # Try to get the XML associated with +rss+.
  77. # Return +rss+ if it already looks like XML, or treat it as a URI,
  78. # or a file to get the XML,
  79. def normalize_rss(rss)
  80. return rss if maybe_xml?(rss)
  81. uri = to_uri(rss)
  82. if uri.respond_to?(:read)
  83. uri.read
  84. elsif !rss.tainted? and File.readable?(rss)
  85. File.open(rss) {|f| f.read}
  86. else
  87. rss
  88. end
  89. end
  90. # maybe_xml? tests if source is a string that looks like XML.
  91. def maybe_xml?(source)
  92. source.is_a?(String) and /</ =~ source
  93. end
  94. # Attempt to convert rss to a URI, but just return it if
  95. # there's a ::URI::Error
  96. def to_uri(rss)
  97. return rss if rss.is_a?(::URI::Generic)
  98. begin
  99. ::URI.parse(rss)
  100. rescue ::URI::Error
  101. rss
  102. end
  103. end
  104. end
  105. class BaseParser
  106. class << self
  107. def raise_for_undefined_entity?
  108. listener.raise_for_undefined_entity?
  109. end
  110. end
  111. def initialize(rss)
  112. @listener = self.class.listener.new
  113. @rss = rss
  114. end
  115. def rss
  116. @listener.rss
  117. end
  118. def ignore_unknown_element
  119. @listener.ignore_unknown_element
  120. end
  121. def ignore_unknown_element=(new_value)
  122. @listener.ignore_unknown_element = new_value
  123. end
  124. def do_validate
  125. @listener.do_validate
  126. end
  127. def do_validate=(new_value)
  128. @listener.do_validate = new_value
  129. end
  130. def parse
  131. if @listener.rss.nil?
  132. _parse
  133. end
  134. @listener.rss
  135. end
  136. end
  137. class BaseListener
  138. extend Utils
  139. class << self
  140. @@accessor_bases = {}
  141. @@registered_uris = {}
  142. @@class_names = {}
  143. # return the setter for the uri, tag_name pair, or nil.
  144. def setter(uri, tag_name)
  145. _getter = getter(uri, tag_name)
  146. if _getter
  147. "#{_getter}="
  148. else
  149. nil
  150. end
  151. end
  152. def getter(uri, tag_name)
  153. (@@accessor_bases[uri] || {})[tag_name]
  154. end
  155. # return the tag_names for setters associated with uri
  156. def available_tags(uri)
  157. (@@accessor_bases[uri] || {}).keys
  158. end
  159. # register uri against this name.
  160. def register_uri(uri, name)
  161. @@registered_uris[name] ||= {}
  162. @@registered_uris[name][uri] = nil
  163. end
  164. # test if this uri is registered against this name
  165. def uri_registered?(uri, name)
  166. @@registered_uris[name].has_key?(uri)
  167. end
  168. # record class_name for the supplied uri and tag_name
  169. def install_class_name(uri, tag_name, class_name)
  170. @@class_names[uri] ||= {}
  171. @@class_names[uri][tag_name] = class_name
  172. end
  173. # retrieve class_name for the supplied uri and tag_name
  174. # If it doesn't exist, capitalize the tag_name
  175. def class_name(uri, tag_name)
  176. name = (@@class_names[uri] || {})[tag_name]
  177. return name if name
  178. tag_name = tag_name.gsub(/[_\-]([a-z]?)/) do
  179. $1.upcase
  180. end
  181. tag_name[0, 1].upcase + tag_name[1..-1]
  182. end
  183. def install_get_text_element(uri, name, accessor_base)
  184. install_accessor_base(uri, name, accessor_base)
  185. def_get_text_element(uri, name, *get_file_and_line_from_caller(1))
  186. end
  187. def raise_for_undefined_entity?
  188. true
  189. end
  190. private
  191. # set the accessor for the uri, tag_name pair
  192. def install_accessor_base(uri, tag_name, accessor_base)
  193. @@accessor_bases[uri] ||= {}
  194. @@accessor_bases[uri][tag_name] = accessor_base.chomp("=")
  195. end
  196. def def_get_text_element(uri, element_name, file, line)
  197. register_uri(uri, element_name)
  198. method_name = "start_#{element_name}"
  199. unless private_method_defined?(method_name)
  200. define_method(method_name) do |name, prefix, attrs, ns|
  201. uri = _ns(ns, prefix)
  202. if self.class.uri_registered?(uri, element_name)
  203. start_get_text_element(name, prefix, ns, uri)
  204. else
  205. start_else_element(name, prefix, attrs, ns)
  206. end
  207. end
  208. private(method_name)
  209. end
  210. end
  211. end
  212. end
  213. module ListenerMixin
  214. attr_reader :rss
  215. attr_accessor :ignore_unknown_element
  216. attr_accessor :do_validate
  217. def initialize
  218. @rss = nil
  219. @ignore_unknown_element = true
  220. @do_validate = true
  221. @ns_stack = [{"xml" => :xml}]
  222. @tag_stack = [[]]
  223. @text_stack = ['']
  224. @proc_stack = []
  225. @last_element = nil
  226. @version = @encoding = @standalone = nil
  227. @xml_stylesheets = []
  228. @xml_child_mode = false
  229. @xml_element = nil
  230. @last_xml_element = nil
  231. end
  232. # set instance vars for version, encoding, standalone
  233. def xmldecl(version, encoding, standalone)
  234. @version, @encoding, @standalone = version, encoding, standalone
  235. end
  236. def instruction(name, content)
  237. if name == "xml-stylesheet"
  238. params = parse_pi_content(content)
  239. if params.has_key?("href")
  240. @xml_stylesheets << XMLStyleSheet.new(params)
  241. end
  242. end
  243. end
  244. def tag_start(name, attributes)
  245. @text_stack.push('')
  246. ns = @ns_stack.last.dup
  247. attrs = {}
  248. attributes.each do |n, v|
  249. if /\Axmlns(?:\z|:)/ =~ n
  250. ns[$POSTMATCH] = v
  251. else
  252. attrs[n] = v
  253. end
  254. end
  255. @ns_stack.push(ns)
  256. prefix, local = split_name(name)
  257. @tag_stack.last.push([_ns(ns, prefix), local])
  258. @tag_stack.push([])
  259. if @xml_child_mode
  260. previous = @last_xml_element
  261. element_attrs = attributes.dup
  262. unless previous
  263. ns.each do |ns_prefix, value|
  264. next if ns_prefix == "xml"
  265. key = ns_prefix.empty? ? "xmlns" : "xmlns:#{ns_prefix}"
  266. element_attrs[key] ||= value
  267. end
  268. end
  269. next_element = XML::Element.new(local,
  270. prefix.empty? ? nil : prefix,
  271. _ns(ns, prefix),
  272. element_attrs)
  273. previous << next_element if previous
  274. @last_xml_element = next_element
  275. pr = Proc.new do |text, tags|
  276. if previous
  277. @last_xml_element = previous
  278. else
  279. @xml_element = @last_xml_element
  280. @last_xml_element = nil
  281. end
  282. end
  283. @proc_stack.push(pr)
  284. else
  285. if @rss.nil? and respond_to?("initial_start_#{local}", true)
  286. __send__("initial_start_#{local}", local, prefix, attrs, ns.dup)
  287. elsif respond_to?("start_#{local}", true)
  288. __send__("start_#{local}", local, prefix, attrs, ns.dup)
  289. else
  290. start_else_element(local, prefix, attrs, ns.dup)
  291. end
  292. end
  293. end
  294. def tag_end(name)
  295. if DEBUG
  296. p "end tag #{name}"
  297. p @tag_stack
  298. end
  299. text = @text_stack.pop
  300. tags = @tag_stack.pop
  301. pr = @proc_stack.pop
  302. pr.call(text, tags) unless pr.nil?
  303. @ns_stack.pop
  304. end
  305. def text(data)
  306. if @xml_child_mode
  307. @last_xml_element << data if @last_xml_element
  308. else
  309. @text_stack.last << data
  310. end
  311. end
  312. private
  313. def _ns(ns, prefix)
  314. ns.fetch(prefix, "")
  315. end
  316. CONTENT_PATTERN = /\s*([^=]+)=(["'])([^\2]+?)\2/
  317. # Extract the first name="value" pair from content.
  318. # Works with single quotes according to the constant
  319. # CONTENT_PATTERN. Return a Hash.
  320. def parse_pi_content(content)
  321. params = {}
  322. content.scan(CONTENT_PATTERN) do |name, quote, value|
  323. params[name] = value
  324. end
  325. params
  326. end
  327. def start_else_element(local, prefix, attrs, ns)
  328. class_name = self.class.class_name(_ns(ns, prefix), local)
  329. current_class = @last_element.class
  330. if class_name and
  331. (current_class.const_defined?(class_name) or
  332. current_class.constants.include?(class_name))
  333. next_class = current_class.const_get(class_name)
  334. start_have_something_element(local, prefix, attrs, ns, next_class)
  335. else
  336. if !@do_validate or @ignore_unknown_element
  337. @proc_stack.push(nil)
  338. else
  339. parent = "ROOT ELEMENT???"
  340. if current_class.tag_name
  341. parent = current_class.tag_name
  342. end
  343. raise NotExpectedTagError.new(local, _ns(ns, prefix), parent)
  344. end
  345. end
  346. end
  347. NAMESPLIT = /^(?:([\w:][-\w\d.]*):)?([\w:][-\w\d.]*)/
  348. def split_name(name)
  349. name =~ NAMESPLIT
  350. [$1 || '', $2]
  351. end
  352. def check_ns(tag_name, prefix, ns, require_uri)
  353. unless _ns(ns, prefix) == require_uri
  354. if @do_validate
  355. raise NSError.new(tag_name, prefix, require_uri)
  356. else
  357. # Force bind required URI with prefix
  358. @ns_stack.last[prefix] = require_uri
  359. end
  360. end
  361. end
  362. def start_get_text_element(tag_name, prefix, ns, required_uri)
  363. pr = Proc.new do |text, tags|
  364. setter = self.class.setter(required_uri, tag_name)
  365. if @last_element.respond_to?(setter)
  366. if @do_validate
  367. getter = self.class.getter(required_uri, tag_name)
  368. if @last_element.__send__(getter)
  369. raise TooMuchTagError.new(tag_name, @last_element.tag_name)
  370. end
  371. end
  372. @last_element.__send__(setter, text.to_s)
  373. else
  374. if @do_validate and !@ignore_unknown_element
  375. raise NotExpectedTagError.new(tag_name, _ns(ns, prefix),
  376. @last_element.tag_name)
  377. end
  378. end
  379. end
  380. @proc_stack.push(pr)
  381. end
  382. def start_have_something_element(tag_name, prefix, attrs, ns, klass)
  383. check_ns(tag_name, prefix, ns, klass.required_uri)
  384. attributes = collect_attributes(tag_name, prefix, attrs, ns, klass)
  385. @proc_stack.push(setup_next_element(tag_name, klass, attributes))
  386. end
  387. def collect_attributes(tag_name, prefix, attrs, ns, klass)
  388. attributes = {}
  389. klass.get_attributes.each do |a_name, a_uri, required, element_name|
  390. if a_uri.is_a?(String) or !a_uri.respond_to?(:include?)
  391. a_uri = [a_uri]
  392. end
  393. unless a_uri == [""]
  394. for prefix, uri in ns
  395. if a_uri.include?(uri)
  396. val = attrs["#{prefix}:#{a_name}"]
  397. break if val
  398. end
  399. end
  400. end
  401. if val.nil? and a_uri.include?("")
  402. val = attrs[a_name]
  403. end
  404. if @do_validate and required and val.nil?
  405. unless a_uri.include?("")
  406. for prefix, uri in ns
  407. if a_uri.include?(uri)
  408. a_name = "#{prefix}:#{a_name}"
  409. end
  410. end
  411. end
  412. raise MissingAttributeError.new(tag_name, a_name)
  413. end
  414. attributes[a_name] = val
  415. end
  416. attributes
  417. end
  418. def setup_next_element(tag_name, klass, attributes)
  419. previous = @last_element
  420. next_element = klass.new(@do_validate, attributes)
  421. previous.set_next_element(tag_name, next_element)
  422. @last_element = next_element
  423. @last_element.parent = previous if klass.need_parent?
  424. @xml_child_mode = @last_element.have_xml_content?
  425. Proc.new do |text, tags|
  426. p(@last_element.class) if DEBUG
  427. if @xml_child_mode
  428. @last_element.content = @xml_element.to_s
  429. xml_setter = @last_element.class.xml_setter
  430. @last_element.__send__(xml_setter, @xml_element)
  431. @xml_element = nil
  432. @xml_child_mode = false
  433. else
  434. if klass.have_content?
  435. if @last_element.need_base64_encode?
  436. text = Base64.decode64(text.lstrip)
  437. end
  438. @last_element.content = text
  439. end
  440. end
  441. if @do_validate
  442. @last_element.validate_for_stream(tags, @ignore_unknown_element)
  443. end
  444. @last_element = previous
  445. end
  446. end
  447. end
  448. unless const_defined? :AVAILABLE_PARSER_LIBRARIES
  449. AVAILABLE_PARSER_LIBRARIES = [
  450. ["rss/xmlparser", :XMLParserParser],
  451. ["rss/xmlscanner", :XMLScanParser],
  452. ["rss/rexmlparser", :REXMLParser],
  453. ]
  454. end
  455. AVAILABLE_PARSERS = []
  456. AVAILABLE_PARSER_LIBRARIES.each do |lib, parser|
  457. begin
  458. require lib
  459. AVAILABLE_PARSERS.push(const_get(parser))
  460. rescue LoadError
  461. end
  462. end
  463. if AVAILABLE_PARSERS.empty?
  464. raise XMLParserNotFound
  465. end
  466. end