PageRenderTime 268ms CodeModel.GetById 201ms app.highlight 61ms RepoModel.GetById 1ms app.codeStats 1ms

/tools/Ruby/lib/ruby/1.8/open-uri.rb

http://github.com/agross/netopenspace
Ruby | 678 lines | 417 code | 44 blank | 217 comment | 80 complexity | 6fa3faa13fbfc6ea54afde7d32ed8e5c MD5 | raw file
  1require 'uri'
  2require 'stringio'
  3require 'time'
  4
  5module Kernel
  6  private
  7  alias open_uri_original_open open # :nodoc:
  8
  9  # makes possible to open various resources including URIs.
 10  # If the first argument respond to `open' method,
 11  # the method is called with the rest arguments.
 12  #
 13  # If the first argument is a string which begins with xxx://,
 14  # it is parsed by URI.parse.  If the parsed object respond to `open' method,
 15  # the method is called with the rest arguments.
 16  #
 17  # Otherwise original open is called.
 18  #
 19  # Since open-uri.rb provides URI::HTTP#open, URI::HTTPS#open and
 20  # URI::FTP#open,
 21  # Kernel[#.]open can accepts such URIs and strings which begins with
 22  # http://, https:// and ftp://.
 23  # In these case, the opened file object is extended by OpenURI::Meta.
 24  def open(name, *rest, &block) # :doc:
 25    if name.respond_to?(:open)
 26      name.open(*rest, &block)
 27    elsif name.respond_to?(:to_str) &&
 28          %r{\A[A-Za-z][A-Za-z0-9+\-\.]*://} =~ name &&
 29          (uri = URI.parse(name)).respond_to?(:open)
 30      uri.open(*rest, &block)
 31    else
 32      open_uri_original_open(name, *rest, &block)
 33    end
 34  end
 35  module_function :open
 36end
 37
 38# OpenURI is an easy-to-use wrapper for net/http, net/https and net/ftp.
 39#
 40#== Example
 41#
 42# It is possible to open http/https/ftp URL as usual like opening a file:
 43#
 44#   open("http://www.ruby-lang.org/") {|f|
 45#     f.each_line {|line| p line}
 46#   }
 47#
 48# The opened file has several methods for meta information as follows since
 49# it is extended by OpenURI::Meta.
 50#
 51#   open("http://www.ruby-lang.org/en") {|f|
 52#     f.each_line {|line| p line}
 53#     p f.base_uri         # <URI::HTTP:0x40e6ef2 URL:http://www.ruby-lang.org/en/>
 54#     p f.content_type     # "text/html"
 55#     p f.charset          # "iso-8859-1"
 56#     p f.content_encoding # []
 57#     p f.last_modified    # Thu Dec 05 02:45:02 UTC 2002
 58#   }
 59#
 60# Additional header fields can be specified by an optional hash argument.
 61#
 62#   open("http://www.ruby-lang.org/en/",
 63#     "User-Agent" => "Ruby/#{RUBY_VERSION}",
 64#     "From" => "foo@bar.invalid",
 65#     "Referer" => "http://www.ruby-lang.org/") {|f|
 66#     # ...
 67#   }
 68#
 69# The environment variables such as http_proxy, https_proxy and ftp_proxy
 70# are in effect by default.  :proxy => nil disables proxy.
 71#
 72#   open("http://www.ruby-lang.org/en/raa.html", :proxy => nil) {|f|
 73#     # ...
 74#   }
 75#
 76# URI objects can be opened in a similar way.
 77#
 78#   uri = URI.parse("http://www.ruby-lang.org/en/")
 79#   uri.open {|f|
 80#     # ...
 81#   }
 82#
 83# URI objects can be read directly. The returned string is also extended by
 84# OpenURI::Meta.
 85#
 86#   str = uri.read
 87#   p str.base_uri
 88#
 89# Author:: Tanaka Akira <akr@m17n.org>
 90
 91module OpenURI
 92  Options = {
 93    :proxy => true,
 94    :progress_proc => true,
 95    :content_length_proc => true,
 96    :http_basic_authentication => true,
 97  }
 98
 99  def OpenURI.check_options(options) # :nodoc:
100    options.each {|k, v|
101      next unless Symbol === k
102      unless Options.include? k
103        raise ArgumentError, "unrecognized option: #{k}"
104      end
105    }
106  end
107
108  def OpenURI.scan_open_optional_arguments(*rest) # :nodoc:
109    if !rest.empty? && (String === rest.first || Integer === rest.first)
110      mode = rest.shift
111      if !rest.empty? && Integer === rest.first
112        perm = rest.shift
113      end
114    end
115    return mode, perm, rest
116  end
117
118  def OpenURI.open_uri(name, *rest) # :nodoc:
119    uri = URI::Generic === name ? name : URI.parse(name)
120    mode, perm, rest = OpenURI.scan_open_optional_arguments(*rest)
121    options = rest.shift if !rest.empty? && Hash === rest.first
122    raise ArgumentError.new("extra arguments") if !rest.empty?
123    options ||= {}
124    OpenURI.check_options(options)
125
126    unless mode == nil ||
127           mode == 'r' || mode == 'rb' ||
128           mode == File::RDONLY
129      raise ArgumentError.new("invalid access mode #{mode} (#{uri.class} resource is read only.)")
130    end
131
132    io = open_loop(uri, options)
133    if block_given?
134      begin
135        yield io
136      ensure
137        io.close
138      end
139    else
140      io
141    end
142  end
143
144  def OpenURI.open_loop(uri, options) # :nodoc:
145    case opt_proxy = options.fetch(:proxy, true)
146    when true
147      find_proxy = lambda {|u| u.find_proxy}
148    when nil, false
149      find_proxy = lambda {|u| nil}
150    when String
151      opt_proxy = URI.parse(opt_proxy)
152      find_proxy = lambda {|u| opt_proxy}
153    when URI::Generic
154      find_proxy = lambda {|u| opt_proxy}
155    else
156      raise ArgumentError.new("Invalid proxy option: #{opt_proxy}")
157    end
158
159    uri_set = {}
160    buf = nil
161    while true
162      redirect = catch(:open_uri_redirect) {
163        buf = Buffer.new
164        uri.buffer_open(buf, find_proxy.call(uri), options)
165        nil
166      }
167      if redirect
168        if redirect.relative?
169          # Although it violates RFC2616, Location: field may have relative
170          # URI.  It is converted to absolute URI using uri as a base URI.
171          redirect = uri + redirect
172        end
173        unless OpenURI.redirectable?(uri, redirect)
174          raise "redirection forbidden: #{uri} -> #{redirect}"
175        end
176        if options.include? :http_basic_authentication
177          # send authentication only for the URI directly specified.
178          options = options.dup
179          options.delete :http_basic_authentication
180        end
181        uri = redirect
182        raise "HTTP redirection loop: #{uri}" if uri_set.include? uri.to_s
183        uri_set[uri.to_s] = true
184      else
185        break
186      end
187    end
188    io = buf.io
189    io.base_uri = uri
190    io
191  end
192
193  def OpenURI.redirectable?(uri1, uri2) # :nodoc:
194    # This test is intended to forbid a redirection from http://... to
195    # file:///etc/passwd.
196    # However this is ad hoc.  It should be extensible/configurable.
197    uri1.scheme.downcase == uri2.scheme.downcase ||
198    (/\A(?:http|ftp)\z/i =~ uri1.scheme && /\A(?:http|ftp)\z/i =~ uri2.scheme)
199  end
200
201  def OpenURI.open_http(buf, target, proxy, options) # :nodoc:
202    if proxy
203      raise "Non-HTTP proxy URI: #{proxy}" if proxy.class != URI::HTTP
204    end
205
206    if target.userinfo && "1.9.0" <= RUBY_VERSION
207      # don't raise for 1.8 because compatibility.
208      raise ArgumentError, "userinfo not supported.  [RFC3986]"
209    end
210
211    require 'net/http'
212    klass = Net::HTTP
213    if URI::HTTP === target
214      # HTTP or HTTPS
215      if proxy
216        klass = Net::HTTP::Proxy(proxy.host, proxy.port)
217      end
218      target_host = target.host
219      target_port = target.port
220      request_uri = target.request_uri
221    else
222      # FTP over HTTP proxy
223      target_host = proxy.host
224      target_port = proxy.port
225      request_uri = target.to_s
226    end
227
228    http = klass.new(target_host, target_port)
229    if target.class == URI::HTTPS
230      require 'net/https'
231      http.use_ssl = true
232      http.verify_mode = OpenSSL::SSL::VERIFY_PEER
233      store = OpenSSL::X509::Store.new
234      store.set_default_paths
235      http.cert_store = store
236    end
237
238    header = {}
239    options.each {|k, v| header[k] = v if String === k }
240
241    resp = nil
242    http.start {
243      req = Net::HTTP::Get.new(request_uri, header)
244      if options.include? :http_basic_authentication
245        user, pass = options[:http_basic_authentication]
246        req.basic_auth user, pass
247      end
248      http.request(req) {|response|
249        resp = response
250        if options[:content_length_proc] && Net::HTTPSuccess === resp
251          if resp.key?('Content-Length')
252            options[:content_length_proc].call(resp['Content-Length'].to_i)
253          else
254            options[:content_length_proc].call(nil)
255          end
256        end
257        resp.read_body {|str|
258          buf << str
259          if options[:progress_proc] && Net::HTTPSuccess === resp
260            options[:progress_proc].call(buf.size)
261          end
262        }
263      }
264    }
265    io = buf.io
266    io.rewind
267    io.status = [resp.code, resp.message]
268    resp.each {|name,value| buf.io.meta_add_field name, value }
269    case resp
270    when Net::HTTPSuccess
271    when Net::HTTPMovedPermanently, # 301
272         Net::HTTPFound, # 302
273         Net::HTTPSeeOther, # 303
274         Net::HTTPTemporaryRedirect # 307
275      throw :open_uri_redirect, URI.parse(resp['location'])
276    else
277      raise OpenURI::HTTPError.new(io.status.join(' '), io)
278    end
279  end
280
281  class HTTPError < StandardError
282    def initialize(message, io)
283      super(message)
284      @io = io
285    end
286    attr_reader :io
287  end
288
289  class Buffer # :nodoc:
290    def initialize
291      @io = StringIO.new
292      @size = 0
293    end
294    attr_reader :size
295
296    StringMax = 10240
297    def <<(str)
298      @io << str
299      @size += str.length
300      if StringIO === @io && StringMax < @size
301        require 'tempfile'
302        io = Tempfile.new('open-uri')
303        io.binmode
304        Meta.init io, @io if @io.respond_to? :meta
305        io << @io.string
306        @io = io
307      end
308    end
309
310    def io
311      Meta.init @io unless @io.respond_to? :meta
312      @io
313    end
314  end
315
316  # Mixin for holding meta-information.
317  module Meta
318    def Meta.init(obj, src=nil) # :nodoc:
319      obj.extend Meta
320      obj.instance_eval {
321        @base_uri = nil
322        @meta = {}
323      }
324      if src
325        obj.status = src.status
326        obj.base_uri = src.base_uri
327        src.meta.each {|name, value|
328          obj.meta_add_field(name, value)
329        }
330      end
331    end
332
333    # returns an Array which consists status code and message.
334    attr_accessor :status
335
336    # returns a URI which is base of relative URIs in the data.
337    # It may differ from the URI supplied by a user because redirection.
338    attr_accessor :base_uri
339
340    # returns a Hash which represents header fields.
341    # The Hash keys are downcased for canonicalization.
342    attr_reader :meta
343
344    def meta_add_field(name, value) # :nodoc:
345      @meta[name.downcase] = value
346    end
347
348    # returns a Time which represents Last-Modified field.
349    def last_modified
350      if v = @meta['last-modified']
351        Time.httpdate(v)
352      else
353        nil
354      end
355    end
356
357    RE_LWS = /[\r\n\t ]+/n
358    RE_TOKEN = %r{[^\x00- ()<>@,;:\\"/\[\]?={}\x7f]+}n
359    RE_QUOTED_STRING = %r{"(?:[\r\n\t !#-\[\]-~\x80-\xff]|\\[\x00-\x7f])*"}n
360    RE_PARAMETERS = %r{(?:;#{RE_LWS}?#{RE_TOKEN}#{RE_LWS}?=#{RE_LWS}?(?:#{RE_TOKEN}|#{RE_QUOTED_STRING})#{RE_LWS}?)*}n
361
362    def content_type_parse # :nodoc:
363      v = @meta['content-type']
364      # The last (?:;#{RE_LWS}?)? matches extra ";" which violates RFC2045.
365      if v && %r{\A#{RE_LWS}?(#{RE_TOKEN})#{RE_LWS}?/(#{RE_TOKEN})#{RE_LWS}?(#{RE_PARAMETERS})(?:;#{RE_LWS}?)?\z}no =~ v
366        type = $1.downcase
367        subtype = $2.downcase
368        parameters = []
369        $3.scan(/;#{RE_LWS}?(#{RE_TOKEN})#{RE_LWS}?=#{RE_LWS}?(?:(#{RE_TOKEN})|(#{RE_QUOTED_STRING}))/no) {|att, val, qval|
370          val = qval.gsub(/[\r\n\t !#-\[\]-~\x80-\xff]+|(\\[\x00-\x7f])/) { $1 ? $1[1,1] : $& } if qval
371          parameters << [att.downcase, val]
372        }
373        ["#{type}/#{subtype}", *parameters]
374      else
375        nil
376      end
377    end
378
379    # returns "type/subtype" which is MIME Content-Type.
380    # It is downcased for canonicalization.
381    # Content-Type parameters are stripped.
382    def content_type
383      type, *parameters = content_type_parse
384      type || 'application/octet-stream'
385    end
386
387    # returns a charset parameter in Content-Type field.
388    # It is downcased for canonicalization.
389    #
390    # If charset parameter is not given but a block is given,
391    # the block is called and its result is returned.
392    # It can be used to guess charset.
393    #
394    # If charset parameter and block is not given,
395    # nil is returned except text type in HTTP.
396    # In that case, "iso-8859-1" is returned as defined by RFC2616 3.7.1.
397    def charset
398      type, *parameters = content_type_parse
399      if pair = parameters.assoc('charset')
400        pair.last.downcase
401      elsif block_given?
402        yield
403      elsif type && %r{\Atext/} =~ type &&
404            @base_uri && /\Ahttp\z/i =~ @base_uri.scheme
405        "iso-8859-1" # RFC2616 3.7.1
406      else
407        nil
408      end
409    end
410
411    # returns a list of encodings in Content-Encoding field
412    # as an Array of String.
413    # The encodings are downcased for canonicalization.
414    def content_encoding
415      v = @meta['content-encoding']
416      if v && %r{\A#{RE_LWS}?#{RE_TOKEN}#{RE_LWS}?(?:,#{RE_LWS}?#{RE_TOKEN}#{RE_LWS}?)*}o =~ v
417        v.scan(RE_TOKEN).map {|content_coding| content_coding.downcase}
418      else
419        []
420      end
421    end
422  end
423
424  # Mixin for HTTP and FTP URIs.
425  module OpenRead
426    # OpenURI::OpenRead#open provides `open' for URI::HTTP and URI::FTP.
427    #
428    # OpenURI::OpenRead#open takes optional 3 arguments as:
429    # OpenURI::OpenRead#open([mode [, perm]] [, options]) [{|io| ... }]
430    #
431    # `mode', `perm' is same as Kernel#open.
432    #
433    # However, `mode' must be read mode because OpenURI::OpenRead#open doesn't
434    # support write mode (yet).
435    # Also `perm' is just ignored because it is meaningful only for file
436    # creation.
437    #
438    # `options' must be a hash.
439    #
440    # Each pairs which key is a string in the hash specify a extra header
441    # field for HTTP.
442    # I.e. it is ignored for FTP without HTTP proxy.
443    #
444    # The hash may include other options which key is a symbol:
445    #
446    # [:proxy]
447    #  Synopsis:
448    #    :proxy => "http://proxy.foo.com:8000/"
449    #    :proxy => URI.parse("http://proxy.foo.com:8000/")
450    #    :proxy => true
451    #    :proxy => false
452    #    :proxy => nil
453    #   
454    #  If :proxy option is specified, the value should be String, URI,
455    #  boolean or nil.
456    #  When String or URI is given, it is treated as proxy URI.
457    #  When true is given or the option itself is not specified,
458    #  environment variable `scheme_proxy' is examined.
459    #  `scheme' is replaced by `http', `https' or `ftp'.
460    #  When false or nil is given, the environment variables are ignored and
461    #  connection will be made to a server directly.
462    #
463    # [:http_basic_authentication]
464    #  Synopsis:
465    #    :http_basic_authentication=>[user, password]
466    #
467    #  If :http_basic_authentication is specified,
468    #  the value should be an array which contains 2 strings:
469    #  username and password.
470    #  It is used for HTTP Basic authentication defined by RFC 2617.
471    #
472    # [:content_length_proc]
473    #  Synopsis:
474    #    :content_length_proc => lambda {|content_length| ... }
475    # 
476    #  If :content_length_proc option is specified, the option value procedure
477    #  is called before actual transfer is started.
478    #  It takes one argument which is expected content length in bytes.
479    # 
480    #  If two or more transfer is done by HTTP redirection, the procedure
481    #  is called only one for a last transfer.
482    # 
483    #  When expected content length is unknown, the procedure is called with
484    #  nil.
485    #  It is happen when HTTP response has no Content-Length header.
486    #
487    # [:progress_proc]
488    #  Synopsis:
489    #    :progress_proc => lambda {|size| ...}
490    #
491    #  If :progress_proc option is specified, the proc is called with one
492    #  argument each time when `open' gets content fragment from network.
493    #  The argument `size' `size' is a accumulated transfered size in bytes.
494    #
495    #  If two or more transfer is done by HTTP redirection, the procedure
496    #  is called only one for a last transfer.
497    #
498    #  :progress_proc and :content_length_proc are intended to be used for
499    #  progress bar.
500    #  For example, it can be implemented as follows using Ruby/ProgressBar.
501    #
502    #    pbar = nil
503    #    open("http://...",
504    #      :content_length_proc => lambda {|t|
505    #        if t && 0 < t
506    #          pbar = ProgressBar.new("...", t)
507    #          pbar.file_transfer_mode
508    #        end
509    #      },
510    #      :progress_proc => lambda {|s|
511    #        pbar.set s if pbar
512    #      }) {|f| ... }
513    #
514    # OpenURI::OpenRead#open returns an IO like object if block is not given.
515    # Otherwise it yields the IO object and return the value of the block.
516    # The IO object is extended with OpenURI::Meta.
517    def open(*rest, &block)
518      OpenURI.open_uri(self, *rest, &block)
519    end
520
521    # OpenURI::OpenRead#read([options]) reads a content referenced by self and
522    # returns the content as string.
523    # The string is extended with OpenURI::Meta.
524    # The argument `options' is same as OpenURI::OpenRead#open.
525    def read(options={})
526      self.open(options) {|f|
527        str = f.read
528        Meta.init str, f
529        str
530      }
531    end
532  end
533end
534
535module URI
536  class Generic
537    # returns a proxy URI.
538    # The proxy URI is obtained from environment variables such as http_proxy,
539    # ftp_proxy, no_proxy, etc.
540    # If there is no proper proxy, nil is returned.
541    #
542    # Note that capitalized variables (HTTP_PROXY, FTP_PROXY, NO_PROXY, etc.)
543    # are examined too.
544    #
545    # But http_proxy and HTTP_PROXY is treated specially under CGI environment.
546    # It's because HTTP_PROXY may be set by Proxy: header.
547    # So HTTP_PROXY is not used.
548    # http_proxy is not used too if the variable is case insensitive.
549    # CGI_HTTP_PROXY can be used instead.
550    def find_proxy
551      name = self.scheme.downcase + '_proxy'
552      proxy_uri = nil
553      if name == 'http_proxy' && ENV.include?('REQUEST_METHOD') # CGI?
554        # HTTP_PROXY conflicts with *_proxy for proxy settings and
555        # HTTP_* for header information in CGI.
556        # So it should be careful to use it.
557        pairs = ENV.reject {|k, v| /\Ahttp_proxy\z/i !~ k }
558        case pairs.length
559        when 0 # no proxy setting anyway.
560          proxy_uri = nil
561        when 1
562          k, v = pairs.shift
563          if k == 'http_proxy' && ENV[k.upcase] == nil
564            # http_proxy is safe to use because ENV is case sensitive.
565            proxy_uri = ENV[name]
566          else
567            proxy_uri = nil
568          end
569        else # http_proxy is safe to use because ENV is case sensitive.
570          proxy_uri = ENV.to_hash[name]
571        end
572        if !proxy_uri
573          # Use CGI_HTTP_PROXY.  cf. libwww-perl.
574          proxy_uri = ENV["CGI_#{name.upcase}"]
575        end
576      elsif name == 'http_proxy'
577        unless proxy_uri = ENV[name]
578          if proxy_uri = ENV[name.upcase]
579            warn 'The environment variable HTTP_PROXY is discouraged.  Use http_proxy.'
580          end
581        end
582      else
583        proxy_uri = ENV[name] || ENV[name.upcase]
584      end
585
586      if proxy_uri && self.host
587        require 'socket'
588        begin
589          addr = IPSocket.getaddress(self.host)
590          proxy_uri = nil if /\A127\.|\A::1\z/ =~ addr
591        rescue SocketError
592        end
593      end
594
595      if proxy_uri
596        proxy_uri = URI.parse(proxy_uri)
597        name = 'no_proxy'
598        if no_proxy = ENV[name] || ENV[name.upcase]
599          no_proxy.scan(/([^:,]*)(?::(\d+))?/) {|host, port|
600            if /(\A|\.)#{Regexp.quote host}\z/i =~ self.host &&
601               (!port || self.port == port.to_i)
602              proxy_uri = nil
603              break
604            end
605          }
606        end
607        proxy_uri
608      else
609        nil
610      end
611    end
612  end
613
614  class HTTP
615    def buffer_open(buf, proxy, options) # :nodoc:
616      OpenURI.open_http(buf, self, proxy, options)
617    end
618
619    include OpenURI::OpenRead
620  end
621
622  class FTP
623    def buffer_open(buf, proxy, options) # :nodoc:
624      if proxy
625        OpenURI.open_http(buf, self, proxy, options)
626        return
627      end
628      require 'net/ftp'
629
630      directories = self.path.split(%r{/}, -1)
631      directories.shift if directories[0] == '' # strip a field before leading slash
632      directories.each {|d|
633        d.gsub!(/%([0-9A-Fa-f][0-9A-Fa-f])/) { [$1].pack("H2") }
634      }
635      unless filename = directories.pop
636        raise ArgumentError, "no filename: #{self.inspect}"
637      end
638      directories.each {|d|
639        if /[\r\n]/ =~ d
640          raise ArgumentError, "invalid directory: #{d.inspect}"
641        end
642      }
643      if /[\r\n]/ =~ filename
644        raise ArgumentError, "invalid filename: #{filename.inspect}"
645      end
646      typecode = self.typecode
647      if typecode && /\A[aid]\z/ !~ typecode
648        raise ArgumentError, "invalid typecode: #{typecode.inspect}"
649      end
650
651      # The access sequence is defined by RFC 1738
652      ftp = Net::FTP.open(self.host)
653      # todo: extract user/passwd from .netrc.
654      user = 'anonymous'
655      passwd = nil
656      user, passwd = self.userinfo.split(/:/) if self.userinfo
657      ftp.login(user, passwd)
658      directories.each {|cwd|
659        ftp.voidcmd("CWD #{cwd}")
660      }
661      if typecode
662        # xxx: typecode D is not handled.
663        ftp.voidcmd("TYPE #{typecode.upcase}")
664      end
665      if options[:content_length_proc]
666        options[:content_length_proc].call(ftp.size(filename))
667      end
668      ftp.retrbinary("RETR #{filename}", 4096) { |str|
669        buf << str
670        options[:progress_proc].call(buf.size) if options[:progress_proc]
671      }
672      ftp.close
673      buf.io.rewind
674    end
675
676    include OpenURI::OpenRead
677  end
678end