PageRenderTime 51ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/extensions/uri/uri/common.rb

http://github.com/rhomobile/rhodes
Ruby | 857 lines | 474 code | 48 blank | 335 comment | 7 complexity | 600147d8426454bd3207cc432a3e93a1 MD5 | raw file
Possible License(s): CC-BY-SA-3.0, MIT, Apache-2.0, LGPL-2.1, GPL-2.0, MPL-2.0-no-copyleft-exception
  1. # = uri/common.rb
  2. #
  3. # Author:: Akira Yamada <akira@ruby-lang.org>
  4. # Revision:: $Id: common.rb 31799 2011-05-29 22:49:36Z yugui $
  5. # License::
  6. # You can redistribute it and/or modify it under the same term as Ruby.
  7. #
  8. module URI
  9. module REGEXP
  10. #
  11. # Patterns used to parse URI's
  12. #
  13. module PATTERN
  14. # :stopdoc:
  15. # RFC 2396 (URI Generic Syntax)
  16. # RFC 2732 (IPv6 Literal Addresses in URL's)
  17. # RFC 2373 (IPv6 Addressing Architecture)
  18. # alpha = lowalpha | upalpha
  19. ALPHA = "a-zA-Z"
  20. # alphanum = alpha | digit
  21. ALNUM = "#{ALPHA}\\d"
  22. # hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
  23. # "a" | "b" | "c" | "d" | "e" | "f"
  24. HEX = "a-fA-F\\d"
  25. # escaped = "%" hex hex
  26. ESCAPED = "%[#{HEX}]{2}"
  27. # mark = "-" | "_" | "." | "!" | "~" | "*" | "'" |
  28. # "(" | ")"
  29. # unreserved = alphanum | mark
  30. UNRESERVED = "-_.!~*'()#{ALNUM}"
  31. # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
  32. # "$" | ","
  33. # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
  34. # "$" | "," | "[" | "]" (RFC 2732)
  35. RESERVED = ";/?:@&=+$,\\[\\]"
  36. # domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
  37. DOMLABEL = "(?:[#{ALNUM}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
  38. # toplabel = alpha | alpha *( alphanum | "-" ) alphanum
  39. TOPLABEL = "(?:[#{ALPHA}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
  40. # hostname = *( domainlabel "." ) toplabel [ "." ]
  41. HOSTNAME = "(?:#{DOMLABEL}\\.)*#{TOPLABEL}\\.?"
  42. # :startdoc:
  43. end # PATTERN
  44. # :startdoc:
  45. end # REGEXP
  46. class Parser
  47. include REGEXP
  48. #
  49. # == Synopsis
  50. #
  51. # URI::Parser.new([opts])
  52. #
  53. # == Args
  54. #
  55. # The constructor accepts a hash as options for parser.
  56. # Keys of options are pattern names of URI components
  57. # and values of options are pattern strings.
  58. # The constructor generetes set of regexps for parsing URIs.
  59. #
  60. # You can use the following keys:
  61. #
  62. # * <tt>:ESCAPED</tt> (URI::PATTERN::ESCAPED in default)
  63. # * <tt>:UNRESERVED</tt> (URI::PATTERN::UNRESERVED in default)
  64. # * <tt>:DOMLABEL</tt> (URI::PATTERN::DOMLABEL in default)
  65. # * <tt>:TOPLABEL</tt> (URI::PATTERN::TOPLABEL in default)
  66. # * <tt>:HOSTNAME</tt> (URI::PATTERN::HOSTNAME in default)
  67. #
  68. # == Examples
  69. #
  70. # p = URI::Parser.new(:ESCPAED => "(?:%[a-fA-F0-9]{2}|%u[a-fA-F0-9]{4})"
  71. # u = p.parse("http://example.jp/%uABCD") #=> #<URI::HTTP:0xb78cf4f8 URL:http://example.jp/%uABCD>
  72. # URI.parse(u.to_s) #=> raises URI::InvalidURIError
  73. #
  74. # s = "http://examle.com/ABCD"
  75. # u1 = p.parse(s) #=> #<URI::HTTP:0xb78c3220 URL:http://example.com/ABCD>
  76. # u2 = URI.parse(s) #=> #<URI::HTTP:0xb78b6d54 URL:http://example.com/ABCD>
  77. # u1 == u2 #=> true
  78. # u1.eql?(u2) #=> false
  79. #
  80. def initialize(opts = {})
  81. @pattern = initialize_pattern(opts)
  82. @pattern.each_value {|v| v.freeze}
  83. @pattern.freeze
  84. @regexp = initialize_regexp(@pattern)
  85. @regexp.each_value {|v| v.freeze}
  86. @regexp.freeze
  87. end
  88. attr_reader :pattern, :regexp
  89. def split(uri)
  90. case uri
  91. when ''
  92. # null uri
  93. when @regexp[:ABS_URI]
  94. scheme, opaque, userinfo, host, port,
  95. registry, path, query, fragment = $~[1..-1]
  96. # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
  97. # absoluteURI = scheme ":" ( hier_part | opaque_part )
  98. # hier_part = ( net_path | abs_path ) [ "?" query ]
  99. # opaque_part = uric_no_slash *uric
  100. # abs_path = "/" path_segments
  101. # net_path = "//" authority [ abs_path ]
  102. # authority = server | reg_name
  103. # server = [ [ userinfo "@" ] hostport ]
  104. if !scheme
  105. raise InvalidURIError,
  106. "bad URI(absolute but no scheme): #{uri}"
  107. end
  108. if !opaque && (!path && (!host && !registry))
  109. raise InvalidURIError,
  110. "bad URI(absolute but no path): #{uri}"
  111. end
  112. when @regexp[:REL_URI]
  113. scheme = nil
  114. opaque = nil
  115. userinfo, host, port, registry,
  116. rel_segment, abs_path, query, fragment = $~[1..-1]
  117. if rel_segment && abs_path
  118. path = rel_segment + abs_path
  119. elsif rel_segment
  120. path = rel_segment
  121. elsif abs_path
  122. path = abs_path
  123. end
  124. # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
  125. # relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
  126. # net_path = "//" authority [ abs_path ]
  127. # abs_path = "/" path_segments
  128. # rel_path = rel_segment [ abs_path ]
  129. # authority = server | reg_name
  130. # server = [ [ userinfo "@" ] hostport ]
  131. else
  132. raise InvalidURIError, "bad URI(is not URI?): #{uri}"
  133. end
  134. path = '' if !path && !opaque # (see RFC2396 Section 5.2)
  135. ret = [
  136. scheme,
  137. userinfo, host, port, # X
  138. registry, # X
  139. path, # Y
  140. opaque, # Y
  141. query,
  142. fragment
  143. ]
  144. return ret
  145. end
  146. def parse(uri)
  147. scheme, userinfo, host, port,
  148. registry, path, opaque, query, fragment = self.split(uri)
  149. if scheme && URI.scheme_list.include?(scheme.upcase)
  150. URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port,
  151. registry, path, opaque, query,
  152. fragment, self)
  153. else
  154. Generic.new(scheme, userinfo, host, port,
  155. registry, path, opaque, query,
  156. fragment, self)
  157. end
  158. end
  159. def join(*str)
  160. u = self.parse(str[0])
  161. str[1 .. -1].each do |x|
  162. u = u.merge(x)
  163. end
  164. u
  165. end
  166. def extract(str, schemes = nil, &block)
  167. if block_given?
  168. str.scan(make_regexp(schemes)) { yield $& }
  169. nil
  170. else
  171. result = []
  172. str.scan(make_regexp(schemes)) { result.push $& }
  173. result
  174. end
  175. end
  176. def make_regexp(schemes = nil)
  177. unless schemes
  178. @regexp[:ABS_URI_REF]
  179. else
  180. /(?=#{Regexp.union(*schemes)}:)#{@pattern[:X_ABS_URI]}/x
  181. end
  182. end
  183. def escape(str, unsafe = @regexp[:UNSAFE])
  184. unless unsafe.kind_of?(Regexp)
  185. # perhaps unsafe is String object
  186. unsafe = Regexp.new("[#{Regexp.quote(unsafe)}]", false)
  187. end
  188. str.gsub(unsafe) do
  189. us = $&
  190. tmp = ''
  191. us.each_byte do |uc|
  192. tmp << sprintf('%%%02X', uc)
  193. end
  194. tmp
  195. end.force_encoding("US-ASCII")#Encoding::US_ASCII)
  196. end
  197. def unescape(str, escaped = @regexp[:ESCAPED])
  198. str.gsub(escaped) { [$&[1, 2].hex].pack('C') }.force_encoding(str.encoding)
  199. end
  200. @@to_s = Kernel.instance_method(:to_s)
  201. def inspect
  202. @@to_s.bind(self).call
  203. end
  204. private
  205. def initialize_pattern(opts = {})
  206. ret = {}
  207. ret[:ESCAPED] = escaped = (opts.delete(:ESCAPED) || PATTERN::ESCAPED)
  208. ret[:UNRESERVED] = unreserved = opts.delete(:UNRESERVED) || PATTERN::UNRESERVED
  209. ret[:RESERVED] = reserved = opts.delete(:RESERVED) || PATTERN::RESERVED
  210. ret[:DOMLABEL] = domlabel = opts.delete(:DOMLABEL) || PATTERN::DOMLABEL
  211. ret[:TOPLABEL] = toplabel = opts.delete(:TOPLABEL) || PATTERN::TOPLABEL
  212. ret[:HOSTNAME] = hostname = opts.delete(:HOSTNAME)
  213. # RFC 2396 (URI Generic Syntax)
  214. # RFC 2732 (IPv6 Literal Addresses in URL's)
  215. # RFC 2373 (IPv6 Addressing Architecture)
  216. # uric = reserved | unreserved | escaped
  217. ret[:URIC] = uric = "(?:[#{unreserved}#{reserved}]|#{escaped})"
  218. # uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
  219. # "&" | "=" | "+" | "$" | ","
  220. ret[:URIC_NO_SLASH] = uric_no_slash = "(?:[#{unreserved};?:@&=+$,]|#{escaped})"
  221. # query = *uric
  222. ret[:QUERY] = query = "#{uric}*"
  223. # fragment = *uric
  224. ret[:FRAGMENT] = fragment = "#{uric}*"
  225. # hostname = *( domainlabel "." ) toplabel [ "." ]
  226. unless hostname
  227. ret[:HOSTNAME] = hostname = "(?:#{domlabel}\\.)*#{toplabel}\\.?"
  228. end
  229. # RFC 2373, APPENDIX B:
  230. # IPv6address = hexpart [ ":" IPv4address ]
  231. # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
  232. # hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
  233. # hexseq = hex4 *( ":" hex4)
  234. # hex4 = 1*4HEXDIG
  235. #
  236. # XXX: This definition has a flaw. "::" + IPv4address must be
  237. # allowed too. Here is a replacement.
  238. #
  239. # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
  240. ret[:IPV4ADDR] = ipv4addr = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"
  241. # hex4 = 1*4HEXDIG
  242. hex4 = "[#{PATTERN::HEX}]{1,4}"
  243. # lastpart = hex4 | IPv4address
  244. lastpart = "(?:#{hex4}|#{ipv4addr})"
  245. # hexseq1 = *( hex4 ":" ) hex4
  246. hexseq1 = "(?:#{hex4}:)*#{hex4}"
  247. # hexseq2 = *( hex4 ":" ) lastpart
  248. hexseq2 = "(?:#{hex4}:)*#{lastpart}"
  249. # IPv6address = hexseq2 | [ hexseq1 ] "::" [ hexseq2 ]
  250. ret[:IPV6ADDR] = ipv6addr = "(?:#{hexseq2}|(?:#{hexseq1})?::(?:#{hexseq2})?)"
  251. # IPv6prefix = ( hexseq1 | [ hexseq1 ] "::" [ hexseq1 ] ) "/" 1*2DIGIT
  252. # unused
  253. # ipv6reference = "[" IPv6address "]" (RFC 2732)
  254. ret[:IPV6REF] = ipv6ref = "\\[#{ipv6addr}\\]"
  255. # host = hostname | IPv4address
  256. # host = hostname | IPv4address | IPv6reference (RFC 2732)
  257. ret[:HOST] = host = "(?:#{hostname}|#{ipv4addr}|#{ipv6ref})"
  258. # port = *digit
  259. port = '\d*'
  260. # hostport = host [ ":" port ]
  261. ret[:HOSTPORT] = hostport = "#{host}(?::#{port})?"
  262. # userinfo = *( unreserved | escaped |
  263. # ";" | ":" | "&" | "=" | "+" | "$" | "," )
  264. ret[:USERINFO] = userinfo = "(?:[#{unreserved};:&=+$,]|#{escaped})*"
  265. # pchar = unreserved | escaped |
  266. # ":" | "@" | "&" | "=" | "+" | "$" | ","
  267. pchar = "(?:[#{unreserved}:@&=+$,]|#{escaped})"
  268. # param = *pchar
  269. param = "#{pchar}*"
  270. # segment = *pchar *( ";" param )
  271. segment = "#{pchar}*(?:;#{param})*"
  272. # path_segments = segment *( "/" segment )
  273. ret[:PATH_SEGMENTS] = path_segments = "#{segment}(?:/#{segment})*"
  274. # server = [ [ userinfo "@" ] hostport ]
  275. server = "(?:#{userinfo}@)?#{hostport}"
  276. # reg_name = 1*( unreserved | escaped | "$" | "," |
  277. # ";" | ":" | "@" | "&" | "=" | "+" )
  278. ret[:REG_NAME] = reg_name = "(?:[#{unreserved}$,;:@&=+]|#{escaped})+"
  279. # authority = server | reg_name
  280. authority = "(?:#{server}|#{reg_name})"
  281. # rel_segment = 1*( unreserved | escaped |
  282. # ";" | "@" | "&" | "=" | "+" | "$" | "," )
  283. ret[:REL_SEGMENT] = rel_segment = "(?:[#{unreserved};@&=+$,]|#{escaped})+"
  284. # scheme = alpha *( alpha | digit | "+" | "-" | "." )
  285. ret[:SCHEME] = scheme = "[#{PATTERN::ALPHA}][-+.#{PATTERN::ALPHA}\\d]*"
  286. # abs_path = "/" path_segments
  287. ret[:ABS_PATH] = abs_path = "/#{path_segments}"
  288. # rel_path = rel_segment [ abs_path ]
  289. ret[:REL_PATH] = rel_path = "#{rel_segment}(?:#{abs_path})?"
  290. # net_path = "//" authority [ abs_path ]
  291. ret[:NET_PATH] = net_path = "//#{authority}(?:#{abs_path})?"
  292. # hier_part = ( net_path | abs_path ) [ "?" query ]
  293. ret[:HIER_PART] = hier_part = "(?:#{net_path}|#{abs_path})(?:\\?(?:#{query}))?"
  294. # opaque_part = uric_no_slash *uric
  295. ret[:OPAQUE_PART] = opaque_part = "#{uric_no_slash}#{uric}*"
  296. # absoluteURI = scheme ":" ( hier_part | opaque_part )
  297. ret[:ABS_URI] = abs_uri = "#{scheme}:(?:#{hier_part}|#{opaque_part})"
  298. # relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
  299. ret[:REL_URI] = rel_uri = "(?:#{net_path}|#{abs_path}|#{rel_path})(?:\\?#{query})?"
  300. # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
  301. ret[:URI_REF] = uri_ref = "(?:#{abs_uri}|#{rel_uri})?(?:##{fragment})?"
  302. ret[:X_ABS_URI] = "
  303. (#{scheme}): (?# 1: scheme)
  304. (?:
  305. (#{opaque_part}) (?# 2: opaque)
  306. |
  307. (?:(?:
  308. //(?:
  309. (?:(?:(#{userinfo})@)? (?# 3: userinfo)
  310. (?:(#{host})(?::(\\d*))?))? (?# 4: host, 5: port)
  311. |
  312. (#{reg_name}) (?# 6: registry)
  313. )
  314. |
  315. (?!//)) (?# XXX: '//' is the mark for hostport)
  316. (#{abs_path})? (?# 7: path)
  317. )(?:\\?(#{query}))? (?# 8: query)
  318. )
  319. (?:\\#(#{fragment}))? (?# 9: fragment)
  320. "
  321. ret[:X_REL_URI] = "
  322. (?:
  323. (?:
  324. //
  325. (?:
  326. (?:(#{userinfo})@)? (?# 1: userinfo)
  327. (#{host})?(?::(\\d*))? (?# 2: host, 3: port)
  328. |
  329. (#{reg_name}) (?# 4: registry)
  330. )
  331. )
  332. |
  333. (#{rel_segment}) (?# 5: rel_segment)
  334. )?
  335. (#{abs_path})? (?# 6: abs_path)
  336. (?:\\?(#{query}))? (?# 7: query)
  337. (?:\\#(#{fragment}))? (?# 8: fragment)
  338. "
  339. ret
  340. end
  341. def initialize_regexp(pattern)
  342. ret = {}
  343. # for URI::split
  344. ret[:ABS_URI] = Regexp.new('\A\s*' + pattern[:X_ABS_URI] + '\s*\z', Regexp::EXTENDED)
  345. ret[:REL_URI] = Regexp.new('\A\s*' + pattern[:X_REL_URI] + '\s*\z', Regexp::EXTENDED)
  346. # for URI::extract
  347. ret[:URI_REF] = Regexp.new(pattern[:URI_REF])
  348. ret[:ABS_URI_REF] = Regexp.new(pattern[:X_ABS_URI], Regexp::EXTENDED)
  349. ret[:REL_URI_REF] = Regexp.new(pattern[:X_REL_URI], Regexp::EXTENDED)
  350. # for URI::escape/unescape
  351. ret[:ESCAPED] = Regexp.new(pattern[:ESCAPED])
  352. ret[:UNSAFE] = Regexp.new("[^#{pattern[:UNRESERVED]}#{pattern[:RESERVED]}]")
  353. # for Generic#initialize
  354. ret[:SCHEME] = Regexp.new("^#{pattern[:SCHEME]}$")
  355. ret[:USERINFO] = Regexp.new("^#{pattern[:USERINFO]}$")
  356. ret[:HOST] = Regexp.new("^#{pattern[:HOST]}$")
  357. ret[:PORT] = Regexp.new("^#{pattern[:PORT]}$")
  358. ret[:OPAQUE] = Regexp.new("^#{pattern[:OPAQUE_PART]}$")
  359. ret[:REGISTRY] = Regexp.new("^#{pattern[:REG_NAME]}$")
  360. ret[:ABS_PATH] = Regexp.new("^#{pattern[:ABS_PATH]}$")
  361. ret[:REL_PATH] = Regexp.new("^#{pattern[:REL_PATH]}$")
  362. ret[:QUERY] = Regexp.new("^#{pattern[:QUERY]}$")
  363. ret[:FRAGMENT] = Regexp.new("^#{pattern[:FRAGMENT]}$")
  364. ret
  365. end
  366. end # class Parser
  367. DEFAULT_PARSER = Parser.new
  368. DEFAULT_PARSER.pattern.each_pair do |sym, str|
  369. unless REGEXP::PATTERN.const_defined?(sym)
  370. REGEXP::PATTERN.const_set(sym, str)
  371. end
  372. end
  373. DEFAULT_PARSER.regexp.each_pair do |sym, str|
  374. const_set(sym, str)
  375. end
  376. module Util # :nodoc:
  377. def make_components_hash(klass, array_hash)
  378. tmp = {}
  379. if array_hash.kind_of?(Array) &&
  380. array_hash.size == klass.component.size - 1
  381. klass.component[1..-1].each_index do |i|
  382. begin
  383. tmp[klass.component[i + 1]] = array_hash[i].clone
  384. rescue TypeError
  385. tmp[klass.component[i + 1]] = array_hash[i]
  386. end
  387. end
  388. elsif array_hash.kind_of?(Hash)
  389. array_hash.each do |key, value|
  390. begin
  391. tmp[key] = value.clone
  392. rescue TypeError
  393. tmp[key] = value
  394. end
  395. end
  396. else
  397. raise ArgumentError,
  398. "expected Array of or Hash of components of #{klass.to_s} (#{klass.component[1..-1].join(', ')})"
  399. end
  400. tmp[:scheme] = klass.to_s.sub(/\A.*::/, '').downcase
  401. return tmp
  402. end
  403. module_function :make_components_hash
  404. end
  405. module Escape
  406. #
  407. # == Synopsis
  408. #
  409. # URI.escape(str [, unsafe])
  410. #
  411. # == Args
  412. #
  413. # +str+::
  414. # String to replaces in.
  415. # +unsafe+::
  416. # Regexp that matches all symbols that must be replaced with codes.
  417. # By default uses <tt>REGEXP::UNSAFE</tt>.
  418. # When this argument is a String, it represents a character set.
  419. #
  420. # == Description
  421. #
  422. # Escapes the string, replacing all unsafe characters with codes.
  423. #
  424. # == Usage
  425. #
  426. # require 'uri'
  427. #
  428. # enc_uri = URI.escape("http://example.com/?a=\11\15")
  429. # p enc_uri
  430. # # => "http://example.com/?a=%09%0D"
  431. #
  432. # p URI.unescape(enc_uri)
  433. # # => "http://example.com/?a=\t\r"
  434. #
  435. # p URI.escape("@?@!", "!?")
  436. # # => "@%3F@%21"
  437. #
  438. def escape(*arg)
  439. warn "#{caller(1)[0]}: warning: URI.escape is obsolete" if $VERBOSE
  440. DEFAULT_PARSER.escape(*arg)
  441. end
  442. alias encode escape
  443. #
  444. # == Synopsis
  445. #
  446. # URI.unescape(str)
  447. #
  448. # == Args
  449. #
  450. # +str+::
  451. # Unescapes the string.
  452. #
  453. # == Usage
  454. #
  455. # require 'uri'
  456. #
  457. # enc_uri = URI.escape("http://example.com/?a=\11\15")
  458. # p enc_uri
  459. # # => "http://example.com/?a=%09%0D"
  460. #
  461. # p URI.unescape(enc_uri)
  462. # # => "http://example.com/?a=\t\r"
  463. #
  464. def unescape(*arg)
  465. warn "#{caller(1)[0]}: warning: URI.unescape is obsolete" if $VERBOSE
  466. DEFAULT_PARSER.unescape(*arg)
  467. end
  468. alias decode unescape
  469. end
  470. extend Escape
  471. include REGEXP
  472. @@schemes = {}
  473. def self.scheme_list
  474. @@schemes
  475. end
  476. #
  477. # Base class for all URI exceptions.
  478. #
  479. class Error < StandardError; end
  480. #
  481. # Not a URI.
  482. #
  483. class InvalidURIError < Error; end
  484. #
  485. # Not a URI component.
  486. #
  487. class InvalidComponentError < Error; end
  488. #
  489. # URI is valid, bad usage is not.
  490. #
  491. class BadURIError < Error; end
  492. #
  493. # == Synopsis
  494. #
  495. # URI::split(uri)
  496. #
  497. # == Args
  498. #
  499. # +uri+::
  500. # String with URI.
  501. #
  502. # == Description
  503. #
  504. # Splits the string on following parts and returns array with result:
  505. #
  506. # * Scheme
  507. # * Userinfo
  508. # * Host
  509. # * Port
  510. # * Registry
  511. # * Path
  512. # * Opaque
  513. # * Query
  514. # * Fragment
  515. #
  516. # == Usage
  517. #
  518. # require 'uri'
  519. #
  520. # p URI.split("http://www.ruby-lang.org/")
  521. # # => ["http", nil, "www.ruby-lang.org", nil, nil, "/", nil, nil, nil]
  522. #
  523. def self.split(uri)
  524. DEFAULT_PARSER.split(uri)
  525. end
  526. #
  527. # == Synopsis
  528. #
  529. # URI::parse(uri_str)
  530. #
  531. # == Args
  532. #
  533. # +uri_str+::
  534. # String with URI.
  535. #
  536. # == Description
  537. #
  538. # Creates one of the URI's subclasses instance from the string.
  539. #
  540. # == Raises
  541. #
  542. # URI::InvalidURIError
  543. # Raised if URI given is not a correct one.
  544. #
  545. # == Usage
  546. #
  547. # require 'uri'
  548. #
  549. # uri = URI.parse("http://www.ruby-lang.org/")
  550. # p uri
  551. # # => #<URI::HTTP:0x202281be URL:http://www.ruby-lang.org/>
  552. # p uri.scheme
  553. # # => "http"
  554. # p uri.host
  555. # # => "www.ruby-lang.org"
  556. #
  557. def self.parse(uri)
  558. DEFAULT_PARSER.parse(uri)
  559. end
  560. #
  561. # == Synopsis
  562. #
  563. # URI::join(str[, str, ...])
  564. #
  565. # == Args
  566. #
  567. # +str+::
  568. # String(s) to work with
  569. #
  570. # == Description
  571. #
  572. # Joins URIs.
  573. #
  574. # == Usage
  575. #
  576. # require 'uri'
  577. #
  578. # p URI.join("http://localhost/","main.rbx")
  579. # # => #<URI::HTTP:0x2022ac02 URL:http://localhost/main.rbx>
  580. #
  581. def self.join(*str)
  582. DEFAULT_PARSER.join(*str)
  583. end
  584. #
  585. # == Synopsis
  586. #
  587. # URI::extract(str[, schemes][,&blk])
  588. #
  589. # == Args
  590. #
  591. # +str+::
  592. # String to extract URIs from.
  593. # +schemes+::
  594. # Limit URI matching to a specific schemes.
  595. #
  596. # == Description
  597. #
  598. # Extracts URIs from a string. If block given, iterates through all matched URIs.
  599. # Returns nil if block given or array with matches.
  600. #
  601. # == Usage
  602. #
  603. # require "uri"
  604. #
  605. # URI.extract("text here http://foo.example.org/bla and here mailto:test@example.com and here also.")
  606. # # => ["http://foo.example.com/bla", "mailto:test@example.com"]
  607. #
  608. def self.extract(str, schemes = nil, &block)
  609. DEFAULT_PARSER.extract(str, schemes, &block)
  610. end
  611. #
  612. # == Synopsis
  613. #
  614. # URI::regexp([match_schemes])
  615. #
  616. # == Args
  617. #
  618. # +match_schemes+::
  619. # Array of schemes. If given, resulting regexp matches to URIs
  620. # whose scheme is one of the match_schemes.
  621. #
  622. # == Description
  623. # Returns a Regexp object which matches to URI-like strings.
  624. # The Regexp object returned by this method includes arbitrary
  625. # number of capture group (parentheses). Never rely on it's number.
  626. #
  627. # == Usage
  628. #
  629. # require 'uri'
  630. #
  631. # # extract first URI from html_string
  632. # html_string.slice(URI.regexp)
  633. #
  634. # # remove ftp URIs
  635. # html_string.sub(URI.regexp(['ftp'])
  636. #
  637. # # You should not rely on the number of parentheses
  638. # html_string.scan(URI.regexp) do |*matches|
  639. # p $&
  640. # end
  641. #
  642. def self.regexp(schemes = nil)
  643. DEFAULT_PARSER.make_regexp(schemes)
  644. end
  645. TBLENCWWWCOMP_ = {} # :nodoc:
  646. TBLDECWWWCOMP_ = {} # :nodoc:
  647. HTML5ASCIIINCOMPAT = []#[Encoding::UTF_7, Encoding::UTF_16BE, Encoding::UTF_16LE,
  648. #Encoding::UTF_32BE, Encoding::UTF_32LE] # :nodoc:
  649. # Encode given +str+ to URL-encoded form data.
  650. #
  651. # This doesn't convert *, -, ., 0-9, A-Z, _, a-z,
  652. # does convert SP to +, and convert others to %XX.
  653. #
  654. # This refers http://www.w3.org/TR/html5/forms.html#url-encoded-form-data
  655. #
  656. # See URI.decode_www_form_component, URI.encode_www_form
  657. def self.encode_www_form_component(str)
  658. if TBLENCWWWCOMP_.empty?
  659. tbl = {}
  660. 256.times do |i|
  661. tbl[i.chr] = '%%%02X' % i
  662. end
  663. tbl[' '] = '+'
  664. begin
  665. TBLENCWWWCOMP_.replace(tbl)
  666. TBLENCWWWCOMP_.freeze
  667. rescue
  668. end
  669. end
  670. str = str.to_s
  671. if HTML5ASCIIINCOMPAT.include?(str.encoding)
  672. str = str.encode("UTF-8") #Encoding::UTF_8)
  673. else
  674. str = str.dup
  675. end
  676. str.force_encoding("ASCII-8BIT") #Encoding::ASCII_8BIT)
  677. str.gsub!(/[^*\-.0-9A-Z_a-z]/, TBLENCWWWCOMP_)
  678. str.force_encoding("US-ASCII") #Encoding::US_ASCII)
  679. end
  680. # Decode given +str+ of URL-encoded form data.
  681. #
  682. # This decods + to SP.
  683. #
  684. # See URI.encode_www_form_component, URI.decode_www_form
  685. def self.decode_www_form_component(str, enc="UTF-8") #Encoding::UTF_8)
  686. if TBLDECWWWCOMP_.empty?
  687. tbl = {}
  688. 256.times do |i|
  689. h, l = i>>4, i&15
  690. tbl['%%%X%X' % [h, l]] = i.chr
  691. tbl['%%%x%X' % [h, l]] = i.chr
  692. tbl['%%%X%x' % [h, l]] = i.chr
  693. tbl['%%%x%x' % [h, l]] = i.chr
  694. end
  695. tbl['+'] = ' '
  696. begin
  697. TBLDECWWWCOMP_.replace(tbl)
  698. TBLDECWWWCOMP_.freeze
  699. rescue
  700. end
  701. end
  702. raise ArgumentError, "invalid %-encoding (#{str})" unless /\A(?:%\h\h|[^%]+)*\z/ =~ str
  703. str.gsub(/\+|%\h\h/, TBLDECWWWCOMP_).force_encoding(enc)
  704. end
  705. # Generate URL-encoded form data from given +enum+.
  706. #
  707. # This generates application/x-www-form-urlencoded data defined in HTML5
  708. # from given an Enumerable object.
  709. #
  710. # This internally uses URI.encode_www_form_component(str).
  711. #
  712. # This doesn't convert encodings of give items, so convert them before call
  713. # this method if you want to send data as other than original encoding or
  714. # mixed encoding data. (strings which is encoded in HTML5 ASCII incompatible
  715. # encoding is converted to UTF-8)
  716. #
  717. # This doesn't treat files. When you send a file, use multipart/form-data.
  718. #
  719. # This refers http://www.w3.org/TR/html5/forms.html#url-encoded-form-data
  720. #
  721. # See URI.encode_www_form_component, URI.decode_www_form
  722. def self.encode_www_form(enum)
  723. str = nil
  724. enum.each do |k,v|
  725. if str
  726. str << '&'
  727. else
  728. str = nil.to_s
  729. end
  730. str << encode_www_form_component(k)
  731. str << '='
  732. str << encode_www_form_component(v)
  733. end
  734. str
  735. end
  736. WFKV_ = '(?:%\h\h|[^%#=;&])' # :nodoc:
  737. # Decode URL-encoded form data from given +str+.
  738. #
  739. # This decodes application/x-www-form-urlencoded data
  740. # and returns array of key-value array.
  741. # This internally uses URI.decode_www_form_component.
  742. #
  743. # _charset_ hack is not supported now because the mapping from given charset
  744. # to Ruby's encoding is not clear yet.
  745. # see also http://www.w3.org/TR/html5/syntax.html#character-encodings-0
  746. #
  747. # This refers http://www.w3.org/TR/html5/forms.html#url-encoded-form-data
  748. #
  749. # ary = URI.decode_www_form("a=1&a=2&b=3")
  750. # p ary #=> [['a', '1'], ['a', '2'], ['b', '3']]
  751. # p ary.assoc('a').last #=> '1'
  752. # p ary.assoc('b').last #=> '3'
  753. # p ary.rassoc('a').last #=> '2'
  754. # p Hash[ary] # => {"a"=>"2", "b"=>"3"}
  755. #
  756. # See URI.decode_www_form_component, URI.encode_www_form
  757. def self.decode_www_form(str, enc="UTF-8") #Encoding::UTF_8)
  758. return [] if str.empty?
  759. unless /\A#{WFKV_}*=#{WFKV_}*(?:[;&]#{WFKV_}*=#{WFKV_}*)*\z/o =~ str
  760. raise ArgumentError, "invalid data of application/x-www-form-urlencoded (#{str})"
  761. end
  762. ary = []
  763. $&.scan(/([^=;&]+)=([^;&]*)/) do
  764. ary << [decode_www_form_component($1, enc), decode_www_form_component($2, enc)]
  765. end
  766. ary
  767. end
  768. end
  769. module Kernel
  770. # alias for URI.parse.
  771. #
  772. # This method is introduced at 1.8.2.
  773. def URI(uri_str) # :doc:
  774. URI.parse(uri_str)
  775. end
  776. module_function :URI
  777. end