PageRenderTime 138ms CodeModel.GetById 36ms RepoModel.GetById 6ms app.codeStats 0ms

/sources/ruby-1.8.5/lib/uri/common.rb

http://rubyworks.googlecode.com/
Ruby | 607 lines | 325 code | 37 blank | 245 comment | 14 complexity | 20fea1d815764b5380e8a0ee90f9f764 MD5 | raw file
Possible License(s): GPL-2.0, BSD-3-Clause, LGPL-2.1, AGPL-3.0, 0BSD, Unlicense
  1. # = uri/common.rb
  2. #
  3. # Author:: Akira Yamada <akira@ruby-lang.org>
  4. # Revision:: $Id: common.rb,v 1.11.2.7 2005/06/24 04:15:20 akira Exp $
  5. # License::
  6. # You can redistribute it and/or modify it under the same term as Ruby.
  7. #
  8. module URI
  9. module REGEXP
  10. #
  11. # Patterns used to parse URI's
  12. #
  13. module PATTERN
  14. # :stopdoc:
  15. # RFC 2396 (URI Generic Syntax)
  16. # RFC 2732 (IPv6 Literal Addresses in URL's)
  17. # RFC 2373 (IPv6 Addressing Architecture)
  18. # alpha = lowalpha | upalpha
  19. ALPHA = "a-zA-Z"
  20. # alphanum = alpha | digit
  21. ALNUM = "#{ALPHA}\\d"
  22. # hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
  23. # "a" | "b" | "c" | "d" | "e" | "f"
  24. HEX = "a-fA-F\\d"
  25. # escaped = "%" hex hex
  26. ESCAPED = "%[#{HEX}]{2}"
  27. # mark = "-" | "_" | "." | "!" | "~" | "*" | "'" |
  28. # "(" | ")"
  29. # unreserved = alphanum | mark
  30. UNRESERVED = "-_.!~*'()#{ALNUM}"
  31. # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
  32. # "$" | ","
  33. # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
  34. # "$" | "," | "[" | "]" (RFC 2732)
  35. RESERVED = ";/?:@&=+$,\\[\\]"
  36. # uric = reserved | unreserved | escaped
  37. URIC = "(?:[#{UNRESERVED}#{RESERVED}]|#{ESCAPED})"
  38. # uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
  39. # "&" | "=" | "+" | "$" | ","
  40. URIC_NO_SLASH = "(?:[#{UNRESERVED};?:@&=+$,]|#{ESCAPED})"
  41. # query = *uric
  42. QUERY = "#{URIC}*"
  43. # fragment = *uric
  44. FRAGMENT = "#{URIC}*"
  45. # domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
  46. DOMLABEL = "(?:[#{ALNUM}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
  47. # toplabel = alpha | alpha *( alphanum | "-" ) alphanum
  48. TOPLABEL = "(?:[#{ALPHA}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
  49. # hostname = *( domainlabel "." ) toplabel [ "." ]
  50. HOSTNAME = "(?:#{DOMLABEL}\\.)*#{TOPLABEL}\\.?"
  51. # RFC 2373, APPENDIX B:
  52. # IPv6address = hexpart [ ":" IPv4address ]
  53. # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
  54. # hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
  55. # hexseq = hex4 *( ":" hex4)
  56. # hex4 = 1*4HEXDIG
  57. #
  58. # XXX: This definition has a flaw. "::" + IPv4address must be
  59. # allowed too. Here is a replacement.
  60. #
  61. # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
  62. IPV4ADDR = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"
  63. # hex4 = 1*4HEXDIG
  64. HEX4 = "[#{HEX}]{1,4}"
  65. # lastpart = hex4 | IPv4address
  66. LASTPART = "(?:#{HEX4}|#{IPV4ADDR})"
  67. # hexseq1 = *( hex4 ":" ) hex4
  68. HEXSEQ1 = "(?:#{HEX4}:)*#{HEX4}"
  69. # hexseq2 = *( hex4 ":" ) lastpart
  70. HEXSEQ2 = "(?:#{HEX4}:)*#{LASTPART}"
  71. # IPv6address = hexseq2 | [ hexseq1 ] "::" [ hexseq2 ]
  72. IPV6ADDR = "(?:#{HEXSEQ2}|(?:#{HEXSEQ1})?::(?:#{HEXSEQ2})?)"
  73. # IPv6prefix = ( hexseq1 | [ hexseq1 ] "::" [ hexseq1 ] ) "/" 1*2DIGIT
  74. # unused
  75. # ipv6reference = "[" IPv6address "]" (RFC 2732)
  76. IPV6REF = "\\[#{IPV6ADDR}\\]"
  77. # host = hostname | IPv4address
  78. # host = hostname | IPv4address | IPv6reference (RFC 2732)
  79. HOST = "(?:#{HOSTNAME}|#{IPV4ADDR}|#{IPV6REF})"
  80. # port = *digit
  81. PORT = '\d*'
  82. # hostport = host [ ":" port ]
  83. HOSTPORT = "#{HOST}(?::#{PORT})?"
  84. # userinfo = *( unreserved | escaped |
  85. # ";" | ":" | "&" | "=" | "+" | "$" | "," )
  86. USERINFO = "(?:[#{UNRESERVED};:&=+$,]|#{ESCAPED})*"
  87. # pchar = unreserved | escaped |
  88. # ":" | "@" | "&" | "=" | "+" | "$" | ","
  89. PCHAR = "(?:[#{UNRESERVED}:@&=+$,]|#{ESCAPED})"
  90. # param = *pchar
  91. PARAM = "#{PCHAR}*"
  92. # segment = *pchar *( ";" param )
  93. SEGMENT = "#{PCHAR}*(?:;#{PARAM})*"
  94. # path_segments = segment *( "/" segment )
  95. PATH_SEGMENTS = "#{SEGMENT}(?:/#{SEGMENT})*"
  96. # server = [ [ userinfo "@" ] hostport ]
  97. SERVER = "(?:#{USERINFO}@)?#{HOSTPORT}"
  98. # reg_name = 1*( unreserved | escaped | "$" | "," |
  99. # ";" | ":" | "@" | "&" | "=" | "+" )
  100. REG_NAME = "(?:[#{UNRESERVED}$,;:@&=+]|#{ESCAPED})+"
  101. # authority = server | reg_name
  102. AUTHORITY = "(?:#{SERVER}|#{REG_NAME})"
  103. # rel_segment = 1*( unreserved | escaped |
  104. # ";" | "@" | "&" | "=" | "+" | "$" | "," )
  105. REL_SEGMENT = "(?:[#{UNRESERVED};@&=+$,]|#{ESCAPED})+"
  106. # scheme = alpha *( alpha | digit | "+" | "-" | "." )
  107. SCHEME = "[#{ALPHA}][-+.#{ALPHA}\\d]*"
  108. # abs_path = "/" path_segments
  109. ABS_PATH = "/#{PATH_SEGMENTS}"
  110. # rel_path = rel_segment [ abs_path ]
  111. REL_PATH = "#{REL_SEGMENT}(?:#{ABS_PATH})?"
  112. # net_path = "//" authority [ abs_path ]
  113. NET_PATH = "//#{AUTHORITY}(?:#{ABS_PATH})?"
  114. # hier_part = ( net_path | abs_path ) [ "?" query ]
  115. HIER_PART = "(?:#{NET_PATH}|#{ABS_PATH})(?:\\?(?:#{QUERY}))?"
  116. # opaque_part = uric_no_slash *uric
  117. OPAQUE_PART = "#{URIC_NO_SLASH}#{URIC}*"
  118. # absoluteURI = scheme ":" ( hier_part | opaque_part )
  119. ABS_URI = "#{SCHEME}:(?:#{HIER_PART}|#{OPAQUE_PART})"
  120. # relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
  121. REL_URI = "(?:#{NET_PATH}|#{ABS_PATH}|#{REL_PATH})(?:\\?#{QUERY})?"
  122. # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
  123. URI_REF = "(?:#{ABS_URI}|#{REL_URI})?(?:##{FRAGMENT})?"
  124. # XXX:
  125. X_ABS_URI = "
  126. (#{PATTERN::SCHEME}): (?# 1: scheme)
  127. (?:
  128. (#{PATTERN::OPAQUE_PART}) (?# 2: opaque)
  129. |
  130. (?:(?:
  131. //(?:
  132. (?:(?:(#{PATTERN::USERINFO})@)? (?# 3: userinfo)
  133. (?:(#{PATTERN::HOST})(?::(\\d*))?))?(?# 4: host, 5: port)
  134. |
  135. (#{PATTERN::REG_NAME}) (?# 6: registry)
  136. )
  137. |
  138. (?!//)) (?# XXX: '//' is the mark for hostport)
  139. (#{PATTERN::ABS_PATH})? (?# 7: path)
  140. )(?:\\?(#{PATTERN::QUERY}))? (?# 8: query)
  141. )
  142. (?:\\#(#{PATTERN::FRAGMENT}))? (?# 9: fragment)
  143. "
  144. X_REL_URI = "
  145. (?:
  146. (?:
  147. //
  148. (?:
  149. (?:(#{PATTERN::USERINFO})@)? (?# 1: userinfo)
  150. (#{PATTERN::HOST})?(?::(\\d*))? (?# 2: host, 3: port)
  151. |
  152. (#{PATTERN::REG_NAME}) (?# 4: registry)
  153. )
  154. )
  155. |
  156. (#{PATTERN::REL_SEGMENT}) (?# 5: rel_segment)
  157. )?
  158. (#{PATTERN::ABS_PATH})? (?# 6: abs_path)
  159. (?:\\?(#{PATTERN::QUERY}))? (?# 7: query)
  160. (?:\\#(#{PATTERN::FRAGMENT}))? (?# 8: fragment)
  161. "
  162. # :startdoc:
  163. end # PATTERN
  164. # :stopdoc:
  165. # for URI::split
  166. ABS_URI = Regexp.new('^' + PATTERN::X_ABS_URI + '$', #'
  167. Regexp::EXTENDED, 'N').freeze
  168. REL_URI = Regexp.new('^' + PATTERN::X_REL_URI + '$', #'
  169. Regexp::EXTENDED, 'N').freeze
  170. # for URI::extract
  171. URI_REF = Regexp.new(PATTERN::URI_REF, false, 'N').freeze
  172. ABS_URI_REF = Regexp.new(PATTERN::X_ABS_URI, Regexp::EXTENDED, 'N').freeze
  173. REL_URI_REF = Regexp.new(PATTERN::X_REL_URI, Regexp::EXTENDED, 'N').freeze
  174. # for URI::escape/unescape
  175. ESCAPED = Regexp.new(PATTERN::ESCAPED, false, 'N').freeze
  176. UNSAFE = Regexp.new("[^#{PATTERN::UNRESERVED}#{PATTERN::RESERVED}]",
  177. false, 'N').freeze
  178. # for Generic#initialize
  179. SCHEME = Regexp.new("^#{PATTERN::SCHEME}$", false, 'N').freeze #"
  180. USERINFO = Regexp.new("^#{PATTERN::USERINFO}$", false, 'N').freeze #"
  181. HOST = Regexp.new("^#{PATTERN::HOST}$", false, 'N').freeze #"
  182. PORT = Regexp.new("^#{PATTERN::PORT}$", false, 'N').freeze #"
  183. OPAQUE = Regexp.new("^#{PATTERN::OPAQUE_PART}$", false, 'N').freeze #"
  184. REGISTRY = Regexp.new("^#{PATTERN::REG_NAME}$", false, 'N').freeze #"
  185. ABS_PATH = Regexp.new("^#{PATTERN::ABS_PATH}$", false, 'N').freeze #"
  186. REL_PATH = Regexp.new("^#{PATTERN::REL_PATH}$", false, 'N').freeze #"
  187. QUERY = Regexp.new("^#{PATTERN::QUERY}$", false, 'N').freeze #"
  188. FRAGMENT = Regexp.new("^#{PATTERN::FRAGMENT}$", false, 'N').freeze #"
  189. # :startdoc:
  190. end # REGEXP
  191. module Util # :nodoc:
  192. def make_components_hash(klass, array_hash)
  193. tmp = {}
  194. if array_hash.kind_of?(Array) &&
  195. array_hash.size == klass.component.size - 1
  196. klass.component[1..-1].each_index do |i|
  197. begin
  198. tmp[klass.component[i + 1]] = array_hash[i].clone
  199. rescue TypeError
  200. tmp[klass.component[i + 1]] = array_hash[i]
  201. end
  202. end
  203. elsif array_hash.kind_of?(Hash)
  204. array_hash.each do |key, value|
  205. begin
  206. tmp[key] = value.clone
  207. rescue TypeError
  208. tmp[key] = value
  209. end
  210. end
  211. else
  212. raise ArgumentError,
  213. "expected Array of or Hash of components of #{klass.to_s} (#{klass.component[1..-1].join(', ')})"
  214. end
  215. tmp[:scheme] = klass.to_s.sub(/\A.*::/, '').downcase
  216. return tmp
  217. end
  218. module_function :make_components_hash
  219. end
  220. module Escape
  221. include REGEXP
  222. #
  223. # == Synopsis
  224. #
  225. # URI.escape(str [, unsafe])
  226. #
  227. # == Args
  228. #
  229. # +str+::
  230. # String to replaces in.
  231. # +unsafe+::
  232. # Regexp that matches all symbols that must be replaced with codes.
  233. # By default uses <tt>REGEXP::UNSAFE</tt>.
  234. #
  235. # == Description
  236. #
  237. # Escapes the string, replacing all unsafe characters with codes.
  238. #
  239. # == Usage
  240. #
  241. # require 'uri'
  242. #
  243. # enc_uri = URI.escape("http://example.com/?a=\11\15")
  244. # p enc_uri
  245. # # => "http://example.com/?a=%09%0D"
  246. #
  247. # p URI.unescape(enc_uri)
  248. # # => "http://example.com/?a=\t\r"
  249. #
  250. def escape(str, unsafe = UNSAFE)
  251. unless unsafe.kind_of?(Regexp)
  252. # perhaps unsafe is String object
  253. unsafe = Regexp.new(Regexp.quote(unsafe), false, 'N')
  254. end
  255. str.gsub(unsafe) do |us|
  256. tmp = ''
  257. us.each_byte do |uc|
  258. tmp << sprintf('%%%02X', uc)
  259. end
  260. tmp
  261. end
  262. end
  263. alias encode escape
  264. #
  265. # == Synopsis
  266. #
  267. # URI.unescape(str)
  268. #
  269. # == Args
  270. #
  271. # +str+::
  272. # Unescapes the string.
  273. #
  274. # == Usage
  275. #
  276. # require 'uri'
  277. #
  278. # enc_uri = URI.escape("http://example.com/?a=\11\15")
  279. # p enc_uri
  280. # # => "http://example.com/?a=%09%0D"
  281. #
  282. # p URI.unescape(enc_uri)
  283. # # => "http://example.com/?a=\t\r"
  284. #
  285. def unescape(str)
  286. str.gsub(ESCAPED) do
  287. $&[1,2].hex.chr
  288. end
  289. end
  290. alias decode unescape
  291. end
  292. include REGEXP
  293. extend Escape
  294. @@schemes = {}
  295. #
  296. # Base class for all URI exceptions.
  297. #
  298. class Error < StandardError; end
  299. #
  300. # Not a URI.
  301. #
  302. class InvalidURIError < Error; end
  303. #
  304. # Not a URI component.
  305. #
  306. class InvalidComponentError < Error; end
  307. #
  308. # URI is valid, bad usage is not.
  309. #
  310. class BadURIError < Error; end
  311. #
  312. # == Synopsis
  313. #
  314. # URI::split(uri)
  315. #
  316. # == Args
  317. #
  318. # +uri+::
  319. # String with URI.
  320. #
  321. # == Description
  322. #
  323. # Splits the string on following parts and returns array with result:
  324. #
  325. # * Scheme
  326. # * Userinfo
  327. # * Host
  328. # * Port
  329. # * Registry
  330. # * Path
  331. # * Opaque
  332. # * Query
  333. # * Fragment
  334. #
  335. # == Usage
  336. #
  337. # require 'uri'
  338. #
  339. # p URI.split("http://www.ruby-lang.org/")
  340. # # => ["http", nil, "www.ruby-lang.org", nil, nil, "/", nil, nil, nil]
  341. #
  342. def self.split(uri)
  343. case uri
  344. when ''
  345. # null uri
  346. when ABS_URI
  347. scheme, opaque, userinfo, host, port,
  348. registry, path, query, fragment = $~[1..-1]
  349. # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
  350. # absoluteURI = scheme ":" ( hier_part | opaque_part )
  351. # hier_part = ( net_path | abs_path ) [ "?" query ]
  352. # opaque_part = uric_no_slash *uric
  353. # abs_path = "/" path_segments
  354. # net_path = "//" authority [ abs_path ]
  355. # authority = server | reg_name
  356. # server = [ [ userinfo "@" ] hostport ]
  357. if !scheme
  358. raise InvalidURIError,
  359. "bad URI(absolute but no scheme): #{uri}"
  360. end
  361. if !opaque && (!path && (!host && !registry))
  362. raise InvalidURIError,
  363. "bad URI(absolute but no path): #{uri}"
  364. end
  365. when REL_URI
  366. scheme = nil
  367. opaque = nil
  368. userinfo, host, port, registry,
  369. rel_segment, abs_path, query, fragment = $~[1..-1]
  370. if rel_segment && abs_path
  371. path = rel_segment + abs_path
  372. elsif rel_segment
  373. path = rel_segment
  374. elsif abs_path
  375. path = abs_path
  376. end
  377. # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
  378. # relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
  379. # net_path = "//" authority [ abs_path ]
  380. # abs_path = "/" path_segments
  381. # rel_path = rel_segment [ abs_path ]
  382. # authority = server | reg_name
  383. # server = [ [ userinfo "@" ] hostport ]
  384. else
  385. raise InvalidURIError, "bad URI(is not URI?): #{uri}"
  386. end
  387. path = '' if !path && !opaque # (see RFC2396 Section 5.2)
  388. ret = [
  389. scheme,
  390. userinfo, host, port, # X
  391. registry, # X
  392. path, # Y
  393. opaque, # Y
  394. query,
  395. fragment
  396. ]
  397. return ret
  398. end
  399. #
  400. # == Synopsis
  401. #
  402. # URI::parse(uri_str)
  403. #
  404. # == Args
  405. #
  406. # +uri_str+::
  407. # String with URI.
  408. #
  409. # == Description
  410. #
  411. # Creates one of the URI's subclasses instance from the string.
  412. #
  413. # == Raises
  414. #
  415. # URI::InvalidURIError
  416. # Raised if URI given is not a correct one.
  417. #
  418. # == Usage
  419. #
  420. # require 'uri'
  421. #
  422. # uri = URI.parse("http://www.ruby-lang.org/")
  423. # p uri
  424. # # => #<URI::HTTP:0x202281be URL:http://www.ruby-lang.org/>
  425. # p uri.scheme
  426. # # => "http"
  427. # p uri.host
  428. # # => "www.ruby-lang.org"
  429. #
  430. def self.parse(uri)
  431. scheme, userinfo, host, port,
  432. registry, path, opaque, query, fragment = self.split(uri)
  433. if scheme && @@schemes.include?(scheme.upcase)
  434. @@schemes[scheme.upcase].new(scheme, userinfo, host, port,
  435. registry, path, opaque, query,
  436. fragment)
  437. else
  438. Generic.new(scheme, userinfo, host, port,
  439. registry, path, opaque, query,
  440. fragment)
  441. end
  442. end
  443. #
  444. # == Synopsis
  445. #
  446. # URI::join(str[, str, ...])
  447. #
  448. # == Args
  449. #
  450. # +str+::
  451. # String(s) to work with
  452. #
  453. # == Description
  454. #
  455. # Joins URIs.
  456. #
  457. # == Usage
  458. #
  459. # require 'uri'
  460. #
  461. # p URI.join("http://localhost/","main.rbx")
  462. # # => #<URI::HTTP:0x2022ac02 URL:http://localhost/main.rbx>
  463. #
  464. def self.join(*str)
  465. u = self.parse(str[0])
  466. str[1 .. -1].each do |x|
  467. u = u.merge(x)
  468. end
  469. u
  470. end
  471. #
  472. # == Synopsis
  473. #
  474. # URI::extract(str[, schemes][,&blk])
  475. #
  476. # == Args
  477. #
  478. # +str+::
  479. # String to extract URIs from.
  480. # +schemes+::
  481. # Limit URI matching to a specific schemes.
  482. #
  483. # == Description
  484. #
  485. # Extracts URIs from a string. If block given, iterates through all matched URIs.
  486. # Returns nil if block given or array with matches.
  487. #
  488. # == Usage
  489. #
  490. # require "uri"
  491. #
  492. # URI.extract("text here http://foo.example.org/bla and here mailto:test@example.com and here also.")
  493. # # => ["http://foo.example.org/bla", "mailto:test@example.com"]
  494. #
  495. def self.extract(str, schemes = nil, &block)
  496. if block_given?
  497. str.scan(regexp(schemes)) { yield $& }
  498. nil
  499. else
  500. result = []
  501. str.scan(regexp(schemes)) { result.push $& }
  502. result
  503. end
  504. end
  505. #
  506. # == Synopsis
  507. #
  508. # URI::regexp([match_schemes])
  509. #
  510. # == Args
  511. #
  512. # +match_schemes+::
  513. # Array of schemes. If given, resulting regexp matches to URIs
  514. # whose scheme is one of the match_schemes.
  515. #
  516. # == Description
  517. # Returns a Regexp object which matches to URI-like strings.
  518. # The Regexp object returned by this method includes arbitrary
  519. # number of capture group (parentheses). Never rely on it's number.
  520. #
  521. # == Usage
  522. #
  523. # require 'uri'
  524. #
  525. # # extract first URI from html_string
  526. # html_string.slice(URI.regexp)
  527. #
  528. # # remove ftp URIs
  529. # html_string.sub(URI.regexp(['ftp'])
  530. #
  531. # # You should not rely on the number of parentheses
  532. # html_string.scan(URI.regexp) do |*matches|
  533. # p $&
  534. # end
  535. #
  536. def self.regexp(schemes = nil)
  537. unless schemes
  538. ABS_URI_REF
  539. else
  540. /(?=#{Regexp.union(*schemes)}:)#{PATTERN::X_ABS_URI}/xn
  541. end
  542. end
  543. end
  544. module Kernel
  545. # alias for URI.parse.
  546. #
  547. # This method is introduced at 1.8.2.
  548. def URI(uri_str) # :doc:
  549. URI.parse(uri_str)
  550. end
  551. module_function :URI
  552. end