PageRenderTime 47ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/ruby/1.8/uri/common.rb

https://github.com/MagLev/maglev
Ruby | 620 lines | 334 code | 37 blank | 249 comment | 15 complexity | ac289fb9cce2e1387b7b25439f917e97 MD5 | raw file
Possible License(s): LGPL-2.1
  1. # = uri/common.rb
  2. #
  3. # Author:: Akira Yamada <akira@ruby-lang.org>
  4. # Revision:: $Id: common.rb 14178 2007-12-10 09:31:55Z matz $
  5. # License::
  6. # You can redistribute it and/or modify it under the same term as Ruby.
  7. #
  8. module URI
  9. module REGEXP
  10. #
  11. # Patterns used to parse URI's
  12. #
  13. module PATTERN
  14. # :stopdoc:
  15. # RFC 2396 (URI Generic Syntax)
  16. # RFC 2732 (IPv6 Literal Addresses in URL's)
  17. # RFC 2373 (IPv6 Addressing Architecture)
  18. # alpha = lowalpha | upalpha
  19. ALPHA = "a-zA-Z"
  20. # alphanum = alpha | digit
  21. ALNUM = "#{ALPHA}\\d"
  22. # hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
  23. # "a" | "b" | "c" | "d" | "e" | "f"
  24. HEX = "a-fA-F\\d"
  25. # escaped = "%" hex hex
  26. ESCAPED = "%[#{HEX}]{2}"
  27. # mark = "-" | "_" | "." | "!" | "~" | "*" | "'" |
  28. # "(" | ")"
  29. # unreserved = alphanum | mark
  30. UNRESERVED = "-_.!~*'()#{ALNUM}"
  31. # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
  32. # "$" | ","
  33. # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
  34. # "$" | "," | "[" | "]" (RFC 2732)
  35. RESERVED = ";/?:@&=+$,\\[\\]"
  36. # uric = reserved | unreserved | escaped
  37. URIC = "(?:[#{UNRESERVED}#{RESERVED}]|#{ESCAPED})"
  38. # uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
  39. # "&" | "=" | "+" | "$" | ","
  40. URIC_NO_SLASH = "(?:[#{UNRESERVED};?:@&=+$,]|#{ESCAPED})"
  41. # query = *uric
  42. QUERY = "#{URIC}*"
  43. # fragment = *uric
  44. FRAGMENT = "#{URIC}*"
  45. # domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
  46. DOMLABEL = "(?:[#{ALNUM}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
  47. # toplabel = alpha | alpha *( alphanum | "-" ) alphanum
  48. TOPLABEL = "(?:[#{ALPHA}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
  49. # hostname = *( domainlabel "." ) toplabel [ "." ]
  50. HOSTNAME = "(?:#{DOMLABEL}\\.)*#{TOPLABEL}\\.?"
  51. # RFC 2373, APPENDIX B:
  52. # IPv6address = hexpart [ ":" IPv4address ]
  53. # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
  54. # hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
  55. # hexseq = hex4 *( ":" hex4)
  56. # hex4 = 1*4HEXDIG
  57. #
  58. # XXX: This definition has a flaw. "::" + IPv4address must be
  59. # allowed too. Here is a replacement.
  60. #
  61. # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
  62. IPV4ADDR = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"
  63. # hex4 = 1*4HEXDIG
  64. HEX4 = "[#{HEX}]{1,4}"
  65. # lastpart = hex4 | IPv4address
  66. LASTPART = "(?:#{HEX4}|#{IPV4ADDR})"
  67. # hexseq1 = *( hex4 ":" ) hex4
  68. HEXSEQ1 = "(?:#{HEX4}:)*#{HEX4}"
  69. # hexseq2 = *( hex4 ":" ) lastpart
  70. HEXSEQ2 = "(?:#{HEX4}:)*#{LASTPART}"
  71. # IPv6address = hexseq2 | [ hexseq1 ] "::" [ hexseq2 ]
  72. IPV6ADDR = "(?:#{HEXSEQ2}|(?:#{HEXSEQ1})?::(?:#{HEXSEQ2})?)"
  73. # IPv6prefix = ( hexseq1 | [ hexseq1 ] "::" [ hexseq1 ] ) "/" 1*2DIGIT
  74. # unused
  75. # ipv6reference = "[" IPv6address "]" (RFC 2732)
  76. IPV6REF = "\\[#{IPV6ADDR}\\]"
  77. # host = hostname | IPv4address
  78. # host = hostname | IPv4address | IPv6reference (RFC 2732)
  79. HOST = "(?:#{HOSTNAME}|#{IPV4ADDR}|#{IPV6REF})"
  80. # port = *digit
  81. PORT = '\d*'
  82. # hostport = host [ ":" port ]
  83. HOSTPORT = "#{HOST}(?::#{PORT})?"
  84. # userinfo = *( unreserved | escaped |
  85. # ";" | ":" | "&" | "=" | "+" | "$" | "," )
  86. USERINFO = "(?:[#{UNRESERVED};:&=+$,]|#{ESCAPED})*"
  87. # pchar = unreserved | escaped |
  88. # ":" | "@" | "&" | "=" | "+" | "$" | ","
  89. PCHAR = "(?:[#{UNRESERVED}:@&=+$,]|#{ESCAPED})"
  90. # param = *pchar
  91. PARAM = "#{PCHAR}*"
  92. # segment = *pchar *( ";" param )
  93. SEGMENT = "#{PCHAR}*(?:;#{PARAM})*"
  94. # path_segments = segment *( "/" segment )
  95. PATH_SEGMENTS = "#{SEGMENT}(?:/#{SEGMENT})*"
  96. # server = [ [ userinfo "@" ] hostport ]
  97. SERVER = "(?:#{USERINFO}@)?#{HOSTPORT}"
  98. # reg_name = 1*( unreserved | escaped | "$" | "," |
  99. # ";" | ":" | "@" | "&" | "=" | "+" )
  100. REG_NAME = "(?:[#{UNRESERVED}$,;:@&=+]|#{ESCAPED})+"
  101. # authority = server | reg_name
  102. AUTHORITY = "(?:#{SERVER}|#{REG_NAME})"
  103. # rel_segment = 1*( unreserved | escaped |
  104. # ";" | "@" | "&" | "=" | "+" | "$" | "," )
  105. REL_SEGMENT = "(?:[#{UNRESERVED};@&=+$,]|#{ESCAPED})+"
  106. # scheme = alpha *( alpha | digit | "+" | "-" | "." )
  107. SCHEME = "[#{ALPHA}][-+.#{ALPHA}\\d]*"
  108. # abs_path = "/" path_segments
  109. ABS_PATH = "/#{PATH_SEGMENTS}"
  110. # rel_path = rel_segment [ abs_path ]
  111. REL_PATH = "#{REL_SEGMENT}(?:#{ABS_PATH})?"
  112. # net_path = "//" authority [ abs_path ]
  113. NET_PATH = "//#{AUTHORITY}(?:#{ABS_PATH})?"
  114. # hier_part = ( net_path | abs_path ) [ "?" query ]
  115. HIER_PART = "(?:#{NET_PATH}|#{ABS_PATH})(?:\\?(?:#{QUERY}))?"
  116. # opaque_part = uric_no_slash *uric
  117. OPAQUE_PART = "#{URIC_NO_SLASH}#{URIC}*"
  118. # absoluteURI = scheme ":" ( hier_part | opaque_part )
  119. ABS_URI = "#{SCHEME}:(?:#{HIER_PART}|#{OPAQUE_PART})"
  120. # relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
  121. REL_URI = "(?:#{NET_PATH}|#{ABS_PATH}|#{REL_PATH})(?:\\?#{QUERY})?"
  122. # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
  123. URI_REF = "(?:#{ABS_URI}|#{REL_URI})?(?:##{FRAGMENT})?"
  124. # XXX:
  125. X_ABS_URI = "
  126. (#{PATTERN::SCHEME}): (?# 1: scheme)
  127. (?:
  128. (#{PATTERN::OPAQUE_PART}) (?# 2: opaque)
  129. |
  130. (?:(?:
  131. //(?:
  132. (?:(?:(#{PATTERN::USERINFO})@)? (?# 3: userinfo)
  133. (?:(#{PATTERN::HOST})(?::(\\d*))?))?(?# 4: host, 5: port)
  134. |
  135. (#{PATTERN::REG_NAME}) (?# 6: registry)
  136. )
  137. |
  138. (?!//)) (?# XXX: '//' is the mark for hostport)
  139. (#{PATTERN::ABS_PATH})? (?# 7: path)
  140. )(?:\\?(#{PATTERN::QUERY}))? (?# 8: query)
  141. )
  142. (?:\\#(#{PATTERN::FRAGMENT}))? (?# 9: fragment)
  143. "
  144. X_REL_URI = "
  145. (?:
  146. (?:
  147. //
  148. (?:
  149. (?:(#{PATTERN::USERINFO})@)? (?# 1: userinfo)
  150. (#{PATTERN::HOST})?(?::(\\d*))? (?# 2: host, 3: port)
  151. |
  152. (#{PATTERN::REG_NAME}) (?# 4: registry)
  153. )
  154. )
  155. |
  156. (#{PATTERN::REL_SEGMENT}) (?# 5: rel_segment)
  157. )?
  158. (#{PATTERN::ABS_PATH})? (?# 6: abs_path)
  159. (?:\\?(#{PATTERN::QUERY}))? (?# 7: query)
  160. (?:\\#(#{PATTERN::FRAGMENT}))? (?# 8: fragment)
  161. "
  162. # :startdoc:
  163. end # PATTERN
  164. # :stopdoc:
  165. # for URI::split
  166. ABS_URI = Regexp.new('^' + PATTERN::X_ABS_URI + '$', #'
  167. Regexp::EXTENDED, 'N').freeze
  168. REL_URI = Regexp.new('^' + PATTERN::X_REL_URI + '$', #'
  169. Regexp::EXTENDED, 'N').freeze
  170. # for URI::extract
  171. URI_REF = Regexp.new(PATTERN::URI_REF, false, 'N').freeze
  172. ABS_URI_REF = Regexp.new(PATTERN::X_ABS_URI, Regexp::EXTENDED, 'N').freeze
  173. REL_URI_REF = Regexp.new(PATTERN::X_REL_URI, Regexp::EXTENDED, 'N').freeze
  174. # for URI::escape/unescape
  175. ESCAPED = Regexp.new(PATTERN::ESCAPED, false, 'N').freeze
  176. UNSAFE = Regexp.new("[^#{PATTERN::UNRESERVED}#{PATTERN::RESERVED}]",
  177. false, 'N').freeze
  178. # for Generic#initialize
  179. SCHEME = Regexp.new("^#{PATTERN::SCHEME}$", false, 'N').freeze #"
  180. USERINFO = Regexp.new("^#{PATTERN::USERINFO}$", false, 'N').freeze #"
  181. HOST = Regexp.new("^#{PATTERN::HOST}$", false, 'N').freeze #"
  182. PORT = Regexp.new("^#{PATTERN::PORT}$", false, 'N').freeze #"
  183. OPAQUE = Regexp.new("^#{PATTERN::OPAQUE_PART}$", false, 'N').freeze #"
  184. REGISTRY = Regexp.new("^#{PATTERN::REG_NAME}$", false, 'N').freeze #"
  185. ABS_PATH = Regexp.new("^#{PATTERN::ABS_PATH}$", false, 'N').freeze #"
  186. REL_PATH = Regexp.new("^#{PATTERN::REL_PATH}$", false, 'N').freeze #"
  187. QUERY = Regexp.new("^#{PATTERN::QUERY}$", false, 'N').freeze #"
  188. FRAGMENT = Regexp.new("^#{PATTERN::FRAGMENT}$", false, 'N').freeze #"
  189. # :startdoc:
  190. end # REGEXP
  191. module Util # :nodoc:
  192. def make_components_hash(klass, array_hash)
  193. tmp = {}
  194. if array_hash._isArray &&
  195. array_hash.size == klass.component.size - 1
  196. klass.component[1..-1].each_index do |i|
  197. begin
  198. tmp[klass.component[i + 1]] = array_hash[i].clone
  199. rescue TypeError
  200. tmp[klass.component[i + 1]] = array_hash[i]
  201. end
  202. end
  203. elsif array_hash._isHash
  204. array_hash.each do |key, value|
  205. begin
  206. tmp[key] = value.clone
  207. rescue TypeError
  208. tmp[key] = value
  209. end
  210. end
  211. else
  212. raise ArgumentError,
  213. "expected Array of or Hash of components of #{klass.to_s} (#{klass.component[1..-1].join(', ')})"
  214. end
  215. tmp[:scheme] = klass.to_s.sub(/\A.*::/, '').downcase
  216. return tmp
  217. end
  218. module_function :make_components_hash
  219. end
  220. module Escape
  221. include REGEXP
  222. #
  223. # == Synopsis
  224. #
  225. # URI.escape(str [, unsafe])
  226. #
  227. # == Args
  228. #
  229. # +str+::
  230. # String to replaces in.
  231. # +unsafe+::
  232. # Regexp that matches all symbols that must be replaced with codes.
  233. # By default uses <tt>REGEXP::UNSAFE</tt>.
  234. # When this argument is a String, it represents a character set.
  235. #
  236. # == Description
  237. #
  238. # Escapes the string, replacing all unsafe characters with codes.
  239. #
  240. # == Usage
  241. #
  242. # require 'uri'
  243. #
  244. # enc_uri = URI.escape("http://example.com/?a=\11\15")
  245. # p enc_uri
  246. # # => "http://example.com/?a=%09%0D"
  247. #
  248. # p URI.unescape(enc_uri)
  249. # # => "http://example.com/?a=\t\r"
  250. #
  251. # p URI.escape("@?@!", "!?")
  252. # # => "@%3F@%21"
  253. #
  254. def escape(str, unsafe = UNSAFE)
  255. unless unsafe._isRegexp
  256. # perhaps unsafe is String object
  257. unsafe = Regexp.new("[#{Regexp.quote(unsafe)}]", false, 'N')
  258. end
  259. str.gsub(unsafe) do |us|
  260. tmp = ''
  261. us.each_byte do |uc|
  262. tmp << sprintf('%%%02X', uc)
  263. end
  264. tmp
  265. end
  266. end
  267. alias encode escape
  268. #
  269. # == Synopsis
  270. #
  271. # URI.unescape(str)
  272. #
  273. # == Args
  274. #
  275. # +str+::
  276. # Unescapes the string.
  277. #
  278. # == Usage
  279. #
  280. # require 'uri'
  281. #
  282. # enc_uri = URI.escape("http://example.com/?a=\11\15")
  283. # p enc_uri
  284. # # => "http://example.com/?a=%09%0D"
  285. #
  286. # p URI.unescape(enc_uri)
  287. # # => "http://example.com/?a=\t\r"
  288. #
  289. def unescape(str)
  290. str.gsub(ESCAPED) do
  291. $&[1,2].hex.chr
  292. end
  293. end
  294. alias decode unescape
  295. end
  296. include REGEXP
  297. extend Escape
  298. @@schemes = {}
  299. #
  300. # Base class for all URI exceptions.
  301. #
  302. class Error < StandardError; end
  303. #
  304. # Not a URI.
  305. #
  306. class InvalidURIError < Error; end
  307. #
  308. # Not a URI component.
  309. #
  310. class InvalidComponentError < Error; end
  311. #
  312. # URI is valid, bad usage is not.
  313. #
  314. class BadURIError < Error; end
  315. #
  316. # == Synopsis
  317. #
  318. # URI::split(uri)
  319. #
  320. # == Args
  321. #
  322. # +uri+::
  323. # String with URI.
  324. #
  325. # == Description
  326. #
  327. # Splits the string on following parts and returns array with result:
  328. #
  329. # * Scheme
  330. # * Userinfo
  331. # * Host
  332. # * Port
  333. # * Registry
  334. # * Path
  335. # * Opaque
  336. # * Query
  337. # * Fragment
  338. #
  339. # == Usage
  340. #
  341. # require 'uri'
  342. #
  343. # p URI.split("http://www.ruby-lang.org/")
  344. # # => ["http", nil, "www.ruby-lang.org", nil, nil, "/", nil, nil, nil]
  345. #
  346. def self.split(uri)
  347. case uri
  348. when ''
  349. # null uri
  350. when ABS_URI
  351. scheme, opaque, userinfo, host, port,
  352. registry, path, query, fragment = $~[1..-1]
  353. # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
  354. # absoluteURI = scheme ":" ( hier_part | opaque_part )
  355. # hier_part = ( net_path | abs_path ) [ "?" query ]
  356. # opaque_part = uric_no_slash *uric
  357. # abs_path = "/" path_segments
  358. # net_path = "//" authority [ abs_path ]
  359. # authority = server | reg_name
  360. # server = [ [ userinfo "@" ] hostport ]
  361. if !scheme
  362. raise InvalidURIError,
  363. "bad URI(absolute but no scheme): #{uri}"
  364. end
  365. if !opaque && (!path && (!host && !registry))
  366. raise InvalidURIError,
  367. "bad URI(absolute but no path): #{uri}"
  368. end
  369. when REL_URI
  370. scheme = nil
  371. opaque = nil
  372. userinfo, host, port, registry,
  373. rel_segment, abs_path, query, fragment = $~[1..-1]
  374. if rel_segment && abs_path
  375. path = rel_segment + abs_path
  376. elsif rel_segment
  377. path = rel_segment
  378. elsif abs_path
  379. path = abs_path
  380. end
  381. # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
  382. # relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
  383. # net_path = "//" authority [ abs_path ]
  384. # abs_path = "/" path_segments
  385. # rel_path = rel_segment [ abs_path ]
  386. # authority = server | reg_name
  387. # server = [ [ userinfo "@" ] hostport ]
  388. else
  389. raise InvalidURIError, "bad URI(is not URI?): #{uri}"
  390. end
  391. path = '' if !path && !opaque # (see RFC2396 Section 5.2)
  392. ret = [
  393. scheme,
  394. userinfo, host, port, # X
  395. registry, # X
  396. path, # Y
  397. opaque, # Y
  398. query,
  399. fragment
  400. ]
  401. return ret
  402. end
  403. #
  404. # == Synopsis
  405. #
  406. # URI::parse(uri_str)
  407. #
  408. # == Args
  409. #
  410. # +uri_str+::
  411. # String with URI.
  412. #
  413. # == Description
  414. #
  415. # Creates one of the URI's subclasses instance from the string.
  416. #
  417. # == Raises
  418. #
  419. # URI::InvalidURIError
  420. # Raised if URI given is not a correct one.
  421. #
  422. # == Usage
  423. #
  424. # require 'uri'
  425. #
  426. # uri = URI.parse("http://www.ruby-lang.org/")
  427. # p uri
  428. # # => #<URI::HTTP:0x202281be URL:http://www.ruby-lang.org/>
  429. # p uri.scheme
  430. # # => "http"
  431. # p uri.host
  432. # # => "www.ruby-lang.org"
  433. #
  434. def self.parse(uri)
  435. scheme, userinfo, host, port,
  436. registry, path, opaque, query, fragment = self.split(uri)
  437. if scheme && @@schemes.include?(scheme.upcase)
  438. @@schemes[scheme.upcase].new(scheme, userinfo, host, port,
  439. registry, path, opaque, query,
  440. fragment)
  441. else
  442. Generic.new(scheme, userinfo, host, port,
  443. registry, path, opaque, query,
  444. fragment)
  445. end
  446. end
  447. #
  448. # == Synopsis
  449. #
  450. # URI::join(str[, str, ...])
  451. #
  452. # == Args
  453. #
  454. # +str+::
  455. # String(s) to work with
  456. #
  457. # == Description
  458. #
  459. # Joins URIs.
  460. #
  461. # == Usage
  462. #
  463. # require 'uri'
  464. #
  465. # p URI.join("http://localhost/","main.rbx")
  466. # # => #<URI::HTTP:0x2022ac02 URL:http://localhost/main.rbx>
  467. #
  468. def self.join(*str)
  469. u = str[0]
  470. unless u._is_a?(URI)
  471. u = Type.coerce_to(u, String, :to_str)
  472. u = self.parse(u)
  473. end
  474. str[1 .. -1].each do |x|
  475. u = u.merge(x)
  476. end
  477. u
  478. end
  479. #
  480. # == Synopsis
  481. #
  482. # URI::extract(str[, schemes][,&blk])
  483. #
  484. # == Args
  485. #
  486. # +str+::
  487. # String to extract URIs from.
  488. # +schemes+::
  489. # Limit URI matching to a specific schemes.
  490. #
  491. # == Description
  492. #
  493. # Extracts URIs from a string. If block given, iterates through all matched URIs.
  494. # Returns nil if block given or array with matches.
  495. #
  496. # == Usage
  497. #
  498. # require "uri"
  499. #
  500. # URI.extract("text here http://foo.example.org/bla and here mailto:test@example.com and here also.")
  501. # # => ["http://foo.example.com/bla", "mailto:test@example.com"]
  502. #
  503. def self.extract(str, schemes = nil, &block)
  504. if block_given?
  505. str.scan(regexp(schemes)) { yield $& }
  506. nil
  507. else
  508. result = []
  509. str.scan(regexp(schemes)) { result.push $& }
  510. result
  511. end
  512. end
  513. #
  514. # == Synopsis
  515. #
  516. # URI::regexp([match_schemes])
  517. #
  518. # == Args
  519. #
  520. # +match_schemes+::
  521. # Array of schemes. If given, resulting regexp matches to URIs
  522. # whose scheme is one of the match_schemes.
  523. #
  524. # == Description
  525. # Returns a Regexp object which matches to URI-like strings.
  526. # The Regexp object returned by this method includes arbitrary
  527. # number of capture group (parentheses). Never rely on it's number.
  528. #
  529. # == Usage
  530. #
  531. # require 'uri'
  532. #
  533. # # extract first URI from html_string
  534. # html_string.slice(URI.regexp)
  535. #
  536. # # remove ftp URIs
  537. # html_string.sub(URI.regexp(['ftp'])
  538. #
  539. # # You should not rely on the number of parentheses
  540. # html_string.scan(URI.regexp) do |*matches|
  541. # p $&
  542. # end
  543. #
  544. def self.regexp(schemes = nil)
  545. unless schemes
  546. ABS_URI_REF
  547. else
  548. /(?=#{Regexp.union(*schemes)}:)#{PATTERN::X_ABS_URI}/xn
  549. end
  550. end
  551. end
  552. module Kernel
  553. # alias for URI.parse.
  554. #
  555. # This method is introduced at 1.8.2.
  556. def URI(uri_str) # :doc:
  557. ucls = URI
  558. if uri_str._is_a?(ucls)
  559. return uri_str
  560. end
  561. uri_str = Type.coerce_to(uri_str, String, :to_str)
  562. ucls.parse(uri_str)
  563. end
  564. module_function :URI
  565. end