PageRenderTime 55ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/uri/rfc2396_parser.rb

http://github.com/ruby/ruby
Ruby | 533 lines | 245 code | 59 blank | 229 comment | 18 complexity | 3c0259235630afc8da1c8be482c7d582 MD5 | raw file
Possible License(s): GPL-2.0, BSD-3-Clause, AGPL-3.0
  1. # frozen_string_literal: false
  2. #--
  3. # = uri/common.rb
  4. #
  5. # Author:: Akira Yamada <akira@ruby-lang.org>
  6. # License::
  7. # You can redistribute it and/or modify it under the same term as Ruby.
  8. #
  9. # See URI for general documentation
  10. #
  11. module URI
  12. #
  13. # Includes URI::REGEXP::PATTERN
  14. #
  15. module RFC2396_REGEXP
  16. #
  17. # Patterns used to parse URI's
  18. #
  19. module PATTERN
  20. # :stopdoc:
  21. # RFC 2396 (URI Generic Syntax)
  22. # RFC 2732 (IPv6 Literal Addresses in URL's)
  23. # RFC 2373 (IPv6 Addressing Architecture)
  24. # alpha = lowalpha | upalpha
  25. ALPHA = "a-zA-Z"
  26. # alphanum = alpha | digit
  27. ALNUM = "#{ALPHA}\\d"
  28. # hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
  29. # "a" | "b" | "c" | "d" | "e" | "f"
  30. HEX = "a-fA-F\\d"
  31. # escaped = "%" hex hex
  32. ESCAPED = "%[#{HEX}]{2}"
  33. # mark = "-" | "_" | "." | "!" | "~" | "*" | "'" |
  34. # "(" | ")"
  35. # unreserved = alphanum | mark
  36. UNRESERVED = "\\-_.!~*'()#{ALNUM}"
  37. # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
  38. # "$" | ","
  39. # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
  40. # "$" | "," | "[" | "]" (RFC 2732)
  41. RESERVED = ";/?:@&=+$,\\[\\]"
  42. # domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
  43. DOMLABEL = "(?:[#{ALNUM}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
  44. # toplabel = alpha | alpha *( alphanum | "-" ) alphanum
  45. TOPLABEL = "(?:[#{ALPHA}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
  46. # hostname = *( domainlabel "." ) toplabel [ "." ]
  47. HOSTNAME = "(?:#{DOMLABEL}\\.)*#{TOPLABEL}\\.?"
  48. # :startdoc:
  49. end # PATTERN
  50. # :startdoc:
  51. end # REGEXP
  52. # Class that parses String's into URI's.
  53. #
  54. # It contains a Hash set of patterns and Regexp's that match and validate.
  55. #
  56. class RFC2396_Parser
  57. include RFC2396_REGEXP
  58. #
  59. # == Synopsis
  60. #
  61. # URI::Parser.new([opts])
  62. #
  63. # == Args
  64. #
  65. # The constructor accepts a hash as options for parser.
  66. # Keys of options are pattern names of URI components
  67. # and values of options are pattern strings.
  68. # The constructor generates set of regexps for parsing URIs.
  69. #
  70. # You can use the following keys:
  71. #
  72. # * :ESCAPED (URI::PATTERN::ESCAPED in default)
  73. # * :UNRESERVED (URI::PATTERN::UNRESERVED in default)
  74. # * :DOMLABEL (URI::PATTERN::DOMLABEL in default)
  75. # * :TOPLABEL (URI::PATTERN::TOPLABEL in default)
  76. # * :HOSTNAME (URI::PATTERN::HOSTNAME in default)
  77. #
  78. # == Examples
  79. #
  80. # p = URI::Parser.new(:ESCAPED => "(?:%[a-fA-F0-9]{2}|%u[a-fA-F0-9]{4})")
  81. # u = p.parse("http://example.jp/%uABCD") #=> #<URI::HTTP http://example.jp/%uABCD>
  82. # URI.parse(u.to_s) #=> raises URI::InvalidURIError
  83. #
  84. # s = "http://example.com/ABCD"
  85. # u1 = p.parse(s) #=> #<URI::HTTP http://example.com/ABCD>
  86. # u2 = URI.parse(s) #=> #<URI::HTTP http://example.com/ABCD>
  87. # u1 == u2 #=> true
  88. # u1.eql?(u2) #=> false
  89. #
  90. def initialize(opts = {})
  91. @pattern = initialize_pattern(opts)
  92. @pattern.each_value(&:freeze)
  93. @pattern.freeze
  94. @regexp = initialize_regexp(@pattern)
  95. @regexp.each_value(&:freeze)
  96. @regexp.freeze
  97. end
  98. # The Hash of patterns.
  99. #
  100. # See also URI::Parser.initialize_pattern.
  101. attr_reader :pattern
  102. # The Hash of Regexp.
  103. #
  104. # See also URI::Parser.initialize_regexp.
  105. attr_reader :regexp
  106. # Returns a split URI against regexp[:ABS_URI].
  107. def split(uri)
  108. case uri
  109. when ''
  110. # null uri
  111. when @regexp[:ABS_URI]
  112. scheme, opaque, userinfo, host, port,
  113. registry, path, query, fragment = $~[1..-1]
  114. # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
  115. # absoluteURI = scheme ":" ( hier_part | opaque_part )
  116. # hier_part = ( net_path | abs_path ) [ "?" query ]
  117. # opaque_part = uric_no_slash *uric
  118. # abs_path = "/" path_segments
  119. # net_path = "//" authority [ abs_path ]
  120. # authority = server | reg_name
  121. # server = [ [ userinfo "@" ] hostport ]
  122. if !scheme
  123. raise InvalidURIError,
  124. "bad URI(absolute but no scheme): #{uri}"
  125. end
  126. if !opaque && (!path && (!host && !registry))
  127. raise InvalidURIError,
  128. "bad URI(absolute but no path): #{uri}"
  129. end
  130. when @regexp[:REL_URI]
  131. scheme = nil
  132. opaque = nil
  133. userinfo, host, port, registry,
  134. rel_segment, abs_path, query, fragment = $~[1..-1]
  135. if rel_segment && abs_path
  136. path = rel_segment + abs_path
  137. elsif rel_segment
  138. path = rel_segment
  139. elsif abs_path
  140. path = abs_path
  141. end
  142. # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
  143. # relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
  144. # net_path = "//" authority [ abs_path ]
  145. # abs_path = "/" path_segments
  146. # rel_path = rel_segment [ abs_path ]
  147. # authority = server | reg_name
  148. # server = [ [ userinfo "@" ] hostport ]
  149. else
  150. raise InvalidURIError, "bad URI(is not URI?): #{uri}"
  151. end
  152. path = '' if !path && !opaque # (see RFC2396 Section 5.2)
  153. ret = [
  154. scheme,
  155. userinfo, host, port, # X
  156. registry, # X
  157. path, # Y
  158. opaque, # Y
  159. query,
  160. fragment
  161. ]
  162. return ret
  163. end
  164. #
  165. # == Args
  166. #
  167. # +uri+::
  168. # String
  169. #
  170. # == Description
  171. #
  172. # Parses +uri+ and constructs either matching URI scheme object
  173. # (File, FTP, HTTP, HTTPS, LDAP, LDAPS, or MailTo) or URI::Generic.
  174. #
  175. # == Usage
  176. #
  177. # p = URI::Parser.new
  178. # p.parse("ldap://ldap.example.com/dc=example?user=john")
  179. # #=> #<URI::LDAP ldap://ldap.example.com/dc=example?user=john>
  180. #
  181. def parse(uri)
  182. URI.for(*self.split(uri), self)
  183. end
  184. #
  185. # == Args
  186. #
  187. # +uris+::
  188. # an Array of Strings
  189. #
  190. # == Description
  191. #
  192. # Attempts to parse and merge a set of URIs.
  193. #
  194. def join(*uris)
  195. uris[0] = convert_to_uri(uris[0])
  196. uris.inject :merge
  197. end
  198. #
  199. # :call-seq:
  200. # extract( str )
  201. # extract( str, schemes )
  202. # extract( str, schemes ) {|item| block }
  203. #
  204. # == Args
  205. #
  206. # +str+::
  207. # String to search
  208. # +schemes+::
  209. # Patterns to apply to +str+
  210. #
  211. # == Description
  212. #
  213. # Attempts to parse and merge a set of URIs.
  214. # If no +block+ given, then returns the result,
  215. # else it calls +block+ for each element in result.
  216. #
  217. # See also URI::Parser.make_regexp.
  218. #
  219. def extract(str, schemes = nil)
  220. if block_given?
  221. str.scan(make_regexp(schemes)) { yield $& }
  222. nil
  223. else
  224. result = []
  225. str.scan(make_regexp(schemes)) { result.push $& }
  226. result
  227. end
  228. end
  229. # Returns Regexp that is default self.regexp[:ABS_URI_REF],
  230. # unless +schemes+ is provided. Then it is a Regexp.union with self.pattern[:X_ABS_URI].
  231. def make_regexp(schemes = nil)
  232. unless schemes
  233. @regexp[:ABS_URI_REF]
  234. else
  235. /(?=#{Regexp.union(*schemes)}:)#{@pattern[:X_ABS_URI]}/x
  236. end
  237. end
  238. #
  239. # :call-seq:
  240. # escape( str )
  241. # escape( str, unsafe )
  242. #
  243. # == Args
  244. #
  245. # +str+::
  246. # String to make safe
  247. # +unsafe+::
  248. # Regexp to apply. Defaults to self.regexp[:UNSAFE]
  249. #
  250. # == Description
  251. #
  252. # Constructs a safe String from +str+, removing unsafe characters,
  253. # replacing them with codes.
  254. #
  255. def escape(str, unsafe = @regexp[:UNSAFE])
  256. unless unsafe.kind_of?(Regexp)
  257. # perhaps unsafe is String object
  258. unsafe = Regexp.new("[#{Regexp.quote(unsafe)}]", false)
  259. end
  260. str.gsub(unsafe) do
  261. us = $&
  262. tmp = ''
  263. us.each_byte do |uc|
  264. tmp << sprintf('%%%02X', uc)
  265. end
  266. tmp
  267. end.force_encoding(Encoding::US_ASCII)
  268. end
  269. #
  270. # :call-seq:
  271. # unescape( str )
  272. # unescape( str, escaped )
  273. #
  274. # == Args
  275. #
  276. # +str+::
  277. # String to remove escapes from
  278. # +escaped+::
  279. # Regexp to apply. Defaults to self.regexp[:ESCAPED]
  280. #
  281. # == Description
  282. #
  283. # Removes escapes from +str+.
  284. #
  285. def unescape(str, escaped = @regexp[:ESCAPED])
  286. enc = str.encoding
  287. enc = Encoding::UTF_8 if enc == Encoding::US_ASCII
  288. str.gsub(escaped) { [$&[1, 2]].pack('H2').force_encoding(enc) }
  289. end
  290. @@to_s = Kernel.instance_method(:to_s)
  291. def inspect
  292. @@to_s.bind_call(self)
  293. end
  294. private
  295. # Constructs the default Hash of patterns.
  296. def initialize_pattern(opts = {})
  297. ret = {}
  298. ret[:ESCAPED] = escaped = (opts.delete(:ESCAPED) || PATTERN::ESCAPED)
  299. ret[:UNRESERVED] = unreserved = opts.delete(:UNRESERVED) || PATTERN::UNRESERVED
  300. ret[:RESERVED] = reserved = opts.delete(:RESERVED) || PATTERN::RESERVED
  301. ret[:DOMLABEL] = opts.delete(:DOMLABEL) || PATTERN::DOMLABEL
  302. ret[:TOPLABEL] = opts.delete(:TOPLABEL) || PATTERN::TOPLABEL
  303. ret[:HOSTNAME] = hostname = opts.delete(:HOSTNAME)
  304. # RFC 2396 (URI Generic Syntax)
  305. # RFC 2732 (IPv6 Literal Addresses in URL's)
  306. # RFC 2373 (IPv6 Addressing Architecture)
  307. # uric = reserved | unreserved | escaped
  308. ret[:URIC] = uric = "(?:[#{unreserved}#{reserved}]|#{escaped})"
  309. # uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
  310. # "&" | "=" | "+" | "$" | ","
  311. ret[:URIC_NO_SLASH] = uric_no_slash = "(?:[#{unreserved};?:@&=+$,]|#{escaped})"
  312. # query = *uric
  313. ret[:QUERY] = query = "#{uric}*"
  314. # fragment = *uric
  315. ret[:FRAGMENT] = fragment = "#{uric}*"
  316. # hostname = *( domainlabel "." ) toplabel [ "." ]
  317. # reg-name = *( unreserved / pct-encoded / sub-delims ) # RFC3986
  318. unless hostname
  319. ret[:HOSTNAME] = hostname = "(?:[a-zA-Z0-9\\-.]|%\\h\\h)+"
  320. end
  321. # RFC 2373, APPENDIX B:
  322. # IPv6address = hexpart [ ":" IPv4address ]
  323. # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
  324. # hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
  325. # hexseq = hex4 *( ":" hex4)
  326. # hex4 = 1*4HEXDIG
  327. #
  328. # XXX: This definition has a flaw. "::" + IPv4address must be
  329. # allowed too. Here is a replacement.
  330. #
  331. # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
  332. ret[:IPV4ADDR] = ipv4addr = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"
  333. # hex4 = 1*4HEXDIG
  334. hex4 = "[#{PATTERN::HEX}]{1,4}"
  335. # lastpart = hex4 | IPv4address
  336. lastpart = "(?:#{hex4}|#{ipv4addr})"
  337. # hexseq1 = *( hex4 ":" ) hex4
  338. hexseq1 = "(?:#{hex4}:)*#{hex4}"
  339. # hexseq2 = *( hex4 ":" ) lastpart
  340. hexseq2 = "(?:#{hex4}:)*#{lastpart}"
  341. # IPv6address = hexseq2 | [ hexseq1 ] "::" [ hexseq2 ]
  342. ret[:IPV6ADDR] = ipv6addr = "(?:#{hexseq2}|(?:#{hexseq1})?::(?:#{hexseq2})?)"
  343. # IPv6prefix = ( hexseq1 | [ hexseq1 ] "::" [ hexseq1 ] ) "/" 1*2DIGIT
  344. # unused
  345. # ipv6reference = "[" IPv6address "]" (RFC 2732)
  346. ret[:IPV6REF] = ipv6ref = "\\[#{ipv6addr}\\]"
  347. # host = hostname | IPv4address
  348. # host = hostname | IPv4address | IPv6reference (RFC 2732)
  349. ret[:HOST] = host = "(?:#{hostname}|#{ipv4addr}|#{ipv6ref})"
  350. # port = *digit
  351. ret[:PORT] = port = '\d*'
  352. # hostport = host [ ":" port ]
  353. ret[:HOSTPORT] = hostport = "#{host}(?::#{port})?"
  354. # userinfo = *( unreserved | escaped |
  355. # ";" | ":" | "&" | "=" | "+" | "$" | "," )
  356. ret[:USERINFO] = userinfo = "(?:[#{unreserved};:&=+$,]|#{escaped})*"
  357. # pchar = unreserved | escaped |
  358. # ":" | "@" | "&" | "=" | "+" | "$" | ","
  359. pchar = "(?:[#{unreserved}:@&=+$,]|#{escaped})"
  360. # param = *pchar
  361. param = "#{pchar}*"
  362. # segment = *pchar *( ";" param )
  363. segment = "#{pchar}*(?:;#{param})*"
  364. # path_segments = segment *( "/" segment )
  365. ret[:PATH_SEGMENTS] = path_segments = "#{segment}(?:/#{segment})*"
  366. # server = [ [ userinfo "@" ] hostport ]
  367. server = "(?:#{userinfo}@)?#{hostport}"
  368. # reg_name = 1*( unreserved | escaped | "$" | "," |
  369. # ";" | ":" | "@" | "&" | "=" | "+" )
  370. ret[:REG_NAME] = reg_name = "(?:[#{unreserved}$,;:@&=+]|#{escaped})+"
  371. # authority = server | reg_name
  372. authority = "(?:#{server}|#{reg_name})"
  373. # rel_segment = 1*( unreserved | escaped |
  374. # ";" | "@" | "&" | "=" | "+" | "$" | "," )
  375. ret[:REL_SEGMENT] = rel_segment = "(?:[#{unreserved};@&=+$,]|#{escaped})+"
  376. # scheme = alpha *( alpha | digit | "+" | "-" | "." )
  377. ret[:SCHEME] = scheme = "[#{PATTERN::ALPHA}][\\-+.#{PATTERN::ALPHA}\\d]*"
  378. # abs_path = "/" path_segments
  379. ret[:ABS_PATH] = abs_path = "/#{path_segments}"
  380. # rel_path = rel_segment [ abs_path ]
  381. ret[:REL_PATH] = rel_path = "#{rel_segment}(?:#{abs_path})?"
  382. # net_path = "//" authority [ abs_path ]
  383. ret[:NET_PATH] = net_path = "//#{authority}(?:#{abs_path})?"
  384. # hier_part = ( net_path | abs_path ) [ "?" query ]
  385. ret[:HIER_PART] = hier_part = "(?:#{net_path}|#{abs_path})(?:\\?(?:#{query}))?"
  386. # opaque_part = uric_no_slash *uric
  387. ret[:OPAQUE_PART] = opaque_part = "#{uric_no_slash}#{uric}*"
  388. # absoluteURI = scheme ":" ( hier_part | opaque_part )
  389. ret[:ABS_URI] = abs_uri = "#{scheme}:(?:#{hier_part}|#{opaque_part})"
  390. # relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
  391. ret[:REL_URI] = rel_uri = "(?:#{net_path}|#{abs_path}|#{rel_path})(?:\\?#{query})?"
  392. # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
  393. ret[:URI_REF] = "(?:#{abs_uri}|#{rel_uri})?(?:##{fragment})?"
  394. ret[:X_ABS_URI] = "
  395. (#{scheme}): (?# 1: scheme)
  396. (?:
  397. (#{opaque_part}) (?# 2: opaque)
  398. |
  399. (?:(?:
  400. //(?:
  401. (?:(?:(#{userinfo})@)? (?# 3: userinfo)
  402. (?:(#{host})(?::(\\d*))?))? (?# 4: host, 5: port)
  403. |
  404. (#{reg_name}) (?# 6: registry)
  405. )
  406. |
  407. (?!//)) (?# XXX: '//' is the mark for hostport)
  408. (#{abs_path})? (?# 7: path)
  409. )(?:\\?(#{query}))? (?# 8: query)
  410. )
  411. (?:\\#(#{fragment}))? (?# 9: fragment)
  412. "
  413. ret[:X_REL_URI] = "
  414. (?:
  415. (?:
  416. //
  417. (?:
  418. (?:(#{userinfo})@)? (?# 1: userinfo)
  419. (#{host})?(?::(\\d*))? (?# 2: host, 3: port)
  420. |
  421. (#{reg_name}) (?# 4: registry)
  422. )
  423. )
  424. |
  425. (#{rel_segment}) (?# 5: rel_segment)
  426. )?
  427. (#{abs_path})? (?# 6: abs_path)
  428. (?:\\?(#{query}))? (?# 7: query)
  429. (?:\\#(#{fragment}))? (?# 8: fragment)
  430. "
  431. ret
  432. end
  433. # Constructs the default Hash of Regexp's.
  434. def initialize_regexp(pattern)
  435. ret = {}
  436. # for URI::split
  437. ret[:ABS_URI] = Regexp.new('\A\s*' + pattern[:X_ABS_URI] + '\s*\z', Regexp::EXTENDED)
  438. ret[:REL_URI] = Regexp.new('\A\s*' + pattern[:X_REL_URI] + '\s*\z', Regexp::EXTENDED)
  439. # for URI::extract
  440. ret[:URI_REF] = Regexp.new(pattern[:URI_REF])
  441. ret[:ABS_URI_REF] = Regexp.new(pattern[:X_ABS_URI], Regexp::EXTENDED)
  442. ret[:REL_URI_REF] = Regexp.new(pattern[:X_REL_URI], Regexp::EXTENDED)
  443. # for URI::escape/unescape
  444. ret[:ESCAPED] = Regexp.new(pattern[:ESCAPED])
  445. ret[:UNSAFE] = Regexp.new("[^#{pattern[:UNRESERVED]}#{pattern[:RESERVED]}]")
  446. # for Generic#initialize
  447. ret[:SCHEME] = Regexp.new("\\A#{pattern[:SCHEME]}\\z")
  448. ret[:USERINFO] = Regexp.new("\\A#{pattern[:USERINFO]}\\z")
  449. ret[:HOST] = Regexp.new("\\A#{pattern[:HOST]}\\z")
  450. ret[:PORT] = Regexp.new("\\A#{pattern[:PORT]}\\z")
  451. ret[:OPAQUE] = Regexp.new("\\A#{pattern[:OPAQUE_PART]}\\z")
  452. ret[:REGISTRY] = Regexp.new("\\A#{pattern[:REG_NAME]}\\z")
  453. ret[:ABS_PATH] = Regexp.new("\\A#{pattern[:ABS_PATH]}\\z")
  454. ret[:REL_PATH] = Regexp.new("\\A#{pattern[:REL_PATH]}\\z")
  455. ret[:QUERY] = Regexp.new("\\A#{pattern[:QUERY]}\\z")
  456. ret[:FRAGMENT] = Regexp.new("\\A#{pattern[:FRAGMENT]}\\z")
  457. ret
  458. end
  459. def convert_to_uri(uri)
  460. if uri.is_a?(URI::Generic)
  461. uri
  462. elsif uri = String.try_convert(uri)
  463. parse(uri)
  464. else
  465. raise ArgumentError,
  466. "bad argument (expected URI object or URI string)"
  467. end
  468. end
  469. end # class Parser
  470. end # module URI