1#--
2# = uri/common.rb
3#
4# Author:: Akira Yamada <akira@ruby-lang.org>
5# Revision:: $Id: common.rb 42355 2013-08-03 13:27:01Z nagachika $
6# License::
7#   You can redistribute it and/or modify it under the same term as Ruby.
8#
9# See URI for general documentation
10#
11
12module URI
13  #
14  # Includes URI::REGEXP::PATTERN
15  #
16  module REGEXP
17    #
18    # Patterns used to parse URI's
19    #
20    module PATTERN
21      # :stopdoc:
22
23      # RFC 2396 (URI Generic Syntax)
24      # RFC 2732 (IPv6 Literal Addresses in URL's)
25      # RFC 2373 (IPv6 Addressing Architecture)
26
27      # alpha         = lowalpha | upalpha
28      ALPHA = "a-zA-Z"
29      # alphanum      = alpha | digit
30      ALNUM = "#{ALPHA}\\d"
31
32      # hex           = digit | "A" | "B" | "C" | "D" | "E" | "F" |
33      #                         "a" | "b" | "c" | "d" | "e" | "f"
34      HEX     = "a-fA-F\\d"
35      # escaped       = "%" hex hex
36      ESCAPED = "%[#{HEX}]{2}"
37      # mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |
38      #                 "(" | ")"
39      # unreserved    = alphanum | mark
40      UNRESERVED = "\\-_.!~*'()#{ALNUM}"
41      # reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
42      #                 "$" | ","
43      # reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
44      #                 "$" | "," | "[" | "]" (RFC 2732)
45      RESERVED = ";/?:@&=+$,\\[\\]"
46
47      # domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
48      DOMLABEL = "(?:[#{ALNUM}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
49      # toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
50      TOPLABEL = "(?:[#{ALPHA}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
51      # hostname      = *( domainlabel "." ) toplabel [ "." ]
52      HOSTNAME = "(?:#{DOMLABEL}\\.)*#{TOPLABEL}\\.?"
53
54      # :startdoc:
55    end # PATTERN
56
57    # :startdoc:
58  end # REGEXP
59
60  # class that Parses String's into URI's
61  #
62  # It contains a Hash set of patterns and Regexp's that match and validate.
63  #
64  class Parser
65    include REGEXP
66
67    #
68    # == Synopsis
69    #
70    #   URI::Parser.new([opts])
71    #
72    # == Args
73    #
74    # The constructor accepts a hash as options for parser.
75    # Keys of options are pattern names of URI components
76    # and values of options are pattern strings.
77    # The constructor generetes set of regexps for parsing URIs.
78    #
79    # You can use the following keys:
80    #
81    #   * :ESCAPED (URI::PATTERN::ESCAPED in default)
82    #   * :UNRESERVED (URI::PATTERN::UNRESERVED in default)
83    #   * :DOMLABEL (URI::PATTERN::DOMLABEL in default)
84    #   * :TOPLABEL (URI::PATTERN::TOPLABEL in default)
85    #   * :HOSTNAME (URI::PATTERN::HOSTNAME in default)
86    #
87    # == Examples
88    #
89    #   p = URI::Parser.new(:ESCAPED => "(?:%[a-fA-F0-9]{2}|%u[a-fA-F0-9]{4})")
90    #   u = p.parse("http://example.jp/%uABCD") #=> #<URI::HTTP:0xb78cf4f8 URL:http://example.jp/%uABCD>
91    #   URI.parse(u.to_s) #=> raises URI::InvalidURIError
92    #
93    #   s = "http://examle.com/ABCD"
94    #   u1 = p.parse(s) #=> #<URI::HTTP:0xb78c3220 URL:http://example.com/ABCD>
95    #   u2 = URI.parse(s) #=> #<URI::HTTP:0xb78b6d54 URL:http://example.com/ABCD>
96    #   u1 == u2 #=> true
97    #   u1.eql?(u2) #=> false
98    #
99    def initialize(opts = {})
100      @pattern = initialize_pattern(opts)
101      @pattern.each_value {|v| v.freeze}
102      @pattern.freeze
103
104      @regexp = initialize_regexp(@pattern)
105      @regexp.each_value {|v| v.freeze}
106      @regexp.freeze
107    end
108
109    # The Hash of patterns.
110    #
111    # see also URI::Parser.initialize_pattern
112    attr_reader :pattern
113
114    # The Hash of Regexp
115    #
116    # see also URI::Parser.initialize_regexp
117    attr_reader :regexp
118
119    # Returns a split URI against regexp[:ABS_URI]
120    def split(uri)
121      case uri
122      when ''
123        # null uri
124
125      when @regexp[:ABS_URI]
126        scheme, opaque, userinfo, host, port,
127          registry, path, query, fragment = $~[1..-1]
128
129        # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
130
131        # absoluteURI   = scheme ":" ( hier_part | opaque_part )
132        # hier_part     = ( net_path | abs_path ) [ "?" query ]
133        # opaque_part   = uric_no_slash *uric
134
135        # abs_path      = "/"  path_segments
136        # net_path      = "//" authority [ abs_path ]
137
138        # authority     = server | reg_name
139        # server        = [ [ userinfo "@" ] hostport ]
140
141        if !scheme
142          raise InvalidURIError,
143            "bad URI(absolute but no scheme): #{uri}"
144        end
145        if !opaque && (!path && (!host && !registry))
146          raise InvalidURIError,
147            "bad URI(absolute but no path): #{uri}"
148        end
149
150      when @regexp[:REL_URI]
151        scheme = nil
152        opaque = nil
153
154        userinfo, host, port, registry,
155          rel_segment, abs_path, query, fragment = $~[1..-1]
156        if rel_segment && abs_path
157          path = rel_segment + abs_path
158        elsif rel_segment
159          path = rel_segment
160        elsif abs_path
161          path = abs_path
162        end
163
164        # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
165
166        # relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
167
168        # net_path      = "//" authority [ abs_path ]
169        # abs_path      = "/"  path_segments
170        # rel_path      = rel_segment [ abs_path ]
171
172        # authority     = server | reg_name
173        # server        = [ [ userinfo "@" ] hostport ]
174
175      else
176        raise InvalidURIError, "bad URI(is not URI?): #{uri}"
177      end
178
179      path = '' if !path && !opaque # (see RFC2396 Section 5.2)
180      ret = [
181        scheme,
182        userinfo, host, port,         # X
183        registry,                     # X
184        path,                         # Y
185        opaque,                       # Y
186        query,
187        fragment
188      ]
189      return ret
190    end
191
192    #
193    # == Args
194    #
195    # +uri+::
196    #    String
197    #
198    # == Description
199    #
200    # parses +uri+ and constructs either matching URI scheme object
201    # (FTP, HTTP, HTTPS, LDAP, LDAPS, or MailTo) or URI::Generic
202    #
203    # == Usage
204    #
205    #   p = URI::Parser.new
206    #   p.parse("ldap://ldap.example.com/dc=example?user=john")
207    #   #=> #<URI::LDAP:0x00000000b9e7e8 URL:ldap://ldap.example.com/dc=example?user=john>
208    #
209    def parse(uri)
210      scheme, userinfo, host, port,
211        registry, path, opaque, query, fragment = self.split(uri)
212
213      if scheme && URI.scheme_list.include?(scheme.upcase)
214        URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port,
215                                           registry, path, opaque, query,
216                                           fragment, self)
217      else
218        Generic.new(scheme, userinfo, host, port,
219                    registry, path, opaque, query,
220                    fragment, self)
221      end
222    end
223
224
225    #
226    # == Args
227    #
228    # +uris+::
229    #    an Array of Strings
230    #
231    # == Description
232    #
233    # Attempts to parse and merge a set of URIs
234    #
235    def join(*uris)
236      uris[0] = convert_to_uri(uris[0])
237      uris.inject :merge
238    end
239
240    #
241    # :call-seq:
242    #   extract( str )
243    #   extract( str, schemes )
244    #   extract( str, schemes ) {|item| block }
245    #
246    # == Args
247    #
248    # +str+::
249    #    String to search
250    # +schemes+::
251    #    Patterns to apply to +str+
252    #
253    # == Description
254    #
255    # Attempts to parse and merge a set of URIs
256    # If no +block+ given , then returns the result,
257    # else it calls +block+ for each element in result.
258    #
259    # see also URI::Parser.make_regexp
260    #
261    def extract(str, schemes = nil)
262      if block_given?
263        str.scan(make_regexp(schemes)) { yield $& }
264        nil
265      else
266        result = []
267        str.scan(make_regexp(schemes)) { result.push $& }
268        result
269      end
270    end
271
272    # returns Regexp that is default self.regexp[:ABS_URI_REF],
273    # unless +schemes+ is provided. Then it is a Regexp.union with self.pattern[:X_ABS_URI]
274    def make_regexp(schemes = nil)
275      unless schemes
276        @regexp[:ABS_URI_REF]
277      else
278        /(?=#{Regexp.union(*schemes)}:)#{@pattern[:X_ABS_URI]}/x
279      end
280    end
281
282    #
283    # :call-seq:
284    #   escape( str )
285    #   escape( str, unsafe )
286    #
287    # == Args
288    #
289    # +str+::
290    #    String to make safe
291    # +unsafe+::
292    #    Regexp to apply. Defaults to self.regexp[:UNSAFE]
293    #
294    # == Description
295    #
296    # constructs a safe String from +str+, removing unsafe characters,
297    # replacing them with codes.
298    #
299    def escape(str, unsafe = @regexp[:UNSAFE])
300      unless unsafe.kind_of?(Regexp)
301        # perhaps unsafe is String object
302        unsafe = Regexp.new("[#{Regexp.quote(unsafe)}]", false)
303      end
304      str.gsub(unsafe) do
305        us = $&
306        tmp = ''
307        us.each_byte do |uc|
308          tmp << sprintf('%%%02X', uc)
309        end
310        tmp
311      end.force_encoding(Encoding::US_ASCII)
312    end
313
314    #
315    # :call-seq:
316    #   unescape( str )
317    #   unescape( str, unsafe )
318    #
319    # == Args
320    #
321    # +str+::
322    #    String to remove escapes from
323    # +unsafe+::
324    #    Regexp to apply. Defaults to self.regexp[:ESCAPED]
325    #
326    # == Description
327    #
328    # Removes escapes from +str+
329    #
330    def unescape(str, escaped = @regexp[:ESCAPED])
331      str.gsub(escaped) { [$&[1, 2].hex].pack('C') }.force_encoding(str.encoding)
332    end
333
334    @@to_s = Kernel.instance_method(:to_s)
335    def inspect
336      @@to_s.bind(self).call
337    end
338
339    private
340
341    # Constructs the default Hash of patterns
342    def initialize_pattern(opts = {})
343      ret = {}
344      ret[:ESCAPED] = escaped = (opts.delete(:ESCAPED) || PATTERN::ESCAPED)
345      ret[:UNRESERVED] = unreserved = opts.delete(:UNRESERVED) || PATTERN::UNRESERVED
346      ret[:RESERVED] = reserved = opts.delete(:RESERVED) || PATTERN::RESERVED
347      ret[:DOMLABEL] = opts.delete(:DOMLABEL) || PATTERN::DOMLABEL
348      ret[:TOPLABEL] = opts.delete(:TOPLABEL) || PATTERN::TOPLABEL
349      ret[:HOSTNAME] = hostname = opts.delete(:HOSTNAME)
350
351      # RFC 2396 (URI Generic Syntax)
352      # RFC 2732 (IPv6 Literal Addresses in URL's)
353      # RFC 2373 (IPv6 Addressing Architecture)
354
355      # uric          = reserved | unreserved | escaped
356      ret[:URIC] = uric = "(?:[#{unreserved}#{reserved}]|#{escaped})"
357      # uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
358      #                 "&" | "=" | "+" | "$" | ","
359      ret[:URIC_NO_SLASH] = uric_no_slash = "(?:[#{unreserved};?:@&=+$,]|#{escaped})"
360      # query         = *uric
361      ret[:QUERY] = query = "#{uric}*"
362      # fragment      = *uric
363      ret[:FRAGMENT] = fragment = "#{uric}*"
364
365      # hostname      = *( domainlabel "." ) toplabel [ "." ]
366      # reg-name      = *( unreserved / pct-encoded / sub-delims ) # RFC3986
367      unless hostname
368        ret[:HOSTNAME] = hostname = "(?:[a-zA-Z0-9\\-.]|%\\h\\h)+"
369      end
370
371      # RFC 2373, APPENDIX B:
372      # IPv6address = hexpart [ ":" IPv4address ]
373      # IPv4address   = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
374      # hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
375      # hexseq  = hex4 *( ":" hex4)
376      # hex4    = 1*4HEXDIG
377      #
378      # XXX: This definition has a flaw. "::" + IPv4address must be
379      # allowed too.  Here is a replacement.
380      #
381      # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
382      ret[:IPV4ADDR] = ipv4addr = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"
383      # hex4     = 1*4HEXDIG
384      hex4 = "[#{PATTERN::HEX}]{1,4}"
385      # lastpart = hex4 | IPv4address
386      lastpart = "(?:#{hex4}|#{ipv4addr})"
387      # hexseq1  = *( hex4 ":" ) hex4
388      hexseq1 = "(?:#{hex4}:)*#{hex4}"
389      # hexseq2  = *( hex4 ":" ) lastpart
390      hexseq2 = "(?:#{hex4}:)*#{lastpart}"
391      # IPv6address = hexseq2 | [ hexseq1 ] "::" [ hexseq2 ]
392      ret[:IPV6ADDR] = ipv6addr = "(?:#{hexseq2}|(?:#{hexseq1})?::(?:#{hexseq2})?)"
393
394      # IPv6prefix  = ( hexseq1 | [ hexseq1 ] "::" [ hexseq1 ] ) "/" 1*2DIGIT
395      # unused
396
397      # ipv6reference = "[" IPv6address "]" (RFC 2732)
398      ret[:IPV6REF] = ipv6ref = "\\[#{ipv6addr}\\]"
399
400      # host          = hostname | IPv4address
401      # host          = hostname | IPv4address | IPv6reference (RFC 2732)
402      ret[:HOST] = host = "(?:#{hostname}|#{ipv4addr}|#{ipv6ref})"
403      # port          = *digit
404      port = '\d*'
405      # hostport      = host [ ":" port ]
406      ret[:HOSTPORT] = hostport = "#{host}(?::#{port})?"
407
408      # userinfo      = *( unreserved | escaped |
409      #                    ";" | ":" | "&" | "=" | "+" | "$" | "," )
410      ret[:USERINFO] = userinfo = "(?:[#{unreserved};:&=+$,]|#{escaped})*"
411
412      # pchar         = unreserved | escaped |
413      #                 ":" | "@" | "&" | "=" | "+" | "$" | ","
414      pchar = "(?:[#{unreserved}:@&=+$,]|#{escaped})"
415      # param         = *pchar
416      param = "#{pchar}*"
417      # segment       = *pchar *( ";" param )
418      segment = "#{pchar}*(?:;#{param})*"
419      # path_segments = segment *( "/" segment )
420      ret[:PATH_SEGMENTS] = path_segments = "#{segment}(?:/#{segment})*"
421
422      # server        = [ [ userinfo "@" ] hostport ]
423      server = "(?:#{userinfo}@)?#{hostport}"
424      # reg_name      = 1*( unreserved | escaped | "$" | "," |
425      #                     ";" | ":" | "@" | "&" | "=" | "+" )
426      ret[:REG_NAME] = reg_name = "(?:[#{unreserved}$,;:@&=+]|#{escaped})+"
427      # authority     = server | reg_name
428      authority = "(?:#{server}|#{reg_name})"
429
430      # rel_segment   = 1*( unreserved | escaped |
431      #                     ";" | "@" | "&" | "=" | "+" | "$" | "," )
432      ret[:REL_SEGMENT] = rel_segment = "(?:[#{unreserved};@&=+$,]|#{escaped})+"
433
434      # scheme        = alpha *( alpha | digit | "+" | "-" | "." )
435      ret[:SCHEME] = scheme = "[#{PATTERN::ALPHA}][\\-+.#{PATTERN::ALPHA}\\d]*"
436
437      # abs_path      = "/"  path_segments
438      ret[:ABS_PATH] = abs_path = "/#{path_segments}"
439      # rel_path      = rel_segment [ abs_path ]
440      ret[:REL_PATH] = rel_path = "#{rel_segment}(?:#{abs_path})?"
441      # net_path      = "//" authority [ abs_path ]
442      ret[:NET_PATH] = net_path = "//#{authority}(?:#{abs_path})?"
443
444      # hier_part     = ( net_path | abs_path ) [ "?" query ]
445      ret[:HIER_PART] = hier_part = "(?:#{net_path}|#{abs_path})(?:\\?(?:#{query}))?"
446      # opaque_part   = uric_no_slash *uric
447      ret[:OPAQUE_PART] = opaque_part = "#{uric_no_slash}#{uric}*"
448
449      # absoluteURI   = scheme ":" ( hier_part | opaque_part )
450      ret[:ABS_URI] = abs_uri = "#{scheme}:(?:#{hier_part}|#{opaque_part})"
451      # relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
452      ret[:REL_URI] = rel_uri = "(?:#{net_path}|#{abs_path}|#{rel_path})(?:\\?#{query})?"
453
454      # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
455      ret[:URI_REF] = "(?:#{abs_uri}|#{rel_uri})?(?:##{fragment})?"
456
457      ret[:X_ABS_URI] = "
458        (#{scheme}):                           (?# 1: scheme)
459        (?:
460           (#{opaque_part})                    (?# 2: opaque)
461        |
462           (?:(?:
463             //(?:
464                 (?:(?:(#{userinfo})@)?        (?# 3: userinfo)
465                   (?:(#{host})(?::(\\d*))?))? (?# 4: host, 5: port)
466               |
467                 (#{reg_name})                 (?# 6: registry)
468               )
469             |
470             (?!//))                           (?# XXX: '//' is the mark for hostport)
471             (#{abs_path})?                    (?# 7: path)
472           )(?:\\?(#{query}))?                 (?# 8: query)
473        )
474        (?:\\#(#{fragment}))?                  (?# 9: fragment)
475      "
476
477      ret[:X_REL_URI] = "
478        (?:
479          (?:
480            //
481            (?:
482              (?:(#{userinfo})@)?       (?# 1: userinfo)
483                (#{host})?(?::(\\d*))?  (?# 2: host, 3: port)
484            |
485              (#{reg_name})             (?# 4: registry)
486            )
487          )
488        |
489          (#{rel_segment})              (?# 5: rel_segment)
490        )?
491        (#{abs_path})?                  (?# 6: abs_path)
492        (?:\\?(#{query}))?              (?# 7: query)
493        (?:\\#(#{fragment}))?           (?# 8: fragment)
494      "
495
496      ret
497    end
498
499    # Constructs the default Hash of Regexp's
500    def initialize_regexp(pattern)
501      ret = {}
502
503      # for URI::split
504      ret[:ABS_URI] = Regexp.new('\A\s*' + pattern[:X_ABS_URI] + '\s*\z', Regexp::EXTENDED)
505      ret[:REL_URI] = Regexp.new('\A\s*' + pattern[:X_REL_URI] + '\s*\z', Regexp::EXTENDED)
506
507      # for URI::extract
508      ret[:URI_REF]     = Regexp.new(pattern[:URI_REF])
509      ret[:ABS_URI_REF] = Regexp.new(pattern[:X_ABS_URI], Regexp::EXTENDED)
510      ret[:REL_URI_REF] = Regexp.new(pattern[:X_REL_URI], Regexp::EXTENDED)
511
512      # for URI::escape/unescape
513      ret[:ESCAPED] = Regexp.new(pattern[:ESCAPED])
514      ret[:UNSAFE]  = Regexp.new("[^#{pattern[:UNRESERVED]}#{pattern[:RESERVED]}]")
515
516      # for Generic#initialize
517      ret[:SCHEME]   = Regexp.new("\\A#{pattern[:SCHEME]}\\z")
518      ret[:USERINFO] = Regexp.new("\\A#{pattern[:USERINFO]}\\z")
519      ret[:HOST]     = Regexp.new("\\A#{pattern[:HOST]}\\z")
520      ret[:PORT]     = Regexp.new("\\A#{pattern[:PORT]}\\z")
521      ret[:OPAQUE]   = Regexp.new("\\A#{pattern[:OPAQUE_PART]}\\z")
522      ret[:REGISTRY] = Regexp.new("\\A#{pattern[:REG_NAME]}\\z")
523      ret[:ABS_PATH] = Regexp.new("\\A#{pattern[:ABS_PATH]}\\z")
524      ret[:REL_PATH] = Regexp.new("\\A#{pattern[:REL_PATH]}\\z")
525      ret[:QUERY]    = Regexp.new("\\A#{pattern[:QUERY]}\\z")
526      ret[:FRAGMENT] = Regexp.new("\\A#{pattern[:FRAGMENT]}\\z")
527
528      ret
529    end
530
531    def convert_to_uri(uri)
532      if uri.is_a?(URI::Generic)
533        uri
534      elsif uri = String.try_convert(uri)
535        parse(uri)
536      else
537        raise ArgumentError,
538          "bad argument (expected URI object or URI string)"
539      end
540    end
541
542  end # class Parser
543
544  # URI::Parser.new
545  DEFAULT_PARSER = Parser.new
546  DEFAULT_PARSER.pattern.each_pair do |sym, str|
547    unless REGEXP::PATTERN.const_defined?(sym)
548      REGEXP::PATTERN.const_set(sym, str)
549    end
550  end
551  DEFAULT_PARSER.regexp.each_pair do |sym, str|
552    const_set(sym, str)
553  end
554
555  module Util # :nodoc:
556    def make_components_hash(klass, array_hash)
557      tmp = {}
558      if array_hash.kind_of?(Array) &&
559          array_hash.size == klass.component.size - 1
560        klass.component[1..-1].each_index do |i|
561          begin
562            tmp[klass.component[i + 1]] = array_hash[i].clone
563          rescue TypeError
564            tmp[klass.component[i + 1]] = array_hash[i]
565          end
566        end
567
568      elsif array_hash.kind_of?(Hash)
569        array_hash.each do |key, value|
570          begin
571            tmp[key] = value.clone
572          rescue TypeError
573            tmp[key] = value
574          end
575        end
576      else
577        raise ArgumentError,
578          "expected Array of or Hash of components of #{klass.to_s} (#{klass.component[1..-1].join(', ')})"
579      end
580      tmp[:scheme] = klass.to_s.sub(/\A.*::/, '').downcase
581
582      return tmp
583    end
584    module_function :make_components_hash
585  end
586
587  # module for escaping unsafe characters with codes.
588  module Escape
589    #
590    # == Synopsis
591    #
592    #   URI.escape(str [, unsafe])
593    #
594    # == Args
595    #
596    # +str+::
597    #   String to replaces in.
598    # +unsafe+::
599    #   Regexp that matches all symbols that must be replaced with codes.
600    #   By default uses <tt>REGEXP::UNSAFE</tt>.
601    #   When this argument is a String, it represents a character set.
602    #
603    # == Description
604    #
605    # Escapes the string, replacing all unsafe characters with codes.
606    #
607    # == Usage
608    #
609    #   require 'uri'
610    #
611    #   enc_uri = URI.escape("http://example.com/?a=\11\15")
612    #   p enc_uri
613    #   # => "http://example.com/?a=%09%0D"
614    #
615    #   p URI.unescape(enc_uri)
616    #   # => "http://example.com/?a=\t\r"
617    #
618    #   p URI.escape("@?@!", "!?")
619    #   # => "@%3F@%21"
620    #
621    def escape(*arg)
622      warn "#{caller(1)[0]}: warning: URI.escape is obsolete" if $VERBOSE
623      DEFAULT_PARSER.escape(*arg)
624    end
625    alias encode escape
626    #
627    # == Synopsis
628    #
629    #   URI.unescape(str)
630    #
631    # == Args
632    #
633    # +str+::
634    #   Unescapes the string.
635    #
636    # == Usage
637    #
638    #   require 'uri'
639    #
640    #   enc_uri = URI.escape("http://example.com/?a=\11\15")
641    #   p enc_uri
642    #   # => "http://example.com/?a=%09%0D"
643    #
644    #   p URI.unescape(enc_uri)
645    #   # => "http://example.com/?a=\t\r"
646    #
647    def unescape(*arg)
648      warn "#{caller(1)[0]}: warning: URI.unescape is obsolete" if $VERBOSE
649      DEFAULT_PARSER.unescape(*arg)
650    end
651    alias decode unescape
652  end # module Escape
653
654  extend Escape
655  include REGEXP
656
657  @@schemes = {}
658  # Returns a Hash of the defined schemes
659  def self.scheme_list
660    @@schemes
661  end
662
663  #
664  # Base class for all URI exceptions.
665  #
666  class Error < StandardError; end
667  #
668  # Not a URI.
669  #
670  class InvalidURIError < Error; end
671  #
672  # Not a URI component.
673  #
674  class InvalidComponentError < Error; end
675  #
676  # URI is valid, bad usage is not.
677  #
678  class BadURIError < Error; end
679
680  #
681  # == Synopsis
682  #
683  #   URI::split(uri)
684  #
685  # == Args
686  #
687  # +uri+::
688  #   String with URI.
689  #
690  # == Description
691  #
692  # Splits the string on following parts and returns array with result:
693  #
694  #   * Scheme
695  #   * Userinfo
696  #   * Host
697  #   * Port
698  #   * Registry
699  #   * Path
700  #   * Opaque
701  #   * Query
702  #   * Fragment
703  #
704  # == Usage
705  #
706  #   require 'uri'
707  #
708  #   p URI.split("http://www.ruby-lang.org/")
709  #   # => ["http", nil, "www.ruby-lang.org", nil, nil, "/", nil, nil, nil]
710  #
711  def self.split(uri)
712    DEFAULT_PARSER.split(uri)
713  end
714
715  #
716  # == Synopsis
717  #
718  #   URI::parse(uri_str)
719  #
720  # == Args
721  #
722  # +uri_str+::
723  #   String with URI.
724  #
725  # == Description
726  #
727  # Creates one of the URI's subclasses instance from the string.
728  #
729  # == Raises
730  #
731  # URI::InvalidURIError
732  #   Raised if URI given is not a correct one.
733  #
734  # == Usage
735  #
736  #   require 'uri'
737  #
738  #   uri = URI.parse("http://www.ruby-lang.org/")
739  #   p uri
740  #   # => #<URI::HTTP:0x202281be URL:http://www.ruby-lang.org/>
741  #   p uri.scheme
742  #   # => "http"
743  #   p uri.host
744  #   # => "www.ruby-lang.org"
745  #
746  def self.parse(uri)
747    DEFAULT_PARSER.parse(uri)
748  end
749
750  #
751  # == Synopsis
752  #
753  #   URI::join(str[, str, ...])
754  #
755  # == Args
756  #
757  # +str+::
758  #   String(s) to work with
759  #
760  # == Description
761  #
762  # Joins URIs.
763  #
764  # == Usage
765  #
766  #   require 'uri'
767  #
768  #   p URI.join("http://example.com/","main.rbx")
769  #   # => #<URI::HTTP:0x2022ac02 URL:http://localhost/main.rbx>
770  #
771  #   p URI.join('http://example.com', 'foo')
772  #   # => #<URI::HTTP:0x01ab80a0 URL:http://example.com/foo>
773  #
774  #   p URI.join('http://example.com', '/foo', '/bar')
775  #   # => #<URI::HTTP:0x01aaf0b0 URL:http://example.com/bar>
776  #
777  #   p URI.join('http://example.com', '/foo', 'bar')
778  #   # => #<URI::HTTP:0x801a92af0 URL:http://example.com/bar>
779  #
780  #   p URI.join('http://example.com', '/foo/', 'bar')
781  #   # => #<URI::HTTP:0x80135a3a0 URL:http://example.com/foo/bar>
782  #
783  #
784  def self.join(*str)
785    DEFAULT_PARSER.join(*str)
786  end
787
788  #
789  # == Synopsis
790  #
791  #   URI::extract(str[, schemes][,&blk])
792  #
793  # == Args
794  #
795  # +str+::
796  #   String to extract URIs from.
797  # +schemes+::
798  #   Limit URI matching to a specific schemes.
799  #
800  # == Description
801  #
802  # Extracts URIs from a string. If block given, iterates through all matched URIs.
803  # Returns nil if block given or array with matches.
804  #
805  # == Usage
806  #
807  #   require "uri"
808  #
809  #   URI.extract("text here http://foo.example.org/bla and here mailto:test@example.com and here also.")
810  #   # => ["http://foo.example.com/bla", "mailto:test@example.com"]
811  #
812  def self.extract(str, schemes = nil, &block)
813    DEFAULT_PARSER.extract(str, schemes, &block)
814  end
815
816  #
817  # == Synopsis
818  #
819  #   URI::regexp([match_schemes])
820  #
821  # == Args
822  #
823  # +match_schemes+::
824  #   Array of schemes. If given, resulting regexp matches to URIs
825  #   whose scheme is one of the match_schemes.
826  #
827  # == Description
828  # Returns a Regexp object which matches to URI-like strings.
829  # The Regexp object returned by this method includes arbitrary
830  # number of capture group (parentheses).  Never rely on it's number.
831  #
832  # == Usage
833  #
834  #   require 'uri'
835  #
836  #   # extract first URI from html_string
837  #   html_string.slice(URI.regexp)
838  #
839  #   # remove ftp URIs
840  #   html_string.sub(URI.regexp(['ftp'])
841  #
842  #   # You should not rely on the number of parentheses
843  #   html_string.scan(URI.regexp) do |*matches|
844  #     p $&
845  #   end
846  #
847  def self.regexp(schemes = nil)
848    DEFAULT_PARSER.make_regexp(schemes)
849  end
850
851  TBLENCWWWCOMP_ = {} # :nodoc:
852  256.times do |i|
853    TBLENCWWWCOMP_[i.chr] = '%%%02X' % i
854  end
855  TBLENCWWWCOMP_[' '] = '+'
856  TBLENCWWWCOMP_.freeze
857  TBLDECWWWCOMP_ = {} # :nodoc:
858  256.times do |i|
859    h, l = i>>4, i&15
860    TBLDECWWWCOMP_['%%%X%X' % [h, l]] = i.chr
861    TBLDECWWWCOMP_['%%%x%X' % [h, l]] = i.chr
862    TBLDECWWWCOMP_['%%%X%x' % [h, l]] = i.chr
863    TBLDECWWWCOMP_['%%%x%x' % [h, l]] = i.chr
864  end
865  TBLDECWWWCOMP_['+'] = ' '
866  TBLDECWWWCOMP_.freeze
867
868  HTML5ASCIIINCOMPAT = [Encoding::UTF_7, Encoding::UTF_16BE, Encoding::UTF_16LE,
869    Encoding::UTF_32BE, Encoding::UTF_32LE] # :nodoc:
870
871  # Encode given +str+ to URL-encoded form data.
872  #
873  # This method doesn't convert *, -, ., 0-9, A-Z, _, a-z, but does convert SP
874  # (ASCII space) to + and converts others to %XX.
875  #
876  # This is an implementation of
877  # http://www.w3.org/TR/html5/association-of-controls-and-forms.html#url-encoded-form-data
878  #
879  # See URI.decode_www_form_component, URI.encode_www_form
880  def self.encode_www_form_component(str)
881    str = str.to_s
882    if HTML5ASCIIINCOMPAT.include?(str.encoding)
883      str = str.encode(Encoding::UTF_8)
884    else
885      str = str.dup
886    end
887    str.force_encoding(Encoding::ASCII_8BIT)
888    str.gsub!(/[^*\-.0-9A-Z_a-z]/, TBLENCWWWCOMP_)
889    str.force_encoding(Encoding::US_ASCII)
890  end
891
892  # Decode given +str+ of URL-encoded form data.
893  #
894  # This decodes + to SP.
895  #
896  # See URI.encode_www_form_component, URI.decode_www_form
897  def self.decode_www_form_component(str, enc=Encoding::UTF_8)
898    raise ArgumentError, "invalid %-encoding (#{str})" unless /\A[^%]*(?:%\h\h[^%]*)*\z/ =~ str
899    str.dup.force_encoding("ASCII-8BIT") \
900       .gsub(/\+|%\h\h/, TBLDECWWWCOMP_) \
901       .force_encoding(enc)
902  end
903
904  # Generate URL-encoded form data from given +enum+.
905  #
906  # This generates application/x-www-form-urlencoded data defined in HTML5
907  # from given an Enumerable object.
908  #
909  # This internally uses URI.encode_www_form_component(str).
910  #
911  # This method doesn't convert the encoding of given items, so convert them
912  # before call this method if you want to send data as other than original
913  # encoding or mixed encoding data. (Strings which are encoded in an HTML5
914  # ASCII incompatible encoding are converted to UTF-8.)
915  #
916  # This method doesn't handle files.  When you send a file, use
917  # multipart/form-data.
918  #
919  # This is an implementation of
920  # http://www.w3.org/TR/html5/forms.html#url-encoded-form-data
921  #
922  #    URI.encode_www_form([["q", "ruby"], ["lang", "en"]])
923  #    #=> "q=ruby&lang=en"
924  #    URI.encode_www_form("q" => "ruby", "lang" => "en")
925  #    #=> "q=ruby&lang=en"
926  #    URI.encode_www_form("q" => ["ruby", "perl"], "lang" => "en")
927  #    #=> "q=ruby&q=perl&lang=en"
928  #    URI.encode_www_form([["q", "ruby"], ["q", "perl"], ["lang", "en"]])
929  #    #=> "q=ruby&q=perl&lang=en"
930  #
931  # See URI.encode_www_form_component, URI.decode_www_form
932  def self.encode_www_form(enum)
933    enum.map do |k,v|
934      if v.nil?
935        encode_www_form_component(k)
936      elsif v.respond_to?(:to_ary)
937        v.to_ary.map do |w|
938          str = encode_www_form_component(k)
939          unless w.nil?
940            str << '='
941            str << encode_www_form_component(w)
942          end
943        end.join('&')
944      else
945        str = encode_www_form_component(k)
946        str << '='
947        str << encode_www_form_component(v)
948      end
949    end.join('&')
950  end
951
952  WFKV_ = '(?:[^%#=;&]*(?:%\h\h[^%#=;&]*)*)' # :nodoc:
953
954  # Decode URL-encoded form data from given +str+.
955  #
956  # This decodes application/x-www-form-urlencoded data
957  # and returns array of key-value array.
958  # This internally uses URI.decode_www_form_component.
959  #
960  # _charset_ hack is not supported now because the mapping from given charset
961  # to Ruby's encoding is not clear yet.
962  # see also http://www.w3.org/TR/html5/syntax.html#character-encodings-0
963  #
964  # This refers http://www.w3.org/TR/html5/forms.html#url-encoded-form-data
965  #
966  # ary = URI.decode_www_form("a=1&a=2&b=3")
967  # p ary                  #=> [['a', '1'], ['a', '2'], ['b', '3']]
968  # p ary.assoc('a').last  #=> '1'
969  # p ary.assoc('b').last  #=> '3'
970  # p ary.rassoc('a').last #=> '2'
971  # p Hash[ary]            # => {"a"=>"2", "b"=>"3"}
972  #
973  # See URI.decode_www_form_component, URI.encode_www_form
974  def self.decode_www_form(str, enc=Encoding::UTF_8)
975    return [] if str.empty?
976    unless /\A#{WFKV_}=#{WFKV_}(?:[;&]#{WFKV_}=#{WFKV_})*\z/o =~ str
977      raise ArgumentError, "invalid data of application/x-www-form-urlencoded (#{str})"
978    end
979    ary = []
980    $&.scan(/([^=;&]+)=([^;&]*)/) do
981      ary << [decode_www_form_component($1, enc), decode_www_form_component($2, enc)]
982    end
983    ary
984  end
985end # module URI
986
987module Kernel
988
989  #
990  # Returns +uri+ converted to a URI object.
991  #
992  def URI(uri)
993    if uri.is_a?(URI::Generic)
994      uri
995    elsif uri = String.try_convert(uri)
996      URI.parse(uri)
997    else
998      raise ArgumentError,
999        "bad argument (expected URI object or URI string)"
1000    end
1001  end
1002  module_function :URI
1003end
1004