1#-- 2# = uri/common.rb 3# 4# Author:: Akira Yamada <akira@ruby-lang.org> 5# Revision:: $Id: common.rb 42355 2013-08-03 13:27:01Z nagachika $ 6# License:: 7# You can redistribute it and/or modify it under the same term as Ruby. 8# 9# See URI for general documentation 10# 11 12module URI 13 # 14 # Includes URI::REGEXP::PATTERN 15 # 16 module REGEXP 17 # 18 # Patterns used to parse URI's 19 # 20 module PATTERN 21 # :stopdoc: 22 23 # RFC 2396 (URI Generic Syntax) 24 # RFC 2732 (IPv6 Literal Addresses in URL's) 25 # RFC 2373 (IPv6 Addressing Architecture) 26 27 # alpha = lowalpha | upalpha 28 ALPHA = "a-zA-Z" 29 # alphanum = alpha | digit 30 ALNUM = "#{ALPHA}\\d" 31 32 # hex = digit | "A" | "B" | "C" | "D" | "E" | "F" | 33 # "a" | "b" | "c" | "d" | "e" | "f" 34 HEX = "a-fA-F\\d" 35 # escaped = "%" hex hex 36 ESCAPED = "%[#{HEX}]{2}" 37 # mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | 38 # "(" | ")" 39 # unreserved = alphanum | mark 40 UNRESERVED = "\\-_.!~*'()#{ALNUM}" 41 # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 42 # "$" | "," 43 # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 44 # "$" | "," | "[" | "]" (RFC 2732) 45 RESERVED = ";/?:@&=+$,\\[\\]" 46 47 # domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum 48 DOMLABEL = "(?:[#{ALNUM}](?:[-#{ALNUM}]*[#{ALNUM}])?)" 49 # toplabel = alpha | alpha *( alphanum | "-" ) alphanum 50 TOPLABEL = "(?:[#{ALPHA}](?:[-#{ALNUM}]*[#{ALNUM}])?)" 51 # hostname = *( domainlabel "." ) toplabel [ "." ] 52 HOSTNAME = "(?:#{DOMLABEL}\\.)*#{TOPLABEL}\\.?" 53 54 # :startdoc: 55 end # PATTERN 56 57 # :startdoc: 58 end # REGEXP 59 60 # class that Parses String's into URI's 61 # 62 # It contains a Hash set of patterns and Regexp's that match and validate. 63 # 64 class Parser 65 include REGEXP 66 67 # 68 # == Synopsis 69 # 70 # URI::Parser.new([opts]) 71 # 72 # == Args 73 # 74 # The constructor accepts a hash as options for parser. 75 # Keys of options are pattern names of URI components 76 # and values of options are pattern strings. 77 # The constructor generetes set of regexps for parsing URIs. 78 # 79 # You can use the following keys: 80 # 81 # * :ESCAPED (URI::PATTERN::ESCAPED in default) 82 # * :UNRESERVED (URI::PATTERN::UNRESERVED in default) 83 # * :DOMLABEL (URI::PATTERN::DOMLABEL in default) 84 # * :TOPLABEL (URI::PATTERN::TOPLABEL in default) 85 # * :HOSTNAME (URI::PATTERN::HOSTNAME in default) 86 # 87 # == Examples 88 # 89 # p = URI::Parser.new(:ESCAPED => "(?:%[a-fA-F0-9]{2}|%u[a-fA-F0-9]{4})") 90 # u = p.parse("http://example.jp/%uABCD") #=> #<URI::HTTP:0xb78cf4f8 URL:http://example.jp/%uABCD> 91 # URI.parse(u.to_s) #=> raises URI::InvalidURIError 92 # 93 # s = "http://examle.com/ABCD" 94 # u1 = p.parse(s) #=> #<URI::HTTP:0xb78c3220 URL:http://example.com/ABCD> 95 # u2 = URI.parse(s) #=> #<URI::HTTP:0xb78b6d54 URL:http://example.com/ABCD> 96 # u1 == u2 #=> true 97 # u1.eql?(u2) #=> false 98 # 99 def initialize(opts = {}) 100 @pattern = initialize_pattern(opts) 101 @pattern.each_value {|v| v.freeze} 102 @pattern.freeze 103 104 @regexp = initialize_regexp(@pattern) 105 @regexp.each_value {|v| v.freeze} 106 @regexp.freeze 107 end 108 109 # The Hash of patterns. 110 # 111 # see also URI::Parser.initialize_pattern 112 attr_reader :pattern 113 114 # The Hash of Regexp 115 # 116 # see also URI::Parser.initialize_regexp 117 attr_reader :regexp 118 119 # Returns a split URI against regexp[:ABS_URI] 120 def split(uri) 121 case uri 122 when '' 123 # null uri 124 125 when @regexp[:ABS_URI] 126 scheme, opaque, userinfo, host, port, 127 registry, path, query, fragment = $~[1..-1] 128 129 # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] 130 131 # absoluteURI = scheme ":" ( hier_part | opaque_part ) 132 # hier_part = ( net_path | abs_path ) [ "?" query ] 133 # opaque_part = uric_no_slash *uric 134 135 # abs_path = "/" path_segments 136 # net_path = "//" authority [ abs_path ] 137 138 # authority = server | reg_name 139 # server = [ [ userinfo "@" ] hostport ] 140 141 if !scheme 142 raise InvalidURIError, 143 "bad URI(absolute but no scheme): #{uri}" 144 end 145 if !opaque && (!path && (!host && !registry)) 146 raise InvalidURIError, 147 "bad URI(absolute but no path): #{uri}" 148 end 149 150 when @regexp[:REL_URI] 151 scheme = nil 152 opaque = nil 153 154 userinfo, host, port, registry, 155 rel_segment, abs_path, query, fragment = $~[1..-1] 156 if rel_segment && abs_path 157 path = rel_segment + abs_path 158 elsif rel_segment 159 path = rel_segment 160 elsif abs_path 161 path = abs_path 162 end 163 164 # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] 165 166 # relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] 167 168 # net_path = "//" authority [ abs_path ] 169 # abs_path = "/" path_segments 170 # rel_path = rel_segment [ abs_path ] 171 172 # authority = server | reg_name 173 # server = [ [ userinfo "@" ] hostport ] 174 175 else 176 raise InvalidURIError, "bad URI(is not URI?): #{uri}" 177 end 178 179 path = '' if !path && !opaque # (see RFC2396 Section 5.2) 180 ret = [ 181 scheme, 182 userinfo, host, port, # X 183 registry, # X 184 path, # Y 185 opaque, # Y 186 query, 187 fragment 188 ] 189 return ret 190 end 191 192 # 193 # == Args 194 # 195 # +uri+:: 196 # String 197 # 198 # == Description 199 # 200 # parses +uri+ and constructs either matching URI scheme object 201 # (FTP, HTTP, HTTPS, LDAP, LDAPS, or MailTo) or URI::Generic 202 # 203 # == Usage 204 # 205 # p = URI::Parser.new 206 # p.parse("ldap://ldap.example.com/dc=example?user=john") 207 # #=> #<URI::LDAP:0x00000000b9e7e8 URL:ldap://ldap.example.com/dc=example?user=john> 208 # 209 def parse(uri) 210 scheme, userinfo, host, port, 211 registry, path, opaque, query, fragment = self.split(uri) 212 213 if scheme && URI.scheme_list.include?(scheme.upcase) 214 URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port, 215 registry, path, opaque, query, 216 fragment, self) 217 else 218 Generic.new(scheme, userinfo, host, port, 219 registry, path, opaque, query, 220 fragment, self) 221 end 222 end 223 224 225 # 226 # == Args 227 # 228 # +uris+:: 229 # an Array of Strings 230 # 231 # == Description 232 # 233 # Attempts to parse and merge a set of URIs 234 # 235 def join(*uris) 236 uris[0] = convert_to_uri(uris[0]) 237 uris.inject :merge 238 end 239 240 # 241 # :call-seq: 242 # extract( str ) 243 # extract( str, schemes ) 244 # extract( str, schemes ) {|item| block } 245 # 246 # == Args 247 # 248 # +str+:: 249 # String to search 250 # +schemes+:: 251 # Patterns to apply to +str+ 252 # 253 # == Description 254 # 255 # Attempts to parse and merge a set of URIs 256 # If no +block+ given , then returns the result, 257 # else it calls +block+ for each element in result. 258 # 259 # see also URI::Parser.make_regexp 260 # 261 def extract(str, schemes = nil) 262 if block_given? 263 str.scan(make_regexp(schemes)) { yield $& } 264 nil 265 else 266 result = [] 267 str.scan(make_regexp(schemes)) { result.push $& } 268 result 269 end 270 end 271 272 # returns Regexp that is default self.regexp[:ABS_URI_REF], 273 # unless +schemes+ is provided. Then it is a Regexp.union with self.pattern[:X_ABS_URI] 274 def make_regexp(schemes = nil) 275 unless schemes 276 @regexp[:ABS_URI_REF] 277 else 278 /(?=#{Regexp.union(*schemes)}:)#{@pattern[:X_ABS_URI]}/x 279 end 280 end 281 282 # 283 # :call-seq: 284 # escape( str ) 285 # escape( str, unsafe ) 286 # 287 # == Args 288 # 289 # +str+:: 290 # String to make safe 291 # +unsafe+:: 292 # Regexp to apply. Defaults to self.regexp[:UNSAFE] 293 # 294 # == Description 295 # 296 # constructs a safe String from +str+, removing unsafe characters, 297 # replacing them with codes. 298 # 299 def escape(str, unsafe = @regexp[:UNSAFE]) 300 unless unsafe.kind_of?(Regexp) 301 # perhaps unsafe is String object 302 unsafe = Regexp.new("[#{Regexp.quote(unsafe)}]", false) 303 end 304 str.gsub(unsafe) do 305 us = $& 306 tmp = '' 307 us.each_byte do |uc| 308 tmp << sprintf('%%%02X', uc) 309 end 310 tmp 311 end.force_encoding(Encoding::US_ASCII) 312 end 313 314 # 315 # :call-seq: 316 # unescape( str ) 317 # unescape( str, unsafe ) 318 # 319 # == Args 320 # 321 # +str+:: 322 # String to remove escapes from 323 # +unsafe+:: 324 # Regexp to apply. Defaults to self.regexp[:ESCAPED] 325 # 326 # == Description 327 # 328 # Removes escapes from +str+ 329 # 330 def unescape(str, escaped = @regexp[:ESCAPED]) 331 str.gsub(escaped) { [$&[1, 2].hex].pack('C') }.force_encoding(str.encoding) 332 end 333 334 @@to_s = Kernel.instance_method(:to_s) 335 def inspect 336 @@to_s.bind(self).call 337 end 338 339 private 340 341 # Constructs the default Hash of patterns 342 def initialize_pattern(opts = {}) 343 ret = {} 344 ret[:ESCAPED] = escaped = (opts.delete(:ESCAPED) || PATTERN::ESCAPED) 345 ret[:UNRESERVED] = unreserved = opts.delete(:UNRESERVED) || PATTERN::UNRESERVED 346 ret[:RESERVED] = reserved = opts.delete(:RESERVED) || PATTERN::RESERVED 347 ret[:DOMLABEL] = opts.delete(:DOMLABEL) || PATTERN::DOMLABEL 348 ret[:TOPLABEL] = opts.delete(:TOPLABEL) || PATTERN::TOPLABEL 349 ret[:HOSTNAME] = hostname = opts.delete(:HOSTNAME) 350 351 # RFC 2396 (URI Generic Syntax) 352 # RFC 2732 (IPv6 Literal Addresses in URL's) 353 # RFC 2373 (IPv6 Addressing Architecture) 354 355 # uric = reserved | unreserved | escaped 356 ret[:URIC] = uric = "(?:[#{unreserved}#{reserved}]|#{escaped})" 357 # uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" | 358 # "&" | "=" | "+" | "$" | "," 359 ret[:URIC_NO_SLASH] = uric_no_slash = "(?:[#{unreserved};?:@&=+$,]|#{escaped})" 360 # query = *uric 361 ret[:QUERY] = query = "#{uric}*" 362 # fragment = *uric 363 ret[:FRAGMENT] = fragment = "#{uric}*" 364 365 # hostname = *( domainlabel "." ) toplabel [ "." ] 366 # reg-name = *( unreserved / pct-encoded / sub-delims ) # RFC3986 367 unless hostname 368 ret[:HOSTNAME] = hostname = "(?:[a-zA-Z0-9\\-.]|%\\h\\h)+" 369 end 370 371 # RFC 2373, APPENDIX B: 372 # IPv6address = hexpart [ ":" IPv4address ] 373 # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT 374 # hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ] 375 # hexseq = hex4 *( ":" hex4) 376 # hex4 = 1*4HEXDIG 377 # 378 # XXX: This definition has a flaw. "::" + IPv4address must be 379 # allowed too. Here is a replacement. 380 # 381 # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT 382 ret[:IPV4ADDR] = ipv4addr = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}" 383 # hex4 = 1*4HEXDIG 384 hex4 = "[#{PATTERN::HEX}]{1,4}" 385 # lastpart = hex4 | IPv4address 386 lastpart = "(?:#{hex4}|#{ipv4addr})" 387 # hexseq1 = *( hex4 ":" ) hex4 388 hexseq1 = "(?:#{hex4}:)*#{hex4}" 389 # hexseq2 = *( hex4 ":" ) lastpart 390 hexseq2 = "(?:#{hex4}:)*#{lastpart}" 391 # IPv6address = hexseq2 | [ hexseq1 ] "::" [ hexseq2 ] 392 ret[:IPV6ADDR] = ipv6addr = "(?:#{hexseq2}|(?:#{hexseq1})?::(?:#{hexseq2})?)" 393 394 # IPv6prefix = ( hexseq1 | [ hexseq1 ] "::" [ hexseq1 ] ) "/" 1*2DIGIT 395 # unused 396 397 # ipv6reference = "[" IPv6address "]" (RFC 2732) 398 ret[:IPV6REF] = ipv6ref = "\\[#{ipv6addr}\\]" 399 400 # host = hostname | IPv4address 401 # host = hostname | IPv4address | IPv6reference (RFC 2732) 402 ret[:HOST] = host = "(?:#{hostname}|#{ipv4addr}|#{ipv6ref})" 403 # port = *digit 404 port = '\d*' 405 # hostport = host [ ":" port ] 406 ret[:HOSTPORT] = hostport = "#{host}(?::#{port})?" 407 408 # userinfo = *( unreserved | escaped | 409 # ";" | ":" | "&" | "=" | "+" | "$" | "," ) 410 ret[:USERINFO] = userinfo = "(?:[#{unreserved};:&=+$,]|#{escaped})*" 411 412 # pchar = unreserved | escaped | 413 # ":" | "@" | "&" | "=" | "+" | "$" | "," 414 pchar = "(?:[#{unreserved}:@&=+$,]|#{escaped})" 415 # param = *pchar 416 param = "#{pchar}*" 417 # segment = *pchar *( ";" param ) 418 segment = "#{pchar}*(?:;#{param})*" 419 # path_segments = segment *( "/" segment ) 420 ret[:PATH_SEGMENTS] = path_segments = "#{segment}(?:/#{segment})*" 421 422 # server = [ [ userinfo "@" ] hostport ] 423 server = "(?:#{userinfo}@)?#{hostport}" 424 # reg_name = 1*( unreserved | escaped | "$" | "," | 425 # ";" | ":" | "@" | "&" | "=" | "+" ) 426 ret[:REG_NAME] = reg_name = "(?:[#{unreserved}$,;:@&=+]|#{escaped})+" 427 # authority = server | reg_name 428 authority = "(?:#{server}|#{reg_name})" 429 430 # rel_segment = 1*( unreserved | escaped | 431 # ";" | "@" | "&" | "=" | "+" | "$" | "," ) 432 ret[:REL_SEGMENT] = rel_segment = "(?:[#{unreserved};@&=+$,]|#{escaped})+" 433 434 # scheme = alpha *( alpha | digit | "+" | "-" | "." ) 435 ret[:SCHEME] = scheme = "[#{PATTERN::ALPHA}][\\-+.#{PATTERN::ALPHA}\\d]*" 436 437 # abs_path = "/" path_segments 438 ret[:ABS_PATH] = abs_path = "/#{path_segments}" 439 # rel_path = rel_segment [ abs_path ] 440 ret[:REL_PATH] = rel_path = "#{rel_segment}(?:#{abs_path})?" 441 # net_path = "//" authority [ abs_path ] 442 ret[:NET_PATH] = net_path = "//#{authority}(?:#{abs_path})?" 443 444 # hier_part = ( net_path | abs_path ) [ "?" query ] 445 ret[:HIER_PART] = hier_part = "(?:#{net_path}|#{abs_path})(?:\\?(?:#{query}))?" 446 # opaque_part = uric_no_slash *uric 447 ret[:OPAQUE_PART] = opaque_part = "#{uric_no_slash}#{uric}*" 448 449 # absoluteURI = scheme ":" ( hier_part | opaque_part ) 450 ret[:ABS_URI] = abs_uri = "#{scheme}:(?:#{hier_part}|#{opaque_part})" 451 # relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] 452 ret[:REL_URI] = rel_uri = "(?:#{net_path}|#{abs_path}|#{rel_path})(?:\\?#{query})?" 453 454 # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] 455 ret[:URI_REF] = "(?:#{abs_uri}|#{rel_uri})?(?:##{fragment})?" 456 457 ret[:X_ABS_URI] = " 458 (#{scheme}): (?# 1: scheme) 459 (?: 460 (#{opaque_part}) (?# 2: opaque) 461 | 462 (?:(?: 463 //(?: 464 (?:(?:(#{userinfo})@)? (?# 3: userinfo) 465 (?:(#{host})(?::(\\d*))?))? (?# 4: host, 5: port) 466 | 467 (#{reg_name}) (?# 6: registry) 468 ) 469 | 470 (?!//)) (?# XXX: '//' is the mark for hostport) 471 (#{abs_path})? (?# 7: path) 472 )(?:\\?(#{query}))? (?# 8: query) 473 ) 474 (?:\\#(#{fragment}))? (?# 9: fragment) 475 " 476 477 ret[:X_REL_URI] = " 478 (?: 479 (?: 480 // 481 (?: 482 (?:(#{userinfo})@)? (?# 1: userinfo) 483 (#{host})?(?::(\\d*))? (?# 2: host, 3: port) 484 | 485 (#{reg_name}) (?# 4: registry) 486 ) 487 ) 488 | 489 (#{rel_segment}) (?# 5: rel_segment) 490 )? 491 (#{abs_path})? (?# 6: abs_path) 492 (?:\\?(#{query}))? (?# 7: query) 493 (?:\\#(#{fragment}))? (?# 8: fragment) 494 " 495 496 ret 497 end 498 499 # Constructs the default Hash of Regexp's 500 def initialize_regexp(pattern) 501 ret = {} 502 503 # for URI::split 504 ret[:ABS_URI] = Regexp.new('\A\s*' + pattern[:X_ABS_URI] + '\s*\z', Regexp::EXTENDED) 505 ret[:REL_URI] = Regexp.new('\A\s*' + pattern[:X_REL_URI] + '\s*\z', Regexp::EXTENDED) 506 507 # for URI::extract 508 ret[:URI_REF] = Regexp.new(pattern[:URI_REF]) 509 ret[:ABS_URI_REF] = Regexp.new(pattern[:X_ABS_URI], Regexp::EXTENDED) 510 ret[:REL_URI_REF] = Regexp.new(pattern[:X_REL_URI], Regexp::EXTENDED) 511 512 # for URI::escape/unescape 513 ret[:ESCAPED] = Regexp.new(pattern[:ESCAPED]) 514 ret[:UNSAFE] = Regexp.new("[^#{pattern[:UNRESERVED]}#{pattern[:RESERVED]}]") 515 516 # for Generic#initialize 517 ret[:SCHEME] = Regexp.new("\\A#{pattern[:SCHEME]}\\z") 518 ret[:USERINFO] = Regexp.new("\\A#{pattern[:USERINFO]}\\z") 519 ret[:HOST] = Regexp.new("\\A#{pattern[:HOST]}\\z") 520 ret[:PORT] = Regexp.new("\\A#{pattern[:PORT]}\\z") 521 ret[:OPAQUE] = Regexp.new("\\A#{pattern[:OPAQUE_PART]}\\z") 522 ret[:REGISTRY] = Regexp.new("\\A#{pattern[:REG_NAME]}\\z") 523 ret[:ABS_PATH] = Regexp.new("\\A#{pattern[:ABS_PATH]}\\z") 524 ret[:REL_PATH] = Regexp.new("\\A#{pattern[:REL_PATH]}\\z") 525 ret[:QUERY] = Regexp.new("\\A#{pattern[:QUERY]}\\z") 526 ret[:FRAGMENT] = Regexp.new("\\A#{pattern[:FRAGMENT]}\\z") 527 528 ret 529 end 530 531 def convert_to_uri(uri) 532 if uri.is_a?(URI::Generic) 533 uri 534 elsif uri = String.try_convert(uri) 535 parse(uri) 536 else 537 raise ArgumentError, 538 "bad argument (expected URI object or URI string)" 539 end 540 end 541 542 end # class Parser 543 544 # URI::Parser.new 545 DEFAULT_PARSER = Parser.new 546 DEFAULT_PARSER.pattern.each_pair do |sym, str| 547 unless REGEXP::PATTERN.const_defined?(sym) 548 REGEXP::PATTERN.const_set(sym, str) 549 end 550 end 551 DEFAULT_PARSER.regexp.each_pair do |sym, str| 552 const_set(sym, str) 553 end 554 555 module Util # :nodoc: 556 def make_components_hash(klass, array_hash) 557 tmp = {} 558 if array_hash.kind_of?(Array) && 559 array_hash.size == klass.component.size - 1 560 klass.component[1..-1].each_index do |i| 561 begin 562 tmp[klass.component[i + 1]] = array_hash[i].clone 563 rescue TypeError 564 tmp[klass.component[i + 1]] = array_hash[i] 565 end 566 end 567 568 elsif array_hash.kind_of?(Hash) 569 array_hash.each do |key, value| 570 begin 571 tmp[key] = value.clone 572 rescue TypeError 573 tmp[key] = value 574 end 575 end 576 else 577 raise ArgumentError, 578 "expected Array of or Hash of components of #{klass.to_s} (#{klass.component[1..-1].join(', ')})" 579 end 580 tmp[:scheme] = klass.to_s.sub(/\A.*::/, '').downcase 581 582 return tmp 583 end 584 module_function :make_components_hash 585 end 586 587 # module for escaping unsafe characters with codes. 588 module Escape 589 # 590 # == Synopsis 591 # 592 # URI.escape(str [, unsafe]) 593 # 594 # == Args 595 # 596 # +str+:: 597 # String to replaces in. 598 # +unsafe+:: 599 # Regexp that matches all symbols that must be replaced with codes. 600 # By default uses <tt>REGEXP::UNSAFE</tt>. 601 # When this argument is a String, it represents a character set. 602 # 603 # == Description 604 # 605 # Escapes the string, replacing all unsafe characters with codes. 606 # 607 # == Usage 608 # 609 # require 'uri' 610 # 611 # enc_uri = URI.escape("http://example.com/?a=\11\15") 612 # p enc_uri 613 # # => "http://example.com/?a=%09%0D" 614 # 615 # p URI.unescape(enc_uri) 616 # # => "http://example.com/?a=\t\r" 617 # 618 # p URI.escape("@?@!", "!?") 619 # # => "@%3F@%21" 620 # 621 def escape(*arg) 622 warn "#{caller(1)[0]}: warning: URI.escape is obsolete" if $VERBOSE 623 DEFAULT_PARSER.escape(*arg) 624 end 625 alias encode escape 626 # 627 # == Synopsis 628 # 629 # URI.unescape(str) 630 # 631 # == Args 632 # 633 # +str+:: 634 # Unescapes the string. 635 # 636 # == Usage 637 # 638 # require 'uri' 639 # 640 # enc_uri = URI.escape("http://example.com/?a=\11\15") 641 # p enc_uri 642 # # => "http://example.com/?a=%09%0D" 643 # 644 # p URI.unescape(enc_uri) 645 # # => "http://example.com/?a=\t\r" 646 # 647 def unescape(*arg) 648 warn "#{caller(1)[0]}: warning: URI.unescape is obsolete" if $VERBOSE 649 DEFAULT_PARSER.unescape(*arg) 650 end 651 alias decode unescape 652 end # module Escape 653 654 extend Escape 655 include REGEXP 656 657 @@schemes = {} 658 # Returns a Hash of the defined schemes 659 def self.scheme_list 660 @@schemes 661 end 662 663 # 664 # Base class for all URI exceptions. 665 # 666 class Error < StandardError; end 667 # 668 # Not a URI. 669 # 670 class InvalidURIError < Error; end 671 # 672 # Not a URI component. 673 # 674 class InvalidComponentError < Error; end 675 # 676 # URI is valid, bad usage is not. 677 # 678 class BadURIError < Error; end 679 680 # 681 # == Synopsis 682 # 683 # URI::split(uri) 684 # 685 # == Args 686 # 687 # +uri+:: 688 # String with URI. 689 # 690 # == Description 691 # 692 # Splits the string on following parts and returns array with result: 693 # 694 # * Scheme 695 # * Userinfo 696 # * Host 697 # * Port 698 # * Registry 699 # * Path 700 # * Opaque 701 # * Query 702 # * Fragment 703 # 704 # == Usage 705 # 706 # require 'uri' 707 # 708 # p URI.split("http://www.ruby-lang.org/") 709 # # => ["http", nil, "www.ruby-lang.org", nil, nil, "/", nil, nil, nil] 710 # 711 def self.split(uri) 712 DEFAULT_PARSER.split(uri) 713 end 714 715 # 716 # == Synopsis 717 # 718 # URI::parse(uri_str) 719 # 720 # == Args 721 # 722 # +uri_str+:: 723 # String with URI. 724 # 725 # == Description 726 # 727 # Creates one of the URI's subclasses instance from the string. 728 # 729 # == Raises 730 # 731 # URI::InvalidURIError 732 # Raised if URI given is not a correct one. 733 # 734 # == Usage 735 # 736 # require 'uri' 737 # 738 # uri = URI.parse("http://www.ruby-lang.org/") 739 # p uri 740 # # => #<URI::HTTP:0x202281be URL:http://www.ruby-lang.org/> 741 # p uri.scheme 742 # # => "http" 743 # p uri.host 744 # # => "www.ruby-lang.org" 745 # 746 def self.parse(uri) 747 DEFAULT_PARSER.parse(uri) 748 end 749 750 # 751 # == Synopsis 752 # 753 # URI::join(str[, str, ...]) 754 # 755 # == Args 756 # 757 # +str+:: 758 # String(s) to work with 759 # 760 # == Description 761 # 762 # Joins URIs. 763 # 764 # == Usage 765 # 766 # require 'uri' 767 # 768 # p URI.join("http://example.com/","main.rbx") 769 # # => #<URI::HTTP:0x2022ac02 URL:http://localhost/main.rbx> 770 # 771 # p URI.join('http://example.com', 'foo') 772 # # => #<URI::HTTP:0x01ab80a0 URL:http://example.com/foo> 773 # 774 # p URI.join('http://example.com', '/foo', '/bar') 775 # # => #<URI::HTTP:0x01aaf0b0 URL:http://example.com/bar> 776 # 777 # p URI.join('http://example.com', '/foo', 'bar') 778 # # => #<URI::HTTP:0x801a92af0 URL:http://example.com/bar> 779 # 780 # p URI.join('http://example.com', '/foo/', 'bar') 781 # # => #<URI::HTTP:0x80135a3a0 URL:http://example.com/foo/bar> 782 # 783 # 784 def self.join(*str) 785 DEFAULT_PARSER.join(*str) 786 end 787 788 # 789 # == Synopsis 790 # 791 # URI::extract(str[, schemes][,&blk]) 792 # 793 # == Args 794 # 795 # +str+:: 796 # String to extract URIs from. 797 # +schemes+:: 798 # Limit URI matching to a specific schemes. 799 # 800 # == Description 801 # 802 # Extracts URIs from a string. If block given, iterates through all matched URIs. 803 # Returns nil if block given or array with matches. 804 # 805 # == Usage 806 # 807 # require "uri" 808 # 809 # URI.extract("text here http://foo.example.org/bla and here mailto:test@example.com and here also.") 810 # # => ["http://foo.example.com/bla", "mailto:test@example.com"] 811 # 812 def self.extract(str, schemes = nil, &block) 813 DEFAULT_PARSER.extract(str, schemes, &block) 814 end 815 816 # 817 # == Synopsis 818 # 819 # URI::regexp([match_schemes]) 820 # 821 # == Args 822 # 823 # +match_schemes+:: 824 # Array of schemes. If given, resulting regexp matches to URIs 825 # whose scheme is one of the match_schemes. 826 # 827 # == Description 828 # Returns a Regexp object which matches to URI-like strings. 829 # The Regexp object returned by this method includes arbitrary 830 # number of capture group (parentheses). Never rely on it's number. 831 # 832 # == Usage 833 # 834 # require 'uri' 835 # 836 # # extract first URI from html_string 837 # html_string.slice(URI.regexp) 838 # 839 # # remove ftp URIs 840 # html_string.sub(URI.regexp(['ftp']) 841 # 842 # # You should not rely on the number of parentheses 843 # html_string.scan(URI.regexp) do |*matches| 844 # p $& 845 # end 846 # 847 def self.regexp(schemes = nil) 848 DEFAULT_PARSER.make_regexp(schemes) 849 end 850 851 TBLENCWWWCOMP_ = {} # :nodoc: 852 256.times do |i| 853 TBLENCWWWCOMP_[i.chr] = '%%%02X' % i 854 end 855 TBLENCWWWCOMP_[' '] = '+' 856 TBLENCWWWCOMP_.freeze 857 TBLDECWWWCOMP_ = {} # :nodoc: 858 256.times do |i| 859 h, l = i>>4, i&15 860 TBLDECWWWCOMP_['%%%X%X' % [h, l]] = i.chr 861 TBLDECWWWCOMP_['%%%x%X' % [h, l]] = i.chr 862 TBLDECWWWCOMP_['%%%X%x' % [h, l]] = i.chr 863 TBLDECWWWCOMP_['%%%x%x' % [h, l]] = i.chr 864 end 865 TBLDECWWWCOMP_['+'] = ' ' 866 TBLDECWWWCOMP_.freeze 867 868 HTML5ASCIIINCOMPAT = [Encoding::UTF_7, Encoding::UTF_16BE, Encoding::UTF_16LE, 869 Encoding::UTF_32BE, Encoding::UTF_32LE] # :nodoc: 870 871 # Encode given +str+ to URL-encoded form data. 872 # 873 # This method doesn't convert *, -, ., 0-9, A-Z, _, a-z, but does convert SP 874 # (ASCII space) to + and converts others to %XX. 875 # 876 # This is an implementation of 877 # http://www.w3.org/TR/html5/association-of-controls-and-forms.html#url-encoded-form-data 878 # 879 # See URI.decode_www_form_component, URI.encode_www_form 880 def self.encode_www_form_component(str) 881 str = str.to_s 882 if HTML5ASCIIINCOMPAT.include?(str.encoding) 883 str = str.encode(Encoding::UTF_8) 884 else 885 str = str.dup 886 end 887 str.force_encoding(Encoding::ASCII_8BIT) 888 str.gsub!(/[^*\-.0-9A-Z_a-z]/, TBLENCWWWCOMP_) 889 str.force_encoding(Encoding::US_ASCII) 890 end 891 892 # Decode given +str+ of URL-encoded form data. 893 # 894 # This decodes + to SP. 895 # 896 # See URI.encode_www_form_component, URI.decode_www_form 897 def self.decode_www_form_component(str, enc=Encoding::UTF_8) 898 raise ArgumentError, "invalid %-encoding (#{str})" unless /\A[^%]*(?:%\h\h[^%]*)*\z/ =~ str 899 str.dup.force_encoding("ASCII-8BIT") \ 900 .gsub(/\+|%\h\h/, TBLDECWWWCOMP_) \ 901 .force_encoding(enc) 902 end 903 904 # Generate URL-encoded form data from given +enum+. 905 # 906 # This generates application/x-www-form-urlencoded data defined in HTML5 907 # from given an Enumerable object. 908 # 909 # This internally uses URI.encode_www_form_component(str). 910 # 911 # This method doesn't convert the encoding of given items, so convert them 912 # before call this method if you want to send data as other than original 913 # encoding or mixed encoding data. (Strings which are encoded in an HTML5 914 # ASCII incompatible encoding are converted to UTF-8.) 915 # 916 # This method doesn't handle files. When you send a file, use 917 # multipart/form-data. 918 # 919 # This is an implementation of 920 # http://www.w3.org/TR/html5/forms.html#url-encoded-form-data 921 # 922 # URI.encode_www_form([["q", "ruby"], ["lang", "en"]]) 923 # #=> "q=ruby&lang=en" 924 # URI.encode_www_form("q" => "ruby", "lang" => "en") 925 # #=> "q=ruby&lang=en" 926 # URI.encode_www_form("q" => ["ruby", "perl"], "lang" => "en") 927 # #=> "q=ruby&q=perl&lang=en" 928 # URI.encode_www_form([["q", "ruby"], ["q", "perl"], ["lang", "en"]]) 929 # #=> "q=ruby&q=perl&lang=en" 930 # 931 # See URI.encode_www_form_component, URI.decode_www_form 932 def self.encode_www_form(enum) 933 enum.map do |k,v| 934 if v.nil? 935 encode_www_form_component(k) 936 elsif v.respond_to?(:to_ary) 937 v.to_ary.map do |w| 938 str = encode_www_form_component(k) 939 unless w.nil? 940 str << '=' 941 str << encode_www_form_component(w) 942 end 943 end.join('&') 944 else 945 str = encode_www_form_component(k) 946 str << '=' 947 str << encode_www_form_component(v) 948 end 949 end.join('&') 950 end 951 952 WFKV_ = '(?:[^%#=;&]*(?:%\h\h[^%#=;&]*)*)' # :nodoc: 953 954 # Decode URL-encoded form data from given +str+. 955 # 956 # This decodes application/x-www-form-urlencoded data 957 # and returns array of key-value array. 958 # This internally uses URI.decode_www_form_component. 959 # 960 # _charset_ hack is not supported now because the mapping from given charset 961 # to Ruby's encoding is not clear yet. 962 # see also http://www.w3.org/TR/html5/syntax.html#character-encodings-0 963 # 964 # This refers http://www.w3.org/TR/html5/forms.html#url-encoded-form-data 965 # 966 # ary = URI.decode_www_form("a=1&a=2&b=3") 967 # p ary #=> [['a', '1'], ['a', '2'], ['b', '3']] 968 # p ary.assoc('a').last #=> '1' 969 # p ary.assoc('b').last #=> '3' 970 # p ary.rassoc('a').last #=> '2' 971 # p Hash[ary] # => {"a"=>"2", "b"=>"3"} 972 # 973 # See URI.decode_www_form_component, URI.encode_www_form 974 def self.decode_www_form(str, enc=Encoding::UTF_8) 975 return [] if str.empty? 976 unless /\A#{WFKV_}=#{WFKV_}(?:[;&]#{WFKV_}=#{WFKV_})*\z/o =~ str 977 raise ArgumentError, "invalid data of application/x-www-form-urlencoded (#{str})" 978 end 979 ary = [] 980 $&.scan(/([^=;&]+)=([^;&]*)/) do 981 ary << [decode_www_form_component($1, enc), decode_www_form_component($2, enc)] 982 end 983 ary 984 end 985end # module URI 986 987module Kernel 988 989 # 990 # Returns +uri+ converted to a URI object. 991 # 992 def URI(uri) 993 if uri.is_a?(URI::Generic) 994 uri 995 elsif uri = String.try_convert(uri) 996 URI.parse(uri) 997 else 998 raise ArgumentError, 999 "bad argument (expected URI object or URI string)" 1000 end 1001 end 1002 module_function :URI 1003end 1004