1/* 2 * Copyright (C) 2005, 2007, 2014 Apple Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of Apple Inc. ("Apple") nor the names of 14 * its contributors may be used to endorse or promote products derived 15 * from this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY 18 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY 21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#import "config.h" 30#import "WebCoreObjCExtras.h" 31#import "WebCoreNSStringExtras.h" 32#import "WebCoreNSURLExtras.h" 33#import "WebCoreSystemInterface.h" 34#import <wtf/ObjcRuntimeExtras.h> 35#import <wtf/RetainPtr.h> 36#import <wtf/Vector.h> 37#import <unicode/uchar.h> 38#import <unicode/uidna.h> 39#import <unicode/uscript.h> 40 41// Needs to be big enough to hold an IDN-encoded name. 42// For host names bigger than this, we won't do IDN encoding, which is almost certainly OK. 43#define HOST_NAME_BUFFER_LENGTH 2048 44#define URL_BYTES_BUFFER_LENGTH 2048 45 46typedef void (* StringRangeApplierFunction)(NSString *string, NSRange range, void *context); 47 48static pthread_once_t IDNScriptWhiteListFileRead = PTHREAD_ONCE_INIT; 49static uint32_t IDNScriptWhiteList[(USCRIPT_CODE_LIMIT + 31) / 32]; 50 51 52@interface NSURLProtocol (WKNSURLProtocolInternal) 53+ (Class)_protocolClassForRequest:(NSURLRequest *)request; 54@end 55 56namespace WebCore { 57 58static BOOL isLookalikeCharacter(UChar32 charCode) 59{ 60 // This function treats the following as unsafe, lookalike characters: 61 // any non-printable character, any character considered as whitespace, 62 // any ignorable character, and emoji characters related to locks. 63 64 // We also considered the characters in Mozilla's blacklist <http://kb.mozillazine.org/Network.IDN.blacklist_chars>. 65 66 // Some of the characters here will never appear once ICU has encoded. 67 // For example, ICU transforms most spaces into an ASCII space and most 68 // slashes into an ASCII solidus. But one of the two callers uses this 69 // on characters that have not been processed by ICU, so they are needed here. 70 71 if (!u_isprint(charCode) || u_isUWhiteSpace(charCode) || u_hasBinaryProperty(charCode, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) 72 return YES; 73 74 switch (charCode) { 75 case 0x00BC: /* VULGAR FRACTION ONE QUARTER */ 76 case 0x00BD: /* VULGAR FRACTION ONE HALF */ 77 case 0x00BE: /* VULGAR FRACTION THREE QUARTERS */ 78 case 0x00ED: /* LATIN SMALL LETTER I WITH ACUTE */ 79 case 0x01C3: /* LATIN LETTER RETROFLEX CLICK */ 80 case 0x0251: /* LATIN SMALL LETTER ALPHA */ 81 case 0x0261: /* LATIN SMALL LETTER SCRIPT G */ 82 case 0x02D0: /* MODIFIER LETTER TRIANGULAR COLON */ 83 case 0x0335: /* COMBINING SHORT STROKE OVERLAY */ 84 case 0x0337: /* COMBINING SHORT SOLIDUS OVERLAY */ 85 case 0x0338: /* COMBINING LONG SOLIDUS OVERLAY */ 86 case 0x0589: /* ARMENIAN FULL STOP */ 87 case 0x05B4: /* HEBREW POINT HIRIQ */ 88 case 0x05BC: /* HEBREW POINT DAGESH OR MAPIQ */ 89 case 0x05C3: /* HEBREW PUNCTUATION SOF PASUQ */ 90 case 0x05F4: /* HEBREW PUNCTUATION GERSHAYIM */ 91 case 0x0609: /* ARABIC-INDIC PER MILLE SIGN */ 92 case 0x060A: /* ARABIC-INDIC PER TEN THOUSAND SIGN */ 93 case 0x0660: /* ARABIC INDIC DIGIT ZERO */ 94 case 0x066A: /* ARABIC PERCENT SIGN */ 95 case 0x06D4: /* ARABIC FULL STOP */ 96 case 0x06F0: /* EXTENDED ARABIC INDIC DIGIT ZERO */ 97 case 0x0701: /* SYRIAC SUPRALINEAR FULL STOP */ 98 case 0x0702: /* SYRIAC SUBLINEAR FULL STOP */ 99 case 0x0703: /* SYRIAC SUPRALINEAR COLON */ 100 case 0x0704: /* SYRIAC SUBLINEAR COLON */ 101 case 0x1735: /* PHILIPPINE SINGLE PUNCTUATION */ 102 case 0x2024: /* ONE DOT LEADER */ 103 case 0x2027: /* HYPHENATION POINT */ 104 case 0x2039: /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */ 105 case 0x203A: /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */ 106 case 0x2041: /* CARET INSERTION POINT */ 107 case 0x2044: /* FRACTION SLASH */ 108 case 0x2052: /* COMMERCIAL MINUS SIGN */ 109 case 0x2153: /* VULGAR FRACTION ONE THIRD */ 110 case 0x2154: /* VULGAR FRACTION TWO THIRDS */ 111 case 0x2155: /* VULGAR FRACTION ONE FIFTH */ 112 case 0x2156: /* VULGAR FRACTION TWO FIFTHS */ 113 case 0x2157: /* VULGAR FRACTION THREE FIFTHS */ 114 case 0x2158: /* VULGAR FRACTION FOUR FIFTHS */ 115 case 0x2159: /* VULGAR FRACTION ONE SIXTH */ 116 case 0x215A: /* VULGAR FRACTION FIVE SIXTHS */ 117 case 0x215B: /* VULGAR FRACTION ONE EIGHT */ 118 case 0x215C: /* VULGAR FRACTION THREE EIGHTHS */ 119 case 0x215D: /* VULGAR FRACTION FIVE EIGHTHS */ 120 case 0x215E: /* VULGAR FRACTION SEVEN EIGHTHS */ 121 case 0x215F: /* FRACTION NUMERATOR ONE */ 122 case 0x2215: /* DIVISION SLASH */ 123 case 0x2216: /* SET MINUS */ 124 case 0x2236: /* RATIO */ 125 case 0x233F: /* APL FUNCTIONAL SYMBOL SLASH BAR */ 126 case 0x23AE: /* INTEGRAL EXTENSION */ 127 case 0x244A: /* OCR DOUBLE BACKSLASH */ 128 case 0x2571: /* BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT */ 129 case 0x2572: /* BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT */ 130 case 0x29F6: /* SOLIDUS WITH OVERBAR */ 131 case 0x29F8: /* BIG SOLIDUS */ 132 case 0x2AFB: /* TRIPLE SOLIDUS BINARY RELATION */ 133 case 0x2AFD: /* DOUBLE SOLIDUS OPERATOR */ 134 case 0x2FF0: /* IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT */ 135 case 0x2FF1: /* IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO BELOW */ 136 case 0x2FF2: /* IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO MIDDLE AND RIGHT */ 137 case 0x2FF3: /* IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO MIDDLE AND BELOW */ 138 case 0x2FF4: /* IDEOGRAPHIC DESCRIPTION CHARACTER FULL SURROUND */ 139 case 0x2FF5: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM ABOVE */ 140 case 0x2FF6: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM BELOW */ 141 case 0x2FF7: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LEFT */ 142 case 0x2FF8: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER LEFT */ 143 case 0x2FF9: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER RIGHT */ 144 case 0x2FFA: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LOWER LEFT */ 145 case 0x2FFB: /* IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID */ 146 case 0x3002: /* IDEOGRAPHIC FULL STOP */ 147 case 0x3008: /* LEFT ANGLE BRACKET */ 148 case 0x3014: /* LEFT TORTOISE SHELL BRACKET */ 149 case 0x3015: /* RIGHT TORTOISE SHELL BRACKET */ 150 case 0x3033: /* VERTICAL KANA REPEAT MARK UPPER HALF */ 151 case 0x3035: /* VERTICAL KANA REPEAT MARK LOWER HALF */ 152 case 0x321D: /* PARENTHESIZED KOREAN CHARACTER OJEON */ 153 case 0x321E: /* PARENTHESIZED KOREAN CHARACTER O HU */ 154 case 0x33AE: /* SQUARE RAD OVER S */ 155 case 0x33AF: /* SQUARE RAD OVER S SQUARED */ 156 case 0x33C6: /* SQUARE C OVER KG */ 157 case 0x33DF: /* SQUARE A OVER M */ 158 case 0xA789: /* MODIFIER LETTER COLON */ 159 case 0xFE14: /* PRESENTATION FORM FOR VERTICAL SEMICOLON */ 160 case 0xFE15: /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */ 161 case 0xFE3F: /* PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET */ 162 case 0xFE5D: /* SMALL LEFT TORTOISE SHELL BRACKET */ 163 case 0xFE5E: /* SMALL RIGHT TORTOISE SHELL BRACKET */ 164 case 0xFF0E: /* FULLWIDTH FULL STOP */ 165 case 0xFF0F: /* FULL WIDTH SOLIDUS */ 166 case 0xFF61: /* HALFWIDTH IDEOGRAPHIC FULL STOP */ 167 case 0xFFFC: /* OBJECT REPLACEMENT CHARACTER */ 168 case 0xFFFD: /* REPLACEMENT CHARACTER */ 169 case 0x1F50F: /* LOCK WITH INK PEN */ 170 case 0x1F510: /* CLOSED LOCK WITH KEY */ 171 case 0x1F511: /* KEY */ 172 case 0x1F512: /* LOCK */ 173 case 0x1F513: /* OPEN LOCK */ 174 return YES; 175 default: 176 return NO; 177 } 178} 179 180static BOOL readIDNScriptWhiteListFile(NSString *filename) 181{ 182 if (!filename) 183 return NO; 184 185 FILE *file = fopen([filename fileSystemRepresentation], "r"); 186 if (!file) 187 return NO; 188 189 // Read a word at a time. 190 // Allow comments, starting with # character to the end of the line. 191 while (1) { 192 // Skip a comment if present. 193 if (fscanf(file, " #%*[^\n\r]%*[\n\r]") == EOF) 194 break; 195 196 // Read a script name if present. 197 char word[33]; 198 int result = fscanf(file, " %32[^# \t\n\r]%*[^# \t\n\r] ", word); 199 if (result == EOF) 200 break; 201 202 if (result == 1) { 203 // Got a word, map to script code and put it into the array. 204 int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word); 205 if (script >= 0 && script < USCRIPT_CODE_LIMIT) { 206 size_t index = script / 32; 207 uint32_t mask = 1 << (script % 32); 208 IDNScriptWhiteList[index] |= mask; 209 } 210 } 211 } 212 fclose(file); 213 return YES; 214} 215 216static void readIDNScriptWhiteList(void) 217{ 218 // Read white list from library. 219 NSArray *dirs = NSSearchPathForDirectoriesInDomains(NSLibraryDirectory, NSAllDomainsMask, YES); 220 int numDirs = [dirs count]; 221 for (int i = 0; i < numDirs; i++) { 222 if (readIDNScriptWhiteListFile([[dirs objectAtIndex:i] stringByAppendingPathComponent:@"IDNScriptWhiteList.txt"])) 223 return; 224 } 225 226 // Fall back on white list inside bundle. 227 NSBundle *bundle = [NSBundle bundleWithIdentifier:@"com.apple.WebCore"]; 228 229 if (!readIDNScriptWhiteListFile([bundle pathForResource:@"IDNScriptWhiteList" ofType:@"txt"])) 230 CRASH(); 231} 232 233static BOOL allCharactersInIDNScriptWhiteList(const UChar *buffer, int32_t length) 234{ 235 pthread_once(&IDNScriptWhiteListFileRead, readIDNScriptWhiteList); 236 237 int32_t i = 0; 238 while (i < length) { 239 UChar32 c; 240 U16_NEXT(buffer, i, length, c) 241 UErrorCode error = U_ZERO_ERROR; 242 UScriptCode script = uscript_getScript(c, &error); 243 if (error != U_ZERO_ERROR) { 244 LOG_ERROR("got ICU error while trying to look at scripts: %d", error); 245 return NO; 246 } 247 if (script < 0) { 248 LOG_ERROR("got negative number for script code from ICU: %d", script); 249 return NO; 250 } 251 if (script >= USCRIPT_CODE_LIMIT) 252 return NO; 253 254 size_t index = script / 32; 255 uint32_t mask = 1 << (script % 32); 256 if (!(IDNScriptWhiteList[index] & mask)) 257 return NO; 258 259 if (isLookalikeCharacter(c)) 260 return NO; 261 } 262 return YES; 263} 264 265static BOOL allCharactersAllowedByTLDRules(const UChar* buffer, int32_t length) 266{ 267 // Skip trailing dot for root domain. 268 if (buffer[length - 1] == '.') 269 length--; 270 271 if (length > 3 && buffer[length - 3] == '.' 272 && buffer[length - 2] == 0x0440 // CYRILLIC SMALL LETTER ER 273 && buffer[length - 1] == 0x0444) // CYRILLIC SMALL LETTER EF 274 { 275 // Rules defined by <http://www.cctld.ru/ru/docs/rulesrf.php>. This code only checks requirements that matter for presentation purposes. 276 for (int32_t i = length - 4; i; --i) { 277 UChar ch = buffer[i]; 278 279 // Only modern Russian letters, digits and dashes are allowed. 280 if ((ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451|| (ch >= '0' && ch <= '9') || ch == '-') 281 continue; 282 283 // Only check top level domain. Lower level registrars may have different rules. 284 if (ch == '.') 285 break; 286 287 return NO; 288 } 289 return YES; 290 } 291 292 // Not a known top level domain with special rules. 293 return NO; 294} 295 296// Return value of nil means no mapping is necessary. 297// If makeString is NO, then return value is either nil or self to indicate mapping is necessary. 298// If makeString is YES, then return value is either nil or the mapped string. 299static NSString *mapHostNameWithRange(NSString *string, NSRange range, BOOL encode, BOOL makeString) 300{ 301 if (range.length > HOST_NAME_BUFFER_LENGTH) 302 return nil; 303 304 if (![string length]) 305 return nil; 306 307 UChar sourceBuffer[HOST_NAME_BUFFER_LENGTH]; 308 UChar destinationBuffer[HOST_NAME_BUFFER_LENGTH]; 309 310 if (encode && [string rangeOfString:@"%" options:NSLiteralSearch range:range].location != NSNotFound) { 311 NSString *substring = [string substringWithRange:range]; 312 substring = CFBridgingRelease(CFURLCreateStringByReplacingPercentEscapes(NULL, (CFStringRef)substring, CFSTR(""))); 313 if (substring) { 314 string = substring; 315 range = NSMakeRange(0, [string length]); 316 } 317 } 318 319 int length = range.length; 320 [string getCharacters:sourceBuffer range:range]; 321 322 UErrorCode error = U_ZERO_ERROR; 323 int32_t numCharactersConverted = (encode ? uidna_IDNToASCII : uidna_IDNToUnicode)(sourceBuffer, length, destinationBuffer, HOST_NAME_BUFFER_LENGTH, UIDNA_ALLOW_UNASSIGNED, NULL, &error); 324 if (error != U_ZERO_ERROR) 325 return nil; 326 327 if (numCharactersConverted == length && !memcmp(sourceBuffer, destinationBuffer, length * sizeof(UChar))) 328 return nil; 329 330 if (!encode && !allCharactersInIDNScriptWhiteList(destinationBuffer, numCharactersConverted) && !allCharactersAllowedByTLDRules(destinationBuffer, numCharactersConverted)) 331 return nil; 332 333 return makeString ? (NSString *)[NSString stringWithCharacters:destinationBuffer length:numCharactersConverted] : string; 334} 335 336BOOL hostNameNeedsDecodingWithRange(NSString *string, NSRange range) 337{ 338 return mapHostNameWithRange(string, range, NO, NO) != nil; 339} 340 341BOOL hostNameNeedsEncodingWithRange(NSString *string, NSRange range) 342{ 343 return mapHostNameWithRange(string, range, YES, NO) != nil; 344} 345 346NSString *decodeHostNameWithRange(NSString *string, NSRange range) 347{ 348 return mapHostNameWithRange(string, range, NO, YES); 349} 350 351NSString *encodeHostNameWithRange(NSString *string, NSRange range) 352{ 353 return mapHostNameWithRange(string, range, YES, YES); 354} 355 356NSString *decodeHostName(NSString *string) 357{ 358 NSString *name = mapHostNameWithRange(string, NSMakeRange(0, [string length]), NO, YES); 359 return !name ? string : name; 360} 361 362NSString *encodeHostName(NSString *string) 363{ 364 NSString *name = mapHostNameWithRange(string, NSMakeRange(0, [string length]), YES, YES); 365 return !name ? string : name; 366} 367 368static void collectRangesThatNeedMapping(NSString *string, NSRange range, void *context, BOOL encode) 369{ 370 BOOL needsMapping = encode ? hostNameNeedsEncodingWithRange(string, range) : hostNameNeedsDecodingWithRange(string, range); 371 if (!needsMapping) 372 return; 373 374 NSMutableArray **array = (NSMutableArray **)context; 375 if (!*array) 376 *array = [[NSMutableArray alloc] init]; 377 378 [*array addObject:[NSValue valueWithRange:range]]; 379} 380 381static void collectRangesThatNeedEncoding(NSString *string, NSRange range, void *context) 382{ 383 return collectRangesThatNeedMapping(string, range, context, YES); 384} 385 386static void collectRangesThatNeedDecoding(NSString *string, NSRange range, void *context) 387{ 388 return collectRangesThatNeedMapping(string, range, context, NO); 389} 390 391static inline NSCharacterSet *retain(NSCharacterSet *charset) 392{ 393 CFRetain(charset); 394 return charset; 395} 396 397static void applyHostNameFunctionToMailToURLString(NSString *string, StringRangeApplierFunction f, void *context) 398{ 399 // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' character. 400 // Skip quoted strings so that characters in them don't confuse us. 401 // When we find a '?' character, we are past the part of the URL that contains host names. 402 403 static NSCharacterSet *hostNameOrStringStartCharacters = retain([NSCharacterSet characterSetWithCharactersInString:@"\"@?"]); 404 static NSCharacterSet *hostNameEndCharacters = retain([NSCharacterSet characterSetWithCharactersInString:@">,?"]); 405 static NSCharacterSet *quotedStringCharacters = retain([NSCharacterSet characterSetWithCharactersInString:@"\"\\"]); 406 407 unsigned stringLength = [string length]; 408 NSRange remaining = NSMakeRange(0, stringLength); 409 410 while (1) { 411 // Find start of host name or of quoted string. 412 NSRange hostNameOrStringStart = [string rangeOfCharacterFromSet:hostNameOrStringStartCharacters options:0 range:remaining]; 413 if (hostNameOrStringStart.location == NSNotFound) 414 return; 415 416 unichar c = [string characterAtIndex:hostNameOrStringStart.location]; 417 remaining.location = NSMaxRange(hostNameOrStringStart); 418 remaining.length = stringLength - remaining.location; 419 420 if (c == '?') 421 return; 422 423 if (c == '@') { 424 // Find end of host name. 425 unsigned hostNameStart = remaining.location; 426 NSRange hostNameEnd = [string rangeOfCharacterFromSet:hostNameEndCharacters options:0 range:remaining]; 427 BOOL done; 428 if (hostNameEnd.location == NSNotFound) { 429 hostNameEnd.location = stringLength; 430 done = YES; 431 } else { 432 remaining.location = hostNameEnd.location; 433 remaining.length = stringLength - remaining.location; 434 done = NO; 435 } 436 437 // Process host name range. 438 f(string, NSMakeRange(hostNameStart, hostNameEnd.location - hostNameStart), context); 439 440 if (done) 441 return; 442 } else { 443 // Skip quoted string. 444 ASSERT(c == '"'); 445 while (1) { 446 NSRange escapedCharacterOrStringEnd = [string rangeOfCharacterFromSet:quotedStringCharacters options:0 range:remaining]; 447 if (escapedCharacterOrStringEnd.location == NSNotFound) 448 return; 449 450 c = [string characterAtIndex:escapedCharacterOrStringEnd.location]; 451 remaining.location = NSMaxRange(escapedCharacterOrStringEnd); 452 remaining.length = stringLength - remaining.location; 453 454 // If we are the end of the string, then break from the string loop back to the host name loop. 455 if (c == '"') 456 break; 457 458 // Skip escaped character. 459 ASSERT(c == '\\'); 460 if (!remaining.length) 461 return; 462 463 remaining.location += 1; 464 remaining.length -= 1; 465 } 466 } 467 } 468} 469 470static void applyHostNameFunctionToURLString(NSString *string, StringRangeApplierFunction f, void *context) 471{ 472 // Find hostnames. Too bad we can't use any real URL-parsing code to do this, 473 // but we have to do it before doing all the %-escaping, and this is the only 474 // code we have that parses mailto URLs anyway. 475 476 // Maybe we should implement this using a character buffer instead? 477 478 if (hasCaseInsensitivePrefix(string, @"mailto:")) { 479 applyHostNameFunctionToMailToURLString(string, f, context); 480 return; 481 } 482 483 // Find the host name in a hierarchical URL. 484 // It comes after a "://" sequence, with scheme characters preceding. 485 // If ends with the end of the string or a ":", "/", or a "?". 486 // If there is a "@" character, the host part is just the part after the "@". 487 NSRange separatorRange = [string rangeOfString:@"://"]; 488 if (separatorRange.location == NSNotFound) 489 return; 490 491 // Check that all characters before the :// are valid scheme characters. 492 static NSCharacterSet *nonSchemeCharacters = retain([[NSCharacterSet characterSetWithCharactersInString:@"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-."] invertedSet]); 493 if ([string rangeOfCharacterFromSet:nonSchemeCharacters options:0 range:NSMakeRange(0, separatorRange.location)].location != NSNotFound) 494 return; 495 496 unsigned stringLength = [string length]; 497 498 static NSCharacterSet *hostTerminators = retain([NSCharacterSet characterSetWithCharactersInString:@":/?#"]); 499 500 // Start after the separator. 501 unsigned authorityStart = NSMaxRange(separatorRange); 502 503 // Find terminating character. 504 NSRange hostNameTerminator = [string rangeOfCharacterFromSet:hostTerminators options:0 range:NSMakeRange(authorityStart, stringLength - authorityStart)]; 505 unsigned hostNameEnd = hostNameTerminator.location == NSNotFound ? stringLength : hostNameTerminator.location; 506 507 // Find "@" for the start of the host name. 508 NSRange userInfoTerminator = [string rangeOfString:@"@" options:0 range:NSMakeRange(authorityStart, hostNameEnd - authorityStart)]; 509 unsigned hostNameStart = userInfoTerminator.location == NSNotFound ? authorityStart : NSMaxRange(userInfoTerminator); 510 511 f(string, NSMakeRange(hostNameStart, hostNameEnd - hostNameStart), context); 512} 513 514static NSString *mapHostNames(NSString *string, BOOL encode) 515{ 516 // Generally, we want to optimize for the case where there is one host name that does not need mapping. 517 518 if (encode && [string canBeConvertedToEncoding:NSASCIIStringEncoding]) 519 return string; 520 521 // Make a list of ranges that actually need mapping. 522 NSMutableArray *hostNameRanges = nil; 523 StringRangeApplierFunction f = encode ? collectRangesThatNeedEncoding : collectRangesThatNeedDecoding; 524 applyHostNameFunctionToURLString(string, f, &hostNameRanges); 525 if (!hostNameRanges) 526 return string; 527 528 // Do the mapping. 529 NSMutableString *mutableCopy = [string mutableCopy]; 530 unsigned i = [hostNameRanges count]; 531 while (i--) { 532 NSRange hostNameRange = [[hostNameRanges objectAtIndex:i] rangeValue]; 533 NSString *mappedHostName = encode ? encodeHostNameWithRange(string, hostNameRange) : decodeHostNameWithRange(string, hostNameRange); 534 [mutableCopy replaceCharactersInRange:hostNameRange withString:mappedHostName]; 535 } 536 [hostNameRanges release]; 537 return [mutableCopy autorelease]; 538} 539 540static BOOL isHexDigit(char c) 541{ 542 return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); 543} 544 545static char hexDigit(int i) 546{ 547 if (i < 0 || i > 16) 548 return '0'; 549 550 return (i >= 10) ? i - 10 + 'A' : i += '0'; 551} 552 553static int hexDigitValue(char c) 554{ 555 if (c >= '0' && c <= '9') 556 return c - '0'; 557 558 if (c >= 'A' && c <= 'F') 559 return c - 'A' + 10; 560 561 if (c >= 'a' && c <= 'f') 562 return c - 'a' + 10; 563 564 LOG_ERROR("illegal hex digit"); 565 return 0; 566} 567 568static NSString *stringByTrimmingWhitespace(NSString *string) 569{ 570 NSMutableString *trimmed = [[string mutableCopy] autorelease]; 571 CFStringTrimWhitespace((CFMutableStringRef)trimmed); 572 return trimmed; 573} 574 575NSURL *URLByTruncatingOneCharacterBeforeComponent(NSURL *URL, CFURLComponentType component) 576{ 577 if (!URL) 578 return nil; 579 580 CFRange fragRg = CFURLGetByteRangeForComponent((CFURLRef)URL, component, NULL); 581 if (fragRg.location == kCFNotFound) 582 return URL; 583 584 UInt8 *urlBytes, buffer[2048]; 585 CFIndex numBytes = CFURLGetBytes((CFURLRef)URL, buffer, 2048); 586 if (numBytes == -1) { 587 numBytes = CFURLGetBytes((CFURLRef)URL, NULL, 0); 588 urlBytes = static_cast<UInt8*>(malloc(numBytes)); 589 CFURLGetBytes((CFURLRef)URL, urlBytes, numBytes); 590 } else 591 urlBytes = buffer; 592 593 NSURL *result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes, fragRg.location - 1, kCFStringEncodingUTF8, NULL)); 594 if (!result) 595 result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes, fragRg.location - 1, kCFStringEncodingISOLatin1, NULL)); 596 597 if (urlBytes != buffer) 598 free(urlBytes); 599 return result ? [result autorelease] : URL; 600} 601 602static NSURL *URLByRemovingResourceSpecifier(NSURL *URL) 603{ 604 return URLByTruncatingOneCharacterBeforeComponent(URL, kCFURLComponentResourceSpecifier); 605} 606 607NSURL *URLWithData(NSData *data, NSURL *baseURL) 608{ 609 if (!data) 610 return nil; 611 612 NSURL *result = nil; 613 size_t length = [data length]; 614 if (length > 0) { 615 // work around <rdar://4470771>: CFURLCreateAbsoluteURLWithBytes(.., TRUE) doesn't remove non-path components. 616 baseURL = URLByRemovingResourceSpecifier(baseURL); 617 618 const UInt8 *bytes = static_cast<const UInt8*>([data bytes]); 619 620 // CFURLCreateAbsoluteURLWithBytes would complain to console if we passed a path to it. 621 if (bytes[0] == '/' && !baseURL) 622 return nil; 623 624 // NOTE: We use UTF-8 here since this encoding is used when computing strings when returning URL components 625 // (e.g calls to NSURL -path). However, this function is not tolerant of illegal UTF-8 sequences, which 626 // could either be a malformed string or bytes in a different encoding, like shift-jis, so we fall back 627 // onto using ISO Latin 1 in those cases. 628 result = CFBridgingRelease(CFURLCreateAbsoluteURLWithBytes(NULL, bytes, length, kCFStringEncodingUTF8, (CFURLRef)baseURL, YES)); 629 if (!result) 630 result = CFBridgingRelease(CFURLCreateAbsoluteURLWithBytes(NULL, bytes, length, kCFStringEncodingISOLatin1, (CFURLRef)baseURL, YES)); 631 } else 632 result = [NSURL URLWithString:@""]; 633 634 return result; 635} 636 637NSURL *URLWithUserTypedString(NSString *string, NSURL *URL) 638{ 639 if (!string) 640 return nil; 641 642 string = mapHostNames(stringByTrimmingWhitespace(string), YES); 643 644 NSData *userTypedData = [string dataUsingEncoding:NSUTF8StringEncoding]; 645 ASSERT(userTypedData); 646 647 const UInt8* inBytes = static_cast<const UInt8 *>([userTypedData bytes]); 648 int inLength = [userTypedData length]; 649 if (!inLength) 650 return [NSURL URLWithString:@""]; 651 652 char* outBytes = static_cast<char *>(malloc(inLength * 3)); // large enough to %-escape every character 653 char* p = outBytes; 654 int outLength = 0; 655 for (int i = 0; i < inLength; i++) { 656 UInt8 c = inBytes[i]; 657 if (c <= 0x20 || c >= 0x7f) { 658 *p++ = '%'; 659 *p++ = hexDigit(c >> 4); 660 *p++ = hexDigit(c & 0xf); 661 outLength += 3; 662 } else { 663 *p++ = c; 664 outLength++; 665 } 666 } 667 668 NSData *data = [NSData dataWithBytesNoCopy:outBytes length:outLength]; // adopts outBytes 669 return URLWithData(data, URL); 670} 671 672static BOOL hasQuestionMarkOnlyQueryString(NSURL *URL) 673{ 674 CFRange rangeWithSeparators; 675 CFURLGetByteRangeForComponent((CFURLRef)URL, kCFURLComponentQuery, &rangeWithSeparators); 676 if (rangeWithSeparators.location != kCFNotFound && rangeWithSeparators.length == 1) 677 return YES; 678 679 return NO; 680} 681 682#define completeURL (CFURLComponentType)-1 683 684NSData *dataForURLComponentType(NSURL *URL, CFURLComponentType componentType) 685{ 686 static int URLComponentTypeBufferLength = 2048; 687 688 UInt8 staticAllBytesBuffer[URLComponentTypeBufferLength]; 689 UInt8 *allBytesBuffer = staticAllBytesBuffer; 690 691 CFIndex bytesFilled = CFURLGetBytes((CFURLRef)URL, allBytesBuffer, URLComponentTypeBufferLength); 692 if (bytesFilled == -1) { 693 CFIndex bytesToAllocate = CFURLGetBytes((CFURLRef)URL, NULL, 0); 694 allBytesBuffer = static_cast<UInt8 *>(malloc(bytesToAllocate)); 695 bytesFilled = CFURLGetBytes((CFURLRef)URL, allBytesBuffer, bytesToAllocate); 696 } 697 698 CFRange range; 699 if (componentType != completeURL) { 700 range = CFURLGetByteRangeForComponent((CFURLRef)URL, componentType, NULL); 701 if (range.location == kCFNotFound) 702 return nil; 703 } else { 704 range.location = 0; 705 range.length = bytesFilled; 706 } 707 708 NSData *componentData = [NSData dataWithBytes:allBytesBuffer + range.location length:range.length]; 709 710 const unsigned char *bytes = static_cast<const unsigned char *>([componentData bytes]); 711 NSMutableData *resultData = [NSMutableData data]; 712 // NOTE: add leading '?' to query strings non-zero length query strings. 713 // NOTE: retain question-mark only query strings. 714 if (componentType == kCFURLComponentQuery) { 715 if (range.length > 0 || hasQuestionMarkOnlyQueryString(URL)) 716 [resultData appendBytes:"?" length:1]; 717 } 718 for (int i = 0; i < range.length; i++) { 719 unsigned char c = bytes[i]; 720 if (c <= 0x20 || c >= 0x7f) { 721 char escaped[3]; 722 escaped[0] = '%'; 723 escaped[1] = hexDigit(c >> 4); 724 escaped[2] = hexDigit(c & 0xf); 725 [resultData appendBytes:escaped length:3]; 726 } else { 727 char b[1]; 728 b[0] = c; 729 [resultData appendBytes:b length:1]; 730 } 731 } 732 733 if (staticAllBytesBuffer != allBytesBuffer) 734 free(allBytesBuffer); 735 736 return resultData; 737} 738 739static NSURL *URLByRemovingComponentAndSubsequentCharacter(NSURL *URL, CFURLComponentType component) 740{ 741 CFRange range = CFURLGetByteRangeForComponent((CFURLRef)URL, component, 0); 742 if (range.location == kCFNotFound) 743 return URL; 744 745 // Remove one subsequent character. 746 range.length++; 747 748 Vector<UInt8, 2048> buffer(2048); 749 CFIndex numBytes = CFURLGetBytes((CFURLRef)URL, buffer.data(), 2048); 750 if (numBytes == -1) { 751 numBytes = CFURLGetBytes((CFURLRef)URL, NULL, 0); 752 buffer.grow(numBytes); 753 CFURLGetBytes((CFURLRef)URL, buffer.data(), numBytes); 754 } 755 UInt8* urlBytes = buffer.data(); 756 757 if (numBytes < range.location) 758 return URL; 759 if (numBytes < range.location + range.length) 760 range.length = numBytes - range.location; 761 762 memmove(urlBytes + range.location, urlBytes + range.location + range.length, numBytes - range.location + range.length); 763 764 NSURL *result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes, numBytes - range.length, kCFStringEncodingUTF8, NULL)); 765 if (!result) 766 result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes, numBytes - range.length, kCFStringEncodingISOLatin1, NULL)); 767 768 return result ? [result autorelease] : URL; 769} 770 771NSURL *URLByRemovingUserInfo(NSURL *URL) 772{ 773 return URLByRemovingComponentAndSubsequentCharacter(URL, kCFURLComponentUserInfo); 774} 775 776NSURL *URLByCanonicalizingURL(NSURL *URL) 777{ 778 RetainPtr<NSURLRequest> request = adoptNS([[NSURLRequest alloc] initWithURL:URL]); 779 Class concreteClass = [NSURLProtocol _protocolClassForRequest:request.get()]; 780 if (!concreteClass) { 781 return URL; 782 } 783 784 // This applies NSURL's concept of canonicalization, but not URL's concept. It would 785 // make sense to apply both, but when we tried that it caused a performance degradation 786 // (see 5315926). It might make sense to apply only the URL concept and not the NSURL 787 // concept, but it's too risky to make that change for WebKit 3.0. 788 NSURLRequest *newRequest = [concreteClass canonicalRequestForRequest:request.get()]; 789 NSURL *newURL = [newRequest URL]; 790 return [[newURL retain] autorelease]; 791} 792 793NSData *originalURLData(NSURL *URL) 794{ 795 UInt8 *buffer = (UInt8 *)malloc(URL_BYTES_BUFFER_LENGTH); 796 CFIndex bytesFilled = CFURLGetBytes((CFURLRef)URL, buffer, URL_BYTES_BUFFER_LENGTH); 797 if (bytesFilled == -1) { 798 CFIndex bytesToAllocate = CFURLGetBytes((CFURLRef)URL, NULL, 0); 799 buffer = (UInt8 *)realloc(buffer, bytesToAllocate); 800 bytesFilled = CFURLGetBytes((CFURLRef)URL, buffer, bytesToAllocate); 801 ASSERT(bytesFilled == bytesToAllocate); 802 } 803 804 // buffer is adopted by the NSData 805 NSData *data = [NSData dataWithBytesNoCopy:buffer length:bytesFilled freeWhenDone:YES]; 806 807 NSURL *baseURL = (NSURL *)CFURLGetBaseURL((CFURLRef)URL); 808 if (baseURL) 809 return originalURLData(URLWithData(data, baseURL)); 810 return data; 811} 812 813static CFStringRef createStringWithEscapedUnsafeCharacters(CFStringRef string) 814{ 815 CFIndex length = CFStringGetLength(string); 816 Vector<UChar, 2048> sourceBuffer(length); 817 CFStringGetCharacters(string, CFRangeMake(0, length), sourceBuffer.data()); 818 819 Vector<UChar, 2048> outBuffer; 820 821 CFIndex i = 0; 822 while (i < length) { 823 UChar32 c; 824 U16_NEXT(sourceBuffer, i, length, c) 825 826 if (isLookalikeCharacter(c)) { 827 uint8_t utf8Buffer[4]; 828 CFIndex offset = 0; 829 UBool failure = false; 830 U8_APPEND(utf8Buffer, offset, 4, c, failure) 831 ASSERT(!failure); 832 833 for (CFIndex j = 0; j < offset; ++j) { 834 outBuffer.append('%'); 835 outBuffer.append(hexDigit(utf8Buffer[j] >> 4)); 836 outBuffer.append(hexDigit(utf8Buffer[j] & 0xf)); 837 } 838 } else { 839 UChar utf16Buffer[2]; 840 CFIndex offset = 0; 841 UBool failure = false; 842 U16_APPEND(utf16Buffer, offset, 2, c, failure) 843 ASSERT(!failure); 844 for (CFIndex j = 0; j < offset; ++j) 845 outBuffer.append(utf16Buffer[j]); 846 } 847 } 848 849 return CFStringCreateWithCharacters(NULL, outBuffer.data(), outBuffer.size()); 850} 851 852NSString *userVisibleString(NSURL *URL) 853{ 854 NSData *data = originalURLData(URL); 855 const unsigned char *before = static_cast<const unsigned char*>([data bytes]); 856 int length = [data length]; 857 858 bool mayNeedHostNameDecoding = false; 859 860 const unsigned char *p = before; 861 int bufferLength = (length * 3) + 1; 862 char *after = static_cast<char *>(malloc(bufferLength)); // large enough to %-escape every character 863 char *q = after; 864 for (int i = 0; i < length; i++) { 865 unsigned char c = p[i]; 866 // unescape escape sequences that indicate bytes greater than 0x7f 867 if (c == '%' && (i + 1 < length && isHexDigit(p[i + 1])) && i + 2 < length && isHexDigit(p[i + 2])) { 868 unsigned char u = (hexDigitValue(p[i + 1]) << 4) | hexDigitValue(p[i + 2]); 869 if (u > 0x7f) { 870 // unescape 871 *q++ = u; 872 } else { 873 // do not unescape 874 *q++ = p[i]; 875 *q++ = p[i + 1]; 876 *q++ = p[i + 2]; 877 } 878 i += 2; 879 } else { 880 *q++ = c; 881 882 // Check for "xn--" in an efficient, non-case-sensitive, way. 883 if (c == '-' && i >= 3 && !mayNeedHostNameDecoding && (q[-4] | 0x20) == 'x' && (q[-3] | 0x20) == 'n' && q[-2] == '-') 884 mayNeedHostNameDecoding = true; 885 } 886 } 887 *q = '\0'; 888 889 // Check string to see if it can be converted to display using UTF-8 890 NSString *result = [NSString stringWithUTF8String:after]; 891 if (!result) { 892 // Could not convert to UTF-8. 893 // Convert characters greater than 0x7f to escape sequences. 894 // Shift current string to the end of the buffer 895 // then we will copy back bytes to the start of the buffer 896 // as we convert. 897 int afterlength = q - after; 898 char *p = after + bufferLength - afterlength - 1; 899 memmove(p, after, afterlength + 1); // copies trailing '\0' 900 char *q = after; 901 while (*p) { 902 unsigned char c = *p; 903 if (c > 0x7f) { 904 *q++ = '%'; 905 *q++ = hexDigit(c >> 4); 906 *q++ = hexDigit(c & 0xf); 907 } else 908 *q++ = *p; 909 p++; 910 } 911 *q = '\0'; 912 result = [NSString stringWithUTF8String:after]; 913 } 914 915 free(after); 916 917 if (mayNeedHostNameDecoding) 918 result = mapHostNames(result, NO); 919 result = [result precomposedStringWithCanonicalMapping]; 920 return CFBridgingRelease(createStringWithEscapedUnsafeCharacters((CFStringRef)result)); 921} 922 923BOOL isUserVisibleURL(NSString *string) 924{ 925 BOOL valid = YES; 926 // get buffer 927 928 char static_buffer[1024]; 929 const char *p; 930 BOOL success = CFStringGetCString((CFStringRef)string, static_buffer, 1023, kCFStringEncodingUTF8); 931 p = success ? static_buffer : [string UTF8String]; 932 933 int length = strlen(p); 934 935 // check for characters <= 0x20 or >=0x7f, %-escape sequences of %7f, and xn--, these 936 // are the things that will lead _web_userVisibleString to actually change things. 937 for (int i = 0; i < length; i++) { 938 unsigned char c = p[i]; 939 // escape control characters, space, and delete 940 if (c <= 0x20 || c == 0x7f) { 941 valid = NO; 942 break; 943 } else if (c == '%' && (i + 1 < length && isHexDigit(p[i + 1])) && i + 2 < length && isHexDigit(p[i + 2])) { 944 unsigned char u = (hexDigitValue(p[i + 1]) << 4) | hexDigitValue(p[i + 2]); 945 if (u > 0x7f) { 946 valid = NO; 947 break; 948 } 949 i += 2; 950 } else { 951 // Check for "xn--" in an efficient, non-case-sensitive, way. 952 if (c == '-' && i >= 3 && (p[i - 3] | 0x20) == 'x' && (p[i - 2] | 0x20) == 'n' && p[i - 1] == '-') { 953 valid = NO; 954 break; 955 } 956 } 957 } 958 959 return valid; 960} 961 962NSRange rangeOfURLScheme(NSString *string) 963{ 964 NSRange colon = [string rangeOfString:@":"]; 965 if (colon.location != NSNotFound && colon.location > 0) { 966 NSRange scheme = {0, colon.location}; 967 static NSCharacterSet *InverseSchemeCharacterSet = nil; 968 if (!InverseSchemeCharacterSet) { 969 /* 970 This stuff is very expensive. 10-15 msec on a 2x1.2GHz. If not cached it swamps 971 everything else when adding items to the autocomplete DB. Makes me wonder if we 972 even need to enforce the character set here. 973 */ 974 NSString *acceptableCharacters = @"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"; 975 InverseSchemeCharacterSet = [[[NSCharacterSet characterSetWithCharactersInString:acceptableCharacters] invertedSet] retain]; 976 } 977 NSRange illegals = [string rangeOfCharacterFromSet:InverseSchemeCharacterSet options:0 range:scheme]; 978 if (illegals.location == NSNotFound) 979 return scheme; 980 } 981 return NSMakeRange(NSNotFound, 0); 982} 983 984BOOL looksLikeAbsoluteURL(NSString *string) 985{ 986 // Trim whitespace because _web_URLWithString allows whitespace. 987 return rangeOfURLScheme(stringByTrimmingWhitespace(string)).location != NSNotFound; 988} 989 990} // namespace WebCore 991