1/*
2 * Copyright (C) 2005, 2007, 2014 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1.  Redistributions of source code must retain the above copyright
9 *     notice, this list of conditions and the following disclaimer.
10 * 2.  Redistributions in binary form must reproduce the above copyright
11 *     notice, this list of conditions and the following disclaimer in the
12 *     documentation and/or other materials provided with the distribution.
13 * 3.  Neither the name of Apple Inc. ("Apple") nor the names of
14 *     its contributors may be used to endorse or promote products derived
15 *     from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
18 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#import "config.h"
30#import "WebCoreObjCExtras.h"
31#import "WebCoreNSStringExtras.h"
32#import "WebCoreNSURLExtras.h"
33#import "WebCoreSystemInterface.h"
34#import <wtf/ObjcRuntimeExtras.h>
35#import <wtf/RetainPtr.h>
36#import <wtf/Vector.h>
37#import <unicode/uchar.h>
38#import <unicode/uidna.h>
39#import <unicode/uscript.h>
40
41// Needs to be big enough to hold an IDN-encoded name.
42// For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
43#define HOST_NAME_BUFFER_LENGTH 2048
44#define URL_BYTES_BUFFER_LENGTH 2048
45
46typedef void (* StringRangeApplierFunction)(NSString *string, NSRange range, void *context);
47
48static pthread_once_t IDNScriptWhiteListFileRead = PTHREAD_ONCE_INIT;
49static uint32_t IDNScriptWhiteList[(USCRIPT_CODE_LIMIT + 31) / 32];
50
51
52@interface NSURLProtocol (WKNSURLProtocolInternal)
53+ (Class)_protocolClassForRequest:(NSURLRequest *)request;
54@end
55
56namespace WebCore {
57
58static BOOL isLookalikeCharacter(UChar32 charCode)
59{
60    // This function treats the following as unsafe, lookalike characters:
61    // any non-printable character, any character considered as whitespace,
62    // any ignorable character, and emoji characters related to locks.
63
64    // We also considered the characters in Mozilla's blacklist <http://kb.mozillazine.org/Network.IDN.blacklist_chars>.
65
66    // Some of the characters here will never appear once ICU has encoded.
67    // For example, ICU transforms most spaces into an ASCII space and most
68    // slashes into an ASCII solidus. But one of the two callers uses this
69    // on characters that have not been processed by ICU, so they are needed here.
70
71    if (!u_isprint(charCode) || u_isUWhiteSpace(charCode) || u_hasBinaryProperty(charCode, UCHAR_DEFAULT_IGNORABLE_CODE_POINT))
72        return YES;
73
74    switch (charCode) {
75        case 0x00BC: /* VULGAR FRACTION ONE QUARTER */
76        case 0x00BD: /* VULGAR FRACTION ONE HALF */
77        case 0x00BE: /* VULGAR FRACTION THREE QUARTERS */
78        case 0x00ED: /* LATIN SMALL LETTER I WITH ACUTE */
79        case 0x01C3: /* LATIN LETTER RETROFLEX CLICK */
80        case 0x0251: /* LATIN SMALL LETTER ALPHA */
81        case 0x0261: /* LATIN SMALL LETTER SCRIPT G */
82        case 0x02D0: /* MODIFIER LETTER TRIANGULAR COLON */
83        case 0x0335: /* COMBINING SHORT STROKE OVERLAY */
84        case 0x0337: /* COMBINING SHORT SOLIDUS OVERLAY */
85        case 0x0338: /* COMBINING LONG SOLIDUS OVERLAY */
86        case 0x0589: /* ARMENIAN FULL STOP */
87        case 0x05B4: /* HEBREW POINT HIRIQ */
88        case 0x05BC: /* HEBREW POINT DAGESH OR MAPIQ */
89        case 0x05C3: /* HEBREW PUNCTUATION SOF PASUQ */
90        case 0x05F4: /* HEBREW PUNCTUATION GERSHAYIM */
91        case 0x0609: /* ARABIC-INDIC PER MILLE SIGN */
92        case 0x060A: /* ARABIC-INDIC PER TEN THOUSAND SIGN */
93        case 0x0660: /* ARABIC INDIC DIGIT ZERO */
94        case 0x066A: /* ARABIC PERCENT SIGN */
95        case 0x06D4: /* ARABIC FULL STOP */
96        case 0x06F0: /* EXTENDED ARABIC INDIC DIGIT ZERO */
97        case 0x0701: /* SYRIAC SUPRALINEAR FULL STOP */
98        case 0x0702: /* SYRIAC SUBLINEAR FULL STOP */
99        case 0x0703: /* SYRIAC SUPRALINEAR COLON */
100        case 0x0704: /* SYRIAC SUBLINEAR COLON */
101        case 0x1735: /* PHILIPPINE SINGLE PUNCTUATION */
102        case 0x2024: /* ONE DOT LEADER */
103        case 0x2027: /* HYPHENATION POINT */
104        case 0x2039: /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
105        case 0x203A: /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
106        case 0x2041: /* CARET INSERTION POINT */
107        case 0x2044: /* FRACTION SLASH */
108        case 0x2052: /* COMMERCIAL MINUS SIGN */
109        case 0x2153: /* VULGAR FRACTION ONE THIRD */
110        case 0x2154: /* VULGAR FRACTION TWO THIRDS */
111        case 0x2155: /* VULGAR FRACTION ONE FIFTH */
112        case 0x2156: /* VULGAR FRACTION TWO FIFTHS */
113        case 0x2157: /* VULGAR FRACTION THREE FIFTHS */
114        case 0x2158: /* VULGAR FRACTION FOUR FIFTHS */
115        case 0x2159: /* VULGAR FRACTION ONE SIXTH */
116        case 0x215A: /* VULGAR FRACTION FIVE SIXTHS */
117        case 0x215B: /* VULGAR FRACTION ONE EIGHT */
118        case 0x215C: /* VULGAR FRACTION THREE EIGHTHS */
119        case 0x215D: /* VULGAR FRACTION FIVE EIGHTHS */
120        case 0x215E: /* VULGAR FRACTION SEVEN EIGHTHS */
121        case 0x215F: /* FRACTION NUMERATOR ONE */
122        case 0x2215: /* DIVISION SLASH */
123        case 0x2216: /* SET MINUS */
124        case 0x2236: /* RATIO */
125        case 0x233F: /* APL FUNCTIONAL SYMBOL SLASH BAR */
126        case 0x23AE: /* INTEGRAL EXTENSION */
127        case 0x244A: /* OCR DOUBLE BACKSLASH */
128        case 0x2571: /* BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT */
129        case 0x2572: /* BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT */
130        case 0x29F6: /* SOLIDUS WITH OVERBAR */
131        case 0x29F8: /* BIG SOLIDUS */
132        case 0x2AFB: /* TRIPLE SOLIDUS BINARY RELATION */
133        case 0x2AFD: /* DOUBLE SOLIDUS OPERATOR */
134        case 0x2FF0: /* IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT */
135        case 0x2FF1: /* IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO BELOW */
136        case 0x2FF2: /* IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO MIDDLE AND RIGHT */
137        case 0x2FF3: /* IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO MIDDLE AND BELOW */
138        case 0x2FF4: /* IDEOGRAPHIC DESCRIPTION CHARACTER FULL SURROUND */
139        case 0x2FF5: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM ABOVE */
140        case 0x2FF6: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM BELOW */
141        case 0x2FF7: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LEFT */
142        case 0x2FF8: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER LEFT */
143        case 0x2FF9: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER RIGHT */
144        case 0x2FFA: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LOWER LEFT */
145        case 0x2FFB: /* IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID */
146        case 0x3002: /* IDEOGRAPHIC FULL STOP */
147        case 0x3008: /* LEFT ANGLE BRACKET */
148        case 0x3014: /* LEFT TORTOISE SHELL BRACKET */
149        case 0x3015: /* RIGHT TORTOISE SHELL BRACKET */
150        case 0x3033: /* VERTICAL KANA REPEAT MARK UPPER HALF */
151        case 0x3035: /* VERTICAL KANA REPEAT MARK LOWER HALF */
152        case 0x321D: /* PARENTHESIZED KOREAN CHARACTER OJEON */
153        case 0x321E: /* PARENTHESIZED KOREAN CHARACTER O HU */
154        case 0x33AE: /* SQUARE RAD OVER S */
155        case 0x33AF: /* SQUARE RAD OVER S SQUARED */
156        case 0x33C6: /* SQUARE C OVER KG */
157        case 0x33DF: /* SQUARE A OVER M */
158        case 0xA789: /* MODIFIER LETTER COLON */
159        case 0xFE14: /* PRESENTATION FORM FOR VERTICAL SEMICOLON */
160        case 0xFE15: /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
161        case 0xFE3F: /* PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET */
162        case 0xFE5D: /* SMALL LEFT TORTOISE SHELL BRACKET */
163        case 0xFE5E: /* SMALL RIGHT TORTOISE SHELL BRACKET */
164        case 0xFF0E: /* FULLWIDTH FULL STOP */
165        case 0xFF0F: /* FULL WIDTH SOLIDUS */
166        case 0xFF61: /* HALFWIDTH IDEOGRAPHIC FULL STOP */
167        case 0xFFFC: /* OBJECT REPLACEMENT CHARACTER */
168        case 0xFFFD: /* REPLACEMENT CHARACTER */
169        case 0x1F50F: /* LOCK WITH INK PEN */
170        case 0x1F510: /* CLOSED LOCK WITH KEY */
171        case 0x1F511: /* KEY */
172        case 0x1F512: /* LOCK */
173        case 0x1F513: /* OPEN LOCK */
174            return YES;
175        default:
176            return NO;
177    }
178}
179
180static BOOL readIDNScriptWhiteListFile(NSString *filename)
181{
182    if (!filename)
183        return NO;
184
185    FILE *file = fopen([filename fileSystemRepresentation], "r");
186    if (!file)
187        return NO;
188
189    // Read a word at a time.
190    // Allow comments, starting with # character to the end of the line.
191    while (1) {
192        // Skip a comment if present.
193        if (fscanf(file, " #%*[^\n\r]%*[\n\r]") == EOF)
194            break;
195
196        // Read a script name if present.
197        char word[33];
198        int result = fscanf(file, " %32[^# \t\n\r]%*[^# \t\n\r] ", word);
199        if (result == EOF)
200            break;
201
202        if (result == 1) {
203            // Got a word, map to script code and put it into the array.
204            int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
205            if (script >= 0 && script < USCRIPT_CODE_LIMIT) {
206                size_t index = script / 32;
207                uint32_t mask = 1 << (script % 32);
208                IDNScriptWhiteList[index] |= mask;
209            }
210        }
211    }
212    fclose(file);
213    return YES;
214}
215
216static void readIDNScriptWhiteList(void)
217{
218    // Read white list from library.
219    NSArray *dirs = NSSearchPathForDirectoriesInDomains(NSLibraryDirectory, NSAllDomainsMask, YES);
220    int numDirs = [dirs count];
221    for (int i = 0; i < numDirs; i++) {
222        if (readIDNScriptWhiteListFile([[dirs objectAtIndex:i] stringByAppendingPathComponent:@"IDNScriptWhiteList.txt"]))
223            return;
224    }
225
226    // Fall back on white list inside bundle.
227    NSBundle *bundle = [NSBundle bundleWithIdentifier:@"com.apple.WebCore"];
228
229    if (!readIDNScriptWhiteListFile([bundle pathForResource:@"IDNScriptWhiteList" ofType:@"txt"]))
230        CRASH();
231}
232
233static BOOL allCharactersInIDNScriptWhiteList(const UChar *buffer, int32_t length)
234{
235    pthread_once(&IDNScriptWhiteListFileRead, readIDNScriptWhiteList);
236
237    int32_t i = 0;
238    while (i < length) {
239        UChar32 c;
240        U16_NEXT(buffer, i, length, c)
241        UErrorCode error = U_ZERO_ERROR;
242        UScriptCode script = uscript_getScript(c, &error);
243        if (error != U_ZERO_ERROR) {
244            LOG_ERROR("got ICU error while trying to look at scripts: %d", error);
245            return NO;
246        }
247        if (script < 0) {
248            LOG_ERROR("got negative number for script code from ICU: %d", script);
249            return NO;
250        }
251        if (script >= USCRIPT_CODE_LIMIT)
252            return NO;
253
254        size_t index = script / 32;
255        uint32_t mask = 1 << (script % 32);
256        if (!(IDNScriptWhiteList[index] & mask))
257            return NO;
258
259        if (isLookalikeCharacter(c))
260            return NO;
261    }
262    return YES;
263}
264
265static BOOL allCharactersAllowedByTLDRules(const UChar* buffer, int32_t length)
266{
267    // Skip trailing dot for root domain.
268    if (buffer[length - 1] == '.')
269        length--;
270
271    if (length > 3 && buffer[length - 3] == '.'
272        && buffer[length - 2] == 0x0440 // CYRILLIC SMALL LETTER ER
273        && buffer[length - 1] == 0x0444) // CYRILLIC SMALL LETTER EF
274    {
275        // Rules defined by <http://www.cctld.ru/ru/docs/rulesrf.php>. This code only checks requirements that matter for presentation purposes.
276        for (int32_t i = length - 4; i; --i) {
277            UChar ch = buffer[i];
278
279            // Only modern Russian letters, digits and dashes are allowed.
280            if ((ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451|| (ch >= '0' && ch <= '9') || ch == '-')
281                continue;
282
283            // Only check top level domain. Lower level registrars may have different rules.
284            if (ch == '.')
285                break;
286
287            return NO;
288        }
289        return YES;
290    }
291
292    // Not a known top level domain with special rules.
293    return NO;
294}
295
296// Return value of nil means no mapping is necessary.
297// If makeString is NO, then return value is either nil or self to indicate mapping is necessary.
298// If makeString is YES, then return value is either nil or the mapped string.
299static NSString *mapHostNameWithRange(NSString *string, NSRange range, BOOL encode, BOOL makeString)
300{
301    if (range.length > HOST_NAME_BUFFER_LENGTH)
302        return nil;
303
304    if (![string length])
305        return nil;
306
307    UChar sourceBuffer[HOST_NAME_BUFFER_LENGTH];
308    UChar destinationBuffer[HOST_NAME_BUFFER_LENGTH];
309
310    if (encode && [string rangeOfString:@"%" options:NSLiteralSearch range:range].location != NSNotFound) {
311        NSString *substring = [string substringWithRange:range];
312        substring = CFBridgingRelease(CFURLCreateStringByReplacingPercentEscapes(NULL, (CFStringRef)substring, CFSTR("")));
313        if (substring) {
314            string = substring;
315            range = NSMakeRange(0, [string length]);
316        }
317    }
318
319    int length = range.length;
320    [string getCharacters:sourceBuffer range:range];
321
322    UErrorCode error = U_ZERO_ERROR;
323    int32_t numCharactersConverted = (encode ? uidna_IDNToASCII : uidna_IDNToUnicode)(sourceBuffer, length, destinationBuffer, HOST_NAME_BUFFER_LENGTH, UIDNA_ALLOW_UNASSIGNED, NULL, &error);
324    if (error != U_ZERO_ERROR)
325        return nil;
326
327    if (numCharactersConverted == length && !memcmp(sourceBuffer, destinationBuffer, length * sizeof(UChar)))
328        return nil;
329
330    if (!encode && !allCharactersInIDNScriptWhiteList(destinationBuffer, numCharactersConverted) && !allCharactersAllowedByTLDRules(destinationBuffer, numCharactersConverted))
331        return nil;
332
333    return makeString ? (NSString *)[NSString stringWithCharacters:destinationBuffer length:numCharactersConverted] : string;
334}
335
336BOOL hostNameNeedsDecodingWithRange(NSString *string, NSRange range)
337{
338     return mapHostNameWithRange(string, range, NO, NO) != nil;
339}
340
341BOOL hostNameNeedsEncodingWithRange(NSString *string, NSRange range)
342{
343     return mapHostNameWithRange(string, range, YES,  NO) != nil;
344}
345
346NSString *decodeHostNameWithRange(NSString *string, NSRange range)
347{
348    return mapHostNameWithRange(string, range, NO, YES);
349}
350
351NSString *encodeHostNameWithRange(NSString *string, NSRange range)
352{
353    return mapHostNameWithRange(string, range, YES, YES);
354}
355
356NSString *decodeHostName(NSString *string)
357{
358    NSString *name = mapHostNameWithRange(string, NSMakeRange(0, [string length]), NO, YES);
359    return !name ? string : name;
360}
361
362NSString *encodeHostName(NSString *string)
363{
364    NSString *name =  mapHostNameWithRange(string, NSMakeRange(0, [string length]), YES, YES);
365    return !name ? string : name;
366}
367
368static void collectRangesThatNeedMapping(NSString *string, NSRange range, void *context, BOOL encode)
369{
370    BOOL needsMapping = encode ? hostNameNeedsEncodingWithRange(string, range) : hostNameNeedsDecodingWithRange(string, range);
371    if (!needsMapping)
372        return;
373
374    NSMutableArray **array = (NSMutableArray **)context;
375    if (!*array)
376        *array = [[NSMutableArray alloc] init];
377
378    [*array addObject:[NSValue valueWithRange:range]];
379}
380
381static void collectRangesThatNeedEncoding(NSString *string, NSRange range, void *context)
382{
383    return collectRangesThatNeedMapping(string, range, context, YES);
384}
385
386static void collectRangesThatNeedDecoding(NSString *string, NSRange range, void *context)
387{
388    return collectRangesThatNeedMapping(string, range, context, NO);
389}
390
391static inline NSCharacterSet *retain(NSCharacterSet *charset)
392{
393    CFRetain(charset);
394    return charset;
395}
396
397static void applyHostNameFunctionToMailToURLString(NSString *string, StringRangeApplierFunction f, void *context)
398{
399    // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' character.
400    // Skip quoted strings so that characters in them don't confuse us.
401    // When we find a '?' character, we are past the part of the URL that contains host names.
402
403    static NSCharacterSet *hostNameOrStringStartCharacters = retain([NSCharacterSet characterSetWithCharactersInString:@"\"@?"]);
404    static NSCharacterSet *hostNameEndCharacters = retain([NSCharacterSet characterSetWithCharactersInString:@">,?"]);
405    static NSCharacterSet *quotedStringCharacters = retain([NSCharacterSet characterSetWithCharactersInString:@"\"\\"]);
406
407    unsigned stringLength = [string length];
408    NSRange remaining = NSMakeRange(0, stringLength);
409
410    while (1) {
411        // Find start of host name or of quoted string.
412        NSRange hostNameOrStringStart = [string rangeOfCharacterFromSet:hostNameOrStringStartCharacters options:0 range:remaining];
413        if (hostNameOrStringStart.location == NSNotFound)
414            return;
415
416        unichar c = [string characterAtIndex:hostNameOrStringStart.location];
417        remaining.location = NSMaxRange(hostNameOrStringStart);
418        remaining.length = stringLength - remaining.location;
419
420        if (c == '?')
421            return;
422
423        if (c == '@') {
424            // Find end of host name.
425            unsigned hostNameStart = remaining.location;
426            NSRange hostNameEnd = [string rangeOfCharacterFromSet:hostNameEndCharacters options:0 range:remaining];
427            BOOL done;
428            if (hostNameEnd.location == NSNotFound) {
429                hostNameEnd.location = stringLength;
430                done = YES;
431            } else {
432                remaining.location = hostNameEnd.location;
433                remaining.length = stringLength - remaining.location;
434                done = NO;
435            }
436
437            // Process host name range.
438            f(string, NSMakeRange(hostNameStart, hostNameEnd.location - hostNameStart), context);
439
440            if (done)
441                return;
442        } else {
443            // Skip quoted string.
444            ASSERT(c == '"');
445            while (1) {
446                NSRange escapedCharacterOrStringEnd = [string rangeOfCharacterFromSet:quotedStringCharacters options:0 range:remaining];
447                if (escapedCharacterOrStringEnd.location == NSNotFound)
448                    return;
449
450                c = [string characterAtIndex:escapedCharacterOrStringEnd.location];
451                remaining.location = NSMaxRange(escapedCharacterOrStringEnd);
452                remaining.length = stringLength - remaining.location;
453
454                // If we are the end of the string, then break from the string loop back to the host name loop.
455                if (c == '"')
456                    break;
457
458                // Skip escaped character.
459                ASSERT(c == '\\');
460                if (!remaining.length)
461                    return;
462
463                remaining.location += 1;
464                remaining.length -= 1;
465            }
466        }
467    }
468}
469
470static void applyHostNameFunctionToURLString(NSString *string, StringRangeApplierFunction f, void *context)
471{
472    // Find hostnames. Too bad we can't use any real URL-parsing code to do this,
473    // but we have to do it before doing all the %-escaping, and this is the only
474    // code we have that parses mailto URLs anyway.
475
476    // Maybe we should implement this using a character buffer instead?
477
478    if (hasCaseInsensitivePrefix(string, @"mailto:")) {
479        applyHostNameFunctionToMailToURLString(string, f, context);
480        return;
481    }
482
483    // Find the host name in a hierarchical URL.
484    // It comes after a "://" sequence, with scheme characters preceding.
485    // If ends with the end of the string or a ":", "/", or a "?".
486    // If there is a "@" character, the host part is just the part after the "@".
487    NSRange separatorRange = [string rangeOfString:@"://"];
488    if (separatorRange.location == NSNotFound)
489        return;
490
491    // Check that all characters before the :// are valid scheme characters.
492    static NSCharacterSet *nonSchemeCharacters = retain([[NSCharacterSet characterSetWithCharactersInString:@"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-."] invertedSet]);
493    if ([string rangeOfCharacterFromSet:nonSchemeCharacters options:0 range:NSMakeRange(0, separatorRange.location)].location != NSNotFound)
494        return;
495
496    unsigned stringLength = [string length];
497
498    static NSCharacterSet *hostTerminators = retain([NSCharacterSet characterSetWithCharactersInString:@":/?#"]);
499
500    // Start after the separator.
501    unsigned authorityStart = NSMaxRange(separatorRange);
502
503    // Find terminating character.
504    NSRange hostNameTerminator = [string rangeOfCharacterFromSet:hostTerminators options:0 range:NSMakeRange(authorityStart, stringLength - authorityStart)];
505    unsigned hostNameEnd = hostNameTerminator.location == NSNotFound ? stringLength : hostNameTerminator.location;
506
507    // Find "@" for the start of the host name.
508    NSRange userInfoTerminator = [string rangeOfString:@"@" options:0 range:NSMakeRange(authorityStart, hostNameEnd - authorityStart)];
509    unsigned hostNameStart = userInfoTerminator.location == NSNotFound ? authorityStart : NSMaxRange(userInfoTerminator);
510
511    f(string, NSMakeRange(hostNameStart, hostNameEnd - hostNameStart), context);
512}
513
514static NSString *mapHostNames(NSString *string, BOOL encode)
515{
516    // Generally, we want to optimize for the case where there is one host name that does not need mapping.
517
518    if (encode && [string canBeConvertedToEncoding:NSASCIIStringEncoding])
519        return string;
520
521    // Make a list of ranges that actually need mapping.
522    NSMutableArray *hostNameRanges = nil;
523    StringRangeApplierFunction f = encode ? collectRangesThatNeedEncoding : collectRangesThatNeedDecoding;
524    applyHostNameFunctionToURLString(string, f, &hostNameRanges);
525    if (!hostNameRanges)
526        return string;
527
528    // Do the mapping.
529    NSMutableString *mutableCopy = [string mutableCopy];
530    unsigned i = [hostNameRanges count];
531    while (i--) {
532        NSRange hostNameRange = [[hostNameRanges objectAtIndex:i] rangeValue];
533        NSString *mappedHostName = encode ? encodeHostNameWithRange(string, hostNameRange) : decodeHostNameWithRange(string, hostNameRange);
534        [mutableCopy replaceCharactersInRange:hostNameRange withString:mappedHostName];
535    }
536    [hostNameRanges release];
537    return [mutableCopy autorelease];
538}
539
540static BOOL isHexDigit(char c)
541{
542    return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f');
543}
544
545static char hexDigit(int i)
546{
547    if (i < 0 || i > 16)
548        return '0';
549
550    return (i >= 10) ? i - 10 + 'A' : i += '0';
551}
552
553static int hexDigitValue(char c)
554{
555    if (c >= '0' && c <= '9')
556        return c - '0';
557
558    if (c >= 'A' && c <= 'F')
559        return c - 'A' + 10;
560
561    if (c >= 'a' && c <= 'f')
562        return c - 'a' + 10;
563
564    LOG_ERROR("illegal hex digit");
565    return 0;
566}
567
568static NSString *stringByTrimmingWhitespace(NSString *string)
569{
570    NSMutableString *trimmed = [[string mutableCopy] autorelease];
571    CFStringTrimWhitespace((CFMutableStringRef)trimmed);
572    return trimmed;
573}
574
575NSURL *URLByTruncatingOneCharacterBeforeComponent(NSURL *URL, CFURLComponentType component)
576{
577    if (!URL)
578        return nil;
579
580    CFRange fragRg = CFURLGetByteRangeForComponent((CFURLRef)URL, component, NULL);
581    if (fragRg.location == kCFNotFound)
582        return URL;
583
584    UInt8 *urlBytes, buffer[2048];
585    CFIndex numBytes = CFURLGetBytes((CFURLRef)URL, buffer, 2048);
586    if (numBytes == -1) {
587        numBytes = CFURLGetBytes((CFURLRef)URL, NULL, 0);
588        urlBytes = static_cast<UInt8*>(malloc(numBytes));
589        CFURLGetBytes((CFURLRef)URL, urlBytes, numBytes);
590    } else
591        urlBytes = buffer;
592
593    NSURL *result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes, fragRg.location - 1, kCFStringEncodingUTF8, NULL));
594    if (!result)
595        result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes, fragRg.location - 1, kCFStringEncodingISOLatin1, NULL));
596
597    if (urlBytes != buffer)
598        free(urlBytes);
599    return result ? [result autorelease] : URL;
600}
601
602static NSURL *URLByRemovingResourceSpecifier(NSURL *URL)
603{
604    return URLByTruncatingOneCharacterBeforeComponent(URL, kCFURLComponentResourceSpecifier);
605}
606
607NSURL *URLWithData(NSData *data, NSURL *baseURL)
608{
609    if (!data)
610        return nil;
611
612    NSURL *result = nil;
613    size_t length = [data length];
614    if (length > 0) {
615        // work around <rdar://4470771>: CFURLCreateAbsoluteURLWithBytes(.., TRUE) doesn't remove non-path components.
616        baseURL = URLByRemovingResourceSpecifier(baseURL);
617
618        const UInt8 *bytes = static_cast<const UInt8*>([data bytes]);
619
620        // CFURLCreateAbsoluteURLWithBytes would complain to console if we passed a path to it.
621        if (bytes[0] == '/' && !baseURL)
622            return nil;
623
624        // NOTE: We use UTF-8 here since this encoding is used when computing strings when returning URL components
625        // (e.g calls to NSURL -path). However, this function is not tolerant of illegal UTF-8 sequences, which
626        // could either be a malformed string or bytes in a different encoding, like shift-jis, so we fall back
627        // onto using ISO Latin 1 in those cases.
628        result = CFBridgingRelease(CFURLCreateAbsoluteURLWithBytes(NULL, bytes, length, kCFStringEncodingUTF8, (CFURLRef)baseURL, YES));
629        if (!result)
630            result = CFBridgingRelease(CFURLCreateAbsoluteURLWithBytes(NULL, bytes, length, kCFStringEncodingISOLatin1, (CFURLRef)baseURL, YES));
631    } else
632        result = [NSURL URLWithString:@""];
633
634    return result;
635}
636
637NSURL *URLWithUserTypedString(NSString *string, NSURL *URL)
638{
639    if (!string)
640        return nil;
641
642    string = mapHostNames(stringByTrimmingWhitespace(string), YES);
643
644    NSData *userTypedData = [string dataUsingEncoding:NSUTF8StringEncoding];
645    ASSERT(userTypedData);
646
647    const UInt8* inBytes = static_cast<const UInt8 *>([userTypedData bytes]);
648    int inLength = [userTypedData length];
649    if (!inLength)
650        return [NSURL URLWithString:@""];
651
652    char* outBytes = static_cast<char *>(malloc(inLength * 3)); // large enough to %-escape every character
653    char* p = outBytes;
654    int outLength = 0;
655    for (int i = 0; i < inLength; i++) {
656        UInt8 c = inBytes[i];
657        if (c <= 0x20 || c >= 0x7f) {
658            *p++ = '%';
659            *p++ = hexDigit(c >> 4);
660            *p++ = hexDigit(c & 0xf);
661            outLength += 3;
662        } else {
663            *p++ = c;
664            outLength++;
665        }
666    }
667
668    NSData *data = [NSData dataWithBytesNoCopy:outBytes length:outLength]; // adopts outBytes
669    return URLWithData(data, URL);
670}
671
672static BOOL hasQuestionMarkOnlyQueryString(NSURL *URL)
673{
674    CFRange rangeWithSeparators;
675    CFURLGetByteRangeForComponent((CFURLRef)URL, kCFURLComponentQuery, &rangeWithSeparators);
676    if (rangeWithSeparators.location != kCFNotFound && rangeWithSeparators.length == 1)
677        return YES;
678
679    return NO;
680}
681
682#define completeURL (CFURLComponentType)-1
683
684NSData *dataForURLComponentType(NSURL *URL, CFURLComponentType componentType)
685{
686    static int URLComponentTypeBufferLength = 2048;
687
688    UInt8 staticAllBytesBuffer[URLComponentTypeBufferLength];
689    UInt8 *allBytesBuffer = staticAllBytesBuffer;
690
691    CFIndex bytesFilled = CFURLGetBytes((CFURLRef)URL, allBytesBuffer, URLComponentTypeBufferLength);
692    if (bytesFilled == -1) {
693        CFIndex bytesToAllocate = CFURLGetBytes((CFURLRef)URL, NULL, 0);
694        allBytesBuffer = static_cast<UInt8 *>(malloc(bytesToAllocate));
695        bytesFilled = CFURLGetBytes((CFURLRef)URL, allBytesBuffer, bytesToAllocate);
696    }
697
698    CFRange range;
699    if (componentType != completeURL) {
700        range = CFURLGetByteRangeForComponent((CFURLRef)URL, componentType, NULL);
701        if (range.location == kCFNotFound)
702            return nil;
703    } else {
704        range.location = 0;
705        range.length = bytesFilled;
706    }
707
708    NSData *componentData = [NSData dataWithBytes:allBytesBuffer + range.location length:range.length];
709
710    const unsigned char *bytes = static_cast<const unsigned char *>([componentData bytes]);
711    NSMutableData *resultData = [NSMutableData data];
712    // NOTE: add leading '?' to query strings non-zero length query strings.
713    // NOTE: retain question-mark only query strings.
714    if (componentType == kCFURLComponentQuery) {
715        if (range.length > 0 || hasQuestionMarkOnlyQueryString(URL))
716            [resultData appendBytes:"?" length:1];
717    }
718    for (int i = 0; i < range.length; i++) {
719        unsigned char c = bytes[i];
720        if (c <= 0x20 || c >= 0x7f) {
721            char escaped[3];
722            escaped[0] = '%';
723            escaped[1] = hexDigit(c >> 4);
724            escaped[2] = hexDigit(c & 0xf);
725            [resultData appendBytes:escaped length:3];
726        } else {
727            char b[1];
728            b[0] = c;
729            [resultData appendBytes:b length:1];
730        }
731    }
732
733    if (staticAllBytesBuffer != allBytesBuffer)
734        free(allBytesBuffer);
735
736    return resultData;
737}
738
739static NSURL *URLByRemovingComponentAndSubsequentCharacter(NSURL *URL, CFURLComponentType component)
740{
741    CFRange range = CFURLGetByteRangeForComponent((CFURLRef)URL, component, 0);
742    if (range.location == kCFNotFound)
743        return URL;
744
745    // Remove one subsequent character.
746    range.length++;
747
748    Vector<UInt8, 2048> buffer(2048);
749    CFIndex numBytes = CFURLGetBytes((CFURLRef)URL, buffer.data(), 2048);
750    if (numBytes == -1) {
751        numBytes = CFURLGetBytes((CFURLRef)URL, NULL, 0);
752        buffer.grow(numBytes);
753        CFURLGetBytes((CFURLRef)URL, buffer.data(), numBytes);
754    }
755    UInt8* urlBytes = buffer.data();
756
757    if (numBytes < range.location)
758        return URL;
759    if (numBytes < range.location + range.length)
760        range.length = numBytes - range.location;
761
762    memmove(urlBytes + range.location, urlBytes + range.location + range.length, numBytes - range.location + range.length);
763
764    NSURL *result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes, numBytes - range.length, kCFStringEncodingUTF8, NULL));
765    if (!result)
766        result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes, numBytes - range.length, kCFStringEncodingISOLatin1, NULL));
767
768    return result ? [result autorelease] : URL;
769}
770
771NSURL *URLByRemovingUserInfo(NSURL *URL)
772{
773    return URLByRemovingComponentAndSubsequentCharacter(URL, kCFURLComponentUserInfo);
774}
775
776NSURL *URLByCanonicalizingURL(NSURL *URL)
777{
778    RetainPtr<NSURLRequest> request = adoptNS([[NSURLRequest alloc] initWithURL:URL]);
779    Class concreteClass = [NSURLProtocol _protocolClassForRequest:request.get()];
780    if (!concreteClass) {
781        return URL;
782    }
783
784    // This applies NSURL's concept of canonicalization, but not URL's concept. It would
785    // make sense to apply both, but when we tried that it caused a performance degradation
786    // (see 5315926). It might make sense to apply only the URL concept and not the NSURL
787    // concept, but it's too risky to make that change for WebKit 3.0.
788    NSURLRequest *newRequest = [concreteClass canonicalRequestForRequest:request.get()];
789    NSURL *newURL = [newRequest URL];
790    return [[newURL retain] autorelease];
791}
792
793NSData *originalURLData(NSURL *URL)
794{
795    UInt8 *buffer = (UInt8 *)malloc(URL_BYTES_BUFFER_LENGTH);
796    CFIndex bytesFilled = CFURLGetBytes((CFURLRef)URL, buffer, URL_BYTES_BUFFER_LENGTH);
797    if (bytesFilled == -1) {
798        CFIndex bytesToAllocate = CFURLGetBytes((CFURLRef)URL, NULL, 0);
799        buffer = (UInt8 *)realloc(buffer, bytesToAllocate);
800        bytesFilled = CFURLGetBytes((CFURLRef)URL, buffer, bytesToAllocate);
801        ASSERT(bytesFilled == bytesToAllocate);
802    }
803
804    // buffer is adopted by the NSData
805    NSData *data = [NSData dataWithBytesNoCopy:buffer length:bytesFilled freeWhenDone:YES];
806
807    NSURL *baseURL = (NSURL *)CFURLGetBaseURL((CFURLRef)URL);
808    if (baseURL)
809        return originalURLData(URLWithData(data, baseURL));
810    return data;
811}
812
813static CFStringRef createStringWithEscapedUnsafeCharacters(CFStringRef string)
814{
815    CFIndex length = CFStringGetLength(string);
816    Vector<UChar, 2048> sourceBuffer(length);
817    CFStringGetCharacters(string, CFRangeMake(0, length), sourceBuffer.data());
818
819    Vector<UChar, 2048> outBuffer;
820
821    CFIndex i = 0;
822    while (i < length) {
823        UChar32 c;
824        U16_NEXT(sourceBuffer, i, length, c)
825
826        if (isLookalikeCharacter(c)) {
827            uint8_t utf8Buffer[4];
828            CFIndex offset = 0;
829            UBool failure = false;
830            U8_APPEND(utf8Buffer, offset, 4, c, failure)
831            ASSERT(!failure);
832
833            for (CFIndex j = 0; j < offset; ++j) {
834                outBuffer.append('%');
835                outBuffer.append(hexDigit(utf8Buffer[j] >> 4));
836                outBuffer.append(hexDigit(utf8Buffer[j] & 0xf));
837            }
838        } else {
839            UChar utf16Buffer[2];
840            CFIndex offset = 0;
841            UBool failure = false;
842            U16_APPEND(utf16Buffer, offset, 2, c, failure)
843            ASSERT(!failure);
844            for (CFIndex j = 0; j < offset; ++j)
845                outBuffer.append(utf16Buffer[j]);
846        }
847    }
848
849    return CFStringCreateWithCharacters(NULL, outBuffer.data(), outBuffer.size());
850}
851
852NSString *userVisibleString(NSURL *URL)
853{
854    NSData *data = originalURLData(URL);
855    const unsigned char *before = static_cast<const unsigned char*>([data bytes]);
856    int length = [data length];
857
858    bool mayNeedHostNameDecoding = false;
859
860    const unsigned char *p = before;
861    int bufferLength = (length * 3) + 1;
862    char *after = static_cast<char *>(malloc(bufferLength)); // large enough to %-escape every character
863    char *q = after;
864    for (int i = 0; i < length; i++) {
865        unsigned char c = p[i];
866        // unescape escape sequences that indicate bytes greater than 0x7f
867        if (c == '%' && (i + 1 < length && isHexDigit(p[i + 1])) && i + 2 < length && isHexDigit(p[i + 2])) {
868            unsigned char u = (hexDigitValue(p[i + 1]) << 4) | hexDigitValue(p[i + 2]);
869            if (u > 0x7f) {
870                // unescape
871                *q++ = u;
872            } else {
873                // do not unescape
874                *q++ = p[i];
875                *q++ = p[i + 1];
876                *q++ = p[i + 2];
877            }
878            i += 2;
879        } else {
880            *q++ = c;
881
882            // Check for "xn--" in an efficient, non-case-sensitive, way.
883            if (c == '-' && i >= 3 && !mayNeedHostNameDecoding && (q[-4] | 0x20) == 'x' && (q[-3] | 0x20) == 'n' && q[-2] == '-')
884                mayNeedHostNameDecoding = true;
885        }
886    }
887    *q = '\0';
888
889    // Check string to see if it can be converted to display using UTF-8
890    NSString *result = [NSString stringWithUTF8String:after];
891    if (!result) {
892        // Could not convert to UTF-8.
893        // Convert characters greater than 0x7f to escape sequences.
894        // Shift current string to the end of the buffer
895        // then we will copy back bytes to the start of the buffer
896        // as we convert.
897        int afterlength = q - after;
898        char *p = after + bufferLength - afterlength - 1;
899        memmove(p, after, afterlength + 1); // copies trailing '\0'
900        char *q = after;
901        while (*p) {
902            unsigned char c = *p;
903            if (c > 0x7f) {
904                *q++ = '%';
905                *q++ = hexDigit(c >> 4);
906                *q++ = hexDigit(c & 0xf);
907            } else
908                *q++ = *p;
909            p++;
910        }
911        *q = '\0';
912        result = [NSString stringWithUTF8String:after];
913    }
914
915    free(after);
916
917    if (mayNeedHostNameDecoding)
918        result = mapHostNames(result, NO);
919    result = [result precomposedStringWithCanonicalMapping];
920    return CFBridgingRelease(createStringWithEscapedUnsafeCharacters((CFStringRef)result));
921}
922
923BOOL isUserVisibleURL(NSString *string)
924{
925    BOOL valid = YES;
926    // get buffer
927
928    char static_buffer[1024];
929    const char *p;
930    BOOL success = CFStringGetCString((CFStringRef)string, static_buffer, 1023, kCFStringEncodingUTF8);
931    p = success ? static_buffer : [string UTF8String];
932
933    int length = strlen(p);
934
935    // check for characters <= 0x20 or >=0x7f, %-escape sequences of %7f, and xn--, these
936    // are the things that will lead _web_userVisibleString to actually change things.
937    for (int i = 0; i < length; i++) {
938        unsigned char c = p[i];
939        // escape control characters, space, and delete
940        if (c <= 0x20 || c == 0x7f) {
941            valid = NO;
942            break;
943        } else if (c == '%' && (i + 1 < length && isHexDigit(p[i + 1])) && i + 2 < length && isHexDigit(p[i + 2])) {
944            unsigned char u = (hexDigitValue(p[i + 1]) << 4) | hexDigitValue(p[i + 2]);
945            if (u > 0x7f) {
946                valid = NO;
947                break;
948            }
949            i += 2;
950        } else {
951            // Check for "xn--" in an efficient, non-case-sensitive, way.
952            if (c == '-' && i >= 3 && (p[i - 3] | 0x20) == 'x' && (p[i - 2] | 0x20) == 'n' && p[i - 1] == '-') {
953                valid = NO;
954                break;
955            }
956        }
957    }
958
959    return valid;
960}
961
962NSRange rangeOfURLScheme(NSString *string)
963{
964    NSRange colon = [string rangeOfString:@":"];
965    if (colon.location != NSNotFound && colon.location > 0) {
966        NSRange scheme = {0, colon.location};
967        static NSCharacterSet *InverseSchemeCharacterSet = nil;
968        if (!InverseSchemeCharacterSet) {
969            /*
970             This stuff is very expensive.  10-15 msec on a 2x1.2GHz.  If not cached it swamps
971             everything else when adding items to the autocomplete DB.  Makes me wonder if we
972             even need to enforce the character set here.
973            */
974            NSString *acceptableCharacters = @"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-";
975            InverseSchemeCharacterSet = [[[NSCharacterSet characterSetWithCharactersInString:acceptableCharacters] invertedSet] retain];
976        }
977        NSRange illegals = [string rangeOfCharacterFromSet:InverseSchemeCharacterSet options:0 range:scheme];
978        if (illegals.location == NSNotFound)
979            return scheme;
980    }
981    return NSMakeRange(NSNotFound, 0);
982}
983
984BOOL looksLikeAbsoluteURL(NSString *string)
985{
986    // Trim whitespace because _web_URLWithString allows whitespace.
987    return rangeOfURLScheme(stringByTrimmingWhitespace(string)).location != NSNotFound;
988}
989
990} // namespace WebCore
991