1/*
2 * Copyright (c) 2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24/*	CFURL.inc.h
25	Copyright (c) 2012-2013, Apple Inc. All rights reserved.
26	Responsibility: Jim Luther
27*/
28
29
30/*
31
32 What's this file for?
33
34 CFURL's URL string parser needs to be able to parse either an array of char or an array of UniChar.
35
36 The code in CFURL.c used to use this macro "#define STRING_CHAR(x) (useCString ? cstring[(x)] : ustring[(x)])" to determine which array to get a character from for every character looked at in the URL string. That macro added one or more compare and branch instructins to the parser's execution for *every* character in the URL string. Those extra compares and branches added up to 10% of the time (for long URL strings) it takes to create a URL object.
37
38 To ensure the exact same parser code is run over a char or a UniChar string, the source code was move to this .h file and is included multiple times by CFURL.c as needed. "STRING_CHAR(x)" was replaced by "characterArray[x]", and characterArray is defined as either an "const char *" or a "const UniChar *" for the two sets of function headers that are either parsing an array of char or an array of UniChar.
39
40 Any changes made to the parser are made in this file so that both char and the UniChar strings are parsed exactly the same way.
41
42 */
43
44/*
45    static void _parseComponentsCString(CFAllocatorRef alloc, CFURLRef baseURL, UInt32 *theFlags, CFRange **range, CFIndex cfStringLength, const char *characterArray)
46    static void _parseComponentsUString(CFAllocatorRef alloc, CFURLRef baseURL, UInt32 *theFlags, CFRange **range, CFIndex cfStringLength, const UniChar *characterArray)
47 */
48#ifdef CFURL_INCLUDE_PARSE_COMPONENTS // defined when we want this block of code included
49{
50    CFRange ranges[9];
51    /* index gives the URL part involved; to calculate the correct range index, use the number of the bit of the equivalent flag (i.e. the host flag is HAS_HOST, which is 0x8.  so the range index for the host is 3.)  Note that this is true in this function ONLY, since the ranges stored in (*range) are actually packed, skipping those URL components that don't exist.  This is why the indices are hard-coded in this function. */
52
53    CFIndex idx, base_idx = 0;
54    CFIndex string_length;
55    UInt32 flags = *theFlags;
56    Boolean isCompliant;
57    uint8_t numRanges = 0;
58
59    string_length = cfStringLength;
60
61    // Algorithm is as described in RFC 1808
62    // 1: parse the fragment; remainder after left-most "#" is fragment
63    for (idx = base_idx; idx < string_length; idx++) {
64        if ('#' == characterArray[idx]) {
65            flags |= HAS_FRAGMENT;
66            ranges[8].location = idx + 1;
67            ranges[8].length = string_length - (idx + 1);
68            numRanges ++;
69            string_length = idx;	// remove fragment from parse string
70            break;
71        }
72    }
73    // 2: parse the scheme
74    for (idx = base_idx; idx < string_length; idx++) {
75        UniChar ch = characterArray[idx];
76        if (':' == ch) {
77            flags |= HAS_SCHEME;
78            ranges[0].location = base_idx;
79            ranges[0].length = idx;
80            numRanges ++;
81            base_idx = idx + 1;
82            // optimization for ftp urls
83            if (idx == 3 && characterArray[0] == 'f' && characterArray[1] == 't' && characterArray[2] == 'p') {
84                _setSchemeTypeInFlags(&flags, kHasFtpScheme);
85            }
86            else if (idx == 4) {
87                // optimization for http urls
88                if (characterArray[0] == 'h' && characterArray[1] == 't' && characterArray[2] == 't' && characterArray[3] == 'p') {
89                    _setSchemeTypeInFlags(&flags, kHasHttpScheme);
90                }
91                // optimization for file urls
92                if (characterArray[0] == 'f' && characterArray[1] == 'i' && characterArray[2] == 'l' && characterArray[3] == 'e') {
93                    _setSchemeTypeInFlags(&flags, kHasFileScheme);
94                }
95                // optimization for data urls
96                if (characterArray[0] == 'd' && characterArray[1] == 'a' && characterArray[2] == 't' && characterArray[3] == 'a') {
97                    _setSchemeTypeInFlags(&flags, kHasDataScheme);
98                }
99            }
100            // optimization for https urls
101            else if (idx == 5 && characterArray[0] == 'h' && characterArray[1] == 't' && characterArray[2] == 't' && characterArray[3] == 'p' && characterArray[3] == 's') {
102                _setSchemeTypeInFlags(&flags, kHasHttpsScheme);
103            }
104            break;
105        } else if (!scheme_valid(ch)) {
106            break;	// invalid scheme character -- no scheme
107        }
108    }
109
110    // Make sure we have an RFC-1808 compliant URL - that's either something without a scheme, or scheme:/(stuff) or scheme://(stuff)
111    // Strictly speaking, RFC 1808 & 2396 bar "scheme:" (with nothing following the colon); however, common usage
112    // expects this to be treated identically to "scheme://" - REW, 12/08/03
113    if (!(flags & HAS_SCHEME)) {
114        isCompliant = true;
115    } else if (base_idx == string_length) {
116        isCompliant = false;
117    } else if (characterArray[base_idx] != '/') {
118        isCompliant = false;
119    } else {
120        isCompliant = true;
121    }
122
123    if (!isCompliant) {
124        // Clear the fragment flag if it's been set
125        if (flags & HAS_FRAGMENT) {
126            flags &= (~HAS_FRAGMENT);
127            string_length = cfStringLength;
128        }
129        (*theFlags) = flags;
130        (*range) = (CFRange *)CFAllocatorAllocate(alloc, sizeof(CFRange), 0);
131        (*range)->location = ranges[0].location;
132        (*range)->length = ranges[0].length;
133
134        return;
135    }
136    // URL is 1808-compliant
137    flags |= IS_DECOMPOSABLE;
138
139    // 3: parse the network location and login
140    if (2 <= (string_length - base_idx) && '/' == characterArray[base_idx] && '/' == characterArray[base_idx+1]) {
141        CFIndex base = 2 + base_idx, extent;
142        for (idx = base; idx < string_length; idx++) {
143            if ('/' == characterArray[idx] || '?' == characterArray[idx]) {
144                break;
145            }
146        }
147        extent = idx;
148
149        // net_loc parts extend from base to extent (but not including), which might be to end of string
150        // net location is "<user>:<password>@<host>:<port>"
151        if (extent != base) {
152            for (idx = base; idx < extent; idx++) {
153                if ('@' == characterArray[idx]) {   // there is a user
154                    CFIndex idx2;
155                    flags |= HAS_USER;
156                    numRanges ++;
157                    ranges[1].location = base;  // base of the user
158                    for (idx2 = base; idx2 < idx; idx2++) {
159                        if (':' == characterArray[idx2]) {	// found a password separator
160                            flags |= HAS_PASSWORD;
161                            numRanges ++;
162                            ranges[2].location = idx2+1; // base of the password
163                            ranges[2].length = idx-(idx2+1);  // password extent
164                            ranges[1].length = idx2 - base; // user extent
165                            break;
166                        }
167                    }
168                    if (!(flags & HAS_PASSWORD)) {
169                        // user extends to the '@'
170                        ranges[1].length = idx - base; // user extent
171                    }
172                    base = idx + 1;
173                    break;
174                }
175            }
176            flags |= HAS_HOST;
177            numRanges ++;
178            ranges[3].location = base; // base of host
179
180            // base has been advanced past the user and password if they existed
181            for (idx = base; idx < extent; idx++) {
182                // IPV6 support (RFC 2732) DCJ June/10/2002
183                if ('[' == characterArray[idx]) {	// starting IPV6 explicit address
184                    //	Find the ']' terminator of the IPv6 address, leave idx pointing to ']' or end
185                    for ( ; idx < extent; ++ idx ) {
186                        if ( ']' == characterArray[idx]) {
187                            flags |= IS_IPV6_ENCODED;
188                            break;
189                        }
190                    }
191                }
192                // there is a port if we see a colon.  Only the last one is the port, though.
193                else if ( ':' == characterArray[idx]) {
194                    flags |= HAS_PORT;
195                    numRanges ++;
196                    ranges[4].location = idx+1; // base of port
197                    ranges[4].length = extent - (idx+1); // port extent
198                    ranges[3].length = idx - base; // host extent
199                    break;
200                }
201            }
202            if (!(flags & HAS_PORT)) {
203                ranges[3].length = extent - base;  // host extent
204            }
205        }
206        base_idx = extent;
207    }
208
209    // 4: parse the query; remainder after left-most "?" is query
210    for (idx = base_idx; idx < string_length; idx++) {
211        if ('?' == characterArray[idx]) {
212            flags |= HAS_QUERY;
213            numRanges ++;
214            ranges[7].location = idx + 1;
215            ranges[7].length = string_length - (idx+1);
216            string_length = idx;	// remove query from parse string
217            break;
218        }
219    }
220
221    // 5: parse the parameters; remainder after left-most ";" is parameters
222    for (idx = base_idx; idx < string_length; idx++) {
223        if (';' == characterArray[idx]) {
224            flags |= HAS_PARAMETERS;
225            numRanges ++;
226            ranges[6].location = idx + 1;
227            ranges[6].length = string_length - (idx+1);
228            string_length = idx;	// remove parameters from parse string
229            break;
230        }
231    }
232
233    // 6: parse the path; it's whatever's left between string_length & base_idx
234    if (string_length - base_idx != 0 || (flags & NET_LOCATION_MASK))
235    {
236        // If we have a net location, we are 1808-compliant, and an empty path substring implies a path of "/"
237        UniChar ch;
238        Boolean isDir;
239        CFRange pathRg;
240        flags |= HAS_PATH;
241        numRanges ++;
242        pathRg.location = base_idx;
243        pathRg.length = string_length - base_idx;
244        ranges[5] = pathRg;
245
246        if (pathRg.length > 0) {
247            Boolean sawPercent = FALSE;
248            for (idx = pathRg.location; idx < string_length; idx++) {
249                if ('%' == characterArray[idx]) {
250                    sawPercent = TRUE;
251                    break;
252                }
253            }
254#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_EMBEDDED_MINI
255	    if (pathRg.length > 6 && characterArray[pathRg.location] == '/' && characterArray[pathRg.location + 1] == '.' && characterArray[pathRg.location + 2] == 'f' && characterArray[pathRg.location + 3] == 'i' && characterArray[pathRg.location + 4] == 'l' && characterArray[pathRg.location + 5] == 'e' && characterArray[pathRg.location + 6] == '/') {
256		flags |= PATH_HAS_FILE_ID;
257	    } else if (!sawPercent) {
258                flags |= POSIX_AND_URL_PATHS_MATCH;
259            }
260#elif DEPLOYMENT_TARGET_LINUX || DEPLOYMENT_TARGET_WINDOWS
261            if (!sawPercent) {
262                flags |= POSIX_AND_URL_PATHS_MATCH;
263            }
264#endif
265
266            ch = characterArray[pathRg.location + pathRg.length - 1];
267            if (ch == '/') {
268                isDir = true;
269            } else if (ch == '.') {
270                if (pathRg.length == 1) {
271                    isDir = true;
272                } else {
273                    ch = characterArray[pathRg.location + pathRg.length - 2];
274                    if (ch == '/') {
275                        isDir = true;
276                    } else if (ch != '.') {
277                        isDir = false;
278                    } else if (pathRg.length == 2) {
279                        isDir = true;
280                    } else {
281                        isDir = (characterArray[pathRg.location + pathRg.length - 3] == '/');
282                    }
283                }
284            } else {
285                isDir = false;
286            }
287        } else {
288            isDir = (baseURL != NULL) ? CFURLHasDirectoryPath(baseURL) : false;
289        }
290        if (isDir) {
291            flags |= IS_DIRECTORY;
292        }
293    }
294
295    (*theFlags) = flags;
296    (*range) = (CFRange *)CFAllocatorAllocate(alloc, sizeof(CFRange)*numRanges, 0);
297    numRanges = 0;
298    for (idx = 0, flags = 1; flags != (1<<9); flags = (flags<<1), idx ++) {
299        if ((*theFlags) & flags) {
300            (*range)[numRanges] = ranges[idx];
301            numRanges ++;
302        }
303    }
304}
305#endif  // CFURL_INCLUDE_PARSE_COMPONENTS
306
307/*
308    static Boolean scanCharactersCString(CFAllocatorRef alloc, CFMutableStringRef *escapedString, UInt32 *flags, const char *characterArray, Boolean useCString, CFIndex base, CFIndex end, CFIndex *mark, UInt32 componentFlag, CFStringEncoding encoding)
309    static Boolean scanCharactersUString(CFAllocatorRef alloc, CFMutableStringRef *escapedString, UInt32 *flags, const UniChar *characterArray, Boolean useCString, CFIndex base, CFIndex end, CFIndex *mark, UInt32 componentFlag, CFStringEncoding encoding)
310 */
311#ifdef CFURL_INCLUDE_SCAN_CHARACTERS  // defined when we want this block of code included
312{
313    CFIndex idx;
314    Boolean sawIllegalChar = false;
315    for (idx = base; idx < end; idx ++) {
316        Boolean shouldEscape;
317        UniChar ch = characterArray[idx];
318        if (isURLLegalCharacter(ch)) {
319            if ((componentFlag == HAS_USER || componentFlag == HAS_PASSWORD) && (ch == '/' || ch == '?' || ch == '@')) {
320                shouldEscape = true;
321            } else {
322                shouldEscape = false;
323            }
324        } else if (ch == '%' && idx + 2 < end && isHexDigit(characterArray[idx + 1]) && isHexDigit(characterArray[idx+2])) {
325            shouldEscape = false;
326        } else if (componentFlag == HAS_HOST && ((idx == base && ch == '[') || (idx == end-1 && ch == ']'))) {
327            shouldEscape = false;
328        } else {
329            shouldEscape = true;
330        }
331        if (shouldEscape) {
332            sawIllegalChar = true;
333            if (componentFlag && flags) {
334                *flags |= componentFlag;
335            }
336            if (!*escapedString) {
337                *escapedString = CFStringCreateMutable(alloc, 0);
338            }
339            if (useCString) {
340                CFStringRef tempString = CFStringCreateWithBytes(alloc, (uint8_t *)&(characterArray[*mark]), idx - *mark, kCFStringEncodingISOLatin1, false);
341                CFStringAppend(*escapedString, tempString);
342                CFRelease(tempString);
343            } else {
344                CFStringAppendCharacters(*escapedString, (const UniChar *)&(characterArray[*mark]), idx - *mark);
345            }
346            *mark = idx + 1;
347            _appendPercentEscapesForCharacter(ch, encoding, *escapedString); // This can never fail because anURL->_string was constructed from the encoding passed in
348        }
349    }
350    return sawIllegalChar;
351}
352#endif  // CFURL_INCLUDE_SCAN_CHARACTERS
353