1/* 2 * Copyright (c) 2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. Please obtain a copy of the License at 10 * http://www.opensource.apple.com/apsl/ and read it before using this 11 * file. 12 * 13 * The Original Code and all software distributed under the License are 14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 18 * Please see the License for the specific language governing rights and 19 * limitations under the License. 20 * 21 * @APPLE_LICENSE_HEADER_END@ 22 */ 23 24/* CFURL.inc.h 25 Copyright (c) 2012-2013, Apple Inc. All rights reserved. 26 Responsibility: Jim Luther 27*/ 28 29 30/* 31 32 What's this file for? 33 34 CFURL's URL string parser needs to be able to parse either an array of char or an array of UniChar. 35 36 The code in CFURL.c used to use this macro "#define STRING_CHAR(x) (useCString ? cstring[(x)] : ustring[(x)])" to determine which array to get a character from for every character looked at in the URL string. That macro added one or more compare and branch instructins to the parser's execution for *every* character in the URL string. Those extra compares and branches added up to 10% of the time (for long URL strings) it takes to create a URL object. 37 38 To ensure the exact same parser code is run over a char or a UniChar string, the source code was move to this .h file and is included multiple times by CFURL.c as needed. "STRING_CHAR(x)" was replaced by "characterArray[x]", and characterArray is defined as either an "const char *" or a "const UniChar *" for the two sets of function headers that are either parsing an array of char or an array of UniChar. 39 40 Any changes made to the parser are made in this file so that both char and the UniChar strings are parsed exactly the same way. 41 42 */ 43 44/* 45 static void _parseComponentsCString(CFAllocatorRef alloc, CFURLRef baseURL, UInt32 *theFlags, CFRange **range, CFIndex cfStringLength, const char *characterArray) 46 static void _parseComponentsUString(CFAllocatorRef alloc, CFURLRef baseURL, UInt32 *theFlags, CFRange **range, CFIndex cfStringLength, const UniChar *characterArray) 47 */ 48#ifdef CFURL_INCLUDE_PARSE_COMPONENTS // defined when we want this block of code included 49{ 50 CFRange ranges[9]; 51 /* index gives the URL part involved; to calculate the correct range index, use the number of the bit of the equivalent flag (i.e. the host flag is HAS_HOST, which is 0x8. so the range index for the host is 3.) Note that this is true in this function ONLY, since the ranges stored in (*range) are actually packed, skipping those URL components that don't exist. This is why the indices are hard-coded in this function. */ 52 53 CFIndex idx, base_idx = 0; 54 CFIndex string_length; 55 UInt32 flags = *theFlags; 56 Boolean isCompliant; 57 uint8_t numRanges = 0; 58 59 string_length = cfStringLength; 60 61 // Algorithm is as described in RFC 1808 62 // 1: parse the fragment; remainder after left-most "#" is fragment 63 for (idx = base_idx; idx < string_length; idx++) { 64 if ('#' == characterArray[idx]) { 65 flags |= HAS_FRAGMENT; 66 ranges[8].location = idx + 1; 67 ranges[8].length = string_length - (idx + 1); 68 numRanges ++; 69 string_length = idx; // remove fragment from parse string 70 break; 71 } 72 } 73 // 2: parse the scheme 74 for (idx = base_idx; idx < string_length; idx++) { 75 UniChar ch = characterArray[idx]; 76 if (':' == ch) { 77 flags |= HAS_SCHEME; 78 ranges[0].location = base_idx; 79 ranges[0].length = idx; 80 numRanges ++; 81 base_idx = idx + 1; 82 // optimization for ftp urls 83 if (idx == 3 && characterArray[0] == 'f' && characterArray[1] == 't' && characterArray[2] == 'p') { 84 _setSchemeTypeInFlags(&flags, kHasFtpScheme); 85 } 86 else if (idx == 4) { 87 // optimization for http urls 88 if (characterArray[0] == 'h' && characterArray[1] == 't' && characterArray[2] == 't' && characterArray[3] == 'p') { 89 _setSchemeTypeInFlags(&flags, kHasHttpScheme); 90 } 91 // optimization for file urls 92 if (characterArray[0] == 'f' && characterArray[1] == 'i' && characterArray[2] == 'l' && characterArray[3] == 'e') { 93 _setSchemeTypeInFlags(&flags, kHasFileScheme); 94 } 95 // optimization for data urls 96 if (characterArray[0] == 'd' && characterArray[1] == 'a' && characterArray[2] == 't' && characterArray[3] == 'a') { 97 _setSchemeTypeInFlags(&flags, kHasDataScheme); 98 } 99 } 100 // optimization for https urls 101 else if (idx == 5 && characterArray[0] == 'h' && characterArray[1] == 't' && characterArray[2] == 't' && characterArray[3] == 'p' && characterArray[3] == 's') { 102 _setSchemeTypeInFlags(&flags, kHasHttpsScheme); 103 } 104 break; 105 } else if (!scheme_valid(ch)) { 106 break; // invalid scheme character -- no scheme 107 } 108 } 109 110 // Make sure we have an RFC-1808 compliant URL - that's either something without a scheme, or scheme:/(stuff) or scheme://(stuff) 111 // Strictly speaking, RFC 1808 & 2396 bar "scheme:" (with nothing following the colon); however, common usage 112 // expects this to be treated identically to "scheme://" - REW, 12/08/03 113 if (!(flags & HAS_SCHEME)) { 114 isCompliant = true; 115 } else if (base_idx == string_length) { 116 isCompliant = false; 117 } else if (characterArray[base_idx] != '/') { 118 isCompliant = false; 119 } else { 120 isCompliant = true; 121 } 122 123 if (!isCompliant) { 124 // Clear the fragment flag if it's been set 125 if (flags & HAS_FRAGMENT) { 126 flags &= (~HAS_FRAGMENT); 127 string_length = cfStringLength; 128 } 129 (*theFlags) = flags; 130 (*range) = (CFRange *)CFAllocatorAllocate(alloc, sizeof(CFRange), 0); 131 (*range)->location = ranges[0].location; 132 (*range)->length = ranges[0].length; 133 134 return; 135 } 136 // URL is 1808-compliant 137 flags |= IS_DECOMPOSABLE; 138 139 // 3: parse the network location and login 140 if (2 <= (string_length - base_idx) && '/' == characterArray[base_idx] && '/' == characterArray[base_idx+1]) { 141 CFIndex base = 2 + base_idx, extent; 142 for (idx = base; idx < string_length; idx++) { 143 if ('/' == characterArray[idx] || '?' == characterArray[idx]) { 144 break; 145 } 146 } 147 extent = idx; 148 149 // net_loc parts extend from base to extent (but not including), which might be to end of string 150 // net location is "<user>:<password>@<host>:<port>" 151 if (extent != base) { 152 for (idx = base; idx < extent; idx++) { 153 if ('@' == characterArray[idx]) { // there is a user 154 CFIndex idx2; 155 flags |= HAS_USER; 156 numRanges ++; 157 ranges[1].location = base; // base of the user 158 for (idx2 = base; idx2 < idx; idx2++) { 159 if (':' == characterArray[idx2]) { // found a password separator 160 flags |= HAS_PASSWORD; 161 numRanges ++; 162 ranges[2].location = idx2+1; // base of the password 163 ranges[2].length = idx-(idx2+1); // password extent 164 ranges[1].length = idx2 - base; // user extent 165 break; 166 } 167 } 168 if (!(flags & HAS_PASSWORD)) { 169 // user extends to the '@' 170 ranges[1].length = idx - base; // user extent 171 } 172 base = idx + 1; 173 break; 174 } 175 } 176 flags |= HAS_HOST; 177 numRanges ++; 178 ranges[3].location = base; // base of host 179 180 // base has been advanced past the user and password if they existed 181 for (idx = base; idx < extent; idx++) { 182 // IPV6 support (RFC 2732) DCJ June/10/2002 183 if ('[' == characterArray[idx]) { // starting IPV6 explicit address 184 // Find the ']' terminator of the IPv6 address, leave idx pointing to ']' or end 185 for ( ; idx < extent; ++ idx ) { 186 if ( ']' == characterArray[idx]) { 187 flags |= IS_IPV6_ENCODED; 188 break; 189 } 190 } 191 } 192 // there is a port if we see a colon. Only the last one is the port, though. 193 else if ( ':' == characterArray[idx]) { 194 flags |= HAS_PORT; 195 numRanges ++; 196 ranges[4].location = idx+1; // base of port 197 ranges[4].length = extent - (idx+1); // port extent 198 ranges[3].length = idx - base; // host extent 199 break; 200 } 201 } 202 if (!(flags & HAS_PORT)) { 203 ranges[3].length = extent - base; // host extent 204 } 205 } 206 base_idx = extent; 207 } 208 209 // 4: parse the query; remainder after left-most "?" is query 210 for (idx = base_idx; idx < string_length; idx++) { 211 if ('?' == characterArray[idx]) { 212 flags |= HAS_QUERY; 213 numRanges ++; 214 ranges[7].location = idx + 1; 215 ranges[7].length = string_length - (idx+1); 216 string_length = idx; // remove query from parse string 217 break; 218 } 219 } 220 221 // 5: parse the parameters; remainder after left-most ";" is parameters 222 for (idx = base_idx; idx < string_length; idx++) { 223 if (';' == characterArray[idx]) { 224 flags |= HAS_PARAMETERS; 225 numRanges ++; 226 ranges[6].location = idx + 1; 227 ranges[6].length = string_length - (idx+1); 228 string_length = idx; // remove parameters from parse string 229 break; 230 } 231 } 232 233 // 6: parse the path; it's whatever's left between string_length & base_idx 234 if (string_length - base_idx != 0 || (flags & NET_LOCATION_MASK)) 235 { 236 // If we have a net location, we are 1808-compliant, and an empty path substring implies a path of "/" 237 UniChar ch; 238 Boolean isDir; 239 CFRange pathRg; 240 flags |= HAS_PATH; 241 numRanges ++; 242 pathRg.location = base_idx; 243 pathRg.length = string_length - base_idx; 244 ranges[5] = pathRg; 245 246 if (pathRg.length > 0) { 247 Boolean sawPercent = FALSE; 248 for (idx = pathRg.location; idx < string_length; idx++) { 249 if ('%' == characterArray[idx]) { 250 sawPercent = TRUE; 251 break; 252 } 253 } 254#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_EMBEDDED_MINI 255 if (pathRg.length > 6 && characterArray[pathRg.location] == '/' && characterArray[pathRg.location + 1] == '.' && characterArray[pathRg.location + 2] == 'f' && characterArray[pathRg.location + 3] == 'i' && characterArray[pathRg.location + 4] == 'l' && characterArray[pathRg.location + 5] == 'e' && characterArray[pathRg.location + 6] == '/') { 256 flags |= PATH_HAS_FILE_ID; 257 } else if (!sawPercent) { 258 flags |= POSIX_AND_URL_PATHS_MATCH; 259 } 260#elif DEPLOYMENT_TARGET_LINUX || DEPLOYMENT_TARGET_WINDOWS 261 if (!sawPercent) { 262 flags |= POSIX_AND_URL_PATHS_MATCH; 263 } 264#endif 265 266 ch = characterArray[pathRg.location + pathRg.length - 1]; 267 if (ch == '/') { 268 isDir = true; 269 } else if (ch == '.') { 270 if (pathRg.length == 1) { 271 isDir = true; 272 } else { 273 ch = characterArray[pathRg.location + pathRg.length - 2]; 274 if (ch == '/') { 275 isDir = true; 276 } else if (ch != '.') { 277 isDir = false; 278 } else if (pathRg.length == 2) { 279 isDir = true; 280 } else { 281 isDir = (characterArray[pathRg.location + pathRg.length - 3] == '/'); 282 } 283 } 284 } else { 285 isDir = false; 286 } 287 } else { 288 isDir = (baseURL != NULL) ? CFURLHasDirectoryPath(baseURL) : false; 289 } 290 if (isDir) { 291 flags |= IS_DIRECTORY; 292 } 293 } 294 295 (*theFlags) = flags; 296 (*range) = (CFRange *)CFAllocatorAllocate(alloc, sizeof(CFRange)*numRanges, 0); 297 numRanges = 0; 298 for (idx = 0, flags = 1; flags != (1<<9); flags = (flags<<1), idx ++) { 299 if ((*theFlags) & flags) { 300 (*range)[numRanges] = ranges[idx]; 301 numRanges ++; 302 } 303 } 304} 305#endif // CFURL_INCLUDE_PARSE_COMPONENTS 306 307/* 308 static Boolean scanCharactersCString(CFAllocatorRef alloc, CFMutableStringRef *escapedString, UInt32 *flags, const char *characterArray, Boolean useCString, CFIndex base, CFIndex end, CFIndex *mark, UInt32 componentFlag, CFStringEncoding encoding) 309 static Boolean scanCharactersUString(CFAllocatorRef alloc, CFMutableStringRef *escapedString, UInt32 *flags, const UniChar *characterArray, Boolean useCString, CFIndex base, CFIndex end, CFIndex *mark, UInt32 componentFlag, CFStringEncoding encoding) 310 */ 311#ifdef CFURL_INCLUDE_SCAN_CHARACTERS // defined when we want this block of code included 312{ 313 CFIndex idx; 314 Boolean sawIllegalChar = false; 315 for (idx = base; idx < end; idx ++) { 316 Boolean shouldEscape; 317 UniChar ch = characterArray[idx]; 318 if (isURLLegalCharacter(ch)) { 319 if ((componentFlag == HAS_USER || componentFlag == HAS_PASSWORD) && (ch == '/' || ch == '?' || ch == '@')) { 320 shouldEscape = true; 321 } else { 322 shouldEscape = false; 323 } 324 } else if (ch == '%' && idx + 2 < end && isHexDigit(characterArray[idx + 1]) && isHexDigit(characterArray[idx+2])) { 325 shouldEscape = false; 326 } else if (componentFlag == HAS_HOST && ((idx == base && ch == '[') || (idx == end-1 && ch == ']'))) { 327 shouldEscape = false; 328 } else { 329 shouldEscape = true; 330 } 331 if (shouldEscape) { 332 sawIllegalChar = true; 333 if (componentFlag && flags) { 334 *flags |= componentFlag; 335 } 336 if (!*escapedString) { 337 *escapedString = CFStringCreateMutable(alloc, 0); 338 } 339 if (useCString) { 340 CFStringRef tempString = CFStringCreateWithBytes(alloc, (uint8_t *)&(characterArray[*mark]), idx - *mark, kCFStringEncodingISOLatin1, false); 341 CFStringAppend(*escapedString, tempString); 342 CFRelease(tempString); 343 } else { 344 CFStringAppendCharacters(*escapedString, (const UniChar *)&(characterArray[*mark]), idx - *mark); 345 } 346 *mark = idx + 1; 347 _appendPercentEscapesForCharacter(ch, encoding, *escapedString); // This can never fail because anURL->_string was constructed from the encoding passed in 348 } 349 } 350 return sawIllegalChar; 351} 352#endif // CFURL_INCLUDE_SCAN_CHARACTERS 353