1/* 2 * Copyright (C) 2004, 2007, 2008, 2011, 2012, 2013 Apple Inc. All rights reserved. 3 * Copyright (C) 2012 Research In Motion Limited. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#include "config.h" 28#include "URL.h" 29 30#include "DecodeEscapeSequences.h" 31#include "MIMETypeRegistry.h" 32#include "TextEncoding.h" 33#include "UUID.h" 34#include <stdio.h> 35#include <unicode/uidna.h> 36#include <wtf/HashMap.h> 37#include <wtf/HexNumber.h> 38#include <wtf/StdLibExtras.h> 39#include <wtf/text/CString.h> 40#include <wtf/text/StringBuilder.h> 41#include <wtf/text/StringHash.h> 42 43// FIXME: This file makes too much use of the + operator on String. 44// We either have to optimize that operator so it doesn't involve 45// so many allocations, or change this to use StringBuffer instead. 46 47using namespace WTF; 48 49namespace WebCore { 50 51typedef Vector<char, 512> CharBuffer; 52typedef Vector<UChar, 512> UCharBuffer; 53 54static const unsigned maximumValidPortNumber = 0xFFFE; 55static const unsigned invalidPortNumber = 0xFFFF; 56 57static inline bool isLetterMatchIgnoringCase(UChar character, char lowercaseLetter) 58{ 59 ASSERT(isASCIILower(lowercaseLetter)); 60 return (character | 0x20) == lowercaseLetter; 61} 62 63static const char wsScheme[] = {'w', 's'}; 64static const char ftpScheme[] = {'f', 't', 'p'}; 65static const char ftpPort[] = {'2', '1'}; 66static const char wssScheme[] = {'w', 's', 's'}; 67static const char fileScheme[] = {'f', 'i', 'l', 'e'}; 68static const char httpScheme[] = {'h', 't', 't', 'p'}; 69static const char httpPort[] = {'8', '0'}; 70static const char httpsScheme[] = {'h', 't', 't', 'p', 's'}; 71static const char httpsPort[] = {'4', '4', '3'}; 72static const char gopherScheme[] = {'g', 'o', 'p', 'h', 'e', 'r'}; 73static const char gopherPort[] = {'7', '0'}; 74 75static inline bool isLetterMatchIgnoringCase(char character, char lowercaseLetter) 76{ 77 ASSERT(isASCIILower(lowercaseLetter)); 78 return (character | 0x20) == lowercaseLetter; 79} 80 81enum URLCharacterClasses { 82 // alpha 83 SchemeFirstChar = 1 << 0, 84 85 // ( alpha | digit | "+" | "-" | "." ) 86 SchemeChar = 1 << 1, 87 88 // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" 89 // unreserved = alphanum | mark 90 // ( unreserved | escaped | ";" | ":" | "&" | "=" | "+" | "$" | "," ) 91 UserInfoChar = 1 << 2, 92 93 // alnum | "." | "-" | "%" 94 // The above is what the specification says, but we are lenient to 95 // match existing practice and also allow: 96 // "_" 97 HostnameChar = 1 << 3, 98 99 // hexdigit | ":" | "%" 100 IPv6Char = 1 << 4, 101 102 // "#" | "?" | "/" | nul 103 PathSegmentEndChar = 1 << 5, 104 105 // not allowed in path 106 BadChar = 1 << 6 107}; 108 109static const unsigned char characterClassTable[256] = { 110 /* 0 nul */ PathSegmentEndChar, /* 1 soh */ BadChar, 111 /* 2 stx */ BadChar, /* 3 etx */ BadChar, 112 /* 4 eot */ BadChar, /* 5 enq */ BadChar, /* 6 ack */ BadChar, /* 7 bel */ BadChar, 113 /* 8 bs */ BadChar, /* 9 ht */ BadChar, /* 10 nl */ BadChar, /* 11 vt */ BadChar, 114 /* 12 np */ BadChar, /* 13 cr */ BadChar, /* 14 so */ BadChar, /* 15 si */ BadChar, 115 /* 16 dle */ BadChar, /* 17 dc1 */ BadChar, /* 18 dc2 */ BadChar, /* 19 dc3 */ BadChar, 116 /* 20 dc4 */ BadChar, /* 21 nak */ BadChar, /* 22 syn */ BadChar, /* 23 etb */ BadChar, 117 /* 24 can */ BadChar, /* 25 em */ BadChar, /* 26 sub */ BadChar, /* 27 esc */ BadChar, 118 /* 28 fs */ BadChar, /* 29 gs */ BadChar, /* 30 rs */ BadChar, /* 31 us */ BadChar, 119 /* 32 sp */ BadChar, /* 33 ! */ UserInfoChar, 120 /* 34 " */ BadChar, /* 35 # */ PathSegmentEndChar | BadChar, 121 /* 36 $ */ UserInfoChar, /* 37 % */ UserInfoChar | HostnameChar | IPv6Char | BadChar, 122 /* 38 & */ UserInfoChar, /* 39 ' */ UserInfoChar, 123 /* 40 ( */ UserInfoChar, /* 41 ) */ UserInfoChar, 124 /* 42 * */ UserInfoChar, /* 43 + */ SchemeChar | UserInfoChar, 125 /* 44 , */ UserInfoChar, 126 /* 45 - */ SchemeChar | UserInfoChar | HostnameChar, 127 /* 46 . */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 128 /* 47 / */ PathSegmentEndChar, 129 /* 48 0 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 130 /* 49 1 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 131 /* 50 2 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 132 /* 51 3 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 133 /* 52 4 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 134 /* 53 5 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 135 /* 54 6 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 136 /* 55 7 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 137 /* 56 8 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 138 /* 57 9 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 139 /* 58 : */ UserInfoChar | IPv6Char, /* 59 ; */ UserInfoChar, 140 /* 60 < */ BadChar, /* 61 = */ UserInfoChar, 141 /* 62 > */ BadChar, /* 63 ? */ PathSegmentEndChar | BadChar, 142 /* 64 @ */ 0, 143 /* 65 A */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 144 /* 66 B */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 145 /* 67 C */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 146 /* 68 D */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 147 /* 69 E */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 148 /* 70 F */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 149 /* 71 G */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 150 /* 72 H */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 151 /* 73 I */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 152 /* 74 J */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 153 /* 75 K */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 154 /* 76 L */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 155 /* 77 M */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 156 /* 78 N */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 157 /* 79 O */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 158 /* 80 P */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 159 /* 81 Q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 160 /* 82 R */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 161 /* 83 S */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 162 /* 84 T */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 163 /* 85 U */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 164 /* 86 V */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 165 /* 87 W */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 166 /* 88 X */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 167 /* 89 Y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 168 /* 90 Z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 169 /* 91 [ */ 0, 170 /* 92 \ */ 0, /* 93 ] */ 0, 171 /* 94 ^ */ 0, 172 /* 95 _ */ UserInfoChar | HostnameChar, 173 /* 96 ` */ 0, 174 /* 97 a */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 175 /* 98 b */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 176 /* 99 c */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 177 /* 100 d */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 178 /* 101 e */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 179 /* 102 f */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 180 /* 103 g */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 181 /* 104 h */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 182 /* 105 i */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 183 /* 106 j */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 184 /* 107 k */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 185 /* 108 l */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 186 /* 109 m */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 187 /* 110 n */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 188 /* 111 o */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 189 /* 112 p */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 190 /* 113 q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 191 /* 114 r */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 192 /* 115 s */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 193 /* 116 t */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 194 /* 117 u */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 195 /* 118 v */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 196 /* 119 w */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 197 /* 120 x */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 198 /* 121 y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 199 /* 122 z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 200 /* 123 { */ 0, 201 /* 124 | */ 0, /* 125 } */ 0, /* 126 ~ */ UserInfoChar, /* 127 del */ BadChar, 202 /* 128 */ BadChar, /* 129 */ BadChar, /* 130 */ BadChar, /* 131 */ BadChar, 203 /* 132 */ BadChar, /* 133 */ BadChar, /* 134 */ BadChar, /* 135 */ BadChar, 204 /* 136 */ BadChar, /* 137 */ BadChar, /* 138 */ BadChar, /* 139 */ BadChar, 205 /* 140 */ BadChar, /* 141 */ BadChar, /* 142 */ BadChar, /* 143 */ BadChar, 206 /* 144 */ BadChar, /* 145 */ BadChar, /* 146 */ BadChar, /* 147 */ BadChar, 207 /* 148 */ BadChar, /* 149 */ BadChar, /* 150 */ BadChar, /* 151 */ BadChar, 208 /* 152 */ BadChar, /* 153 */ BadChar, /* 154 */ BadChar, /* 155 */ BadChar, 209 /* 156 */ BadChar, /* 157 */ BadChar, /* 158 */ BadChar, /* 159 */ BadChar, 210 /* 160 */ BadChar, /* 161 */ BadChar, /* 162 */ BadChar, /* 163 */ BadChar, 211 /* 164 */ BadChar, /* 165 */ BadChar, /* 166 */ BadChar, /* 167 */ BadChar, 212 /* 168 */ BadChar, /* 169 */ BadChar, /* 170 */ BadChar, /* 171 */ BadChar, 213 /* 172 */ BadChar, /* 173 */ BadChar, /* 174 */ BadChar, /* 175 */ BadChar, 214 /* 176 */ BadChar, /* 177 */ BadChar, /* 178 */ BadChar, /* 179 */ BadChar, 215 /* 180 */ BadChar, /* 181 */ BadChar, /* 182 */ BadChar, /* 183 */ BadChar, 216 /* 184 */ BadChar, /* 185 */ BadChar, /* 186 */ BadChar, /* 187 */ BadChar, 217 /* 188 */ BadChar, /* 189 */ BadChar, /* 190 */ BadChar, /* 191 */ BadChar, 218 /* 192 */ BadChar, /* 193 */ BadChar, /* 194 */ BadChar, /* 195 */ BadChar, 219 /* 196 */ BadChar, /* 197 */ BadChar, /* 198 */ BadChar, /* 199 */ BadChar, 220 /* 200 */ BadChar, /* 201 */ BadChar, /* 202 */ BadChar, /* 203 */ BadChar, 221 /* 204 */ BadChar, /* 205 */ BadChar, /* 206 */ BadChar, /* 207 */ BadChar, 222 /* 208 */ BadChar, /* 209 */ BadChar, /* 210 */ BadChar, /* 211 */ BadChar, 223 /* 212 */ BadChar, /* 213 */ BadChar, /* 214 */ BadChar, /* 215 */ BadChar, 224 /* 216 */ BadChar, /* 217 */ BadChar, /* 218 */ BadChar, /* 219 */ BadChar, 225 /* 220 */ BadChar, /* 221 */ BadChar, /* 222 */ BadChar, /* 223 */ BadChar, 226 /* 224 */ BadChar, /* 225 */ BadChar, /* 226 */ BadChar, /* 227 */ BadChar, 227 /* 228 */ BadChar, /* 229 */ BadChar, /* 230 */ BadChar, /* 231 */ BadChar, 228 /* 232 */ BadChar, /* 233 */ BadChar, /* 234 */ BadChar, /* 235 */ BadChar, 229 /* 236 */ BadChar, /* 237 */ BadChar, /* 238 */ BadChar, /* 239 */ BadChar, 230 /* 240 */ BadChar, /* 241 */ BadChar, /* 242 */ BadChar, /* 243 */ BadChar, 231 /* 244 */ BadChar, /* 245 */ BadChar, /* 246 */ BadChar, /* 247 */ BadChar, 232 /* 248 */ BadChar, /* 249 */ BadChar, /* 250 */ BadChar, /* 251 */ BadChar, 233 /* 252 */ BadChar, /* 253 */ BadChar, /* 254 */ BadChar, /* 255 */ BadChar 234}; 235 236static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd); 237static void encodeRelativeString(const String& rel, const TextEncoding&, CharBuffer& ouput); 238static String substituteBackslashes(const String&); 239 240static inline bool isSchemeFirstChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeFirstChar; } 241static inline bool isSchemeFirstChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeFirstChar); } 242static inline bool isSchemeChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeChar; } 243static inline bool isSchemeChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeChar); } 244static inline bool isUserInfoChar(unsigned char c) { return characterClassTable[c] & UserInfoChar; } 245static inline bool isHostnameChar(unsigned char c) { return characterClassTable[c] & HostnameChar; } 246static inline bool isIPv6Char(unsigned char c) { return characterClassTable[c] & IPv6Char; } 247static inline bool isPathSegmentEndChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & PathSegmentEndChar; } 248static inline bool isPathSegmentEndChar(UChar c) { return c <= 0xff && (characterClassTable[c] & PathSegmentEndChar); } 249static inline bool isBadChar(unsigned char c) { return characterClassTable[c] & BadChar; } 250 251static inline bool isSchemeCharacterMatchIgnoringCase(char character, char schemeCharacter) 252{ 253 ASSERT(isSchemeChar(character)); 254 ASSERT(schemeCharacter & 0x20); 255 ASSERT(isASCIILower(schemeCharacter) || (!isASCIIUpper(schemeCharacter) && isSchemeChar(schemeCharacter))); 256 return (character | 0x20) == schemeCharacter; 257} 258 259// Copies the source to the destination, assuming all the source characters are 260// ASCII. The destination buffer must be large enough. Null characters are allowed 261// in the source string, and no attempt is made to null-terminate the result. 262static void copyASCII(const String& string, char* dest) 263{ 264 if (string.isEmpty()) 265 return; 266 267 if (string.is8Bit()) 268 memcpy(dest, string.characters8(), string.length()); 269 else { 270 const UChar* src = string.characters16(); 271 size_t length = string.length(); 272 for (size_t i = 0; i < length; i++) 273 dest[i] = static_cast<char>(src[i]); 274 } 275} 276 277static void appendASCII(const String& base, const char* rel, size_t len, CharBuffer& buffer) 278{ 279 buffer.resize(base.length() + len + 1); 280 copyASCII(base, buffer.data()); 281 memcpy(buffer.data() + base.length(), rel, len); 282 buffer[buffer.size() - 1] = '\0'; 283} 284 285// FIXME: Move to WTFString.h eventually. 286// Returns the index of the first index in string |s| of any of the characters 287// in |toFind|. |toFind| should be a null-terminated string, all characters up 288// to the null will be searched. Returns int if not found. 289static int findFirstOf(StringView string, unsigned startPosition, const char* target) 290{ 291 unsigned length = string.length(); 292 for (unsigned i = startPosition; i < length; ++i) { 293 for (unsigned j = 0; target[j]; ++j) { 294 if (string[i] == target[j]) 295 return i; 296 } 297 } 298 return -1; 299} 300 301static inline void checkEncodedString(const String& url) 302{ 303 ASSERT_UNUSED(url, url.containsOnlyASCII()); 304 ASSERT_UNUSED(url, url.isEmpty() || isSchemeFirstChar(url[0])); 305} 306 307inline bool URL::protocolIs(const String& string, const char* protocol) 308{ 309 return WebCore::protocolIs(string, protocol); 310} 311 312void URL::invalidate() 313{ 314 m_isValid = false; 315 m_protocolIsInHTTPFamily = false; 316 m_schemeEnd = 0; 317 m_userStart = 0; 318 m_userEnd = 0; 319 m_passwordEnd = 0; 320 m_hostEnd = 0; 321 m_portEnd = 0; 322 m_pathEnd = 0; 323 m_pathAfterLastSlash = 0; 324 m_queryEnd = 0; 325 m_fragmentEnd = 0; 326} 327 328URL::URL(ParsedURLStringTag, const String& url) 329{ 330 parse(url); 331 ASSERT(url == m_string); 332} 333 334URL::URL(const URL& base, const String& relative) 335{ 336 init(base, relative, UTF8Encoding()); 337} 338 339URL::URL(const URL& base, const String& relative, const TextEncoding& encoding) 340{ 341 // For UTF-{7,16,32}, we want to use UTF-8 for the query part as 342 // we do when submitting a form. A form with GET method 343 // has its contents added to a URL as query params and it makes sense 344 // to be consistent. 345 init(base, relative, encoding.encodingForFormSubmission()); 346} 347 348static bool shouldTrimFromURL(unsigned char c) 349{ 350 // Browsers ignore leading/trailing whitespace and control 351 // characters from URLs. Note that c is an *unsigned* char here 352 // so this comparison should only catch control characters. 353 return c <= ' '; 354} 355 356void URL::init(const URL& base, const String& relative, const TextEncoding& encoding) 357{ 358 // Allow resolutions with a null or empty base URL, but not with any other invalid one. 359 // FIXME: Is this a good rule? 360 if (!base.m_isValid && !base.isEmpty()) { 361 m_string = relative; 362 invalidate(); 363 return; 364 } 365 366 // For compatibility with Win IE, treat backslashes as if they were slashes, 367 // as long as we're not dealing with javascript: or data: URLs. 368 String rel = relative; 369 if (rel.contains('\\') && !(protocolIsJavaScript(rel) || protocolIs(rel, "data"))) 370 rel = substituteBackslashes(rel); 371 372 bool allASCII = rel.containsOnlyASCII(); 373 CharBuffer strBuffer; 374 char* str; 375 size_t len; 376 if (allASCII) { 377 len = rel.length(); 378 strBuffer.resize(len + 1); 379 copyASCII(rel, strBuffer.data()); 380 strBuffer[len] = 0; 381 str = strBuffer.data(); 382 } else { 383 encodeRelativeString(rel, encoding, strBuffer); 384 str = strBuffer.data(); 385 len = strlen(str); 386 } 387 388 // Get rid of leading whitespace and control characters. 389 while (len && shouldTrimFromURL(*str)) { 390 str++; 391 --len; 392 } 393 394 // Get rid of trailing whitespace and control characters. 395 while (len && shouldTrimFromURL(str[len - 1])) 396 str[--len] = '\0'; 397 398 // According to the RFC, the reference should be interpreted as an 399 // absolute URI if possible, using the "leftmost, longest" 400 // algorithm. If the URI reference is absolute it will have a 401 // scheme, meaning that it will have a colon before the first 402 // non-scheme element. 403 bool absolute = false; 404 char* p = str; 405 if (isSchemeFirstChar(*p)) { 406 ++p; 407 while (isSchemeChar(*p)) { 408 ++p; 409 } 410 if (*p == ':') { 411 if (p[1] != '/' && equalIgnoringCase(base.protocol(), String(str, p - str)) && base.isHierarchical()) 412 str = p + 1; 413 else 414 absolute = true; 415 } 416 } 417 418 CharBuffer parseBuffer; 419 420 if (absolute) { 421 parse(str, &relative); 422 } else { 423 // If the base is empty or opaque (e.g. data: or javascript:), then the URL is invalid 424 // unless the relative URL is a single fragment. 425 if (!base.isHierarchical()) { 426 if (str[0] == '#') { 427 appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer); 428 parse(parseBuffer.data(), &relative); 429 } else { 430 m_string = relative; 431 invalidate(); 432 } 433 return; 434 } 435 436 switch (str[0]) { 437 case '\0': 438 // The reference is empty, so this is a reference to the same document with any fragment identifier removed. 439 *this = base; 440 removeFragmentIdentifier(); 441 break; 442 case '#': { 443 // must be fragment-only reference 444 appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer); 445 parse(parseBuffer.data(), &relative); 446 break; 447 } 448 case '?': { 449 // query-only reference, special case needed for non-URL results 450 appendASCII(base.m_string.left(base.m_pathEnd), str, len, parseBuffer); 451 parse(parseBuffer.data(), &relative); 452 break; 453 } 454 case '/': 455 // must be net-path or absolute-path reference 456 if (str[1] == '/') { 457 // net-path 458 appendASCII(base.m_string.left(base.m_schemeEnd + 1), str, len, parseBuffer); 459 parse(parseBuffer.data(), &relative); 460 } else { 461 // abs-path 462 appendASCII(base.m_string.left(base.m_portEnd), str, len, parseBuffer); 463 parse(parseBuffer.data(), &relative); 464 } 465 break; 466 default: 467 { 468 // must be relative-path reference 469 470 // Base part plus relative part plus one possible slash added in between plus terminating \0 byte. 471 const size_t bufferSize = base.m_pathEnd + 1 + len + 1; 472 parseBuffer.resize(bufferSize); 473 474 char* bufferPos = parseBuffer.data(); 475 char* bufferStart = bufferPos; 476 477 // first copy everything before the path from the base 478 CharBuffer baseStringBuffer(base.m_string.length()); 479 copyASCII(base.m_string, baseStringBuffer.data()); 480 const char* baseString = baseStringBuffer.data(); 481 const char* baseStringStart = baseString; 482 const char* pathStart = baseStringStart + base.m_portEnd; 483 while (baseStringStart < pathStart) 484 *bufferPos++ = *baseStringStart++; 485 char* bufferPathStart = bufferPos; 486 487 // now copy the base path 488 const char* baseStringEnd = baseString + base.m_pathEnd; 489 490 // go back to the last slash 491 while (baseStringEnd > baseStringStart && baseStringEnd[-1] != '/') 492 baseStringEnd--; 493 494 if (baseStringEnd == baseStringStart) { 495 // no path in base, add a path separator if necessary 496 if (base.m_schemeEnd + 1 != base.m_pathEnd && *str && *str != '?' && *str != '#') 497 *bufferPos++ = '/'; 498 } else { 499 bufferPos += copyPathRemovingDots(bufferPos, baseStringStart, 0, baseStringEnd - baseStringStart); 500 } 501 502 const char* relStringStart = str; 503 const char* relStringPos = relStringStart; 504 505 while (*relStringPos && *relStringPos != '?' && *relStringPos != '#') { 506 if (relStringPos[0] == '.' && bufferPos[-1] == '/') { 507 if (isPathSegmentEndChar(relStringPos[1])) { 508 // skip over "." segment 509 relStringPos += 1; 510 if (relStringPos[0] == '/') 511 relStringPos++; 512 continue; 513 } else if (relStringPos[1] == '.' && isPathSegmentEndChar(relStringPos[2])) { 514 // skip over ".." segment and rewind the last segment 515 // the RFC leaves it up to the app to decide what to do with excess 516 // ".." segments - we choose to drop them since some web content 517 // relies on this. 518 relStringPos += 2; 519 if (relStringPos[0] == '/') 520 relStringPos++; 521 if (bufferPos > bufferPathStart + 1) 522 bufferPos--; 523 while (bufferPos > bufferPathStart + 1 && bufferPos[-1] != '/') 524 bufferPos--; 525 continue; 526 } 527 } 528 529 *bufferPos = *relStringPos; 530 relStringPos++; 531 bufferPos++; 532 } 533 534 // all done with the path work, now copy any remainder 535 // of the relative reference; this will also add a null terminator 536 strncpy(bufferPos, relStringPos, bufferSize - (bufferPos - bufferStart)); 537 538 parse(parseBuffer.data(), &relative); 539 540 ASSERT(strlen(parseBuffer.data()) + 1 <= parseBuffer.size()); 541 break; 542 } 543 } 544 } 545} 546 547URL URL::copy() const 548{ 549 URL result = *this; 550 result.m_string = result.m_string.isolatedCopy(); 551 return result; 552} 553 554String URL::lastPathComponent() const 555{ 556 if (!hasPath()) 557 return String(); 558 559 unsigned end = m_pathEnd - 1; 560 if (m_string[end] == '/') 561 --end; 562 563 size_t start = m_string.reverseFind('/', end); 564 if (start < static_cast<unsigned>(m_portEnd)) 565 return String(); 566 ++start; 567 568 return m_string.substring(start, end - start + 1); 569} 570 571String URL::protocol() const 572{ 573 return m_string.left(m_schemeEnd); 574} 575 576String URL::host() const 577{ 578 int start = hostStart(); 579 return m_string.substring(start, m_hostEnd - start); 580} 581 582unsigned short URL::port() const 583{ 584 // We return a port of 0 if there is no port specified. This can happen in two situations: 585 // 1) The URL contains no colon after the host name and before the path component of the URL. 586 // 2) The URL contains a colon but there's no port number before the path component of the URL begins. 587 if (m_hostEnd == m_portEnd || m_hostEnd == m_portEnd - 1) 588 return 0; 589 590 bool ok = false; 591 unsigned number; 592 if (m_string.is8Bit()) 593 number = charactersToUIntStrict(m_string.characters8() + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok); 594 else 595 number = charactersToUIntStrict(m_string.characters16() + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok); 596 if (!ok || number > maximumValidPortNumber) 597 return invalidPortNumber; 598 return number; 599} 600 601String URL::pass() const 602{ 603 if (m_passwordEnd == m_userEnd) 604 return String(); 605 606 return decodeURLEscapeSequences(m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1)); 607} 608 609String URL::user() const 610{ 611 return decodeURLEscapeSequences(m_string.substring(m_userStart, m_userEnd - m_userStart)); 612} 613 614String URL::fragmentIdentifier() const 615{ 616 if (m_fragmentEnd == m_queryEnd) 617 return String(); 618 619 return m_string.substring(m_queryEnd + 1, m_fragmentEnd - (m_queryEnd + 1)); 620} 621 622bool URL::hasFragmentIdentifier() const 623{ 624 return m_fragmentEnd != m_queryEnd; 625} 626 627String URL::baseAsString() const 628{ 629 return m_string.left(m_pathAfterLastSlash); 630} 631 632#if !USE(CF) 633String URL::fileSystemPath() const 634{ 635 if (!isValid() || !isLocalFile()) 636 return String(); 637 638 return decodeURLEscapeSequences(path()); 639} 640#endif 641 642#ifdef NDEBUG 643 644static inline void assertProtocolIsGood(const char*) 645{ 646} 647 648#else 649 650static void assertProtocolIsGood(const char* protocol) 651{ 652 const char* p = protocol; 653 while (*p) { 654 ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z')); 655 ++p; 656 } 657} 658 659#endif 660 661bool URL::protocolIs(const char* protocol) const 662{ 663 assertProtocolIsGood(protocol); 664 665 // JavaScript URLs are "valid" and should be executed even if URL decides they are invalid. 666 // The free function protocolIsJavaScript() should be used instead. 667 ASSERT(!equalIgnoringCase(protocol, String("javascript"))); 668 669 if (!m_isValid) 670 return false; 671 672 // Do the comparison without making a new string object. 673 for (int i = 0; i < m_schemeEnd; ++i) { 674 if (!protocol[i] || !isSchemeCharacterMatchIgnoringCase(m_string[i], protocol[i])) 675 return false; 676 } 677 return !protocol[m_schemeEnd]; // We should have consumed all characters in the argument. 678} 679 680String URL::query() const 681{ 682 if (m_queryEnd == m_pathEnd) 683 return String(); 684 685 return m_string.substring(m_pathEnd + 1, m_queryEnd - (m_pathEnd + 1)); 686} 687 688String URL::path() const 689{ 690 return m_string.substring(m_portEnd, m_pathEnd - m_portEnd); 691} 692 693bool URL::setProtocol(const String& s) 694{ 695 // Firefox and IE remove everything after the first ':'. 696 size_t separatorPosition = s.find(':'); 697 String newProtocol = s.substring(0, separatorPosition); 698 699 if (!isValidProtocol(newProtocol)) 700 return false; 701 702 if (!m_isValid) { 703 parse(newProtocol + ':' + m_string); 704 return true; 705 } 706 707 parse(newProtocol + m_string.substring(m_schemeEnd)); 708 return true; 709} 710 711void URL::setHost(const String& s) 712{ 713 if (!m_isValid) 714 return; 715 716 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, 717 // and to avoid changing more than just the host. 718 719 bool slashSlashNeeded = m_userStart == m_schemeEnd + 1; 720 721 parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + s + m_string.substring(m_hostEnd)); 722} 723 724void URL::removePort() 725{ 726 if (m_hostEnd == m_portEnd) 727 return; 728 parse(m_string.left(m_hostEnd) + m_string.substring(m_portEnd)); 729} 730 731void URL::setPort(unsigned short i) 732{ 733 if (!m_isValid) 734 return; 735 736 bool colonNeeded = m_portEnd == m_hostEnd; 737 int portStart = (colonNeeded ? m_hostEnd : m_hostEnd + 1); 738 739 parse(m_string.left(portStart) + (colonNeeded ? ":" : "") + String::number(i) + m_string.substring(m_portEnd)); 740} 741 742void URL::setHostAndPort(const String& hostAndPort) 743{ 744 if (!m_isValid) 745 return; 746 747 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, 748 // and to avoid changing more than just host and port. 749 750 bool slashSlashNeeded = m_userStart == m_schemeEnd + 1; 751 752 parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + hostAndPort + m_string.substring(m_portEnd)); 753} 754 755void URL::setUser(const String& user) 756{ 757 if (!m_isValid) 758 return; 759 760 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, 761 // and to avoid changing more than just the user login. 762 763 int end = m_userEnd; 764 if (!user.isEmpty()) { 765 String u = user; 766 if (m_userStart == m_schemeEnd + 1) 767 u = "//" + u; 768 // Add '@' if we didn't have one before. 769 if (end == m_hostEnd || (end == m_passwordEnd && m_string[end] != '@')) 770 u.append('@'); 771 parse(m_string.left(m_userStart) + u + m_string.substring(end)); 772 } else { 773 // Remove '@' if we now have neither user nor password. 774 if (m_userEnd == m_passwordEnd && end != m_hostEnd && m_string[end] == '@') 775 end += 1; 776 // We don't want to parse in the extremely common case where we are not going to make a change. 777 if (m_userStart != end) 778 parse(m_string.left(m_userStart) + m_string.substring(end)); 779 } 780} 781 782void URL::setPass(const String& password) 783{ 784 if (!m_isValid) 785 return; 786 787 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, 788 // and to avoid changing more than just the user password. 789 790 int end = m_passwordEnd; 791 if (!password.isEmpty()) { 792 String p = ":" + password + "@"; 793 if (m_userEnd == m_schemeEnd + 1) 794 p = "//" + p; 795 // Eat the existing '@' since we are going to add our own. 796 if (end != m_hostEnd && m_string[end] == '@') 797 end += 1; 798 parse(m_string.left(m_userEnd) + p + m_string.substring(end)); 799 } else { 800 // Remove '@' if we now have neither user nor password. 801 if (m_userStart == m_userEnd && end != m_hostEnd && m_string[end] == '@') 802 end += 1; 803 // We don't want to parse in the extremely common case where we are not going to make a change. 804 if (m_userEnd != end) 805 parse(m_string.left(m_userEnd) + m_string.substring(end)); 806 } 807} 808 809void URL::setFragmentIdentifier(const String& s) 810{ 811 if (!m_isValid) 812 return; 813 814 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations. 815 parse(m_string.left(m_queryEnd) + "#" + s); 816} 817 818void URL::removeFragmentIdentifier() 819{ 820 if (!m_isValid) 821 return; 822 parse(m_string.left(m_queryEnd)); 823} 824 825void URL::setQuery(const String& query) 826{ 827 if (!m_isValid) 828 return; 829 830 // FIXME: '#' and non-ASCII characters must be encoded and escaped. 831 // Usually, the query is encoded using document encoding, not UTF-8, but we don't have 832 // access to the document in this function. 833 if ((query.isEmpty() || query[0] != '?') && !query.isNull()) 834 parse(m_string.left(m_pathEnd) + "?" + query + m_string.substring(m_queryEnd)); 835 else 836 parse(m_string.left(m_pathEnd) + query + m_string.substring(m_queryEnd)); 837 838} 839 840void URL::setPath(const String& s) 841{ 842 if (!m_isValid) 843 return; 844 845 // FIXME: encodeWithURLEscapeSequences does not correctly escape '#' and '?', so fragment and query parts 846 // may be inadvertently affected. 847 String path = s; 848 if (path.isEmpty() || path[0] != '/') 849 path = "/" + path; 850 851 parse(m_string.left(m_portEnd) + encodeWithURLEscapeSequences(path) + m_string.substring(m_pathEnd)); 852} 853 854String decodeURLEscapeSequences(const String& string) 855{ 856 return decodeEscapeSequences<URLEscapeSequence>(string, UTF8Encoding()); 857} 858 859String decodeURLEscapeSequences(const String& string, const TextEncoding& encoding) 860{ 861 return decodeEscapeSequences<URLEscapeSequence>(string, encoding); 862} 863 864// Caution: This function does not bounds check. 865static void appendEscapedChar(char*& buffer, unsigned char c) 866{ 867 *buffer++ = '%'; 868 placeByteAsHex(c, buffer); 869} 870 871static void appendEscapingBadChars(char*& buffer, const char* strStart, size_t length) 872{ 873 char* p = buffer; 874 875 const char* str = strStart; 876 const char* strEnd = strStart + length; 877 while (str < strEnd) { 878 unsigned char c = *str++; 879 if (isBadChar(c)) { 880 if (c == '%' || c == '?') 881 *p++ = c; 882 else if (c != 0x09 && c != 0x0a && c != 0x0d) 883 appendEscapedChar(p, c); 884 } else 885 *p++ = c; 886 } 887 888 buffer = p; 889} 890 891static void escapeAndAppendNonHierarchicalPart(char*& buffer, const char* strStart, size_t length) 892{ 893 char* p = buffer; 894 895 const char* str = strStart; 896 const char* strEnd = strStart + length; 897 while (str < strEnd) { 898 unsigned char c = *str++; 899 // Strip CR, LF and Tab from fragments, per: 900 // https://bugs.webkit.org/show_bug.cgi?id=8770 901 if (c == 0x09 || c == 0x0a || c == 0x0d) 902 continue; 903 904 // Chrome and IE allow non-ascii characters in fragments, however doing 905 // so would hit an ASSERT in checkEncodedString, so for now we don't. 906 if (c < 0x20 || c >= 127) { 907 appendEscapedChar(p, c); 908 continue; 909 } 910 *p++ = c; 911 } 912 913 buffer = p; 914} 915 916// copy a path, accounting for "." and ".." segments 917static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd) 918{ 919 char* bufferPathStart = dst; 920 921 // empty path is a special case, and need not have a leading slash 922 if (srcStart != srcEnd) { 923 const char* baseStringStart = src + srcStart; 924 const char* baseStringEnd = src + srcEnd; 925 const char* baseStringPos = baseStringStart; 926 927 // this code is unprepared for paths that do not begin with a 928 // slash and we should always have one in the source string 929 ASSERT(baseStringPos[0] == '/'); 930 931 // copy the leading slash into the destination 932 *dst = *baseStringPos; 933 baseStringPos++; 934 dst++; 935 936 while (baseStringPos < baseStringEnd) { 937 if (baseStringPos[0] == '.' && dst[-1] == '/') { 938 if (baseStringPos[1] == '/' || baseStringPos + 1 == baseStringEnd) { 939 // skip over "." segment 940 baseStringPos += 2; 941 continue; 942 } else if (baseStringPos[1] == '.' && (baseStringPos[2] == '/' || 943 baseStringPos + 2 == baseStringEnd)) { 944 // skip over ".." segment and rewind the last segment 945 // the RFC leaves it up to the app to decide what to do with excess 946 // ".." segments - we choose to drop them since some web content 947 // relies on this. 948 baseStringPos += 3; 949 if (dst > bufferPathStart + 1) 950 dst--; 951 while (dst > bufferPathStart && dst[-1] != '/') 952 dst--; 953 continue; 954 } 955 } 956 957 *dst = *baseStringPos; 958 baseStringPos++; 959 dst++; 960 } 961 } 962 *dst = '\0'; 963 return dst - bufferPathStart; 964} 965 966static inline bool hasSlashDotOrDotDot(const char* str) 967{ 968 const unsigned char* p = reinterpret_cast<const unsigned char*>(str); 969 if (!*p) 970 return false; 971 unsigned char pc = *p; 972 while (unsigned char c = *++p) { 973 if (c == '.' && (pc == '/' || pc == '.')) 974 return true; 975 pc = c; 976 } 977 return false; 978} 979 980void URL::parse(const String& string) 981{ 982 checkEncodedString(string); 983 984 CharBuffer buffer(string.length() + 1); 985 copyASCII(string, buffer.data()); 986 buffer[string.length()] = '\0'; 987 parse(buffer.data(), &string); 988} 989 990#if PLATFORM(IOS) 991static bool shouldCanonicalizeScheme = true; 992 993void enableURLSchemeCanonicalization(bool enableSchemeCanonicalization) 994{ 995 shouldCanonicalizeScheme = enableSchemeCanonicalization; 996} 997#endif 998 999template<size_t length> 1000static inline bool equal(const char* a, const char (&b)[length]) 1001{ 1002#if PLATFORM(IOS) 1003 if (!shouldCanonicalizeScheme) { 1004 for (size_t i = 0; i < length; ++i) { 1005 if (toASCIILower(a[i]) != b[i]) 1006 return false; 1007 } 1008 return true; 1009 } 1010#endif 1011 for (size_t i = 0; i < length; ++i) { 1012 if (a[i] != b[i]) 1013 return false; 1014 } 1015 return true; 1016} 1017 1018template<size_t lengthB> 1019static inline bool equal(const char* stringA, size_t lengthA, const char (&stringB)[lengthB]) 1020{ 1021 return lengthA == lengthB && equal(stringA, stringB); 1022} 1023 1024// List of default schemes is taken from google-url: 1025// http://code.google.com/p/google-url/source/browse/trunk/src/url_canon_stdurl.cc#120 1026static inline bool isDefaultPortForScheme(const char* port, size_t portLength, const char* scheme, size_t schemeLength) 1027{ 1028 // This switch is theoretically a performance optimization. It came over when 1029 // the code was moved from google-url, but may be removed later. 1030 switch (schemeLength) { 1031 case 2: 1032 return equal(scheme, wsScheme) && equal(port, portLength, httpPort); 1033 case 3: 1034 if (equal(scheme, ftpScheme)) 1035 return equal(port, portLength, ftpPort); 1036 if (equal(scheme, wssScheme)) 1037 return equal(port, portLength, httpsPort); 1038 break; 1039 case 4: 1040 return equal(scheme, httpScheme) && equal(port, portLength, httpPort); 1041 case 5: 1042 return equal(scheme, httpsScheme) && equal(port, portLength, httpsPort); 1043 case 6: 1044 return equal(scheme, gopherScheme) && equal(port, portLength, gopherPort); 1045 } 1046 return false; 1047} 1048 1049static inline bool hostPortIsEmptyButCredentialsArePresent(int hostStart, int portEnd, char userinfoEndChar) 1050{ 1051 return userinfoEndChar == '@' && hostStart == portEnd; 1052} 1053 1054static bool isNonFileHierarchicalScheme(const char* scheme, size_t schemeLength) 1055{ 1056 switch (schemeLength) { 1057 case 2: 1058 return equal(scheme, wsScheme); 1059 case 3: 1060 return equal(scheme, ftpScheme) || equal(scheme, wssScheme); 1061 case 4: 1062 return equal(scheme, httpScheme); 1063 case 5: 1064 return equal(scheme, httpsScheme); 1065 case 6: 1066 return equal(scheme, gopherScheme); 1067 } 1068 return false; 1069} 1070 1071static bool isCanonicalHostnameLowercaseForScheme(const char* scheme, size_t schemeLength) 1072{ 1073 switch (schemeLength) { 1074 case 2: 1075 return equal(scheme, wsScheme); 1076 case 3: 1077 return equal(scheme, ftpScheme) || equal(scheme, wssScheme); 1078 case 4: 1079 return equal(scheme, httpScheme) || equal(scheme, fileScheme); 1080 case 5: 1081 return equal(scheme, httpsScheme); 1082 case 6: 1083 return equal(scheme, gopherScheme); 1084 } 1085 return false; 1086} 1087 1088void URL::parse(const char* url, const String* originalString) 1089{ 1090 if (!url || url[0] == '\0') { 1091 // valid URL must be non-empty 1092 m_string = originalString ? *originalString : url; 1093 invalidate(); 1094 return; 1095 } 1096 1097 if (!isSchemeFirstChar(url[0])) { 1098 // scheme must start with an alphabetic character 1099 m_string = originalString ? *originalString : url; 1100 invalidate(); 1101 return; 1102 } 1103 1104 int schemeEnd = 0; 1105 while (isSchemeChar(url[schemeEnd])) 1106 schemeEnd++; 1107 1108 if (url[schemeEnd] != ':') { 1109 m_string = originalString ? *originalString : url; 1110 invalidate(); 1111 return; 1112 } 1113 1114 int userStart = schemeEnd + 1; 1115 int userEnd; 1116 int passwordStart; 1117 int passwordEnd; 1118 int hostStart; 1119 int hostEnd; 1120 int portStart; 1121 int portEnd; 1122 1123 bool hierarchical = url[schemeEnd + 1] == '/'; 1124 bool hasSecondSlash = hierarchical && url[schemeEnd + 2] == '/'; 1125 1126 bool isFile = schemeEnd == 4 1127 && isLetterMatchIgnoringCase(url[0], 'f') 1128 && isLetterMatchIgnoringCase(url[1], 'i') 1129 && isLetterMatchIgnoringCase(url[2], 'l') 1130 && isLetterMatchIgnoringCase(url[3], 'e'); 1131 1132 m_protocolIsInHTTPFamily = isLetterMatchIgnoringCase(url[0], 'h') 1133 && isLetterMatchIgnoringCase(url[1], 't') 1134 && isLetterMatchIgnoringCase(url[2], 't') 1135 && isLetterMatchIgnoringCase(url[3], 'p') 1136 && (url[4] == ':' || (isLetterMatchIgnoringCase(url[4], 's') && url[5] == ':')); 1137 1138 if ((hierarchical && hasSecondSlash) || isNonFileHierarchicalScheme(url, schemeEnd)) { 1139 // The part after the scheme is either a net_path or an abs_path whose first path segment is empty. 1140 // Attempt to find an authority. 1141 // FIXME: Authority characters may be scanned twice, and it would be nice to be faster. 1142 1143 if (hierarchical) 1144 userStart++; 1145 if (hasSecondSlash) 1146 userStart++; 1147 userEnd = userStart; 1148 1149 int colonPos = 0; 1150 while (isUserInfoChar(url[userEnd])) { 1151 if (url[userEnd] == ':' && colonPos == 0) 1152 colonPos = userEnd; 1153 userEnd++; 1154 } 1155 1156 if (url[userEnd] == '@') { 1157 // actual end of the userinfo, start on the host 1158 if (colonPos != 0) { 1159 passwordEnd = userEnd; 1160 userEnd = colonPos; 1161 passwordStart = colonPos + 1; 1162 } else 1163 passwordStart = passwordEnd = userEnd; 1164 1165 hostStart = passwordEnd + 1; 1166 } else if (url[userEnd] == '[' || isPathSegmentEndChar(url[userEnd])) { 1167 // hit the end of the authority, must have been no user 1168 // or looks like an IPv6 hostname 1169 // either way, try to parse it as a hostname 1170 userEnd = userStart; 1171 passwordStart = passwordEnd = userEnd; 1172 hostStart = userStart; 1173 } else { 1174 // invalid character 1175 m_string = originalString ? *originalString : url; 1176 invalidate(); 1177 return; 1178 } 1179 1180 hostEnd = hostStart; 1181 1182 // IPV6 IP address 1183 if (url[hostEnd] == '[') { 1184 hostEnd++; 1185 while (isIPv6Char(url[hostEnd])) 1186 hostEnd++; 1187 if (url[hostEnd] == ']') 1188 hostEnd++; 1189 else { 1190 // invalid character 1191 m_string = originalString ? *originalString : url; 1192 invalidate(); 1193 return; 1194 } 1195 } else { 1196 while (isHostnameChar(url[hostEnd])) 1197 hostEnd++; 1198 } 1199 1200 if (url[hostEnd] == ':') { 1201 portStart = portEnd = hostEnd + 1; 1202 1203 // possible start of port 1204 portEnd = portStart; 1205 while (isASCIIDigit(url[portEnd])) 1206 portEnd++; 1207 } else 1208 portStart = portEnd = hostEnd; 1209 1210 if (!isPathSegmentEndChar(url[portEnd])) { 1211 // invalid character 1212 m_string = originalString ? *originalString : url; 1213 invalidate(); 1214 return; 1215 } 1216 1217 if (hostPortIsEmptyButCredentialsArePresent(hostStart, portEnd, url[passwordEnd])) { 1218 m_string = originalString ? *originalString : url; 1219 invalidate(); 1220 return; 1221 } 1222 1223 if (userStart == portEnd && !m_protocolIsInHTTPFamily && !isFile) { 1224 // No authority found, which means that this is not a net_path, but rather an abs_path whose first two 1225 // path segments are empty. For file, http and https only, an empty authority is allowed. 1226 userStart -= 2; 1227 userEnd = userStart; 1228 passwordStart = userEnd; 1229 passwordEnd = passwordStart; 1230 hostStart = passwordEnd; 1231 hostEnd = hostStart; 1232 portStart = hostEnd; 1233 portEnd = hostEnd; 1234 } 1235 } else { 1236 // the part after the scheme must be an opaque_part or an abs_path 1237 userEnd = userStart; 1238 passwordStart = passwordEnd = userEnd; 1239 hostStart = hostEnd = passwordEnd; 1240 portStart = portEnd = hostEnd; 1241 } 1242 1243 int pathStart = portEnd; 1244 int pathEnd = pathStart; 1245 while (url[pathEnd] && url[pathEnd] != '?' && url[pathEnd] != '#') 1246 pathEnd++; 1247 1248 int queryStart = pathEnd; 1249 int queryEnd = queryStart; 1250 if (url[queryStart] == '?') { 1251 while (url[queryEnd] && url[queryEnd] != '#') 1252 queryEnd++; 1253 } 1254 1255 int fragmentStart = queryEnd; 1256 int fragmentEnd = fragmentStart; 1257 if (url[fragmentStart] == '#') { 1258 fragmentStart++; 1259 fragmentEnd = fragmentStart; 1260 while (url[fragmentEnd]) 1261 fragmentEnd++; 1262 } 1263 1264 // assemble it all, remembering the real ranges 1265 1266 Vector<char, 4096> buffer(fragmentEnd * 3 + 1); 1267 1268 char *p = buffer.data(); 1269 const char *strPtr = url; 1270 1271 // copy in the scheme 1272 const char *schemeEndPtr = url + schemeEnd; 1273#if PLATFORM(IOS) 1274 if (shouldCanonicalizeScheme || m_protocolIsInHTTPFamily) { 1275 while (strPtr < schemeEndPtr) 1276 *p++ = toASCIILower(*strPtr++); 1277 } else { 1278 while (strPtr < schemeEndPtr) 1279 *p++ = *strPtr++; 1280 } 1281#else 1282 while (strPtr < schemeEndPtr) 1283 *p++ = toASCIILower(*strPtr++); 1284#endif 1285 m_schemeEnd = p - buffer.data(); 1286 1287 bool hostIsLocalHost = portEnd - userStart == 9 1288 && isLetterMatchIgnoringCase(url[userStart], 'l') 1289 && isLetterMatchIgnoringCase(url[userStart+1], 'o') 1290 && isLetterMatchIgnoringCase(url[userStart+2], 'c') 1291 && isLetterMatchIgnoringCase(url[userStart+3], 'a') 1292 && isLetterMatchIgnoringCase(url[userStart+4], 'l') 1293 && isLetterMatchIgnoringCase(url[userStart+5], 'h') 1294 && isLetterMatchIgnoringCase(url[userStart+6], 'o') 1295 && isLetterMatchIgnoringCase(url[userStart+7], 's') 1296 && isLetterMatchIgnoringCase(url[userStart+8], 't'); 1297 1298 // File URLs need a host part unless it is just file:// or file://localhost 1299 bool degenerateFilePath = pathStart == pathEnd && (hostStart == hostEnd || hostIsLocalHost); 1300 1301 // We drop empty credentials, but keep a colon in an empty host/port pair. 1302 // Removing hostname completely would change the structure of the URL on re-parsing. 1303 bool haveNonHostAuthorityPart = userStart != userEnd || passwordStart != passwordEnd || hostEnd != portEnd; 1304 1305 // add ":" after scheme 1306 *p++ = ':'; 1307 1308 // if we have at least one authority part or a file URL - add "//" and authority 1309 if (isFile ? !degenerateFilePath : (haveNonHostAuthorityPart || hostStart != hostEnd)) { 1310 *p++ = '/'; 1311 *p++ = '/'; 1312 1313 m_userStart = p - buffer.data(); 1314 1315 // copy in the user 1316 strPtr = url + userStart; 1317 const char* userEndPtr = url + userEnd; 1318 while (strPtr < userEndPtr) { 1319 char c = *strPtr++; 1320 ASSERT(isUserInfoChar(c)); 1321 *p++ = c; 1322 } 1323 m_userEnd = p - buffer.data(); 1324 1325 // copy in the password 1326 if (passwordEnd != passwordStart) { 1327 *p++ = ':'; 1328 strPtr = url + passwordStart; 1329 const char* passwordEndPtr = url + passwordEnd; 1330 while (strPtr < passwordEndPtr) { 1331 char c = *strPtr++; 1332 ASSERT(isUserInfoChar(c)); 1333 *p++ = c; 1334 } 1335 } 1336 m_passwordEnd = p - buffer.data(); 1337 1338 // If we had any user info, add "@" 1339 if (p - buffer.data() != m_userStart) 1340 *p++ = '@'; 1341 1342 // copy in the host, except in the case of a file URL with authority="localhost" 1343 if (!(isFile && hostIsLocalHost && !haveNonHostAuthorityPart)) { 1344 strPtr = url + hostStart; 1345 const char* hostEndPtr = url + hostEnd; 1346 if (isCanonicalHostnameLowercaseForScheme(buffer.data(), m_schemeEnd)) { 1347 while (strPtr < hostEndPtr) { 1348 char c = toASCIILower(*strPtr++); 1349 ASSERT(isHostnameChar(c) || c == '[' || c == ']' || c == ':'); 1350 *p++ = c; 1351 } 1352 } else { 1353 while (strPtr < hostEndPtr) { 1354 char c = *strPtr++; 1355 ASSERT(isHostnameChar(c) || c == '[' || c == ']' || c == ':'); 1356 *p++ = c; 1357 } 1358 } 1359 } 1360 m_hostEnd = p - buffer.data(); 1361 1362 // Copy in the port if the URL has one (and it's not default). Also, copy it if there was no hostname, so that there is still something in authority component. 1363 if (hostEnd != portStart) { 1364 const char* portStr = url + portStart; 1365 size_t portLength = portEnd - portStart; 1366 if ((portLength && !isDefaultPortForScheme(portStr, portLength, buffer.data(), m_schemeEnd)) 1367 || (hostStart == hostEnd && hostEnd != portStart)) { 1368 *p++ = ':'; 1369 const char* portEndPtr = url + portEnd; 1370 while (portStr < portEndPtr) 1371 *p++ = *portStr++; 1372 } 1373 } 1374 m_portEnd = p - buffer.data(); 1375 } else { 1376 if (isFile) { 1377 ASSERT(degenerateFilePath); 1378 *p++ = '/'; 1379 *p++ = '/'; 1380 } 1381 m_userStart = m_userEnd = m_passwordEnd = m_hostEnd = m_portEnd = p - buffer.data(); 1382 } 1383 1384 // For canonicalization, ensure we have a '/' for no path. 1385 // Do this only for URL with protocol file, http or https. 1386 if ((m_protocolIsInHTTPFamily || isFile) && pathEnd == pathStart) 1387 *p++ = '/'; 1388 1389 // add path, escaping bad characters 1390 if (!hierarchical) 1391 escapeAndAppendNonHierarchicalPart(p, url + pathStart, pathEnd - pathStart); 1392 else if (!hasSlashDotOrDotDot(url)) 1393 appendEscapingBadChars(p, url + pathStart, pathEnd - pathStart); 1394 else { 1395 CharBuffer pathBuffer(pathEnd - pathStart + 1); 1396 size_t length = copyPathRemovingDots(pathBuffer.data(), url, pathStart, pathEnd); 1397 appendEscapingBadChars(p, pathBuffer.data(), length); 1398 } 1399 1400 m_pathEnd = p - buffer.data(); 1401 1402 // Find the position after the last slash in the path, or 1403 // the position before the path if there are no slashes in it. 1404 int i; 1405 for (i = m_pathEnd; i > m_portEnd; --i) { 1406 if (buffer[i - 1] == '/') 1407 break; 1408 } 1409 m_pathAfterLastSlash = i; 1410 1411 // add query, escaping bad characters 1412 appendEscapingBadChars(p, url + queryStart, queryEnd - queryStart); 1413 m_queryEnd = p - buffer.data(); 1414 1415 // add fragment, escaping bad characters 1416 if (fragmentEnd != queryEnd) { 1417 *p++ = '#'; 1418 escapeAndAppendNonHierarchicalPart(p, url + fragmentStart, fragmentEnd - fragmentStart); 1419 } 1420 m_fragmentEnd = p - buffer.data(); 1421 1422 ASSERT(p - buffer.data() <= static_cast<int>(buffer.size())); 1423 ASSERT(buffer.size() > 0); 1424 1425 // If we didn't end up actually changing the original string and 1426 // it was already in a String, reuse it to avoid extra allocation. 1427 if (originalString && equal(originalString->impl(), buffer.data(), m_fragmentEnd)) 1428 m_string = *originalString; 1429 else 1430 m_string = String(buffer.data(), m_fragmentEnd); 1431 1432 m_isValid = true; 1433} 1434 1435bool equalIgnoringFragmentIdentifier(const URL& a, const URL& b) 1436{ 1437 if (a.m_queryEnd != b.m_queryEnd) 1438 return false; 1439 unsigned queryLength = a.m_queryEnd; 1440 for (unsigned i = 0; i < queryLength; ++i) 1441 if (a.string()[i] != b.string()[i]) 1442 return false; 1443 return true; 1444} 1445 1446bool protocolHostAndPortAreEqual(const URL& a, const URL& b) 1447{ 1448 if (a.m_schemeEnd != b.m_schemeEnd) 1449 return false; 1450 1451 int hostStartA = a.hostStart(); 1452 int hostLengthA = a.hostEnd() - hostStartA; 1453 int hostStartB = b.hostStart(); 1454 int hostLengthB = b.hostEnd() - b.hostStart(); 1455 if (hostLengthA != hostLengthB) 1456 return false; 1457 1458 // Check the scheme 1459 for (int i = 0; i < a.m_schemeEnd; ++i) 1460 if (a.string()[i] != b.string()[i]) 1461 return false; 1462 1463 // And the host 1464 for (int i = 0; i < hostLengthA; ++i) 1465 if (a.string()[hostStartA + i] != b.string()[hostStartB + i]) 1466 return false; 1467 1468 if (a.port() != b.port()) 1469 return false; 1470 1471 return true; 1472} 1473 1474String encodeWithURLEscapeSequences(const String& notEncodedString) 1475{ 1476 CString asUTF8 = notEncodedString.utf8(); 1477 1478 CharBuffer buffer(asUTF8.length() * 3 + 1); 1479 char* p = buffer.data(); 1480 1481 const char* str = asUTF8.data(); 1482 const char* strEnd = str + asUTF8.length(); 1483 while (str < strEnd) { 1484 unsigned char c = *str++; 1485 if (isBadChar(c)) 1486 appendEscapedChar(p, c); 1487 else 1488 *p++ = c; 1489 } 1490 1491 ASSERT(p - buffer.data() <= static_cast<int>(buffer.size())); 1492 1493 return String(buffer.data(), p - buffer.data()); 1494} 1495 1496static bool containsOnlyASCII(StringView string) 1497{ 1498 if (string.is8Bit()) 1499 return charactersAreAllASCII(string.characters8(), string.length()); 1500 return charactersAreAllASCII(string.characters16(), string.length()); 1501} 1502 1503static bool protocolIs(StringView stringURL, const char* protocol) 1504{ 1505 assertProtocolIsGood(protocol); 1506 unsigned length = stringURL.length(); 1507 for (unsigned i = 0; i < length; ++i) { 1508 if (!protocol[i]) 1509 return stringURL[i] == ':'; 1510 if (!isLetterMatchIgnoringCase(stringURL[i], protocol[i])) 1511 return false; 1512 } 1513 return false; 1514} 1515 1516// Appends the punycoded hostname identified by the given string and length to 1517// the output buffer. The result will not be null terminated. 1518static void appendEncodedHostname(UCharBuffer& buffer, StringView string) 1519{ 1520 // Needs to be big enough to hold an IDN-encoded name. 1521 // For host names bigger than this, we won't do IDN encoding, which is almost certainly OK. 1522 const unsigned hostnameBufferLength = 2048; 1523 1524 if (string.length() > hostnameBufferLength || containsOnlyASCII(string)) { 1525 append(buffer, string); 1526 return; 1527 } 1528 1529 UChar hostnameBuffer[hostnameBufferLength]; 1530 UErrorCode error = U_ZERO_ERROR; 1531 int32_t numCharactersConverted = uidna_IDNToASCII(string.upconvertedCharacters(), string.length(), hostnameBuffer, 1532 hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, 0, &error); 1533 if (error == U_ZERO_ERROR) 1534 buffer.append(hostnameBuffer, numCharactersConverted); 1535} 1536 1537static void findHostnamesInMailToURL(StringView string, Vector<std::pair<int, int>>& nameRanges) 1538{ 1539 // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' or end of string character. 1540 // Skip quoted strings so that characters in them don't confuse us. 1541 // When we find a '?' character, we are past the part of the URL that contains host names. 1542 1543 nameRanges.clear(); 1544 1545 int p = 0; 1546 while (1) { 1547 // Find start of host name or of quoted string. 1548 int hostnameOrStringStart = findFirstOf(string, p, "\"@?"); 1549 if (hostnameOrStringStart == -1) 1550 return; 1551 UChar c = string[hostnameOrStringStart]; 1552 p = hostnameOrStringStart + 1; 1553 1554 if (c == '?') 1555 return; 1556 1557 if (c == '@') { 1558 // Find end of host name. 1559 int hostnameStart = p; 1560 int hostnameEnd = findFirstOf(string, p, ">,?"); 1561 bool done; 1562 if (hostnameEnd == -1) { 1563 hostnameEnd = string.length(); 1564 done = true; 1565 } else { 1566 p = hostnameEnd; 1567 done = false; 1568 } 1569 1570 nameRanges.append(std::make_pair(hostnameStart, hostnameEnd)); 1571 1572 if (done) 1573 return; 1574 } else { 1575 // Skip quoted string. 1576 ASSERT(c == '"'); 1577 while (1) { 1578 int escapedCharacterOrStringEnd = findFirstOf(string, p, "\"\\"); 1579 if (escapedCharacterOrStringEnd == -1) 1580 return; 1581 1582 c = string[escapedCharacterOrStringEnd]; 1583 p = escapedCharacterOrStringEnd + 1; 1584 1585 // If we are the end of the string, then break from the string loop back to the host name loop. 1586 if (c == '"') 1587 break; 1588 1589 // Skip escaped character. 1590 ASSERT(c == '\\'); 1591 if (p == static_cast<int>(string.length())) 1592 return; 1593 1594 ++p; 1595 } 1596 } 1597 } 1598} 1599 1600static bool findHostnameInHierarchicalURL(StringView string, int& startOffset, int& endOffset) 1601{ 1602 // Find the host name in a hierarchical URL. 1603 // It comes after a "://" sequence, with scheme characters preceding, and 1604 // this should be the first colon in the string. 1605 // It ends with the end of the string or a ":" or a path segment ending character. 1606 // If there is a "@" character, the host part is just the part after the "@". 1607 int separator = findFirstOf(string, 0, ":"); 1608 if (separator == -1 || separator + 2 >= static_cast<int>(string.length()) || string[separator + 1] != '/' || string[separator + 2] != '/') 1609 return false; 1610 1611 // Check that all characters before the :// are valid scheme characters. 1612 if (!isSchemeFirstChar(string[0])) 1613 return false; 1614 for (int i = 1; i < separator; ++i) { 1615 if (!isSchemeChar(string[i])) 1616 return false; 1617 } 1618 1619 // Start after the separator. 1620 int authorityStart = separator + 3; 1621 1622 // Find terminating character. 1623 int hostnameEnd = string.length(); 1624 for (int i = authorityStart; i < hostnameEnd; ++i) { 1625 UChar c = string[i]; 1626 if (c == ':' || (isPathSegmentEndChar(c) && c != 0)) { 1627 hostnameEnd = i; 1628 break; 1629 } 1630 } 1631 1632 // Find "@" for the start of the host name. 1633 int userInfoTerminator = findFirstOf(string, authorityStart, "@"); 1634 int hostnameStart; 1635 if (userInfoTerminator == -1 || userInfoTerminator > hostnameEnd) 1636 hostnameStart = authorityStart; 1637 else 1638 hostnameStart = userInfoTerminator + 1; 1639 1640 startOffset = hostnameStart; 1641 endOffset = hostnameEnd; 1642 return true; 1643} 1644 1645// Converts all hostnames found in the given input to punycode, preserving the 1646// rest of the URL unchanged. The output will NOT be null-terminated. 1647static void encodeHostnames(StringView string, UCharBuffer& buffer) 1648{ 1649 buffer.clear(); 1650 1651 if (protocolIs(string, "mailto")) { 1652 Vector<std::pair<int, int>> hostnameRanges; 1653 findHostnamesInMailToURL(string, hostnameRanges); 1654 int n = hostnameRanges.size(); 1655 int p = 0; 1656 for (int i = 0; i < n; ++i) { 1657 const std::pair<int, int>& r = hostnameRanges[i]; 1658 append(buffer, string.substring(p, r.first - p)); 1659 appendEncodedHostname(buffer, string.substring(r.first, r.second - r.first)); 1660 p = r.second; 1661 } 1662 // This will copy either everything after the last hostname, or the 1663 // whole thing if there is no hostname. 1664 append(buffer, string.substring(p)); 1665 } else { 1666 int hostStart, hostEnd; 1667 if (findHostnameInHierarchicalURL(string, hostStart, hostEnd)) { 1668 append(buffer, string.substring(0, hostStart)); // Before hostname. 1669 appendEncodedHostname(buffer, string.substring(hostStart, hostEnd - hostStart)); 1670 append(buffer, string.substring(hostEnd)); // After hostname. 1671 } else { 1672 // No hostname to encode, return the input. 1673 append(buffer, string); 1674 } 1675 } 1676} 1677 1678static void encodeRelativeString(const String& rel, const TextEncoding& encoding, CharBuffer& output) 1679{ 1680 UCharBuffer s; 1681 encodeHostnames(rel, s); 1682 1683 TextEncoding pathEncoding(UTF8Encoding()); // Path is always encoded as UTF-8; other parts may depend on the scheme. 1684 1685 int pathEnd = -1; 1686 if (encoding != pathEncoding && encoding.isValid() && !protocolIs(rel, "mailto") && !protocolIs(rel, "data") && !protocolIsJavaScript(rel)) { 1687 // Find the first instance of either # or ?, keep pathEnd at -1 otherwise. 1688 pathEnd = findFirstOf(StringView(s.data(), s.size()), 0, "#?"); 1689 } 1690 1691 if (pathEnd == -1) { 1692 CString decoded = pathEncoding.encode(StringView(s.data(), s.size()), URLEncodedEntitiesForUnencodables); 1693 output.resize(decoded.length()); 1694 memcpy(output.data(), decoded.data(), decoded.length()); 1695 } else { 1696 CString pathDecoded = pathEncoding.encode(StringView(s.data(), pathEnd), URLEncodedEntitiesForUnencodables); 1697 // Unencodable characters in URLs are represented by converting 1698 // them to XML entities and escaping non-alphanumeric characters. 1699 CString otherDecoded = encoding.encode(StringView(s.data() + pathEnd, s.size() - pathEnd), URLEncodedEntitiesForUnencodables); 1700 1701 output.resize(pathDecoded.length() + otherDecoded.length()); 1702 memcpy(output.data(), pathDecoded.data(), pathDecoded.length()); 1703 memcpy(output.data() + pathDecoded.length(), otherDecoded.data(), otherDecoded.length()); 1704 } 1705 output.append('\0'); // null-terminate the output. 1706} 1707 1708static String substituteBackslashes(const String& string) 1709{ 1710 size_t questionPos = string.find('?'); 1711 size_t hashPos = string.find('#'); 1712 unsigned pathEnd; 1713 1714 if (hashPos != notFound && (questionPos == notFound || questionPos > hashPos)) 1715 pathEnd = hashPos; 1716 else if (questionPos != notFound) 1717 pathEnd = questionPos; 1718 else 1719 pathEnd = string.length(); 1720 1721 return string.left(pathEnd).replace('\\','/') + string.substring(pathEnd); 1722} 1723 1724bool URL::isHierarchical() const 1725{ 1726 if (!m_isValid) 1727 return false; 1728 ASSERT(m_string[m_schemeEnd] == ':'); 1729 return m_string[m_schemeEnd + 1] == '/'; 1730} 1731 1732void URL::copyToBuffer(Vector<char, 512>& buffer) const 1733{ 1734 // FIXME: This throws away the high bytes of all the characters in the string! 1735 // That's fine for a valid URL, which is all ASCII, but not for invalid URLs. 1736 buffer.resize(m_string.length()); 1737 copyASCII(m_string, buffer.data()); 1738} 1739 1740bool protocolIs(const String& url, const char* protocol) 1741{ 1742 // Do the comparison without making a new string object. 1743 assertProtocolIsGood(protocol); 1744 for (int i = 0; ; ++i) { 1745 if (!protocol[i]) 1746 return url[i] == ':'; 1747 if (!isLetterMatchIgnoringCase(url[i], protocol[i])) 1748 return false; 1749 } 1750} 1751 1752bool isValidProtocol(const String& protocol) 1753{ 1754 // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) 1755 if (protocol.isEmpty()) 1756 return false; 1757 if (!isSchemeFirstChar(protocol[0])) 1758 return false; 1759 unsigned protocolLength = protocol.length(); 1760 for (unsigned i = 1; i < protocolLength; i++) { 1761 if (!isSchemeChar(protocol[i])) 1762 return false; 1763 } 1764 return true; 1765} 1766 1767#ifndef NDEBUG 1768void URL::print() const 1769{ 1770 printf("%s\n", m_string.utf8().data()); 1771} 1772#endif 1773 1774String URL::strippedForUseAsReferrer() const 1775{ 1776 URL referrer(*this); 1777 referrer.setUser(String()); 1778 referrer.setPass(String()); 1779 referrer.removeFragmentIdentifier(); 1780 return referrer.string(); 1781} 1782 1783bool URL::isLocalFile() const 1784{ 1785 // Including feed here might be a bad idea since drag and drop uses this check 1786 // and including feed would allow feeds to potentially let someone's blog 1787 // read the contents of the clipboard on a drag, even without a drop. 1788 // Likewise with using the FrameLoader::shouldTreatURLAsLocal() function. 1789 return protocolIs("file"); 1790} 1791 1792bool protocolIsJavaScript(const String& url) 1793{ 1794 return protocolIs(url, "javascript"); 1795} 1796 1797bool protocolIsInHTTPFamily(const String& url) 1798{ 1799 // Do the comparison without making a new string object. 1800 return isLetterMatchIgnoringCase(url[0], 'h') 1801 && isLetterMatchIgnoringCase(url[1], 't') 1802 && isLetterMatchIgnoringCase(url[2], 't') 1803 && isLetterMatchIgnoringCase(url[3], 'p') 1804 && (url[4] == ':' || (isLetterMatchIgnoringCase(url[4], 's') && url[5] == ':')); 1805} 1806 1807const URL& blankURL() 1808{ 1809 DEPRECATED_DEFINE_STATIC_LOCAL(URL, staticBlankURL, (ParsedURLString, "about:blank")); 1810 return staticBlankURL; 1811} 1812 1813bool URL::isBlankURL() const 1814{ 1815 return protocolIs("about"); 1816} 1817 1818bool isDefaultPortForProtocol(unsigned short port, const String& protocol) 1819{ 1820 if (protocol.isEmpty()) 1821 return false; 1822 1823 typedef HashMap<String, unsigned, CaseFoldingHash> DefaultPortsMap; 1824 DEPRECATED_DEFINE_STATIC_LOCAL(DefaultPortsMap, defaultPorts, ()); 1825 if (defaultPorts.isEmpty()) { 1826 defaultPorts.set("http", 80); 1827 defaultPorts.set("https", 443); 1828 defaultPorts.set("ftp", 21); 1829 defaultPorts.set("ftps", 990); 1830 } 1831 return defaultPorts.get(protocol) == port; 1832} 1833 1834bool portAllowed(const URL& url) 1835{ 1836 unsigned short port = url.port(); 1837 1838 // Since most URLs don't have a port, return early for the "no port" case. 1839 if (!port) 1840 return true; 1841 1842 // This blocked port list matches the port blocking that Mozilla implements. 1843 // See http://www.mozilla.org/projects/netlib/PortBanning.html for more information. 1844 static const unsigned short blockedPortList[] = { 1845 1, // tcpmux 1846 7, // echo 1847 9, // discard 1848 11, // systat 1849 13, // daytime 1850 15, // netstat 1851 17, // qotd 1852 19, // chargen 1853 20, // FTP-data 1854 21, // FTP-control 1855 22, // SSH 1856 23, // telnet 1857 25, // SMTP 1858 37, // time 1859 42, // name 1860 43, // nicname 1861 53, // domain 1862 77, // priv-rjs 1863 79, // finger 1864 87, // ttylink 1865 95, // supdup 1866 101, // hostriame 1867 102, // iso-tsap 1868 103, // gppitnp 1869 104, // acr-nema 1870 109, // POP2 1871 110, // POP3 1872 111, // sunrpc 1873 113, // auth 1874 115, // SFTP 1875 117, // uucp-path 1876 119, // nntp 1877 123, // NTP 1878 135, // loc-srv / epmap 1879 139, // netbios 1880 143, // IMAP2 1881 179, // BGP 1882 389, // LDAP 1883 465, // SMTP+SSL 1884 512, // print / exec 1885 513, // login 1886 514, // shell 1887 515, // printer 1888 526, // tempo 1889 530, // courier 1890 531, // Chat 1891 532, // netnews 1892 540, // UUCP 1893 556, // remotefs 1894 563, // NNTP+SSL 1895 587, // ESMTP 1896 601, // syslog-conn 1897 636, // LDAP+SSL 1898 993, // IMAP+SSL 1899 995, // POP3+SSL 1900 2049, // NFS 1901 3659, // apple-sasl / PasswordServer [Apple addition] 1902 4045, // lockd 1903 6000, // X11 1904 6665, // Alternate IRC [Apple addition] 1905 6666, // Alternate IRC [Apple addition] 1906 6667, // Standard IRC [Apple addition] 1907 6668, // Alternate IRC [Apple addition] 1908 6669, // Alternate IRC [Apple addition] 1909 invalidPortNumber, // Used to block all invalid port numbers 1910 }; 1911 const unsigned short* const blockedPortListEnd = blockedPortList + WTF_ARRAY_LENGTH(blockedPortList); 1912 1913#ifndef NDEBUG 1914 // The port list must be sorted for binary_search to work. 1915 static bool checkedPortList = false; 1916 if (!checkedPortList) { 1917 for (const unsigned short* p = blockedPortList; p != blockedPortListEnd - 1; ++p) 1918 ASSERT(*p < *(p + 1)); 1919 checkedPortList = true; 1920 } 1921#endif 1922 1923 // If the port is not in the blocked port list, allow it. 1924 if (!std::binary_search(blockedPortList, blockedPortListEnd, port)) 1925 return true; 1926 1927 // Allow ports 21 and 22 for FTP URLs, as Mozilla does. 1928 if ((port == 21 || port == 22) && url.protocolIs("ftp")) 1929 return true; 1930 1931 // Allow any port number in a file URL, since the port number is ignored. 1932 if (url.protocolIs("file")) 1933 return true; 1934 1935 return false; 1936} 1937 1938String mimeTypeFromDataURL(const String& url) 1939{ 1940 ASSERT(protocolIs(url, "data")); 1941 size_t index = url.find(';'); 1942 if (index == notFound) 1943 index = url.find(','); 1944 if (index != notFound) { 1945 if (index > 5) 1946 return url.substring(5, index - 5).lower(); 1947 return "text/plain"; // Data URLs with no MIME type are considered text/plain. 1948 } 1949 return ""; 1950} 1951 1952String mimeTypeFromURL(const URL& url) 1953{ 1954 String decodedPath = decodeURLEscapeSequences(url.path()); 1955 String extension = decodedPath.substring(decodedPath.reverseFind('.') + 1); 1956 1957 // We don't use MIMETypeRegistry::getMIMETypeForPath() because it returns "application/octet-stream" upon failure 1958 return MIMETypeRegistry::getMIMETypeForExtension(extension); 1959} 1960 1961bool URL::isSafeToSendToAnotherThread() const 1962{ 1963 return m_string.isSafeToSendToAnotherThread(); 1964} 1965 1966String URL::stringCenterEllipsizedToLength(unsigned length) const 1967{ 1968 if (string().length() <= length) 1969 return string(); 1970 1971 return string().left(length / 2 - 1) + "..." + string().right(length / 2 - 2); 1972} 1973 1974URL URL::fakeURLWithRelativePart(const String& relativePart) 1975{ 1976 return URL(URL(), "webkit-fake-url://" + createCanonicalUUIDString() + '/' + relativePart); 1977} 1978 1979} 1980