1/*
2 * Copyright (C) 2004, 2007, 2008, 2011, 2012, 2013 Apple Inc. All rights reserved.
3 * Copyright (C) 2012 Research In Motion Limited. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "KURL.h"
29
30#include "DecodeEscapeSequences.h"
31#include "MIMETypeRegistry.h"
32#include "TextEncoding.h"
33#include <stdio.h>
34#include <wtf/HashMap.h>
35#include <wtf/HexNumber.h>
36#include <wtf/StdLibExtras.h>
37#include <wtf/text/CString.h>
38#include <wtf/text/StringBuilder.h>
39#include <wtf/text/StringHash.h>
40
41#if USE(ICU_UNICODE)
42#include <unicode/uidna.h>
43#endif
44
45// FIXME: This file makes too much use of the + operator on String.
46// We either have to optimize that operator so it doesn't involve
47// so many allocations, or change this to use StringBuffer instead.
48
49using namespace std;
50using namespace WTF;
51
52namespace WebCore {
53
54typedef Vector<char, 512> CharBuffer;
55typedef Vector<UChar, 512> UCharBuffer;
56
57static const unsigned maximumValidPortNumber = 0xFFFE;
58static const unsigned invalidPortNumber = 0xFFFF;
59
60static inline bool isLetterMatchIgnoringCase(UChar character, char lowercaseLetter)
61{
62    ASSERT(isASCIILower(lowercaseLetter));
63    return (character | 0x20) == lowercaseLetter;
64}
65
66static const char wsScheme[] = {'w', 's'};
67static const char ftpScheme[] = {'f', 't', 'p'};
68static const char ftpPort[] = {'2', '1'};
69static const char wssScheme[] = {'w', 's', 's'};
70static const char fileScheme[] = {'f', 'i', 'l', 'e'};
71static const char httpScheme[] = {'h', 't', 't', 'p'};
72static const char httpPort[] = {'8', '0'};
73static const char httpsScheme[] = {'h', 't', 't', 'p', 's'};
74static const char httpsPort[] = {'4', '4', '3'};
75static const char gopherScheme[] = {'g', 'o', 'p', 'h', 'e', 'r'};
76static const char gopherPort[] = {'7', '0'};
77
78static inline bool isLetterMatchIgnoringCase(char character, char lowercaseLetter)
79{
80    ASSERT(isASCIILower(lowercaseLetter));
81    return (character | 0x20) == lowercaseLetter;
82}
83
84enum URLCharacterClasses {
85    // alpha
86    SchemeFirstChar = 1 << 0,
87
88    // ( alpha | digit | "+" | "-" | "." )
89    SchemeChar = 1 << 1,
90
91    // mark        = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
92    // unreserved  = alphanum | mark
93    // ( unreserved | escaped | ";" | ":" | "&" | "=" | "+" | "$" | "," )
94    UserInfoChar = 1 << 2,
95
96    // alnum | "." | "-" | "%"
97    // The above is what the specification says, but we are lenient to
98    // match existing practice and also allow:
99    // "_"
100    HostnameChar = 1 << 3,
101
102    // hexdigit | ":" | "%"
103    IPv6Char = 1 << 4,
104
105    // "#" | "?" | "/" | nul
106    PathSegmentEndChar = 1 << 5,
107
108    // not allowed in path
109    BadChar = 1 << 6
110};
111
112static const unsigned char characterClassTable[256] = {
113    /* 0 nul */ PathSegmentEndChar,    /* 1 soh */ BadChar,
114    /* 2 stx */ BadChar,    /* 3 etx */ BadChar,
115    /* 4 eot */ BadChar,    /* 5 enq */ BadChar,    /* 6 ack */ BadChar,    /* 7 bel */ BadChar,
116    /* 8 bs */ BadChar,     /* 9 ht */ BadChar,     /* 10 nl */ BadChar,    /* 11 vt */ BadChar,
117    /* 12 np */ BadChar,    /* 13 cr */ BadChar,    /* 14 so */ BadChar,    /* 15 si */ BadChar,
118    /* 16 dle */ BadChar,   /* 17 dc1 */ BadChar,   /* 18 dc2 */ BadChar,   /* 19 dc3 */ BadChar,
119    /* 20 dc4 */ BadChar,   /* 21 nak */ BadChar,   /* 22 syn */ BadChar,   /* 23 etb */ BadChar,
120    /* 24 can */ BadChar,   /* 25 em */ BadChar,    /* 26 sub */ BadChar,   /* 27 esc */ BadChar,
121    /* 28 fs */ BadChar,    /* 29 gs */ BadChar,    /* 30 rs */ BadChar,    /* 31 us */ BadChar,
122    /* 32 sp */ BadChar,    /* 33  ! */ UserInfoChar,
123    /* 34  " */ BadChar,    /* 35  # */ PathSegmentEndChar | BadChar,
124    /* 36  $ */ UserInfoChar,    /* 37  % */ UserInfoChar | HostnameChar | IPv6Char | BadChar,
125    /* 38  & */ UserInfoChar,    /* 39  ' */ UserInfoChar,
126    /* 40  ( */ UserInfoChar,    /* 41  ) */ UserInfoChar,
127    /* 42  * */ UserInfoChar,    /* 43  + */ SchemeChar | UserInfoChar,
128    /* 44  , */ UserInfoChar,
129    /* 45  - */ SchemeChar | UserInfoChar | HostnameChar,
130    /* 46  . */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
131    /* 47  / */ PathSegmentEndChar,
132    /* 48  0 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
133    /* 49  1 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
134    /* 50  2 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
135    /* 51  3 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
136    /* 52  4 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
137    /* 53  5 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
138    /* 54  6 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
139    /* 55  7 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
140    /* 56  8 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
141    /* 57  9 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
142    /* 58  : */ UserInfoChar | IPv6Char,    /* 59  ; */ UserInfoChar,
143    /* 60  < */ BadChar,    /* 61  = */ UserInfoChar,
144    /* 62  > */ BadChar,    /* 63  ? */ PathSegmentEndChar | BadChar,
145    /* 64  @ */ 0,
146    /* 65  A */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
147    /* 66  B */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
148    /* 67  C */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
149    /* 68  D */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
150    /* 69  E */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
151    /* 70  F */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
152    /* 71  G */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
153    /* 72  H */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
154    /* 73  I */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
155    /* 74  J */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
156    /* 75  K */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
157    /* 76  L */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
158    /* 77  M */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
159    /* 78  N */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
160    /* 79  O */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
161    /* 80  P */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
162    /* 81  Q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
163    /* 82  R */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
164    /* 83  S */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
165    /* 84  T */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
166    /* 85  U */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
167    /* 86  V */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
168    /* 87  W */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
169    /* 88  X */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
170    /* 89  Y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
171    /* 90  Z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
172    /* 91  [ */ 0,
173    /* 92  \ */ 0,    /* 93  ] */ 0,
174    /* 94  ^ */ 0,
175    /* 95  _ */ UserInfoChar | HostnameChar,
176    /* 96  ` */ 0,
177    /* 97  a */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
178    /* 98  b */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
179    /* 99  c */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
180    /* 100  d */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
181    /* 101  e */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
182    /* 102  f */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
183    /* 103  g */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
184    /* 104  h */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
185    /* 105  i */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
186    /* 106  j */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
187    /* 107  k */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
188    /* 108  l */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
189    /* 109  m */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
190    /* 110  n */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
191    /* 111  o */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
192    /* 112  p */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
193    /* 113  q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
194    /* 114  r */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
195    /* 115  s */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
196    /* 116  t */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
197    /* 117  u */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
198    /* 118  v */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
199    /* 119  w */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
200    /* 120  x */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
201    /* 121  y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
202    /* 122  z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
203    /* 123  { */ 0,
204    /* 124  | */ 0,   /* 125  } */ 0,   /* 126  ~ */ UserInfoChar,   /* 127 del */ BadChar,
205    /* 128 */ BadChar, /* 129 */ BadChar, /* 130 */ BadChar, /* 131 */ BadChar,
206    /* 132 */ BadChar, /* 133 */ BadChar, /* 134 */ BadChar, /* 135 */ BadChar,
207    /* 136 */ BadChar, /* 137 */ BadChar, /* 138 */ BadChar, /* 139 */ BadChar,
208    /* 140 */ BadChar, /* 141 */ BadChar, /* 142 */ BadChar, /* 143 */ BadChar,
209    /* 144 */ BadChar, /* 145 */ BadChar, /* 146 */ BadChar, /* 147 */ BadChar,
210    /* 148 */ BadChar, /* 149 */ BadChar, /* 150 */ BadChar, /* 151 */ BadChar,
211    /* 152 */ BadChar, /* 153 */ BadChar, /* 154 */ BadChar, /* 155 */ BadChar,
212    /* 156 */ BadChar, /* 157 */ BadChar, /* 158 */ BadChar, /* 159 */ BadChar,
213    /* 160 */ BadChar, /* 161 */ BadChar, /* 162 */ BadChar, /* 163 */ BadChar,
214    /* 164 */ BadChar, /* 165 */ BadChar, /* 166 */ BadChar, /* 167 */ BadChar,
215    /* 168 */ BadChar, /* 169 */ BadChar, /* 170 */ BadChar, /* 171 */ BadChar,
216    /* 172 */ BadChar, /* 173 */ BadChar, /* 174 */ BadChar, /* 175 */ BadChar,
217    /* 176 */ BadChar, /* 177 */ BadChar, /* 178 */ BadChar, /* 179 */ BadChar,
218    /* 180 */ BadChar, /* 181 */ BadChar, /* 182 */ BadChar, /* 183 */ BadChar,
219    /* 184 */ BadChar, /* 185 */ BadChar, /* 186 */ BadChar, /* 187 */ BadChar,
220    /* 188 */ BadChar, /* 189 */ BadChar, /* 190 */ BadChar, /* 191 */ BadChar,
221    /* 192 */ BadChar, /* 193 */ BadChar, /* 194 */ BadChar, /* 195 */ BadChar,
222    /* 196 */ BadChar, /* 197 */ BadChar, /* 198 */ BadChar, /* 199 */ BadChar,
223    /* 200 */ BadChar, /* 201 */ BadChar, /* 202 */ BadChar, /* 203 */ BadChar,
224    /* 204 */ BadChar, /* 205 */ BadChar, /* 206 */ BadChar, /* 207 */ BadChar,
225    /* 208 */ BadChar, /* 209 */ BadChar, /* 210 */ BadChar, /* 211 */ BadChar,
226    /* 212 */ BadChar, /* 213 */ BadChar, /* 214 */ BadChar, /* 215 */ BadChar,
227    /* 216 */ BadChar, /* 217 */ BadChar, /* 218 */ BadChar, /* 219 */ BadChar,
228    /* 220 */ BadChar, /* 221 */ BadChar, /* 222 */ BadChar, /* 223 */ BadChar,
229    /* 224 */ BadChar, /* 225 */ BadChar, /* 226 */ BadChar, /* 227 */ BadChar,
230    /* 228 */ BadChar, /* 229 */ BadChar, /* 230 */ BadChar, /* 231 */ BadChar,
231    /* 232 */ BadChar, /* 233 */ BadChar, /* 234 */ BadChar, /* 235 */ BadChar,
232    /* 236 */ BadChar, /* 237 */ BadChar, /* 238 */ BadChar, /* 239 */ BadChar,
233    /* 240 */ BadChar, /* 241 */ BadChar, /* 242 */ BadChar, /* 243 */ BadChar,
234    /* 244 */ BadChar, /* 245 */ BadChar, /* 246 */ BadChar, /* 247 */ BadChar,
235    /* 248 */ BadChar, /* 249 */ BadChar, /* 250 */ BadChar, /* 251 */ BadChar,
236    /* 252 */ BadChar, /* 253 */ BadChar, /* 254 */ BadChar, /* 255 */ BadChar
237};
238
239static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd);
240static void encodeRelativeString(const String& rel, const TextEncoding&, CharBuffer& ouput);
241static String substituteBackslashes(const String&);
242
243static inline bool isSchemeFirstChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeFirstChar; }
244static inline bool isSchemeFirstChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeFirstChar); }
245static inline bool isSchemeChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeChar; }
246static inline bool isSchemeChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeChar); }
247static inline bool isUserInfoChar(unsigned char c) { return characterClassTable[c] & UserInfoChar; }
248static inline bool isHostnameChar(unsigned char c) { return characterClassTable[c] & HostnameChar; }
249static inline bool isIPv6Char(unsigned char c) { return characterClassTable[c] & IPv6Char; }
250static inline bool isPathSegmentEndChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & PathSegmentEndChar; }
251static inline bool isPathSegmentEndChar(UChar c) { return c <= 0xff && (characterClassTable[c] & PathSegmentEndChar); }
252static inline bool isBadChar(unsigned char c) { return characterClassTable[c] & BadChar; }
253
254static inline bool isSchemeCharacterMatchIgnoringCase(char character, char schemeCharacter)
255{
256    ASSERT(isSchemeChar(character));
257    ASSERT(schemeCharacter & 0x20);
258    ASSERT(isASCIILower(schemeCharacter) || (!isASCIIUpper(schemeCharacter) && isSchemeChar(schemeCharacter)));
259    return (character | 0x20) == schemeCharacter;
260}
261
262// Copies the source to the destination, assuming all the source characters are
263// ASCII. The destination buffer must be large enough. Null characters are allowed
264// in the source string, and no attempt is made to null-terminate the result.
265static void copyASCII(const String& string, char* dest)
266{
267    if (string.isEmpty())
268        return;
269
270    if (string.is8Bit())
271        memcpy(dest, string.characters8(), string.length());
272    else {
273        const UChar* src = string.characters16();
274        size_t length = string.length();
275        for (size_t i = 0; i < length; i++)
276            dest[i] = static_cast<char>(src[i]);
277    }
278}
279
280static void appendASCII(const String& base, const char* rel, size_t len, CharBuffer& buffer)
281{
282    buffer.resize(base.length() + len + 1);
283    copyASCII(base, buffer.data());
284    memcpy(buffer.data() + base.length(), rel, len);
285    buffer[buffer.size() - 1] = '\0';
286}
287
288// FIXME: Move to WTFString.h eventually.
289// Returns the index of the first index in string |s| of any of the characters
290// in |toFind|. |toFind| should be a null-terminated string, all characters up
291// to the null will be searched. Returns int if not found.
292static int findFirstOf(const UChar* s, int sLen, int startPos, const char* toFind)
293{
294    for (int i = startPos; i < sLen; i++) {
295        const char* cur = toFind;
296        while (*cur) {
297            if (s[i] == *(cur++))
298                return i;
299        }
300    }
301    return -1;
302}
303
304static inline void checkEncodedString(const String& url)
305{
306    ASSERT_UNUSED(url, url.containsOnlyASCII());
307    ASSERT_UNUSED(url, url.isEmpty() || isSchemeFirstChar(url[0]));
308}
309
310inline bool KURL::protocolIs(const String& string, const char* protocol)
311{
312    return WebCore::protocolIs(string, protocol);
313}
314
315void KURL::invalidate()
316{
317    m_isValid = false;
318    m_protocolIsInHTTPFamily = false;
319    m_schemeEnd = 0;
320    m_userStart = 0;
321    m_userEnd = 0;
322    m_passwordEnd = 0;
323    m_hostEnd = 0;
324    m_portEnd = 0;
325    m_pathEnd = 0;
326    m_pathAfterLastSlash = 0;
327    m_queryEnd = 0;
328    m_fragmentEnd = 0;
329}
330
331KURL::KURL(ParsedURLStringTag, const String& url)
332{
333    parse(url);
334    ASSERT(url == m_string);
335}
336
337KURL::KURL(const KURL& base, const String& relative)
338{
339    init(base, relative, UTF8Encoding());
340}
341
342KURL::KURL(const KURL& base, const String& relative, const TextEncoding& encoding)
343{
344    // For UTF-{7,16,32}, we want to use UTF-8 for the query part as
345    // we do when submitting a form. A form with GET method
346    // has its contents added to a URL as query params and it makes sense
347    // to be consistent.
348    init(base, relative, encoding.encodingForFormSubmission());
349}
350
351static bool shouldTrimFromURL(unsigned char c)
352{
353    // Browsers ignore leading/trailing whitespace and control
354    // characters from URLs.  Note that c is an *unsigned* char here
355    // so this comparison should only catch control characters.
356    return c <= ' ';
357}
358
359void KURL::init(const KURL& base, const String& relative, const TextEncoding& encoding)
360{
361    // Allow resolutions with a null or empty base URL, but not with any other invalid one.
362    // FIXME: Is this a good rule?
363    if (!base.m_isValid && !base.isEmpty()) {
364        m_string = relative;
365        invalidate();
366        return;
367    }
368
369    // For compatibility with Win IE, treat backslashes as if they were slashes,
370    // as long as we're not dealing with javascript: or data: URLs.
371    String rel = relative;
372    if (rel.contains('\\') && !(protocolIsJavaScript(rel) || protocolIs(rel, "data")))
373        rel = substituteBackslashes(rel);
374
375    bool allASCII = rel.containsOnlyASCII();
376    CharBuffer strBuffer;
377    char* str;
378    size_t len;
379    if (allASCII) {
380        len = rel.length();
381        strBuffer.resize(len + 1);
382        copyASCII(rel, strBuffer.data());
383        strBuffer[len] = 0;
384        str = strBuffer.data();
385    } else {
386        encodeRelativeString(rel, encoding, strBuffer);
387        str = strBuffer.data();
388        len = strlen(str);
389    }
390
391    // Get rid of leading whitespace and control characters.
392    while (len && shouldTrimFromURL(*str)) {
393        str++;
394        --len;
395    }
396
397    // Get rid of trailing whitespace and control characters.
398    while (len && shouldTrimFromURL(str[len - 1]))
399        str[--len] = '\0';
400
401    // According to the RFC, the reference should be interpreted as an
402    // absolute URI if possible, using the "leftmost, longest"
403    // algorithm. If the URI reference is absolute it will have a
404    // scheme, meaning that it will have a colon before the first
405    // non-scheme element.
406    bool absolute = false;
407    char* p = str;
408    if (isSchemeFirstChar(*p)) {
409        ++p;
410        while (isSchemeChar(*p)) {
411            ++p;
412        }
413        if (*p == ':') {
414            if (p[1] != '/' && equalIgnoringCase(base.protocol(), String(str, p - str)) && base.isHierarchical())
415                str = p + 1;
416            else
417                absolute = true;
418        }
419    }
420
421    CharBuffer parseBuffer;
422
423    if (absolute) {
424        parse(str, &relative);
425    } else {
426        // If the base is empty or opaque (e.g. data: or javascript:), then the URL is invalid
427        // unless the relative URL is a single fragment.
428        if (!base.isHierarchical()) {
429            if (str[0] == '#') {
430                appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer);
431                parse(parseBuffer.data(), &relative);
432            } else {
433                m_string = relative;
434                invalidate();
435            }
436            return;
437        }
438
439        switch (str[0]) {
440        case '\0':
441            // The reference is empty, so this is a reference to the same document with any fragment identifier removed.
442            *this = base;
443            removeFragmentIdentifier();
444            break;
445        case '#': {
446            // must be fragment-only reference
447            appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer);
448            parse(parseBuffer.data(), &relative);
449            break;
450        }
451        case '?': {
452            // query-only reference, special case needed for non-URL results
453            appendASCII(base.m_string.left(base.m_pathEnd), str, len, parseBuffer);
454            parse(parseBuffer.data(), &relative);
455            break;
456        }
457        case '/':
458            // must be net-path or absolute-path reference
459            if (str[1] == '/') {
460                // net-path
461                appendASCII(base.m_string.left(base.m_schemeEnd + 1), str, len, parseBuffer);
462                parse(parseBuffer.data(), &relative);
463            } else {
464                // abs-path
465                appendASCII(base.m_string.left(base.m_portEnd), str, len, parseBuffer);
466                parse(parseBuffer.data(), &relative);
467            }
468            break;
469        default:
470            {
471                // must be relative-path reference
472
473                // Base part plus relative part plus one possible slash added in between plus terminating \0 byte.
474                const size_t bufferSize = base.m_pathEnd + 1 + len + 1;
475                parseBuffer.resize(bufferSize);
476
477                char* bufferPos = parseBuffer.data();
478                char* bufferStart = bufferPos;
479
480                // first copy everything before the path from the base
481                CharBuffer baseStringBuffer(base.m_string.length());
482                copyASCII(base.m_string, baseStringBuffer.data());
483                const char* baseString = baseStringBuffer.data();
484                const char* baseStringStart = baseString;
485                const char* pathStart = baseStringStart + base.m_portEnd;
486                while (baseStringStart < pathStart)
487                    *bufferPos++ = *baseStringStart++;
488                char* bufferPathStart = bufferPos;
489
490                // now copy the base path
491                const char* baseStringEnd = baseString + base.m_pathEnd;
492
493                // go back to the last slash
494                while (baseStringEnd > baseStringStart && baseStringEnd[-1] != '/')
495                    baseStringEnd--;
496
497                if (baseStringEnd == baseStringStart) {
498                    // no path in base, add a path separator if necessary
499                    if (base.m_schemeEnd + 1 != base.m_pathEnd && *str && *str != '?' && *str != '#')
500                        *bufferPos++ = '/';
501                } else {
502                    bufferPos += copyPathRemovingDots(bufferPos, baseStringStart, 0, baseStringEnd - baseStringStart);
503                }
504
505                const char* relStringStart = str;
506                const char* relStringPos = relStringStart;
507
508                while (*relStringPos && *relStringPos != '?' && *relStringPos != '#') {
509                    if (relStringPos[0] == '.' && bufferPos[-1] == '/') {
510                        if (isPathSegmentEndChar(relStringPos[1])) {
511                            // skip over "." segment
512                            relStringPos += 1;
513                            if (relStringPos[0] == '/')
514                                relStringPos++;
515                            continue;
516                        } else if (relStringPos[1] == '.' && isPathSegmentEndChar(relStringPos[2])) {
517                            // skip over ".." segment and rewind the last segment
518                            // the RFC leaves it up to the app to decide what to do with excess
519                            // ".." segments - we choose to drop them since some web content
520                            // relies on this.
521                            relStringPos += 2;
522                            if (relStringPos[0] == '/')
523                                relStringPos++;
524                            if (bufferPos > bufferPathStart + 1)
525                                bufferPos--;
526                            while (bufferPos > bufferPathStart + 1  && bufferPos[-1] != '/')
527                                bufferPos--;
528                            continue;
529                        }
530                    }
531
532                    *bufferPos = *relStringPos;
533                    relStringPos++;
534                    bufferPos++;
535                }
536
537                // all done with the path work, now copy any remainder
538                // of the relative reference; this will also add a null terminator
539                strncpy(bufferPos, relStringPos, bufferSize - (bufferPos - bufferStart));
540
541                parse(parseBuffer.data(), &relative);
542
543                ASSERT(strlen(parseBuffer.data()) + 1 <= parseBuffer.size());
544                break;
545            }
546        }
547    }
548}
549
550KURL KURL::copy() const
551{
552    KURL result = *this;
553    result.m_string = result.m_string.isolatedCopy();
554    return result;
555}
556
557String KURL::lastPathComponent() const
558{
559    if (!hasPath())
560        return String();
561
562    unsigned end = m_pathEnd - 1;
563    if (m_string[end] == '/')
564        --end;
565
566    size_t start = m_string.reverseFind('/', end);
567    if (start < static_cast<unsigned>(m_portEnd))
568        return String();
569    ++start;
570
571    return m_string.substring(start, end - start + 1);
572}
573
574String KURL::protocol() const
575{
576    return m_string.left(m_schemeEnd);
577}
578
579String KURL::host() const
580{
581    int start = hostStart();
582    String substring = m_string.substring(start, m_hostEnd - start);
583    return substring.isNull() ? emptyString() : substring;
584}
585
586unsigned short KURL::port() const
587{
588    // We return a port of 0 if there is no port specified. This can happen in two situations:
589    // 1) The URL contains no colon after the host name and before the path component of the URL.
590    // 2) The URL contains a colon but there's no port number before the path component of the URL begins.
591    if (m_hostEnd == m_portEnd || m_hostEnd == m_portEnd - 1)
592        return 0;
593
594    const UChar* stringData = m_string.characters();
595    bool ok = false;
596    unsigned number = charactersToUIntStrict(stringData + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok);
597    if (!ok || number > maximumValidPortNumber)
598        return invalidPortNumber;
599    return number;
600}
601
602String KURL::pass() const
603{
604    if (m_passwordEnd == m_userEnd)
605        return String();
606
607    return decodeURLEscapeSequences(m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1));
608}
609
610String KURL::user() const
611{
612    return decodeURLEscapeSequences(m_string.substring(m_userStart, m_userEnd - m_userStart));
613}
614
615String KURL::fragmentIdentifier() const
616{
617    if (m_fragmentEnd == m_queryEnd)
618        return String();
619
620    return m_string.substring(m_queryEnd + 1, m_fragmentEnd - (m_queryEnd + 1));
621}
622
623bool KURL::hasFragmentIdentifier() const
624{
625    return m_fragmentEnd != m_queryEnd;
626}
627
628String KURL::baseAsString() const
629{
630    return m_string.left(m_pathAfterLastSlash);
631}
632
633#if !PLATFORM(QT) && !USE(CF)
634String KURL::fileSystemPath() const
635{
636    if (!isValid() || !isLocalFile())
637        return String();
638
639    return decodeURLEscapeSequences(path());
640}
641#endif
642
643#ifdef NDEBUG
644
645static inline void assertProtocolIsGood(const char*)
646{
647}
648
649#else
650
651static void assertProtocolIsGood(const char* protocol)
652{
653    const char* p = protocol;
654    while (*p) {
655        ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z'));
656        ++p;
657    }
658}
659
660#endif
661
662bool KURL::protocolIs(const char* protocol) const
663{
664    assertProtocolIsGood(protocol);
665
666    // JavaScript URLs are "valid" and should be executed even if KURL decides they are invalid.
667    // The free function protocolIsJavaScript() should be used instead.
668    ASSERT(!equalIgnoringCase(protocol, String("javascript")));
669
670    if (!m_isValid)
671        return false;
672
673    // Do the comparison without making a new string object.
674    for (int i = 0; i < m_schemeEnd; ++i) {
675        if (!protocol[i] || !isSchemeCharacterMatchIgnoringCase(m_string[i], protocol[i]))
676            return false;
677    }
678    return !protocol[m_schemeEnd]; // We should have consumed all characters in the argument.
679}
680
681String KURL::query() const
682{
683    if (m_queryEnd == m_pathEnd)
684        return String();
685
686    return m_string.substring(m_pathEnd + 1, m_queryEnd - (m_pathEnd + 1));
687}
688
689String KURL::path() const
690{
691    return m_string.substring(m_portEnd, m_pathEnd - m_portEnd);
692}
693
694bool KURL::setProtocol(const String& s)
695{
696    // Firefox and IE remove everything after the first ':'.
697    size_t separatorPosition = s.find(':');
698    String newProtocol = s.substring(0, separatorPosition);
699
700    if (!isValidProtocol(newProtocol))
701        return false;
702
703    if (!m_isValid) {
704        parse(newProtocol + ':' + m_string);
705        return true;
706    }
707
708    parse(newProtocol + m_string.substring(m_schemeEnd));
709    return true;
710}
711
712void KURL::setHost(const String& s)
713{
714    if (!m_isValid)
715        return;
716
717    // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
718    // and to avoid changing more than just the host.
719
720    bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
721
722    parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + s + m_string.substring(m_hostEnd));
723}
724
725void KURL::removePort()
726{
727    if (m_hostEnd == m_portEnd)
728        return;
729    parse(m_string.left(m_hostEnd) + m_string.substring(m_portEnd));
730}
731
732void KURL::setPort(unsigned short i)
733{
734    if (!m_isValid)
735        return;
736
737    bool colonNeeded = m_portEnd == m_hostEnd;
738    int portStart = (colonNeeded ? m_hostEnd : m_hostEnd + 1);
739
740    parse(m_string.left(portStart) + (colonNeeded ? ":" : "") + String::number(i) + m_string.substring(m_portEnd));
741}
742
743void KURL::setHostAndPort(const String& hostAndPort)
744{
745    if (!m_isValid)
746        return;
747
748    // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
749    // and to avoid changing more than just host and port.
750
751    bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
752
753    parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + hostAndPort + m_string.substring(m_portEnd));
754}
755
756void KURL::setUser(const String& user)
757{
758    if (!m_isValid)
759        return;
760
761    // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
762    // and to avoid changing more than just the user login.
763
764    int end = m_userEnd;
765    if (!user.isEmpty()) {
766        String u = user;
767        if (m_userStart == m_schemeEnd + 1)
768            u = "//" + u;
769        // Add '@' if we didn't have one before.
770        if (end == m_hostEnd || (end == m_passwordEnd && m_string[end] != '@'))
771            u.append('@');
772        parse(m_string.left(m_userStart) + u + m_string.substring(end));
773    } else {
774        // Remove '@' if we now have neither user nor password.
775        if (m_userEnd == m_passwordEnd && end != m_hostEnd && m_string[end] == '@')
776            end += 1;
777        // We don't want to parse in the extremely common case where we are not going to make a change.
778        if (m_userStart != end)
779            parse(m_string.left(m_userStart) + m_string.substring(end));
780    }
781}
782
783void KURL::setPass(const String& password)
784{
785    if (!m_isValid)
786        return;
787
788    // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
789    // and to avoid changing more than just the user password.
790
791    int end = m_passwordEnd;
792    if (!password.isEmpty()) {
793        String p = ":" + password + "@";
794        if (m_userEnd == m_schemeEnd + 1)
795            p = "//" + p;
796        // Eat the existing '@' since we are going to add our own.
797        if (end != m_hostEnd && m_string[end] == '@')
798            end += 1;
799        parse(m_string.left(m_userEnd) + p + m_string.substring(end));
800    } else {
801        // Remove '@' if we now have neither user nor password.
802        if (m_userStart == m_userEnd && end != m_hostEnd && m_string[end] == '@')
803            end += 1;
804        // We don't want to parse in the extremely common case where we are not going to make a change.
805        if (m_userEnd != end)
806            parse(m_string.left(m_userEnd) + m_string.substring(end));
807    }
808}
809
810void KURL::setFragmentIdentifier(const String& s)
811{
812    if (!m_isValid)
813        return;
814
815    // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations.
816    parse(m_string.left(m_queryEnd) + "#" + s);
817}
818
819void KURL::removeFragmentIdentifier()
820{
821    if (!m_isValid)
822        return;
823    parse(m_string.left(m_queryEnd));
824}
825
826void KURL::setQuery(const String& query)
827{
828    if (!m_isValid)
829        return;
830
831    // FIXME: '#' and non-ASCII characters must be encoded and escaped.
832    // Usually, the query is encoded using document encoding, not UTF-8, but we don't have
833    // access to the document in this function.
834    if ((query.isEmpty() || query[0] != '?') && !query.isNull())
835        parse(m_string.left(m_pathEnd) + "?" + query + m_string.substring(m_queryEnd));
836    else
837        parse(m_string.left(m_pathEnd) + query + m_string.substring(m_queryEnd));
838
839}
840
841void KURL::setPath(const String& s)
842{
843    if (!m_isValid)
844        return;
845
846    // FIXME: encodeWithURLEscapeSequences does not correctly escape '#' and '?', so fragment and query parts
847    // may be inadvertently affected.
848    String path = s;
849    if (path.isEmpty() || path[0] != '/')
850        path = "/" + path;
851
852    parse(m_string.left(m_portEnd) + encodeWithURLEscapeSequences(path) + m_string.substring(m_pathEnd));
853}
854
855String decodeURLEscapeSequences(const String& string)
856{
857    return decodeEscapeSequences<URLEscapeSequence>(string, UTF8Encoding());
858}
859
860String decodeURLEscapeSequences(const String& string, const TextEncoding& encoding)
861{
862    return decodeEscapeSequences<URLEscapeSequence>(string, encoding);
863}
864
865// Caution: This function does not bounds check.
866static void appendEscapedChar(char*& buffer, unsigned char c)
867{
868    *buffer++ = '%';
869    placeByteAsHex(c, buffer);
870}
871
872static void appendEscapingBadChars(char*& buffer, const char* strStart, size_t length)
873{
874    char* p = buffer;
875
876    const char* str = strStart;
877    const char* strEnd = strStart + length;
878    while (str < strEnd) {
879        unsigned char c = *str++;
880        if (isBadChar(c)) {
881            if (c == '%' || c == '?')
882                *p++ = c;
883            else if (c != 0x09 && c != 0x0a && c != 0x0d)
884                appendEscapedChar(p, c);
885        } else
886            *p++ = c;
887    }
888
889    buffer = p;
890}
891
892static void escapeAndAppendNonHierarchicalPart(char*& buffer, const char* strStart, size_t length)
893{
894    char* p = buffer;
895
896    const char* str = strStart;
897    const char* strEnd = strStart + length;
898    while (str < strEnd) {
899        unsigned char c = *str++;
900        // Strip CR, LF and Tab from fragments, per:
901        // https://bugs.webkit.org/show_bug.cgi?id=8770
902        if (c == 0x09 || c == 0x0a || c == 0x0d)
903            continue;
904
905        // Chrome and IE allow non-ascii characters in fragments, however doing
906        // so would hit an ASSERT in checkEncodedString, so for now we don't.
907        if (c < 0x20 || c >= 127) {
908            appendEscapedChar(p, c);
909            continue;
910        }
911        *p++ = c;
912    }
913
914    buffer = p;
915}
916
917// copy a path, accounting for "." and ".." segments
918static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd)
919{
920    char* bufferPathStart = dst;
921
922    // empty path is a special case, and need not have a leading slash
923    if (srcStart != srcEnd) {
924        const char* baseStringStart = src + srcStart;
925        const char* baseStringEnd = src + srcEnd;
926        const char* baseStringPos = baseStringStart;
927
928        // this code is unprepared for paths that do not begin with a
929        // slash and we should always have one in the source string
930        ASSERT(baseStringPos[0] == '/');
931
932        // copy the leading slash into the destination
933        *dst = *baseStringPos;
934        baseStringPos++;
935        dst++;
936
937        while (baseStringPos < baseStringEnd) {
938            if (baseStringPos[0] == '.' && dst[-1] == '/') {
939                if (baseStringPos[1] == '/' || baseStringPos + 1 == baseStringEnd) {
940                    // skip over "." segment
941                    baseStringPos += 2;
942                    continue;
943                } else if (baseStringPos[1] == '.' && (baseStringPos[2] == '/' ||
944                                       baseStringPos + 2 == baseStringEnd)) {
945                    // skip over ".." segment and rewind the last segment
946                    // the RFC leaves it up to the app to decide what to do with excess
947                    // ".." segments - we choose to drop them since some web content
948                    // relies on this.
949                    baseStringPos += 3;
950                    if (dst > bufferPathStart + 1)
951                        dst--;
952                    while (dst > bufferPathStart && dst[-1] != '/')
953                        dst--;
954                    continue;
955                }
956            }
957
958            *dst = *baseStringPos;
959            baseStringPos++;
960            dst++;
961        }
962    }
963    *dst = '\0';
964    return dst - bufferPathStart;
965}
966
967static inline bool hasSlashDotOrDotDot(const char* str)
968{
969    const unsigned char* p = reinterpret_cast<const unsigned char*>(str);
970    if (!*p)
971        return false;
972    unsigned char pc = *p;
973    while (unsigned char c = *++p) {
974        if (c == '.' && (pc == '/' || pc == '.'))
975            return true;
976        pc = c;
977    }
978    return false;
979}
980
981void KURL::parse(const String& string)
982{
983    checkEncodedString(string);
984
985    CharBuffer buffer(string.length() + 1);
986    copyASCII(string, buffer.data());
987    buffer[string.length()] = '\0';
988    parse(buffer.data(), &string);
989}
990
991template<size_t length>
992static inline bool equal(const char* a, const char (&b)[length])
993{
994    for (size_t i = 0; i < length; ++i) {
995        if (a[i] != b[i])
996            return false;
997    }
998    return true;
999}
1000
1001template<size_t lengthB>
1002static inline bool equal(const char* stringA, size_t lengthA, const char (&stringB)[lengthB])
1003{
1004    return lengthA == lengthB && equal(stringA, stringB);
1005}
1006
1007// List of default schemes is taken from google-url:
1008// http://code.google.com/p/google-url/source/browse/trunk/src/url_canon_stdurl.cc#120
1009static inline bool isDefaultPortForScheme(const char* port, size_t portLength, const char* scheme, size_t schemeLength)
1010{
1011    // This switch is theoretically a performance optimization.  It came over when
1012    // the code was moved from google-url, but may be removed later.
1013    switch (schemeLength) {
1014    case 2:
1015        return equal(scheme, wsScheme) && equal(port, portLength, httpPort);
1016    case 3:
1017        if (equal(scheme, ftpScheme))
1018            return equal(port, portLength, ftpPort);
1019        if (equal(scheme, wssScheme))
1020            return equal(port, portLength, httpsPort);
1021        break;
1022    case 4:
1023        return equal(scheme, httpScheme) && equal(port, portLength, httpPort);
1024    case 5:
1025        return equal(scheme, httpsScheme) && equal(port, portLength, httpsPort);
1026    case 6:
1027        return equal(scheme, gopherScheme) && equal(port, portLength, gopherPort);
1028    }
1029    return false;
1030}
1031
1032static inline bool hostPortIsEmptyButCredentialsArePresent(int hostStart, int portEnd, char userinfoEndChar)
1033{
1034    return userinfoEndChar == '@' && hostStart == portEnd;
1035}
1036
1037static bool isNonFileHierarchicalScheme(const char* scheme, size_t schemeLength)
1038{
1039    switch (schemeLength) {
1040    case 2:
1041        return equal(scheme, wsScheme);
1042    case 3:
1043        return equal(scheme, ftpScheme) || equal(scheme, wssScheme);
1044    case 4:
1045        return equal(scheme, httpScheme);
1046    case 5:
1047        return equal(scheme, httpsScheme);
1048    case 6:
1049        return equal(scheme, gopherScheme);
1050    }
1051    return false;
1052}
1053
1054static bool isCanonicalHostnameLowercaseForScheme(const char* scheme, size_t schemeLength)
1055{
1056    switch (schemeLength) {
1057    case 2:
1058        return equal(scheme, wsScheme);
1059    case 3:
1060        return equal(scheme, ftpScheme) || equal(scheme, wssScheme);
1061    case 4:
1062        return equal(scheme, httpScheme) || equal(scheme, fileScheme);
1063    case 5:
1064        return equal(scheme, httpsScheme);
1065    case 6:
1066        return equal(scheme, gopherScheme);
1067    }
1068    return false;
1069}
1070
1071void KURL::parse(const char* url, const String* originalString)
1072{
1073    if (!url || url[0] == '\0') {
1074        // valid URL must be non-empty
1075        m_string = originalString ? *originalString : url;
1076        invalidate();
1077        return;
1078    }
1079
1080    if (!isSchemeFirstChar(url[0])) {
1081        // scheme must start with an alphabetic character
1082        m_string = originalString ? *originalString : url;
1083        invalidate();
1084        return;
1085    }
1086
1087    int schemeEnd = 0;
1088    while (isSchemeChar(url[schemeEnd]))
1089        schemeEnd++;
1090
1091    if (url[schemeEnd] != ':') {
1092        m_string = originalString ? *originalString : url;
1093        invalidate();
1094        return;
1095    }
1096
1097    int userStart = schemeEnd + 1;
1098    int userEnd;
1099    int passwordStart;
1100    int passwordEnd;
1101    int hostStart;
1102    int hostEnd;
1103    int portStart;
1104    int portEnd;
1105
1106    bool hierarchical = url[schemeEnd + 1] == '/';
1107    bool hasSecondSlash = hierarchical && url[schemeEnd + 2] == '/';
1108
1109    bool isFile = schemeEnd == 4
1110        && isLetterMatchIgnoringCase(url[0], 'f')
1111        && isLetterMatchIgnoringCase(url[1], 'i')
1112        && isLetterMatchIgnoringCase(url[2], 'l')
1113        && isLetterMatchIgnoringCase(url[3], 'e');
1114
1115#if PLATFORM(BLACKBERRY)
1116    // Parse local: urls the same as file: urls.
1117    if (!isFile)
1118        isFile = schemeEnd == 5
1119            && isLetterMatchIgnoringCase(url[0], 'l')
1120            && isLetterMatchIgnoringCase(url[1], 'o')
1121            && isLetterMatchIgnoringCase(url[2], 'c')
1122            && isLetterMatchIgnoringCase(url[3], 'a')
1123            && isLetterMatchIgnoringCase(url[4], 'l');
1124#endif
1125
1126    m_protocolIsInHTTPFamily = isLetterMatchIgnoringCase(url[0], 'h')
1127        && isLetterMatchIgnoringCase(url[1], 't')
1128        && isLetterMatchIgnoringCase(url[2], 't')
1129        && isLetterMatchIgnoringCase(url[3], 'p')
1130        && (url[4] == ':' || (isLetterMatchIgnoringCase(url[4], 's') && url[5] == ':'));
1131
1132    if ((hierarchical && hasSecondSlash) || isNonFileHierarchicalScheme(url, schemeEnd)) {
1133        // The part after the scheme is either a net_path or an abs_path whose first path segment is empty.
1134        // Attempt to find an authority.
1135        // FIXME: Authority characters may be scanned twice, and it would be nice to be faster.
1136
1137        if (hierarchical)
1138            userStart++;
1139        if (hasSecondSlash)
1140            userStart++;
1141        userEnd = userStart;
1142
1143        int colonPos = 0;
1144        while (isUserInfoChar(url[userEnd])) {
1145            if (url[userEnd] == ':' && colonPos == 0)
1146                colonPos = userEnd;
1147            userEnd++;
1148        }
1149
1150        if (url[userEnd] == '@') {
1151            // actual end of the userinfo, start on the host
1152            if (colonPos != 0) {
1153                passwordEnd = userEnd;
1154                userEnd = colonPos;
1155                passwordStart = colonPos + 1;
1156            } else
1157                passwordStart = passwordEnd = userEnd;
1158
1159            hostStart = passwordEnd + 1;
1160        } else if (url[userEnd] == '[' || isPathSegmentEndChar(url[userEnd])) {
1161            // hit the end of the authority, must have been no user
1162            // or looks like an IPv6 hostname
1163            // either way, try to parse it as a hostname
1164            userEnd = userStart;
1165            passwordStart = passwordEnd = userEnd;
1166            hostStart = userStart;
1167        } else {
1168            // invalid character
1169            m_string = originalString ? *originalString : url;
1170            invalidate();
1171            return;
1172        }
1173
1174        hostEnd = hostStart;
1175
1176        // IPV6 IP address
1177        if (url[hostEnd] == '[') {
1178            hostEnd++;
1179            while (isIPv6Char(url[hostEnd]))
1180                hostEnd++;
1181            if (url[hostEnd] == ']')
1182                hostEnd++;
1183            else {
1184                // invalid character
1185                m_string = originalString ? *originalString : url;
1186                invalidate();
1187                return;
1188            }
1189        } else {
1190            while (isHostnameChar(url[hostEnd]))
1191                hostEnd++;
1192        }
1193
1194        if (url[hostEnd] == ':') {
1195            portStart = portEnd = hostEnd + 1;
1196
1197            // possible start of port
1198            portEnd = portStart;
1199            while (isASCIIDigit(url[portEnd]))
1200                portEnd++;
1201        } else
1202            portStart = portEnd = hostEnd;
1203
1204        if (!isPathSegmentEndChar(url[portEnd])) {
1205            // invalid character
1206            m_string = originalString ? *originalString : url;
1207            invalidate();
1208            return;
1209        }
1210
1211        if (hostPortIsEmptyButCredentialsArePresent(hostStart, portEnd, url[passwordEnd])) {
1212            m_string = originalString ? *originalString : url;
1213            invalidate();
1214            return;
1215        }
1216
1217        if (userStart == portEnd && !m_protocolIsInHTTPFamily && !isFile) {
1218            // No authority found, which means that this is not a net_path, but rather an abs_path whose first two
1219            // path segments are empty. For file, http and https only, an empty authority is allowed.
1220            userStart -= 2;
1221            userEnd = userStart;
1222            passwordStart = userEnd;
1223            passwordEnd = passwordStart;
1224            hostStart = passwordEnd;
1225            hostEnd = hostStart;
1226            portStart = hostEnd;
1227            portEnd = hostEnd;
1228        }
1229    } else {
1230        // the part after the scheme must be an opaque_part or an abs_path
1231        userEnd = userStart;
1232        passwordStart = passwordEnd = userEnd;
1233        hostStart = hostEnd = passwordEnd;
1234        portStart = portEnd = hostEnd;
1235    }
1236
1237    int pathStart = portEnd;
1238    int pathEnd = pathStart;
1239    while (url[pathEnd] && url[pathEnd] != '?' && url[pathEnd] != '#')
1240        pathEnd++;
1241
1242    int queryStart = pathEnd;
1243    int queryEnd = queryStart;
1244    if (url[queryStart] == '?') {
1245        while (url[queryEnd] && url[queryEnd] != '#')
1246            queryEnd++;
1247    }
1248
1249    int fragmentStart = queryEnd;
1250    int fragmentEnd = fragmentStart;
1251    if (url[fragmentStart] == '#') {
1252        fragmentStart++;
1253        fragmentEnd = fragmentStart;
1254        while (url[fragmentEnd])
1255            fragmentEnd++;
1256    }
1257
1258    // assemble it all, remembering the real ranges
1259
1260    Vector<char, 4096> buffer(fragmentEnd * 3 + 1);
1261
1262    char *p = buffer.data();
1263    const char *strPtr = url;
1264
1265    // copy in the scheme
1266    const char *schemeEndPtr = url + schemeEnd;
1267    while (strPtr < schemeEndPtr)
1268        *p++ = toASCIILower(*strPtr++);
1269    m_schemeEnd = p - buffer.data();
1270
1271    bool hostIsLocalHost = portEnd - userStart == 9
1272        && isLetterMatchIgnoringCase(url[userStart], 'l')
1273        && isLetterMatchIgnoringCase(url[userStart+1], 'o')
1274        && isLetterMatchIgnoringCase(url[userStart+2], 'c')
1275        && isLetterMatchIgnoringCase(url[userStart+3], 'a')
1276        && isLetterMatchIgnoringCase(url[userStart+4], 'l')
1277        && isLetterMatchIgnoringCase(url[userStart+5], 'h')
1278        && isLetterMatchIgnoringCase(url[userStart+6], 'o')
1279        && isLetterMatchIgnoringCase(url[userStart+7], 's')
1280        && isLetterMatchIgnoringCase(url[userStart+8], 't');
1281
1282    // File URLs need a host part unless it is just file:// or file://localhost
1283    bool degenerateFilePath = pathStart == pathEnd && (hostStart == hostEnd || hostIsLocalHost);
1284
1285    // We drop empty credentials, but keep a colon in an empty host/port pair.
1286    // Removing hostname completely would change the structure of the URL on re-parsing.
1287    bool haveNonHostAuthorityPart = userStart != userEnd || passwordStart != passwordEnd || hostEnd != portEnd;
1288
1289    // add ":" after scheme
1290    *p++ = ':';
1291
1292    // if we have at least one authority part or a file URL - add "//" and authority
1293    if (isFile ? !degenerateFilePath : (haveNonHostAuthorityPart || hostStart != hostEnd)) {
1294        *p++ = '/';
1295        *p++ = '/';
1296
1297        m_userStart = p - buffer.data();
1298
1299        // copy in the user
1300        strPtr = url + userStart;
1301        const char* userEndPtr = url + userEnd;
1302        while (strPtr < userEndPtr) {
1303            char c = *strPtr++;
1304            ASSERT(isUserInfoChar(c));
1305            *p++ = c;
1306        }
1307        m_userEnd = p - buffer.data();
1308
1309        // copy in the password
1310        if (passwordEnd != passwordStart) {
1311            *p++ = ':';
1312            strPtr = url + passwordStart;
1313            const char* passwordEndPtr = url + passwordEnd;
1314            while (strPtr < passwordEndPtr) {
1315                char c = *strPtr++;
1316                ASSERT(isUserInfoChar(c));
1317                *p++ = c;
1318            }
1319        }
1320        m_passwordEnd = p - buffer.data();
1321
1322        // If we had any user info, add "@"
1323        if (p - buffer.data() != m_userStart)
1324            *p++ = '@';
1325
1326        // copy in the host, except in the case of a file URL with authority="localhost"
1327        if (!(isFile && hostIsLocalHost && !haveNonHostAuthorityPart)) {
1328            strPtr = url + hostStart;
1329            const char* hostEndPtr = url + hostEnd;
1330            if (isCanonicalHostnameLowercaseForScheme(buffer.data(), m_schemeEnd)) {
1331                while (strPtr < hostEndPtr) {
1332                    char c = toASCIILower(*strPtr++);
1333                    ASSERT(isHostnameChar(c) || c == '[' || c == ']' || c == ':');
1334                    *p++ = c;
1335                }
1336            } else {
1337                while (strPtr < hostEndPtr) {
1338                    char c = *strPtr++;
1339                    ASSERT(isHostnameChar(c) || c == '[' || c == ']' || c == ':');
1340                    *p++ = c;
1341                }
1342            }
1343        }
1344        m_hostEnd = p - buffer.data();
1345
1346        // Copy in the port if the URL has one (and it's not default). Also, copy it if there was no hostname, so that there is still something in authority component.
1347        if (hostEnd != portStart) {
1348            const char* portStr = url + portStart;
1349            size_t portLength = portEnd - portStart;
1350            if ((portLength && !isDefaultPortForScheme(portStr, portLength, buffer.data(), m_schemeEnd))
1351                || (hostStart == hostEnd && hostEnd != portStart)) {
1352                *p++ = ':';
1353                const char* portEndPtr = url + portEnd;
1354                while (portStr < portEndPtr)
1355                    *p++ = *portStr++;
1356            }
1357        }
1358        m_portEnd = p - buffer.data();
1359    } else {
1360        if (isFile) {
1361            ASSERT(degenerateFilePath);
1362            *p++ = '/';
1363            *p++ = '/';
1364        }
1365        m_userStart = m_userEnd = m_passwordEnd = m_hostEnd = m_portEnd = p - buffer.data();
1366    }
1367
1368    // For canonicalization, ensure we have a '/' for no path.
1369    // Do this only for URL with protocol file, http or https.
1370    if ((m_protocolIsInHTTPFamily || isFile) && pathEnd == pathStart)
1371        *p++ = '/';
1372
1373    // add path, escaping bad characters
1374    if (!hierarchical)
1375        escapeAndAppendNonHierarchicalPart(p, url + pathStart, pathEnd - pathStart);
1376    else if (!hasSlashDotOrDotDot(url))
1377        appendEscapingBadChars(p, url + pathStart, pathEnd - pathStart);
1378    else {
1379        CharBuffer pathBuffer(pathEnd - pathStart + 1);
1380        size_t length = copyPathRemovingDots(pathBuffer.data(), url, pathStart, pathEnd);
1381        appendEscapingBadChars(p, pathBuffer.data(), length);
1382    }
1383
1384    m_pathEnd = p - buffer.data();
1385
1386    // Find the position after the last slash in the path, or
1387    // the position before the path if there are no slashes in it.
1388    int i;
1389    for (i = m_pathEnd; i > m_portEnd; --i) {
1390        if (buffer[i - 1] == '/')
1391            break;
1392    }
1393    m_pathAfterLastSlash = i;
1394
1395    // add query, escaping bad characters
1396    appendEscapingBadChars(p, url + queryStart, queryEnd - queryStart);
1397    m_queryEnd = p - buffer.data();
1398
1399    // add fragment, escaping bad characters
1400    if (fragmentEnd != queryEnd) {
1401        *p++ = '#';
1402        escapeAndAppendNonHierarchicalPart(p, url + fragmentStart, fragmentEnd - fragmentStart);
1403    }
1404    m_fragmentEnd = p - buffer.data();
1405
1406    ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
1407    ASSERT(buffer.size() > 0);
1408
1409    // If we didn't end up actually changing the original string and
1410    // it was already in a String, reuse it to avoid extra allocation.
1411    if (originalString && equal(originalString->impl(), buffer.data(), m_fragmentEnd))
1412        m_string = *originalString;
1413    else
1414        m_string = String(buffer.data(), m_fragmentEnd);
1415
1416    m_isValid = true;
1417}
1418
1419bool equalIgnoringFragmentIdentifier(const KURL& a, const KURL& b)
1420{
1421    if (a.m_queryEnd != b.m_queryEnd)
1422        return false;
1423    unsigned queryLength = a.m_queryEnd;
1424    for (unsigned i = 0; i < queryLength; ++i)
1425        if (a.string()[i] != b.string()[i])
1426            return false;
1427    return true;
1428}
1429
1430bool protocolHostAndPortAreEqual(const KURL& a, const KURL& b)
1431{
1432    if (a.m_schemeEnd != b.m_schemeEnd)
1433        return false;
1434
1435    int hostStartA = a.hostStart();
1436    int hostLengthA = a.hostEnd() - hostStartA;
1437    int hostStartB = b.hostStart();
1438    int hostLengthB = b.hostEnd() - b.hostStart();
1439    if (hostLengthA != hostLengthB)
1440        return false;
1441
1442    // Check the scheme
1443    for (int i = 0; i < a.m_schemeEnd; ++i)
1444        if (a.string()[i] != b.string()[i])
1445            return false;
1446
1447    // And the host
1448    for (int i = 0; i < hostLengthA; ++i)
1449        if (a.string()[hostStartA + i] != b.string()[hostStartB + i])
1450            return false;
1451
1452    if (a.port() != b.port())
1453        return false;
1454
1455    return true;
1456}
1457
1458String encodeWithURLEscapeSequences(const String& notEncodedString)
1459{
1460    CString asUTF8 = notEncodedString.utf8();
1461
1462    CharBuffer buffer(asUTF8.length() * 3 + 1);
1463    char* p = buffer.data();
1464
1465    const char* str = asUTF8.data();
1466    const char* strEnd = str + asUTF8.length();
1467    while (str < strEnd) {
1468        unsigned char c = *str++;
1469        if (isBadChar(c))
1470            appendEscapedChar(p, c);
1471        else
1472            *p++ = c;
1473    }
1474
1475    ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
1476
1477    return String(buffer.data(), p - buffer.data());
1478}
1479
1480// Appends the punycoded hostname identified by the given string and length to
1481// the output buffer. The result will not be null terminated.
1482static void appendEncodedHostname(UCharBuffer& buffer, const UChar* str, unsigned strLen)
1483{
1484    // Needs to be big enough to hold an IDN-encoded name.
1485    // For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
1486    const unsigned hostnameBufferLength = 2048;
1487
1488    if (strLen > hostnameBufferLength || charactersAreAllASCII(str, strLen)) {
1489        buffer.append(str, strLen);
1490        return;
1491    }
1492
1493#if USE(ICU_UNICODE)
1494    UChar hostnameBuffer[hostnameBufferLength];
1495    UErrorCode error = U_ZERO_ERROR;
1496    int32_t numCharactersConverted = uidna_IDNToASCII(str, strLen, hostnameBuffer,
1497        hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, 0, &error);
1498    if (error == U_ZERO_ERROR)
1499        buffer.append(hostnameBuffer, numCharactersConverted);
1500#endif
1501}
1502
1503static void findHostnamesInMailToURL(const UChar* str, int strLen, Vector<pair<int, int> >& nameRanges)
1504{
1505    // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' or end of string character.
1506    // Skip quoted strings so that characters in them don't confuse us.
1507    // When we find a '?' character, we are past the part of the URL that contains host names.
1508
1509    nameRanges.clear();
1510
1511    int p = 0;
1512    while (1) {
1513        // Find start of host name or of quoted string.
1514        int hostnameOrStringStart = findFirstOf(str, strLen, p, "\"@?");
1515        if (hostnameOrStringStart == -1)
1516            return;
1517        UChar c = str[hostnameOrStringStart];
1518        p = hostnameOrStringStart + 1;
1519
1520        if (c == '?')
1521            return;
1522
1523        if (c == '@') {
1524            // Find end of host name.
1525            int hostnameStart = p;
1526            int hostnameEnd = findFirstOf(str, strLen, p, ">,?");
1527            bool done;
1528            if (hostnameEnd == -1) {
1529                hostnameEnd = strLen;
1530                done = true;
1531            } else {
1532                p = hostnameEnd;
1533                done = false;
1534            }
1535
1536            nameRanges.append(make_pair(hostnameStart, hostnameEnd));
1537
1538            if (done)
1539                return;
1540        } else {
1541            // Skip quoted string.
1542            ASSERT(c == '"');
1543            while (1) {
1544                int escapedCharacterOrStringEnd = findFirstOf(str, strLen, p, "\"\\");
1545                if (escapedCharacterOrStringEnd == -1)
1546                    return;
1547
1548                c = str[escapedCharacterOrStringEnd];
1549                p = escapedCharacterOrStringEnd + 1;
1550
1551                // If we are the end of the string, then break from the string loop back to the host name loop.
1552                if (c == '"')
1553                    break;
1554
1555                // Skip escaped character.
1556                ASSERT(c == '\\');
1557                if (p == strLen)
1558                    return;
1559
1560                ++p;
1561            }
1562        }
1563    }
1564}
1565
1566static bool findHostnameInHierarchicalURL(const UChar* str, int strLen, int& startOffset, int& endOffset)
1567{
1568    // Find the host name in a hierarchical URL.
1569    // It comes after a "://" sequence, with scheme characters preceding, and
1570    // this should be the first colon in the string.
1571    // It ends with the end of the string or a ":" or a path segment ending character.
1572    // If there is a "@" character, the host part is just the part after the "@".
1573    int separator = findFirstOf(str, strLen, 0, ":");
1574    if (separator == -1 || separator + 2 >= strLen ||
1575        str[separator + 1] != '/' || str[separator + 2] != '/')
1576        return false;
1577
1578    // Check that all characters before the :// are valid scheme characters.
1579    if (!isSchemeFirstChar(str[0]))
1580        return false;
1581    for (int i = 1; i < separator; ++i) {
1582        if (!isSchemeChar(str[i]))
1583            return false;
1584    }
1585
1586    // Start after the separator.
1587    int authorityStart = separator + 3;
1588
1589    // Find terminating character.
1590    int hostnameEnd = strLen;
1591    for (int i = authorityStart; i < strLen; ++i) {
1592        UChar c = str[i];
1593        if (c == ':' || (isPathSegmentEndChar(c) && c != 0)) {
1594            hostnameEnd = i;
1595            break;
1596        }
1597    }
1598
1599    // Find "@" for the start of the host name.
1600    int userInfoTerminator = findFirstOf(str, strLen, authorityStart, "@");
1601    int hostnameStart;
1602    if (userInfoTerminator == -1 || userInfoTerminator > hostnameEnd)
1603        hostnameStart = authorityStart;
1604    else
1605        hostnameStart = userInfoTerminator + 1;
1606
1607    startOffset = hostnameStart;
1608    endOffset = hostnameEnd;
1609    return true;
1610}
1611
1612// Converts all hostnames found in the given input to punycode, preserving the
1613// rest of the URL unchanged. The output will NOT be null-terminated.
1614static void encodeHostnames(const String& str, UCharBuffer& output)
1615{
1616    output.clear();
1617
1618    if (protocolIs(str, "mailto")) {
1619        Vector<pair<int, int> > hostnameRanges;
1620        findHostnamesInMailToURL(str.characters(), str.length(), hostnameRanges);
1621        int n = hostnameRanges.size();
1622        int p = 0;
1623        for (int i = 0; i < n; ++i) {
1624            const pair<int, int>& r = hostnameRanges[i];
1625            output.append(&str.characters()[p], r.first - p);
1626            appendEncodedHostname(output, &str.characters()[r.first], r.second - r.first);
1627            p = r.second;
1628        }
1629        // This will copy either everything after the last hostname, or the
1630        // whole thing if there is no hostname.
1631        output.append(&str.characters()[p], str.length() - p);
1632    } else {
1633        int hostStart, hostEnd;
1634        if (findHostnameInHierarchicalURL(str.characters(), str.length(), hostStart, hostEnd)) {
1635            output.append(str.characters(), hostStart); // Before hostname.
1636            appendEncodedHostname(output, &str.characters()[hostStart], hostEnd - hostStart);
1637            output.append(&str.characters()[hostEnd], str.length() - hostEnd); // After hostname.
1638        } else {
1639            // No hostname to encode, return the input.
1640            output.append(str.characters(), str.length());
1641        }
1642    }
1643}
1644
1645static void encodeRelativeString(const String& rel, const TextEncoding& encoding, CharBuffer& output)
1646{
1647    UCharBuffer s;
1648    encodeHostnames(rel, s);
1649
1650    TextEncoding pathEncoding(UTF8Encoding()); // Path is always encoded as UTF-8; other parts may depend on the scheme.
1651
1652    int pathEnd = -1;
1653    if (encoding != pathEncoding && encoding.isValid() && !protocolIs(rel, "mailto") && !protocolIs(rel, "data") && !protocolIsJavaScript(rel)) {
1654        // Find the first instance of either # or ?, keep pathEnd at -1 otherwise.
1655        pathEnd = findFirstOf(s.data(), s.size(), 0, "#?");
1656    }
1657
1658    if (pathEnd == -1) {
1659        CString decoded = pathEncoding.encode(s.data(), s.size(), URLEncodedEntitiesForUnencodables);
1660        output.resize(decoded.length());
1661        memcpy(output.data(), decoded.data(), decoded.length());
1662    } else {
1663        CString pathDecoded = pathEncoding.encode(s.data(), pathEnd, URLEncodedEntitiesForUnencodables);
1664        // Unencodable characters in URLs are represented by converting
1665        // them to XML entities and escaping non-alphanumeric characters.
1666        CString otherDecoded = encoding.encode(s.data() + pathEnd, s.size() - pathEnd, URLEncodedEntitiesForUnencodables);
1667
1668        output.resize(pathDecoded.length() + otherDecoded.length());
1669        memcpy(output.data(), pathDecoded.data(), pathDecoded.length());
1670        memcpy(output.data() + pathDecoded.length(), otherDecoded.data(), otherDecoded.length());
1671    }
1672    output.append('\0'); // null-terminate the output.
1673}
1674
1675static String substituteBackslashes(const String& string)
1676{
1677    size_t questionPos = string.find('?');
1678    size_t hashPos = string.find('#');
1679    unsigned pathEnd;
1680
1681    if (hashPos != notFound && (questionPos == notFound || questionPos > hashPos))
1682        pathEnd = hashPos;
1683    else if (questionPos != notFound)
1684        pathEnd = questionPos;
1685    else
1686        pathEnd = string.length();
1687
1688    return string.left(pathEnd).replace('\\','/') + string.substring(pathEnd);
1689}
1690
1691bool KURL::isHierarchical() const
1692{
1693    if (!m_isValid)
1694        return false;
1695    ASSERT(m_string[m_schemeEnd] == ':');
1696    return m_string[m_schemeEnd + 1] == '/';
1697}
1698
1699void KURL::copyToBuffer(Vector<char, 512>& buffer) const
1700{
1701    // FIXME: This throws away the high bytes of all the characters in the string!
1702    // That's fine for a valid URL, which is all ASCII, but not for invalid URLs.
1703    buffer.resize(m_string.length());
1704    copyASCII(m_string, buffer.data());
1705}
1706
1707bool protocolIs(const String& url, const char* protocol)
1708{
1709    // Do the comparison without making a new string object.
1710    assertProtocolIsGood(protocol);
1711    for (int i = 0; ; ++i) {
1712        if (!protocol[i])
1713            return url[i] == ':';
1714        if (!isLetterMatchIgnoringCase(url[i], protocol[i]))
1715            return false;
1716    }
1717}
1718
1719bool isValidProtocol(const String& protocol)
1720{
1721    // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
1722    if (protocol.isEmpty())
1723        return false;
1724    if (!isSchemeFirstChar(protocol[0]))
1725        return false;
1726    unsigned protocolLength = protocol.length();
1727    for (unsigned i = 1; i < protocolLength; i++) {
1728        if (!isSchemeChar(protocol[i]))
1729            return false;
1730    }
1731    return true;
1732}
1733
1734#ifndef NDEBUG
1735void KURL::print() const
1736{
1737    printf("%s\n", m_string.utf8().data());
1738}
1739#endif
1740
1741String KURL::strippedForUseAsReferrer() const
1742{
1743    KURL referrer(*this);
1744    referrer.setUser(String());
1745    referrer.setPass(String());
1746    referrer.removeFragmentIdentifier();
1747    return referrer.string();
1748}
1749
1750bool KURL::isLocalFile() const
1751{
1752    // Including feed here might be a bad idea since drag and drop uses this check
1753    // and including feed would allow feeds to potentially let someone's blog
1754    // read the contents of the clipboard on a drag, even without a drop.
1755    // Likewise with using the FrameLoader::shouldTreatURLAsLocal() function.
1756    return protocolIs("file");
1757}
1758
1759bool protocolIsJavaScript(const String& url)
1760{
1761    return protocolIs(url, "javascript");
1762}
1763
1764const KURL& blankURL()
1765{
1766    DEFINE_STATIC_LOCAL(KURL, staticBlankURL, (ParsedURLString, "about:blank"));
1767    return staticBlankURL;
1768}
1769
1770bool KURL::isBlankURL() const
1771{
1772    return protocolIs("about");
1773}
1774
1775bool isDefaultPortForProtocol(unsigned short port, const String& protocol)
1776{
1777    if (protocol.isEmpty())
1778        return false;
1779
1780    typedef HashMap<String, unsigned, CaseFoldingHash> DefaultPortsMap;
1781    DEFINE_STATIC_LOCAL(DefaultPortsMap, defaultPorts, ());
1782    if (defaultPorts.isEmpty()) {
1783        defaultPorts.set("http", 80);
1784        defaultPorts.set("https", 443);
1785        defaultPorts.set("ftp", 21);
1786        defaultPorts.set("ftps", 990);
1787    }
1788    return defaultPorts.get(protocol) == port;
1789}
1790
1791bool portAllowed(const KURL& url)
1792{
1793    unsigned short port = url.port();
1794
1795    // Since most URLs don't have a port, return early for the "no port" case.
1796    if (!port)
1797        return true;
1798
1799    // This blocked port list matches the port blocking that Mozilla implements.
1800    // See http://www.mozilla.org/projects/netlib/PortBanning.html for more information.
1801    static const unsigned short blockedPortList[] = {
1802        1,    // tcpmux
1803        7,    // echo
1804        9,    // discard
1805        11,   // systat
1806        13,   // daytime
1807        15,   // netstat
1808        17,   // qotd
1809        19,   // chargen
1810        20,   // FTP-data
1811        21,   // FTP-control
1812        22,   // SSH
1813        23,   // telnet
1814        25,   // SMTP
1815        37,   // time
1816        42,   // name
1817        43,   // nicname
1818        53,   // domain
1819        77,   // priv-rjs
1820        79,   // finger
1821        87,   // ttylink
1822        95,   // supdup
1823        101,  // hostriame
1824        102,  // iso-tsap
1825        103,  // gppitnp
1826        104,  // acr-nema
1827        109,  // POP2
1828        110,  // POP3
1829        111,  // sunrpc
1830        113,  // auth
1831        115,  // SFTP
1832        117,  // uucp-path
1833        119,  // nntp
1834        123,  // NTP
1835        135,  // loc-srv / epmap
1836        139,  // netbios
1837        143,  // IMAP2
1838        179,  // BGP
1839        389,  // LDAP
1840        465,  // SMTP+SSL
1841        512,  // print / exec
1842        513,  // login
1843        514,  // shell
1844        515,  // printer
1845        526,  // tempo
1846        530,  // courier
1847        531,  // Chat
1848        532,  // netnews
1849        540,  // UUCP
1850        556,  // remotefs
1851        563,  // NNTP+SSL
1852        587,  // ESMTP
1853        601,  // syslog-conn
1854        636,  // LDAP+SSL
1855        993,  // IMAP+SSL
1856        995,  // POP3+SSL
1857        2049, // NFS
1858        3659, // apple-sasl / PasswordServer [Apple addition]
1859        4045, // lockd
1860        6000, // X11
1861        6665, // Alternate IRC [Apple addition]
1862        6666, // Alternate IRC [Apple addition]
1863        6667, // Standard IRC [Apple addition]
1864        6668, // Alternate IRC [Apple addition]
1865        6669, // Alternate IRC [Apple addition]
1866        invalidPortNumber, // Used to block all invalid port numbers
1867    };
1868    const unsigned short* const blockedPortListEnd = blockedPortList + WTF_ARRAY_LENGTH(blockedPortList);
1869
1870#ifndef NDEBUG
1871    // The port list must be sorted for binary_search to work.
1872    static bool checkedPortList = false;
1873    if (!checkedPortList) {
1874        for (const unsigned short* p = blockedPortList; p != blockedPortListEnd - 1; ++p)
1875            ASSERT(*p < *(p + 1));
1876        checkedPortList = true;
1877    }
1878#endif
1879
1880    // If the port is not in the blocked port list, allow it.
1881    if (!binary_search(blockedPortList, blockedPortListEnd, port))
1882        return true;
1883
1884    // Allow ports 21 and 22 for FTP URLs, as Mozilla does.
1885    if ((port == 21 || port == 22) && url.protocolIs("ftp"))
1886        return true;
1887
1888    // Allow any port number in a file URL, since the port number is ignored.
1889    if (url.protocolIs("file"))
1890        return true;
1891
1892#if PLATFORM(BLACKBERRY)
1893    if (url.protocolIs("local"))
1894        return true;
1895#endif
1896
1897    return false;
1898}
1899
1900String mimeTypeFromDataURL(const String& url)
1901{
1902    ASSERT(protocolIs(url, "data"));
1903    size_t index = url.find(';');
1904    if (index == notFound)
1905        index = url.find(',');
1906    if (index != notFound) {
1907        if (index > 5)
1908            return url.substring(5, index - 5).lower();
1909        return "text/plain"; // Data URLs with no MIME type are considered text/plain.
1910    }
1911    return "";
1912}
1913
1914String mimeTypeFromURL(const KURL& url)
1915{
1916    String decodedPath = decodeURLEscapeSequences(url.path());
1917    String extension = decodedPath.substring(decodedPath.reverseFind('.') + 1);
1918
1919    // We don't use MIMETypeRegistry::getMIMETypeForPath() because it returns "application/octet-stream" upon failure
1920    return MIMETypeRegistry::getMIMETypeForExtension(extension);
1921}
1922
1923bool KURL::isSafeToSendToAnotherThread() const
1924{
1925    return m_string.isSafeToSendToAnotherThread();
1926}
1927
1928String KURL::stringCenterEllipsizedToLength(unsigned length) const
1929{
1930    if (string().length() <= length)
1931        return string();
1932
1933    return string().left(length / 2 - 1) + "..." + string().right(length / 2 - 2);
1934}
1935
1936}
1937