1/*
2 * Copyright (C) 2004, 2007, 2008, 2011, 2012, 2013 Apple Inc. All rights reserved.
3 * Copyright (C) 2012 Research In Motion Limited. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "URL.h"
29
30#include "DecodeEscapeSequences.h"
31#include "MIMETypeRegistry.h"
32#include "TextEncoding.h"
33#include "UUID.h"
34#include <stdio.h>
35#include <unicode/uidna.h>
36#include <wtf/HashMap.h>
37#include <wtf/HexNumber.h>
38#include <wtf/StdLibExtras.h>
39#include <wtf/text/CString.h>
40#include <wtf/text/StringBuilder.h>
41#include <wtf/text/StringHash.h>
42
43// FIXME: This file makes too much use of the + operator on String.
44// We either have to optimize that operator so it doesn't involve
45// so many allocations, or change this to use StringBuffer instead.
46
47using namespace WTF;
48
49namespace WebCore {
50
51typedef Vector<char, 512> CharBuffer;
52typedef Vector<UChar, 512> UCharBuffer;
53
54static const unsigned maximumValidPortNumber = 0xFFFE;
55static const unsigned invalidPortNumber = 0xFFFF;
56
57static inline bool isLetterMatchIgnoringCase(UChar character, char lowercaseLetter)
58{
59    ASSERT(isASCIILower(lowercaseLetter));
60    return (character | 0x20) == lowercaseLetter;
61}
62
63static const char wsScheme[] = {'w', 's'};
64static const char ftpScheme[] = {'f', 't', 'p'};
65static const char ftpPort[] = {'2', '1'};
66static const char wssScheme[] = {'w', 's', 's'};
67static const char fileScheme[] = {'f', 'i', 'l', 'e'};
68static const char httpScheme[] = {'h', 't', 't', 'p'};
69static const char httpPort[] = {'8', '0'};
70static const char httpsScheme[] = {'h', 't', 't', 'p', 's'};
71static const char httpsPort[] = {'4', '4', '3'};
72static const char gopherScheme[] = {'g', 'o', 'p', 'h', 'e', 'r'};
73static const char gopherPort[] = {'7', '0'};
74
75static inline bool isLetterMatchIgnoringCase(char character, char lowercaseLetter)
76{
77    ASSERT(isASCIILower(lowercaseLetter));
78    return (character | 0x20) == lowercaseLetter;
79}
80
81enum URLCharacterClasses {
82    // alpha
83    SchemeFirstChar = 1 << 0,
84
85    // ( alpha | digit | "+" | "-" | "." )
86    SchemeChar = 1 << 1,
87
88    // mark        = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
89    // unreserved  = alphanum | mark
90    // ( unreserved | escaped | ";" | ":" | "&" | "=" | "+" | "$" | "," )
91    UserInfoChar = 1 << 2,
92
93    // alnum | "." | "-" | "%"
94    // The above is what the specification says, but we are lenient to
95    // match existing practice and also allow:
96    // "_"
97    HostnameChar = 1 << 3,
98
99    // hexdigit | ":" | "%"
100    IPv6Char = 1 << 4,
101
102    // "#" | "?" | "/" | nul
103    PathSegmentEndChar = 1 << 5,
104
105    // not allowed in path
106    BadChar = 1 << 6
107};
108
109static const unsigned char characterClassTable[256] = {
110    /* 0 nul */ PathSegmentEndChar,    /* 1 soh */ BadChar,
111    /* 2 stx */ BadChar,    /* 3 etx */ BadChar,
112    /* 4 eot */ BadChar,    /* 5 enq */ BadChar,    /* 6 ack */ BadChar,    /* 7 bel */ BadChar,
113    /* 8 bs */ BadChar,     /* 9 ht */ BadChar,     /* 10 nl */ BadChar,    /* 11 vt */ BadChar,
114    /* 12 np */ BadChar,    /* 13 cr */ BadChar,    /* 14 so */ BadChar,    /* 15 si */ BadChar,
115    /* 16 dle */ BadChar,   /* 17 dc1 */ BadChar,   /* 18 dc2 */ BadChar,   /* 19 dc3 */ BadChar,
116    /* 20 dc4 */ BadChar,   /* 21 nak */ BadChar,   /* 22 syn */ BadChar,   /* 23 etb */ BadChar,
117    /* 24 can */ BadChar,   /* 25 em */ BadChar,    /* 26 sub */ BadChar,   /* 27 esc */ BadChar,
118    /* 28 fs */ BadChar,    /* 29 gs */ BadChar,    /* 30 rs */ BadChar,    /* 31 us */ BadChar,
119    /* 32 sp */ BadChar,    /* 33  ! */ UserInfoChar,
120    /* 34  " */ BadChar,    /* 35  # */ PathSegmentEndChar | BadChar,
121    /* 36  $ */ UserInfoChar,    /* 37  % */ UserInfoChar | HostnameChar | IPv6Char | BadChar,
122    /* 38  & */ UserInfoChar,    /* 39  ' */ UserInfoChar,
123    /* 40  ( */ UserInfoChar,    /* 41  ) */ UserInfoChar,
124    /* 42  * */ UserInfoChar,    /* 43  + */ SchemeChar | UserInfoChar,
125    /* 44  , */ UserInfoChar,
126    /* 45  - */ SchemeChar | UserInfoChar | HostnameChar,
127    /* 46  . */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
128    /* 47  / */ PathSegmentEndChar,
129    /* 48  0 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
130    /* 49  1 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
131    /* 50  2 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
132    /* 51  3 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
133    /* 52  4 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
134    /* 53  5 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
135    /* 54  6 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
136    /* 55  7 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
137    /* 56  8 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
138    /* 57  9 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
139    /* 58  : */ UserInfoChar | IPv6Char,    /* 59  ; */ UserInfoChar,
140    /* 60  < */ BadChar,    /* 61  = */ UserInfoChar,
141    /* 62  > */ BadChar,    /* 63  ? */ PathSegmentEndChar | BadChar,
142    /* 64  @ */ 0,
143    /* 65  A */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
144    /* 66  B */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
145    /* 67  C */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
146    /* 68  D */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
147    /* 69  E */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
148    /* 70  F */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
149    /* 71  G */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
150    /* 72  H */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
151    /* 73  I */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
152    /* 74  J */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
153    /* 75  K */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
154    /* 76  L */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
155    /* 77  M */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
156    /* 78  N */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
157    /* 79  O */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
158    /* 80  P */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
159    /* 81  Q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
160    /* 82  R */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
161    /* 83  S */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
162    /* 84  T */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
163    /* 85  U */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
164    /* 86  V */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
165    /* 87  W */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
166    /* 88  X */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
167    /* 89  Y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
168    /* 90  Z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
169    /* 91  [ */ 0,
170    /* 92  \ */ 0,    /* 93  ] */ 0,
171    /* 94  ^ */ 0,
172    /* 95  _ */ UserInfoChar | HostnameChar,
173    /* 96  ` */ 0,
174    /* 97  a */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
175    /* 98  b */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
176    /* 99  c */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
177    /* 100  d */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
178    /* 101  e */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
179    /* 102  f */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
180    /* 103  g */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
181    /* 104  h */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
182    /* 105  i */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
183    /* 106  j */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
184    /* 107  k */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
185    /* 108  l */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
186    /* 109  m */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
187    /* 110  n */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
188    /* 111  o */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
189    /* 112  p */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
190    /* 113  q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
191    /* 114  r */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
192    /* 115  s */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
193    /* 116  t */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
194    /* 117  u */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
195    /* 118  v */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
196    /* 119  w */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
197    /* 120  x */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
198    /* 121  y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
199    /* 122  z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
200    /* 123  { */ 0,
201    /* 124  | */ 0,   /* 125  } */ 0,   /* 126  ~ */ UserInfoChar,   /* 127 del */ BadChar,
202    /* 128 */ BadChar, /* 129 */ BadChar, /* 130 */ BadChar, /* 131 */ BadChar,
203    /* 132 */ BadChar, /* 133 */ BadChar, /* 134 */ BadChar, /* 135 */ BadChar,
204    /* 136 */ BadChar, /* 137 */ BadChar, /* 138 */ BadChar, /* 139 */ BadChar,
205    /* 140 */ BadChar, /* 141 */ BadChar, /* 142 */ BadChar, /* 143 */ BadChar,
206    /* 144 */ BadChar, /* 145 */ BadChar, /* 146 */ BadChar, /* 147 */ BadChar,
207    /* 148 */ BadChar, /* 149 */ BadChar, /* 150 */ BadChar, /* 151 */ BadChar,
208    /* 152 */ BadChar, /* 153 */ BadChar, /* 154 */ BadChar, /* 155 */ BadChar,
209    /* 156 */ BadChar, /* 157 */ BadChar, /* 158 */ BadChar, /* 159 */ BadChar,
210    /* 160 */ BadChar, /* 161 */ BadChar, /* 162 */ BadChar, /* 163 */ BadChar,
211    /* 164 */ BadChar, /* 165 */ BadChar, /* 166 */ BadChar, /* 167 */ BadChar,
212    /* 168 */ BadChar, /* 169 */ BadChar, /* 170 */ BadChar, /* 171 */ BadChar,
213    /* 172 */ BadChar, /* 173 */ BadChar, /* 174 */ BadChar, /* 175 */ BadChar,
214    /* 176 */ BadChar, /* 177 */ BadChar, /* 178 */ BadChar, /* 179 */ BadChar,
215    /* 180 */ BadChar, /* 181 */ BadChar, /* 182 */ BadChar, /* 183 */ BadChar,
216    /* 184 */ BadChar, /* 185 */ BadChar, /* 186 */ BadChar, /* 187 */ BadChar,
217    /* 188 */ BadChar, /* 189 */ BadChar, /* 190 */ BadChar, /* 191 */ BadChar,
218    /* 192 */ BadChar, /* 193 */ BadChar, /* 194 */ BadChar, /* 195 */ BadChar,
219    /* 196 */ BadChar, /* 197 */ BadChar, /* 198 */ BadChar, /* 199 */ BadChar,
220    /* 200 */ BadChar, /* 201 */ BadChar, /* 202 */ BadChar, /* 203 */ BadChar,
221    /* 204 */ BadChar, /* 205 */ BadChar, /* 206 */ BadChar, /* 207 */ BadChar,
222    /* 208 */ BadChar, /* 209 */ BadChar, /* 210 */ BadChar, /* 211 */ BadChar,
223    /* 212 */ BadChar, /* 213 */ BadChar, /* 214 */ BadChar, /* 215 */ BadChar,
224    /* 216 */ BadChar, /* 217 */ BadChar, /* 218 */ BadChar, /* 219 */ BadChar,
225    /* 220 */ BadChar, /* 221 */ BadChar, /* 222 */ BadChar, /* 223 */ BadChar,
226    /* 224 */ BadChar, /* 225 */ BadChar, /* 226 */ BadChar, /* 227 */ BadChar,
227    /* 228 */ BadChar, /* 229 */ BadChar, /* 230 */ BadChar, /* 231 */ BadChar,
228    /* 232 */ BadChar, /* 233 */ BadChar, /* 234 */ BadChar, /* 235 */ BadChar,
229    /* 236 */ BadChar, /* 237 */ BadChar, /* 238 */ BadChar, /* 239 */ BadChar,
230    /* 240 */ BadChar, /* 241 */ BadChar, /* 242 */ BadChar, /* 243 */ BadChar,
231    /* 244 */ BadChar, /* 245 */ BadChar, /* 246 */ BadChar, /* 247 */ BadChar,
232    /* 248 */ BadChar, /* 249 */ BadChar, /* 250 */ BadChar, /* 251 */ BadChar,
233    /* 252 */ BadChar, /* 253 */ BadChar, /* 254 */ BadChar, /* 255 */ BadChar
234};
235
236static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd);
237static void encodeRelativeString(const String& rel, const TextEncoding&, CharBuffer& ouput);
238static String substituteBackslashes(const String&);
239
240static inline bool isSchemeFirstChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeFirstChar; }
241static inline bool isSchemeFirstChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeFirstChar); }
242static inline bool isSchemeChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeChar; }
243static inline bool isSchemeChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeChar); }
244static inline bool isUserInfoChar(unsigned char c) { return characterClassTable[c] & UserInfoChar; }
245static inline bool isHostnameChar(unsigned char c) { return characterClassTable[c] & HostnameChar; }
246static inline bool isIPv6Char(unsigned char c) { return characterClassTable[c] & IPv6Char; }
247static inline bool isPathSegmentEndChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & PathSegmentEndChar; }
248static inline bool isPathSegmentEndChar(UChar c) { return c <= 0xff && (characterClassTable[c] & PathSegmentEndChar); }
249static inline bool isBadChar(unsigned char c) { return characterClassTable[c] & BadChar; }
250
251static inline bool isSchemeCharacterMatchIgnoringCase(char character, char schemeCharacter)
252{
253    ASSERT(isSchemeChar(character));
254    ASSERT(schemeCharacter & 0x20);
255    ASSERT(isASCIILower(schemeCharacter) || (!isASCIIUpper(schemeCharacter) && isSchemeChar(schemeCharacter)));
256    return (character | 0x20) == schemeCharacter;
257}
258
259// Copies the source to the destination, assuming all the source characters are
260// ASCII. The destination buffer must be large enough. Null characters are allowed
261// in the source string, and no attempt is made to null-terminate the result.
262static void copyASCII(const String& string, char* dest)
263{
264    if (string.isEmpty())
265        return;
266
267    if (string.is8Bit())
268        memcpy(dest, string.characters8(), string.length());
269    else {
270        const UChar* src = string.characters16();
271        size_t length = string.length();
272        for (size_t i = 0; i < length; i++)
273            dest[i] = static_cast<char>(src[i]);
274    }
275}
276
277static void appendASCII(const String& base, const char* rel, size_t len, CharBuffer& buffer)
278{
279    buffer.resize(base.length() + len + 1);
280    copyASCII(base, buffer.data());
281    memcpy(buffer.data() + base.length(), rel, len);
282    buffer[buffer.size() - 1] = '\0';
283}
284
285// FIXME: Move to WTFString.h eventually.
286// Returns the index of the first index in string |s| of any of the characters
287// in |toFind|. |toFind| should be a null-terminated string, all characters up
288// to the null will be searched. Returns int if not found.
289static int findFirstOf(StringView string, unsigned startPosition, const char* target)
290{
291    unsigned length = string.length();
292    for (unsigned i = startPosition; i < length; ++i) {
293        for (unsigned j = 0; target[j]; ++j) {
294            if (string[i] == target[j])
295                return i;
296        }
297    }
298    return -1;
299}
300
301static inline void checkEncodedString(const String& url)
302{
303    ASSERT_UNUSED(url, url.containsOnlyASCII());
304    ASSERT_UNUSED(url, url.isEmpty() || isSchemeFirstChar(url[0]));
305}
306
307inline bool URL::protocolIs(const String& string, const char* protocol)
308{
309    return WebCore::protocolIs(string, protocol);
310}
311
312void URL::invalidate()
313{
314    m_isValid = false;
315    m_protocolIsInHTTPFamily = false;
316    m_schemeEnd = 0;
317    m_userStart = 0;
318    m_userEnd = 0;
319    m_passwordEnd = 0;
320    m_hostEnd = 0;
321    m_portEnd = 0;
322    m_pathEnd = 0;
323    m_pathAfterLastSlash = 0;
324    m_queryEnd = 0;
325    m_fragmentEnd = 0;
326}
327
328URL::URL(ParsedURLStringTag, const String& url)
329{
330    parse(url);
331    ASSERT(url == m_string);
332}
333
334URL::URL(const URL& base, const String& relative)
335{
336    init(base, relative, UTF8Encoding());
337}
338
339URL::URL(const URL& base, const String& relative, const TextEncoding& encoding)
340{
341    // For UTF-{7,16,32}, we want to use UTF-8 for the query part as
342    // we do when submitting a form. A form with GET method
343    // has its contents added to a URL as query params and it makes sense
344    // to be consistent.
345    init(base, relative, encoding.encodingForFormSubmission());
346}
347
348static bool shouldTrimFromURL(unsigned char c)
349{
350    // Browsers ignore leading/trailing whitespace and control
351    // characters from URLs.  Note that c is an *unsigned* char here
352    // so this comparison should only catch control characters.
353    return c <= ' ';
354}
355
356void URL::init(const URL& base, const String& relative, const TextEncoding& encoding)
357{
358    // Allow resolutions with a null or empty base URL, but not with any other invalid one.
359    // FIXME: Is this a good rule?
360    if (!base.m_isValid && !base.isEmpty()) {
361        m_string = relative;
362        invalidate();
363        return;
364    }
365
366    // For compatibility with Win IE, treat backslashes as if they were slashes,
367    // as long as we're not dealing with javascript: or data: URLs.
368    String rel = relative;
369    if (rel.contains('\\') && !(protocolIsJavaScript(rel) || protocolIs(rel, "data")))
370        rel = substituteBackslashes(rel);
371
372    bool allASCII = rel.containsOnlyASCII();
373    CharBuffer strBuffer;
374    char* str;
375    size_t len;
376    if (allASCII) {
377        len = rel.length();
378        strBuffer.resize(len + 1);
379        copyASCII(rel, strBuffer.data());
380        strBuffer[len] = 0;
381        str = strBuffer.data();
382    } else {
383        encodeRelativeString(rel, encoding, strBuffer);
384        str = strBuffer.data();
385        len = strlen(str);
386    }
387
388    // Get rid of leading whitespace and control characters.
389    while (len && shouldTrimFromURL(*str)) {
390        str++;
391        --len;
392    }
393
394    // Get rid of trailing whitespace and control characters.
395    while (len && shouldTrimFromURL(str[len - 1]))
396        str[--len] = '\0';
397
398    // According to the RFC, the reference should be interpreted as an
399    // absolute URI if possible, using the "leftmost, longest"
400    // algorithm. If the URI reference is absolute it will have a
401    // scheme, meaning that it will have a colon before the first
402    // non-scheme element.
403    bool absolute = false;
404    char* p = str;
405    if (isSchemeFirstChar(*p)) {
406        ++p;
407        while (isSchemeChar(*p)) {
408            ++p;
409        }
410        if (*p == ':') {
411            if (p[1] != '/' && equalIgnoringCase(base.protocol(), String(str, p - str)) && base.isHierarchical())
412                str = p + 1;
413            else
414                absolute = true;
415        }
416    }
417
418    CharBuffer parseBuffer;
419
420    if (absolute) {
421        parse(str, &relative);
422    } else {
423        // If the base is empty or opaque (e.g. data: or javascript:), then the URL is invalid
424        // unless the relative URL is a single fragment.
425        if (!base.isHierarchical()) {
426            if (str[0] == '#') {
427                appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer);
428                parse(parseBuffer.data(), &relative);
429            } else {
430                m_string = relative;
431                invalidate();
432            }
433            return;
434        }
435
436        switch (str[0]) {
437        case '\0':
438            // The reference is empty, so this is a reference to the same document with any fragment identifier removed.
439            *this = base;
440            removeFragmentIdentifier();
441            break;
442        case '#': {
443            // must be fragment-only reference
444            appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer);
445            parse(parseBuffer.data(), &relative);
446            break;
447        }
448        case '?': {
449            // query-only reference, special case needed for non-URL results
450            appendASCII(base.m_string.left(base.m_pathEnd), str, len, parseBuffer);
451            parse(parseBuffer.data(), &relative);
452            break;
453        }
454        case '/':
455            // must be net-path or absolute-path reference
456            if (str[1] == '/') {
457                // net-path
458                appendASCII(base.m_string.left(base.m_schemeEnd + 1), str, len, parseBuffer);
459                parse(parseBuffer.data(), &relative);
460            } else {
461                // abs-path
462                appendASCII(base.m_string.left(base.m_portEnd), str, len, parseBuffer);
463                parse(parseBuffer.data(), &relative);
464            }
465            break;
466        default:
467            {
468                // must be relative-path reference
469
470                // Base part plus relative part plus one possible slash added in between plus terminating \0 byte.
471                const size_t bufferSize = base.m_pathEnd + 1 + len + 1;
472                parseBuffer.resize(bufferSize);
473
474                char* bufferPos = parseBuffer.data();
475                char* bufferStart = bufferPos;
476
477                // first copy everything before the path from the base
478                CharBuffer baseStringBuffer(base.m_string.length());
479                copyASCII(base.m_string, baseStringBuffer.data());
480                const char* baseString = baseStringBuffer.data();
481                const char* baseStringStart = baseString;
482                const char* pathStart = baseStringStart + base.m_portEnd;
483                while (baseStringStart < pathStart)
484                    *bufferPos++ = *baseStringStart++;
485                char* bufferPathStart = bufferPos;
486
487                // now copy the base path
488                const char* baseStringEnd = baseString + base.m_pathEnd;
489
490                // go back to the last slash
491                while (baseStringEnd > baseStringStart && baseStringEnd[-1] != '/')
492                    baseStringEnd--;
493
494                if (baseStringEnd == baseStringStart) {
495                    // no path in base, add a path separator if necessary
496                    if (base.m_schemeEnd + 1 != base.m_pathEnd && *str && *str != '?' && *str != '#')
497                        *bufferPos++ = '/';
498                } else {
499                    bufferPos += copyPathRemovingDots(bufferPos, baseStringStart, 0, baseStringEnd - baseStringStart);
500                }
501
502                const char* relStringStart = str;
503                const char* relStringPos = relStringStart;
504
505                while (*relStringPos && *relStringPos != '?' && *relStringPos != '#') {
506                    if (relStringPos[0] == '.' && bufferPos[-1] == '/') {
507                        if (isPathSegmentEndChar(relStringPos[1])) {
508                            // skip over "." segment
509                            relStringPos += 1;
510                            if (relStringPos[0] == '/')
511                                relStringPos++;
512                            continue;
513                        } else if (relStringPos[1] == '.' && isPathSegmentEndChar(relStringPos[2])) {
514                            // skip over ".." segment and rewind the last segment
515                            // the RFC leaves it up to the app to decide what to do with excess
516                            // ".." segments - we choose to drop them since some web content
517                            // relies on this.
518                            relStringPos += 2;
519                            if (relStringPos[0] == '/')
520                                relStringPos++;
521                            if (bufferPos > bufferPathStart + 1)
522                                bufferPos--;
523                            while (bufferPos > bufferPathStart + 1  && bufferPos[-1] != '/')
524                                bufferPos--;
525                            continue;
526                        }
527                    }
528
529                    *bufferPos = *relStringPos;
530                    relStringPos++;
531                    bufferPos++;
532                }
533
534                // all done with the path work, now copy any remainder
535                // of the relative reference; this will also add a null terminator
536                strncpy(bufferPos, relStringPos, bufferSize - (bufferPos - bufferStart));
537
538                parse(parseBuffer.data(), &relative);
539
540                ASSERT(strlen(parseBuffer.data()) + 1 <= parseBuffer.size());
541                break;
542            }
543        }
544    }
545}
546
547URL URL::copy() const
548{
549    URL result = *this;
550    result.m_string = result.m_string.isolatedCopy();
551    return result;
552}
553
554String URL::lastPathComponent() const
555{
556    if (!hasPath())
557        return String();
558
559    unsigned end = m_pathEnd - 1;
560    if (m_string[end] == '/')
561        --end;
562
563    size_t start = m_string.reverseFind('/', end);
564    if (start < static_cast<unsigned>(m_portEnd))
565        return String();
566    ++start;
567
568    return m_string.substring(start, end - start + 1);
569}
570
571String URL::protocol() const
572{
573    return m_string.left(m_schemeEnd);
574}
575
576String URL::host() const
577{
578    int start = hostStart();
579    return m_string.substring(start, m_hostEnd - start);
580}
581
582unsigned short URL::port() const
583{
584    // We return a port of 0 if there is no port specified. This can happen in two situations:
585    // 1) The URL contains no colon after the host name and before the path component of the URL.
586    // 2) The URL contains a colon but there's no port number before the path component of the URL begins.
587    if (m_hostEnd == m_portEnd || m_hostEnd == m_portEnd - 1)
588        return 0;
589
590    bool ok = false;
591    unsigned number;
592    if (m_string.is8Bit())
593        number = charactersToUIntStrict(m_string.characters8() + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok);
594    else
595        number = charactersToUIntStrict(m_string.characters16() + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok);
596    if (!ok || number > maximumValidPortNumber)
597        return invalidPortNumber;
598    return number;
599}
600
601String URL::pass() const
602{
603    if (m_passwordEnd == m_userEnd)
604        return String();
605
606    return decodeURLEscapeSequences(m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1));
607}
608
609String URL::user() const
610{
611    return decodeURLEscapeSequences(m_string.substring(m_userStart, m_userEnd - m_userStart));
612}
613
614String URL::fragmentIdentifier() const
615{
616    if (m_fragmentEnd == m_queryEnd)
617        return String();
618
619    return m_string.substring(m_queryEnd + 1, m_fragmentEnd - (m_queryEnd + 1));
620}
621
622bool URL::hasFragmentIdentifier() const
623{
624    return m_fragmentEnd != m_queryEnd;
625}
626
627String URL::baseAsString() const
628{
629    return m_string.left(m_pathAfterLastSlash);
630}
631
632#if !USE(CF)
633String URL::fileSystemPath() const
634{
635    if (!isValid() || !isLocalFile())
636        return String();
637
638    return decodeURLEscapeSequences(path());
639}
640#endif
641
642#ifdef NDEBUG
643
644static inline void assertProtocolIsGood(const char*)
645{
646}
647
648#else
649
650static void assertProtocolIsGood(const char* protocol)
651{
652    const char* p = protocol;
653    while (*p) {
654        ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z'));
655        ++p;
656    }
657}
658
659#endif
660
661bool URL::protocolIs(const char* protocol) const
662{
663    assertProtocolIsGood(protocol);
664
665    // JavaScript URLs are "valid" and should be executed even if URL decides they are invalid.
666    // The free function protocolIsJavaScript() should be used instead.
667    ASSERT(!equalIgnoringCase(protocol, String("javascript")));
668
669    if (!m_isValid)
670        return false;
671
672    // Do the comparison without making a new string object.
673    for (int i = 0; i < m_schemeEnd; ++i) {
674        if (!protocol[i] || !isSchemeCharacterMatchIgnoringCase(m_string[i], protocol[i]))
675            return false;
676    }
677    return !protocol[m_schemeEnd]; // We should have consumed all characters in the argument.
678}
679
680String URL::query() const
681{
682    if (m_queryEnd == m_pathEnd)
683        return String();
684
685    return m_string.substring(m_pathEnd + 1, m_queryEnd - (m_pathEnd + 1));
686}
687
688String URL::path() const
689{
690    return m_string.substring(m_portEnd, m_pathEnd - m_portEnd);
691}
692
693bool URL::setProtocol(const String& s)
694{
695    // Firefox and IE remove everything after the first ':'.
696    size_t separatorPosition = s.find(':');
697    String newProtocol = s.substring(0, separatorPosition);
698
699    if (!isValidProtocol(newProtocol))
700        return false;
701
702    if (!m_isValid) {
703        parse(newProtocol + ':' + m_string);
704        return true;
705    }
706
707    parse(newProtocol + m_string.substring(m_schemeEnd));
708    return true;
709}
710
711void URL::setHost(const String& s)
712{
713    if (!m_isValid)
714        return;
715
716    // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
717    // and to avoid changing more than just the host.
718
719    bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
720
721    parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + s + m_string.substring(m_hostEnd));
722}
723
724void URL::removePort()
725{
726    if (m_hostEnd == m_portEnd)
727        return;
728    parse(m_string.left(m_hostEnd) + m_string.substring(m_portEnd));
729}
730
731void URL::setPort(unsigned short i)
732{
733    if (!m_isValid)
734        return;
735
736    bool colonNeeded = m_portEnd == m_hostEnd;
737    int portStart = (colonNeeded ? m_hostEnd : m_hostEnd + 1);
738
739    parse(m_string.left(portStart) + (colonNeeded ? ":" : "") + String::number(i) + m_string.substring(m_portEnd));
740}
741
742void URL::setHostAndPort(const String& hostAndPort)
743{
744    if (!m_isValid)
745        return;
746
747    // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
748    // and to avoid changing more than just host and port.
749
750    bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
751
752    parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + hostAndPort + m_string.substring(m_portEnd));
753}
754
755void URL::setUser(const String& user)
756{
757    if (!m_isValid)
758        return;
759
760    // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
761    // and to avoid changing more than just the user login.
762
763    int end = m_userEnd;
764    if (!user.isEmpty()) {
765        String u = user;
766        if (m_userStart == m_schemeEnd + 1)
767            u = "//" + u;
768        // Add '@' if we didn't have one before.
769        if (end == m_hostEnd || (end == m_passwordEnd && m_string[end] != '@'))
770            u.append('@');
771        parse(m_string.left(m_userStart) + u + m_string.substring(end));
772    } else {
773        // Remove '@' if we now have neither user nor password.
774        if (m_userEnd == m_passwordEnd && end != m_hostEnd && m_string[end] == '@')
775            end += 1;
776        // We don't want to parse in the extremely common case where we are not going to make a change.
777        if (m_userStart != end)
778            parse(m_string.left(m_userStart) + m_string.substring(end));
779    }
780}
781
782void URL::setPass(const String& password)
783{
784    if (!m_isValid)
785        return;
786
787    // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
788    // and to avoid changing more than just the user password.
789
790    int end = m_passwordEnd;
791    if (!password.isEmpty()) {
792        String p = ":" + password + "@";
793        if (m_userEnd == m_schemeEnd + 1)
794            p = "//" + p;
795        // Eat the existing '@' since we are going to add our own.
796        if (end != m_hostEnd && m_string[end] == '@')
797            end += 1;
798        parse(m_string.left(m_userEnd) + p + m_string.substring(end));
799    } else {
800        // Remove '@' if we now have neither user nor password.
801        if (m_userStart == m_userEnd && end != m_hostEnd && m_string[end] == '@')
802            end += 1;
803        // We don't want to parse in the extremely common case where we are not going to make a change.
804        if (m_userEnd != end)
805            parse(m_string.left(m_userEnd) + m_string.substring(end));
806    }
807}
808
809void URL::setFragmentIdentifier(const String& s)
810{
811    if (!m_isValid)
812        return;
813
814    // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations.
815    parse(m_string.left(m_queryEnd) + "#" + s);
816}
817
818void URL::removeFragmentIdentifier()
819{
820    if (!m_isValid)
821        return;
822    parse(m_string.left(m_queryEnd));
823}
824
825void URL::setQuery(const String& query)
826{
827    if (!m_isValid)
828        return;
829
830    // FIXME: '#' and non-ASCII characters must be encoded and escaped.
831    // Usually, the query is encoded using document encoding, not UTF-8, but we don't have
832    // access to the document in this function.
833    if ((query.isEmpty() || query[0] != '?') && !query.isNull())
834        parse(m_string.left(m_pathEnd) + "?" + query + m_string.substring(m_queryEnd));
835    else
836        parse(m_string.left(m_pathEnd) + query + m_string.substring(m_queryEnd));
837
838}
839
840void URL::setPath(const String& s)
841{
842    if (!m_isValid)
843        return;
844
845    // FIXME: encodeWithURLEscapeSequences does not correctly escape '#' and '?', so fragment and query parts
846    // may be inadvertently affected.
847    String path = s;
848    if (path.isEmpty() || path[0] != '/')
849        path = "/" + path;
850
851    parse(m_string.left(m_portEnd) + encodeWithURLEscapeSequences(path) + m_string.substring(m_pathEnd));
852}
853
854String decodeURLEscapeSequences(const String& string)
855{
856    return decodeEscapeSequences<URLEscapeSequence>(string, UTF8Encoding());
857}
858
859String decodeURLEscapeSequences(const String& string, const TextEncoding& encoding)
860{
861    return decodeEscapeSequences<URLEscapeSequence>(string, encoding);
862}
863
864// Caution: This function does not bounds check.
865static void appendEscapedChar(char*& buffer, unsigned char c)
866{
867    *buffer++ = '%';
868    placeByteAsHex(c, buffer);
869}
870
871static void appendEscapingBadChars(char*& buffer, const char* strStart, size_t length)
872{
873    char* p = buffer;
874
875    const char* str = strStart;
876    const char* strEnd = strStart + length;
877    while (str < strEnd) {
878        unsigned char c = *str++;
879        if (isBadChar(c)) {
880            if (c == '%' || c == '?')
881                *p++ = c;
882            else if (c != 0x09 && c != 0x0a && c != 0x0d)
883                appendEscapedChar(p, c);
884        } else
885            *p++ = c;
886    }
887
888    buffer = p;
889}
890
891static void escapeAndAppendNonHierarchicalPart(char*& buffer, const char* strStart, size_t length)
892{
893    char* p = buffer;
894
895    const char* str = strStart;
896    const char* strEnd = strStart + length;
897    while (str < strEnd) {
898        unsigned char c = *str++;
899        // Strip CR, LF and Tab from fragments, per:
900        // https://bugs.webkit.org/show_bug.cgi?id=8770
901        if (c == 0x09 || c == 0x0a || c == 0x0d)
902            continue;
903
904        // Chrome and IE allow non-ascii characters in fragments, however doing
905        // so would hit an ASSERT in checkEncodedString, so for now we don't.
906        if (c < 0x20 || c >= 127) {
907            appendEscapedChar(p, c);
908            continue;
909        }
910        *p++ = c;
911    }
912
913    buffer = p;
914}
915
916// copy a path, accounting for "." and ".." segments
917static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd)
918{
919    char* bufferPathStart = dst;
920
921    // empty path is a special case, and need not have a leading slash
922    if (srcStart != srcEnd) {
923        const char* baseStringStart = src + srcStart;
924        const char* baseStringEnd = src + srcEnd;
925        const char* baseStringPos = baseStringStart;
926
927        // this code is unprepared for paths that do not begin with a
928        // slash and we should always have one in the source string
929        ASSERT(baseStringPos[0] == '/');
930
931        // copy the leading slash into the destination
932        *dst = *baseStringPos;
933        baseStringPos++;
934        dst++;
935
936        while (baseStringPos < baseStringEnd) {
937            if (baseStringPos[0] == '.' && dst[-1] == '/') {
938                if (baseStringPos[1] == '/' || baseStringPos + 1 == baseStringEnd) {
939                    // skip over "." segment
940                    baseStringPos += 2;
941                    continue;
942                } else if (baseStringPos[1] == '.' && (baseStringPos[2] == '/' ||
943                                       baseStringPos + 2 == baseStringEnd)) {
944                    // skip over ".." segment and rewind the last segment
945                    // the RFC leaves it up to the app to decide what to do with excess
946                    // ".." segments - we choose to drop them since some web content
947                    // relies on this.
948                    baseStringPos += 3;
949                    if (dst > bufferPathStart + 1)
950                        dst--;
951                    while (dst > bufferPathStart && dst[-1] != '/')
952                        dst--;
953                    continue;
954                }
955            }
956
957            *dst = *baseStringPos;
958            baseStringPos++;
959            dst++;
960        }
961    }
962    *dst = '\0';
963    return dst - bufferPathStart;
964}
965
966static inline bool hasSlashDotOrDotDot(const char* str)
967{
968    const unsigned char* p = reinterpret_cast<const unsigned char*>(str);
969    if (!*p)
970        return false;
971    unsigned char pc = *p;
972    while (unsigned char c = *++p) {
973        if (c == '.' && (pc == '/' || pc == '.'))
974            return true;
975        pc = c;
976    }
977    return false;
978}
979
980void URL::parse(const String& string)
981{
982    checkEncodedString(string);
983
984    CharBuffer buffer(string.length() + 1);
985    copyASCII(string, buffer.data());
986    buffer[string.length()] = '\0';
987    parse(buffer.data(), &string);
988}
989
990#if PLATFORM(IOS)
991static bool shouldCanonicalizeScheme = true;
992
993void enableURLSchemeCanonicalization(bool enableSchemeCanonicalization)
994{
995    shouldCanonicalizeScheme = enableSchemeCanonicalization;
996}
997#endif
998
999template<size_t length>
1000static inline bool equal(const char* a, const char (&b)[length])
1001{
1002#if PLATFORM(IOS)
1003    if (!shouldCanonicalizeScheme) {
1004        for (size_t i = 0; i < length; ++i) {
1005            if (toASCIILower(a[i]) != b[i])
1006                return false;
1007        }
1008        return true;
1009    }
1010#endif
1011    for (size_t i = 0; i < length; ++i) {
1012        if (a[i] != b[i])
1013            return false;
1014    }
1015    return true;
1016}
1017
1018template<size_t lengthB>
1019static inline bool equal(const char* stringA, size_t lengthA, const char (&stringB)[lengthB])
1020{
1021    return lengthA == lengthB && equal(stringA, stringB);
1022}
1023
1024// List of default schemes is taken from google-url:
1025// http://code.google.com/p/google-url/source/browse/trunk/src/url_canon_stdurl.cc#120
1026static inline bool isDefaultPortForScheme(const char* port, size_t portLength, const char* scheme, size_t schemeLength)
1027{
1028    // This switch is theoretically a performance optimization.  It came over when
1029    // the code was moved from google-url, but may be removed later.
1030    switch (schemeLength) {
1031    case 2:
1032        return equal(scheme, wsScheme) && equal(port, portLength, httpPort);
1033    case 3:
1034        if (equal(scheme, ftpScheme))
1035            return equal(port, portLength, ftpPort);
1036        if (equal(scheme, wssScheme))
1037            return equal(port, portLength, httpsPort);
1038        break;
1039    case 4:
1040        return equal(scheme, httpScheme) && equal(port, portLength, httpPort);
1041    case 5:
1042        return equal(scheme, httpsScheme) && equal(port, portLength, httpsPort);
1043    case 6:
1044        return equal(scheme, gopherScheme) && equal(port, portLength, gopherPort);
1045    }
1046    return false;
1047}
1048
1049static inline bool hostPortIsEmptyButCredentialsArePresent(int hostStart, int portEnd, char userinfoEndChar)
1050{
1051    return userinfoEndChar == '@' && hostStart == portEnd;
1052}
1053
1054static bool isNonFileHierarchicalScheme(const char* scheme, size_t schemeLength)
1055{
1056    switch (schemeLength) {
1057    case 2:
1058        return equal(scheme, wsScheme);
1059    case 3:
1060        return equal(scheme, ftpScheme) || equal(scheme, wssScheme);
1061    case 4:
1062        return equal(scheme, httpScheme);
1063    case 5:
1064        return equal(scheme, httpsScheme);
1065    case 6:
1066        return equal(scheme, gopherScheme);
1067    }
1068    return false;
1069}
1070
1071static bool isCanonicalHostnameLowercaseForScheme(const char* scheme, size_t schemeLength)
1072{
1073    switch (schemeLength) {
1074    case 2:
1075        return equal(scheme, wsScheme);
1076    case 3:
1077        return equal(scheme, ftpScheme) || equal(scheme, wssScheme);
1078    case 4:
1079        return equal(scheme, httpScheme) || equal(scheme, fileScheme);
1080    case 5:
1081        return equal(scheme, httpsScheme);
1082    case 6:
1083        return equal(scheme, gopherScheme);
1084    }
1085    return false;
1086}
1087
1088void URL::parse(const char* url, const String* originalString)
1089{
1090    if (!url || url[0] == '\0') {
1091        // valid URL must be non-empty
1092        m_string = originalString ? *originalString : url;
1093        invalidate();
1094        return;
1095    }
1096
1097    if (!isSchemeFirstChar(url[0])) {
1098        // scheme must start with an alphabetic character
1099        m_string = originalString ? *originalString : url;
1100        invalidate();
1101        return;
1102    }
1103
1104    int schemeEnd = 0;
1105    while (isSchemeChar(url[schemeEnd]))
1106        schemeEnd++;
1107
1108    if (url[schemeEnd] != ':') {
1109        m_string = originalString ? *originalString : url;
1110        invalidate();
1111        return;
1112    }
1113
1114    int userStart = schemeEnd + 1;
1115    int userEnd;
1116    int passwordStart;
1117    int passwordEnd;
1118    int hostStart;
1119    int hostEnd;
1120    int portStart;
1121    int portEnd;
1122
1123    bool hierarchical = url[schemeEnd + 1] == '/';
1124    bool hasSecondSlash = hierarchical && url[schemeEnd + 2] == '/';
1125
1126    bool isFile = schemeEnd == 4
1127        && isLetterMatchIgnoringCase(url[0], 'f')
1128        && isLetterMatchIgnoringCase(url[1], 'i')
1129        && isLetterMatchIgnoringCase(url[2], 'l')
1130        && isLetterMatchIgnoringCase(url[3], 'e');
1131
1132    m_protocolIsInHTTPFamily = isLetterMatchIgnoringCase(url[0], 'h')
1133        && isLetterMatchIgnoringCase(url[1], 't')
1134        && isLetterMatchIgnoringCase(url[2], 't')
1135        && isLetterMatchIgnoringCase(url[3], 'p')
1136        && (url[4] == ':' || (isLetterMatchIgnoringCase(url[4], 's') && url[5] == ':'));
1137
1138    if ((hierarchical && hasSecondSlash) || isNonFileHierarchicalScheme(url, schemeEnd)) {
1139        // The part after the scheme is either a net_path or an abs_path whose first path segment is empty.
1140        // Attempt to find an authority.
1141        // FIXME: Authority characters may be scanned twice, and it would be nice to be faster.
1142
1143        if (hierarchical)
1144            userStart++;
1145        if (hasSecondSlash)
1146            userStart++;
1147        userEnd = userStart;
1148
1149        int colonPos = 0;
1150        while (isUserInfoChar(url[userEnd])) {
1151            if (url[userEnd] == ':' && colonPos == 0)
1152                colonPos = userEnd;
1153            userEnd++;
1154        }
1155
1156        if (url[userEnd] == '@') {
1157            // actual end of the userinfo, start on the host
1158            if (colonPos != 0) {
1159                passwordEnd = userEnd;
1160                userEnd = colonPos;
1161                passwordStart = colonPos + 1;
1162            } else
1163                passwordStart = passwordEnd = userEnd;
1164
1165            hostStart = passwordEnd + 1;
1166        } else if (url[userEnd] == '[' || isPathSegmentEndChar(url[userEnd])) {
1167            // hit the end of the authority, must have been no user
1168            // or looks like an IPv6 hostname
1169            // either way, try to parse it as a hostname
1170            userEnd = userStart;
1171            passwordStart = passwordEnd = userEnd;
1172            hostStart = userStart;
1173        } else {
1174            // invalid character
1175            m_string = originalString ? *originalString : url;
1176            invalidate();
1177            return;
1178        }
1179
1180        hostEnd = hostStart;
1181
1182        // IPV6 IP address
1183        if (url[hostEnd] == '[') {
1184            hostEnd++;
1185            while (isIPv6Char(url[hostEnd]))
1186                hostEnd++;
1187            if (url[hostEnd] == ']')
1188                hostEnd++;
1189            else {
1190                // invalid character
1191                m_string = originalString ? *originalString : url;
1192                invalidate();
1193                return;
1194            }
1195        } else {
1196            while (isHostnameChar(url[hostEnd]))
1197                hostEnd++;
1198        }
1199
1200        if (url[hostEnd] == ':') {
1201            portStart = portEnd = hostEnd + 1;
1202
1203            // possible start of port
1204            portEnd = portStart;
1205            while (isASCIIDigit(url[portEnd]))
1206                portEnd++;
1207        } else
1208            portStart = portEnd = hostEnd;
1209
1210        if (!isPathSegmentEndChar(url[portEnd])) {
1211            // invalid character
1212            m_string = originalString ? *originalString : url;
1213            invalidate();
1214            return;
1215        }
1216
1217        if (hostPortIsEmptyButCredentialsArePresent(hostStart, portEnd, url[passwordEnd])) {
1218            m_string = originalString ? *originalString : url;
1219            invalidate();
1220            return;
1221        }
1222
1223        if (userStart == portEnd && !m_protocolIsInHTTPFamily && !isFile) {
1224            // No authority found, which means that this is not a net_path, but rather an abs_path whose first two
1225            // path segments are empty. For file, http and https only, an empty authority is allowed.
1226            userStart -= 2;
1227            userEnd = userStart;
1228            passwordStart = userEnd;
1229            passwordEnd = passwordStart;
1230            hostStart = passwordEnd;
1231            hostEnd = hostStart;
1232            portStart = hostEnd;
1233            portEnd = hostEnd;
1234        }
1235    } else {
1236        // the part after the scheme must be an opaque_part or an abs_path
1237        userEnd = userStart;
1238        passwordStart = passwordEnd = userEnd;
1239        hostStart = hostEnd = passwordEnd;
1240        portStart = portEnd = hostEnd;
1241    }
1242
1243    int pathStart = portEnd;
1244    int pathEnd = pathStart;
1245    while (url[pathEnd] && url[pathEnd] != '?' && url[pathEnd] != '#')
1246        pathEnd++;
1247
1248    int queryStart = pathEnd;
1249    int queryEnd = queryStart;
1250    if (url[queryStart] == '?') {
1251        while (url[queryEnd] && url[queryEnd] != '#')
1252            queryEnd++;
1253    }
1254
1255    int fragmentStart = queryEnd;
1256    int fragmentEnd = fragmentStart;
1257    if (url[fragmentStart] == '#') {
1258        fragmentStart++;
1259        fragmentEnd = fragmentStart;
1260        while (url[fragmentEnd])
1261            fragmentEnd++;
1262    }
1263
1264    // assemble it all, remembering the real ranges
1265
1266    Vector<char, 4096> buffer(fragmentEnd * 3 + 1);
1267
1268    char *p = buffer.data();
1269    const char *strPtr = url;
1270
1271    // copy in the scheme
1272    const char *schemeEndPtr = url + schemeEnd;
1273#if PLATFORM(IOS)
1274    if (shouldCanonicalizeScheme || m_protocolIsInHTTPFamily) {
1275        while (strPtr < schemeEndPtr)
1276            *p++ = toASCIILower(*strPtr++);
1277    } else {
1278        while (strPtr < schemeEndPtr)
1279            *p++ = *strPtr++;
1280    }
1281#else
1282    while (strPtr < schemeEndPtr)
1283        *p++ = toASCIILower(*strPtr++);
1284#endif
1285    m_schemeEnd = p - buffer.data();
1286
1287    bool hostIsLocalHost = portEnd - userStart == 9
1288        && isLetterMatchIgnoringCase(url[userStart], 'l')
1289        && isLetterMatchIgnoringCase(url[userStart+1], 'o')
1290        && isLetterMatchIgnoringCase(url[userStart+2], 'c')
1291        && isLetterMatchIgnoringCase(url[userStart+3], 'a')
1292        && isLetterMatchIgnoringCase(url[userStart+4], 'l')
1293        && isLetterMatchIgnoringCase(url[userStart+5], 'h')
1294        && isLetterMatchIgnoringCase(url[userStart+6], 'o')
1295        && isLetterMatchIgnoringCase(url[userStart+7], 's')
1296        && isLetterMatchIgnoringCase(url[userStart+8], 't');
1297
1298    // File URLs need a host part unless it is just file:// or file://localhost
1299    bool degenerateFilePath = pathStart == pathEnd && (hostStart == hostEnd || hostIsLocalHost);
1300
1301    // We drop empty credentials, but keep a colon in an empty host/port pair.
1302    // Removing hostname completely would change the structure of the URL on re-parsing.
1303    bool haveNonHostAuthorityPart = userStart != userEnd || passwordStart != passwordEnd || hostEnd != portEnd;
1304
1305    // add ":" after scheme
1306    *p++ = ':';
1307
1308    // if we have at least one authority part or a file URL - add "//" and authority
1309    if (isFile ? !degenerateFilePath : (haveNonHostAuthorityPart || hostStart != hostEnd)) {
1310        *p++ = '/';
1311        *p++ = '/';
1312
1313        m_userStart = p - buffer.data();
1314
1315        // copy in the user
1316        strPtr = url + userStart;
1317        const char* userEndPtr = url + userEnd;
1318        while (strPtr < userEndPtr) {
1319            char c = *strPtr++;
1320            ASSERT(isUserInfoChar(c));
1321            *p++ = c;
1322        }
1323        m_userEnd = p - buffer.data();
1324
1325        // copy in the password
1326        if (passwordEnd != passwordStart) {
1327            *p++ = ':';
1328            strPtr = url + passwordStart;
1329            const char* passwordEndPtr = url + passwordEnd;
1330            while (strPtr < passwordEndPtr) {
1331                char c = *strPtr++;
1332                ASSERT(isUserInfoChar(c));
1333                *p++ = c;
1334            }
1335        }
1336        m_passwordEnd = p - buffer.data();
1337
1338        // If we had any user info, add "@"
1339        if (p - buffer.data() != m_userStart)
1340            *p++ = '@';
1341
1342        // copy in the host, except in the case of a file URL with authority="localhost"
1343        if (!(isFile && hostIsLocalHost && !haveNonHostAuthorityPart)) {
1344            strPtr = url + hostStart;
1345            const char* hostEndPtr = url + hostEnd;
1346            if (isCanonicalHostnameLowercaseForScheme(buffer.data(), m_schemeEnd)) {
1347                while (strPtr < hostEndPtr) {
1348                    char c = toASCIILower(*strPtr++);
1349                    ASSERT(isHostnameChar(c) || c == '[' || c == ']' || c == ':');
1350                    *p++ = c;
1351                }
1352            } else {
1353                while (strPtr < hostEndPtr) {
1354                    char c = *strPtr++;
1355                    ASSERT(isHostnameChar(c) || c == '[' || c == ']' || c == ':');
1356                    *p++ = c;
1357                }
1358            }
1359        }
1360        m_hostEnd = p - buffer.data();
1361
1362        // Copy in the port if the URL has one (and it's not default). Also, copy it if there was no hostname, so that there is still something in authority component.
1363        if (hostEnd != portStart) {
1364            const char* portStr = url + portStart;
1365            size_t portLength = portEnd - portStart;
1366            if ((portLength && !isDefaultPortForScheme(portStr, portLength, buffer.data(), m_schemeEnd))
1367                || (hostStart == hostEnd && hostEnd != portStart)) {
1368                *p++ = ':';
1369                const char* portEndPtr = url + portEnd;
1370                while (portStr < portEndPtr)
1371                    *p++ = *portStr++;
1372            }
1373        }
1374        m_portEnd = p - buffer.data();
1375    } else {
1376        if (isFile) {
1377            ASSERT(degenerateFilePath);
1378            *p++ = '/';
1379            *p++ = '/';
1380        }
1381        m_userStart = m_userEnd = m_passwordEnd = m_hostEnd = m_portEnd = p - buffer.data();
1382    }
1383
1384    // For canonicalization, ensure we have a '/' for no path.
1385    // Do this only for URL with protocol file, http or https.
1386    if ((m_protocolIsInHTTPFamily || isFile) && pathEnd == pathStart)
1387        *p++ = '/';
1388
1389    // add path, escaping bad characters
1390    if (!hierarchical)
1391        escapeAndAppendNonHierarchicalPart(p, url + pathStart, pathEnd - pathStart);
1392    else if (!hasSlashDotOrDotDot(url))
1393        appendEscapingBadChars(p, url + pathStart, pathEnd - pathStart);
1394    else {
1395        CharBuffer pathBuffer(pathEnd - pathStart + 1);
1396        size_t length = copyPathRemovingDots(pathBuffer.data(), url, pathStart, pathEnd);
1397        appendEscapingBadChars(p, pathBuffer.data(), length);
1398    }
1399
1400    m_pathEnd = p - buffer.data();
1401
1402    // Find the position after the last slash in the path, or
1403    // the position before the path if there are no slashes in it.
1404    int i;
1405    for (i = m_pathEnd; i > m_portEnd; --i) {
1406        if (buffer[i - 1] == '/')
1407            break;
1408    }
1409    m_pathAfterLastSlash = i;
1410
1411    // add query, escaping bad characters
1412    appendEscapingBadChars(p, url + queryStart, queryEnd - queryStart);
1413    m_queryEnd = p - buffer.data();
1414
1415    // add fragment, escaping bad characters
1416    if (fragmentEnd != queryEnd) {
1417        *p++ = '#';
1418        escapeAndAppendNonHierarchicalPart(p, url + fragmentStart, fragmentEnd - fragmentStart);
1419    }
1420    m_fragmentEnd = p - buffer.data();
1421
1422    ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
1423    ASSERT(buffer.size() > 0);
1424
1425    // If we didn't end up actually changing the original string and
1426    // it was already in a String, reuse it to avoid extra allocation.
1427    if (originalString && equal(originalString->impl(), buffer.data(), m_fragmentEnd))
1428        m_string = *originalString;
1429    else
1430        m_string = String(buffer.data(), m_fragmentEnd);
1431
1432    m_isValid = true;
1433}
1434
1435bool equalIgnoringFragmentIdentifier(const URL& a, const URL& b)
1436{
1437    if (a.m_queryEnd != b.m_queryEnd)
1438        return false;
1439    unsigned queryLength = a.m_queryEnd;
1440    for (unsigned i = 0; i < queryLength; ++i)
1441        if (a.string()[i] != b.string()[i])
1442            return false;
1443    return true;
1444}
1445
1446bool protocolHostAndPortAreEqual(const URL& a, const URL& b)
1447{
1448    if (a.m_schemeEnd != b.m_schemeEnd)
1449        return false;
1450
1451    int hostStartA = a.hostStart();
1452    int hostLengthA = a.hostEnd() - hostStartA;
1453    int hostStartB = b.hostStart();
1454    int hostLengthB = b.hostEnd() - b.hostStart();
1455    if (hostLengthA != hostLengthB)
1456        return false;
1457
1458    // Check the scheme
1459    for (int i = 0; i < a.m_schemeEnd; ++i)
1460        if (a.string()[i] != b.string()[i])
1461            return false;
1462
1463    // And the host
1464    for (int i = 0; i < hostLengthA; ++i)
1465        if (a.string()[hostStartA + i] != b.string()[hostStartB + i])
1466            return false;
1467
1468    if (a.port() != b.port())
1469        return false;
1470
1471    return true;
1472}
1473
1474String encodeWithURLEscapeSequences(const String& notEncodedString)
1475{
1476    CString asUTF8 = notEncodedString.utf8();
1477
1478    CharBuffer buffer(asUTF8.length() * 3 + 1);
1479    char* p = buffer.data();
1480
1481    const char* str = asUTF8.data();
1482    const char* strEnd = str + asUTF8.length();
1483    while (str < strEnd) {
1484        unsigned char c = *str++;
1485        if (isBadChar(c))
1486            appendEscapedChar(p, c);
1487        else
1488            *p++ = c;
1489    }
1490
1491    ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
1492
1493    return String(buffer.data(), p - buffer.data());
1494}
1495
1496static bool containsOnlyASCII(StringView string)
1497{
1498    if (string.is8Bit())
1499        return charactersAreAllASCII(string.characters8(), string.length());
1500    return charactersAreAllASCII(string.characters16(), string.length());
1501}
1502
1503static bool protocolIs(StringView stringURL, const char* protocol)
1504{
1505    assertProtocolIsGood(protocol);
1506    unsigned length = stringURL.length();
1507    for (unsigned i = 0; i < length; ++i) {
1508        if (!protocol[i])
1509            return stringURL[i] == ':';
1510        if (!isLetterMatchIgnoringCase(stringURL[i], protocol[i]))
1511            return false;
1512    }
1513    return false;
1514}
1515
1516// Appends the punycoded hostname identified by the given string and length to
1517// the output buffer. The result will not be null terminated.
1518static void appendEncodedHostname(UCharBuffer& buffer, StringView string)
1519{
1520    // Needs to be big enough to hold an IDN-encoded name.
1521    // For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
1522    const unsigned hostnameBufferLength = 2048;
1523
1524    if (string.length() > hostnameBufferLength || containsOnlyASCII(string)) {
1525        append(buffer, string);
1526        return;
1527    }
1528
1529    UChar hostnameBuffer[hostnameBufferLength];
1530    UErrorCode error = U_ZERO_ERROR;
1531    int32_t numCharactersConverted = uidna_IDNToASCII(string.upconvertedCharacters(), string.length(), hostnameBuffer,
1532        hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, 0, &error);
1533    if (error == U_ZERO_ERROR)
1534        buffer.append(hostnameBuffer, numCharactersConverted);
1535}
1536
1537static void findHostnamesInMailToURL(StringView string, Vector<std::pair<int, int>>& nameRanges)
1538{
1539    // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' or end of string character.
1540    // Skip quoted strings so that characters in them don't confuse us.
1541    // When we find a '?' character, we are past the part of the URL that contains host names.
1542
1543    nameRanges.clear();
1544
1545    int p = 0;
1546    while (1) {
1547        // Find start of host name or of quoted string.
1548        int hostnameOrStringStart = findFirstOf(string, p, "\"@?");
1549        if (hostnameOrStringStart == -1)
1550            return;
1551        UChar c = string[hostnameOrStringStart];
1552        p = hostnameOrStringStart + 1;
1553
1554        if (c == '?')
1555            return;
1556
1557        if (c == '@') {
1558            // Find end of host name.
1559            int hostnameStart = p;
1560            int hostnameEnd = findFirstOf(string, p, ">,?");
1561            bool done;
1562            if (hostnameEnd == -1) {
1563                hostnameEnd = string.length();
1564                done = true;
1565            } else {
1566                p = hostnameEnd;
1567                done = false;
1568            }
1569
1570            nameRanges.append(std::make_pair(hostnameStart, hostnameEnd));
1571
1572            if (done)
1573                return;
1574        } else {
1575            // Skip quoted string.
1576            ASSERT(c == '"');
1577            while (1) {
1578                int escapedCharacterOrStringEnd = findFirstOf(string, p, "\"\\");
1579                if (escapedCharacterOrStringEnd == -1)
1580                    return;
1581
1582                c = string[escapedCharacterOrStringEnd];
1583                p = escapedCharacterOrStringEnd + 1;
1584
1585                // If we are the end of the string, then break from the string loop back to the host name loop.
1586                if (c == '"')
1587                    break;
1588
1589                // Skip escaped character.
1590                ASSERT(c == '\\');
1591                if (p == static_cast<int>(string.length()))
1592                    return;
1593
1594                ++p;
1595            }
1596        }
1597    }
1598}
1599
1600static bool findHostnameInHierarchicalURL(StringView string, int& startOffset, int& endOffset)
1601{
1602    // Find the host name in a hierarchical URL.
1603    // It comes after a "://" sequence, with scheme characters preceding, and
1604    // this should be the first colon in the string.
1605    // It ends with the end of the string or a ":" or a path segment ending character.
1606    // If there is a "@" character, the host part is just the part after the "@".
1607    int separator = findFirstOf(string, 0, ":");
1608    if (separator == -1 || separator + 2 >= static_cast<int>(string.length()) || string[separator + 1] != '/' || string[separator + 2] != '/')
1609        return false;
1610
1611    // Check that all characters before the :// are valid scheme characters.
1612    if (!isSchemeFirstChar(string[0]))
1613        return false;
1614    for (int i = 1; i < separator; ++i) {
1615        if (!isSchemeChar(string[i]))
1616            return false;
1617    }
1618
1619    // Start after the separator.
1620    int authorityStart = separator + 3;
1621
1622    // Find terminating character.
1623    int hostnameEnd = string.length();
1624    for (int i = authorityStart; i < hostnameEnd; ++i) {
1625        UChar c = string[i];
1626        if (c == ':' || (isPathSegmentEndChar(c) && c != 0)) {
1627            hostnameEnd = i;
1628            break;
1629        }
1630    }
1631
1632    // Find "@" for the start of the host name.
1633    int userInfoTerminator = findFirstOf(string, authorityStart, "@");
1634    int hostnameStart;
1635    if (userInfoTerminator == -1 || userInfoTerminator > hostnameEnd)
1636        hostnameStart = authorityStart;
1637    else
1638        hostnameStart = userInfoTerminator + 1;
1639
1640    startOffset = hostnameStart;
1641    endOffset = hostnameEnd;
1642    return true;
1643}
1644
1645// Converts all hostnames found in the given input to punycode, preserving the
1646// rest of the URL unchanged. The output will NOT be null-terminated.
1647static void encodeHostnames(StringView string, UCharBuffer& buffer)
1648{
1649    buffer.clear();
1650
1651    if (protocolIs(string, "mailto")) {
1652        Vector<std::pair<int, int>> hostnameRanges;
1653        findHostnamesInMailToURL(string, hostnameRanges);
1654        int n = hostnameRanges.size();
1655        int p = 0;
1656        for (int i = 0; i < n; ++i) {
1657            const std::pair<int, int>& r = hostnameRanges[i];
1658            append(buffer, string.substring(p, r.first - p));
1659            appendEncodedHostname(buffer, string.substring(r.first, r.second - r.first));
1660            p = r.second;
1661        }
1662        // This will copy either everything after the last hostname, or the
1663        // whole thing if there is no hostname.
1664        append(buffer, string.substring(p));
1665    } else {
1666        int hostStart, hostEnd;
1667        if (findHostnameInHierarchicalURL(string, hostStart, hostEnd)) {
1668            append(buffer, string.substring(0, hostStart)); // Before hostname.
1669            appendEncodedHostname(buffer, string.substring(hostStart, hostEnd - hostStart));
1670            append(buffer, string.substring(hostEnd)); // After hostname.
1671        } else {
1672            // No hostname to encode, return the input.
1673            append(buffer, string);
1674        }
1675    }
1676}
1677
1678static void encodeRelativeString(const String& rel, const TextEncoding& encoding, CharBuffer& output)
1679{
1680    UCharBuffer s;
1681    encodeHostnames(rel, s);
1682
1683    TextEncoding pathEncoding(UTF8Encoding()); // Path is always encoded as UTF-8; other parts may depend on the scheme.
1684
1685    int pathEnd = -1;
1686    if (encoding != pathEncoding && encoding.isValid() && !protocolIs(rel, "mailto") && !protocolIs(rel, "data") && !protocolIsJavaScript(rel)) {
1687        // Find the first instance of either # or ?, keep pathEnd at -1 otherwise.
1688        pathEnd = findFirstOf(StringView(s.data(), s.size()), 0, "#?");
1689    }
1690
1691    if (pathEnd == -1) {
1692        CString decoded = pathEncoding.encode(StringView(s.data(), s.size()), URLEncodedEntitiesForUnencodables);
1693        output.resize(decoded.length());
1694        memcpy(output.data(), decoded.data(), decoded.length());
1695    } else {
1696        CString pathDecoded = pathEncoding.encode(StringView(s.data(), pathEnd), URLEncodedEntitiesForUnencodables);
1697        // Unencodable characters in URLs are represented by converting
1698        // them to XML entities and escaping non-alphanumeric characters.
1699        CString otherDecoded = encoding.encode(StringView(s.data() + pathEnd, s.size() - pathEnd), URLEncodedEntitiesForUnencodables);
1700
1701        output.resize(pathDecoded.length() + otherDecoded.length());
1702        memcpy(output.data(), pathDecoded.data(), pathDecoded.length());
1703        memcpy(output.data() + pathDecoded.length(), otherDecoded.data(), otherDecoded.length());
1704    }
1705    output.append('\0'); // null-terminate the output.
1706}
1707
1708static String substituteBackslashes(const String& string)
1709{
1710    size_t questionPos = string.find('?');
1711    size_t hashPos = string.find('#');
1712    unsigned pathEnd;
1713
1714    if (hashPos != notFound && (questionPos == notFound || questionPos > hashPos))
1715        pathEnd = hashPos;
1716    else if (questionPos != notFound)
1717        pathEnd = questionPos;
1718    else
1719        pathEnd = string.length();
1720
1721    return string.left(pathEnd).replace('\\','/') + string.substring(pathEnd);
1722}
1723
1724bool URL::isHierarchical() const
1725{
1726    if (!m_isValid)
1727        return false;
1728    ASSERT(m_string[m_schemeEnd] == ':');
1729    return m_string[m_schemeEnd + 1] == '/';
1730}
1731
1732void URL::copyToBuffer(Vector<char, 512>& buffer) const
1733{
1734    // FIXME: This throws away the high bytes of all the characters in the string!
1735    // That's fine for a valid URL, which is all ASCII, but not for invalid URLs.
1736    buffer.resize(m_string.length());
1737    copyASCII(m_string, buffer.data());
1738}
1739
1740bool protocolIs(const String& url, const char* protocol)
1741{
1742    // Do the comparison without making a new string object.
1743    assertProtocolIsGood(protocol);
1744    for (int i = 0; ; ++i) {
1745        if (!protocol[i])
1746            return url[i] == ':';
1747        if (!isLetterMatchIgnoringCase(url[i], protocol[i]))
1748            return false;
1749    }
1750}
1751
1752bool isValidProtocol(const String& protocol)
1753{
1754    // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
1755    if (protocol.isEmpty())
1756        return false;
1757    if (!isSchemeFirstChar(protocol[0]))
1758        return false;
1759    unsigned protocolLength = protocol.length();
1760    for (unsigned i = 1; i < protocolLength; i++) {
1761        if (!isSchemeChar(protocol[i]))
1762            return false;
1763    }
1764    return true;
1765}
1766
1767#ifndef NDEBUG
1768void URL::print() const
1769{
1770    printf("%s\n", m_string.utf8().data());
1771}
1772#endif
1773
1774String URL::strippedForUseAsReferrer() const
1775{
1776    URL referrer(*this);
1777    referrer.setUser(String());
1778    referrer.setPass(String());
1779    referrer.removeFragmentIdentifier();
1780    return referrer.string();
1781}
1782
1783bool URL::isLocalFile() const
1784{
1785    // Including feed here might be a bad idea since drag and drop uses this check
1786    // and including feed would allow feeds to potentially let someone's blog
1787    // read the contents of the clipboard on a drag, even without a drop.
1788    // Likewise with using the FrameLoader::shouldTreatURLAsLocal() function.
1789    return protocolIs("file");
1790}
1791
1792bool protocolIsJavaScript(const String& url)
1793{
1794    return protocolIs(url, "javascript");
1795}
1796
1797bool protocolIsInHTTPFamily(const String& url)
1798{
1799    // Do the comparison without making a new string object.
1800    return isLetterMatchIgnoringCase(url[0], 'h')
1801        && isLetterMatchIgnoringCase(url[1], 't')
1802        && isLetterMatchIgnoringCase(url[2], 't')
1803        && isLetterMatchIgnoringCase(url[3], 'p')
1804        && (url[4] == ':' || (isLetterMatchIgnoringCase(url[4], 's') && url[5] == ':'));
1805}
1806
1807const URL& blankURL()
1808{
1809    DEPRECATED_DEFINE_STATIC_LOCAL(URL, staticBlankURL, (ParsedURLString, "about:blank"));
1810    return staticBlankURL;
1811}
1812
1813bool URL::isBlankURL() const
1814{
1815    return protocolIs("about");
1816}
1817
1818bool isDefaultPortForProtocol(unsigned short port, const String& protocol)
1819{
1820    if (protocol.isEmpty())
1821        return false;
1822
1823    typedef HashMap<String, unsigned, CaseFoldingHash> DefaultPortsMap;
1824    DEPRECATED_DEFINE_STATIC_LOCAL(DefaultPortsMap, defaultPorts, ());
1825    if (defaultPorts.isEmpty()) {
1826        defaultPorts.set("http", 80);
1827        defaultPorts.set("https", 443);
1828        defaultPorts.set("ftp", 21);
1829        defaultPorts.set("ftps", 990);
1830    }
1831    return defaultPorts.get(protocol) == port;
1832}
1833
1834bool portAllowed(const URL& url)
1835{
1836    unsigned short port = url.port();
1837
1838    // Since most URLs don't have a port, return early for the "no port" case.
1839    if (!port)
1840        return true;
1841
1842    // This blocked port list matches the port blocking that Mozilla implements.
1843    // See http://www.mozilla.org/projects/netlib/PortBanning.html for more information.
1844    static const unsigned short blockedPortList[] = {
1845        1,    // tcpmux
1846        7,    // echo
1847        9,    // discard
1848        11,   // systat
1849        13,   // daytime
1850        15,   // netstat
1851        17,   // qotd
1852        19,   // chargen
1853        20,   // FTP-data
1854        21,   // FTP-control
1855        22,   // SSH
1856        23,   // telnet
1857        25,   // SMTP
1858        37,   // time
1859        42,   // name
1860        43,   // nicname
1861        53,   // domain
1862        77,   // priv-rjs
1863        79,   // finger
1864        87,   // ttylink
1865        95,   // supdup
1866        101,  // hostriame
1867        102,  // iso-tsap
1868        103,  // gppitnp
1869        104,  // acr-nema
1870        109,  // POP2
1871        110,  // POP3
1872        111,  // sunrpc
1873        113,  // auth
1874        115,  // SFTP
1875        117,  // uucp-path
1876        119,  // nntp
1877        123,  // NTP
1878        135,  // loc-srv / epmap
1879        139,  // netbios
1880        143,  // IMAP2
1881        179,  // BGP
1882        389,  // LDAP
1883        465,  // SMTP+SSL
1884        512,  // print / exec
1885        513,  // login
1886        514,  // shell
1887        515,  // printer
1888        526,  // tempo
1889        530,  // courier
1890        531,  // Chat
1891        532,  // netnews
1892        540,  // UUCP
1893        556,  // remotefs
1894        563,  // NNTP+SSL
1895        587,  // ESMTP
1896        601,  // syslog-conn
1897        636,  // LDAP+SSL
1898        993,  // IMAP+SSL
1899        995,  // POP3+SSL
1900        2049, // NFS
1901        3659, // apple-sasl / PasswordServer [Apple addition]
1902        4045, // lockd
1903        6000, // X11
1904        6665, // Alternate IRC [Apple addition]
1905        6666, // Alternate IRC [Apple addition]
1906        6667, // Standard IRC [Apple addition]
1907        6668, // Alternate IRC [Apple addition]
1908        6669, // Alternate IRC [Apple addition]
1909        invalidPortNumber, // Used to block all invalid port numbers
1910    };
1911    const unsigned short* const blockedPortListEnd = blockedPortList + WTF_ARRAY_LENGTH(blockedPortList);
1912
1913#ifndef NDEBUG
1914    // The port list must be sorted for binary_search to work.
1915    static bool checkedPortList = false;
1916    if (!checkedPortList) {
1917        for (const unsigned short* p = blockedPortList; p != blockedPortListEnd - 1; ++p)
1918            ASSERT(*p < *(p + 1));
1919        checkedPortList = true;
1920    }
1921#endif
1922
1923    // If the port is not in the blocked port list, allow it.
1924    if (!std::binary_search(blockedPortList, blockedPortListEnd, port))
1925        return true;
1926
1927    // Allow ports 21 and 22 for FTP URLs, as Mozilla does.
1928    if ((port == 21 || port == 22) && url.protocolIs("ftp"))
1929        return true;
1930
1931    // Allow any port number in a file URL, since the port number is ignored.
1932    if (url.protocolIs("file"))
1933        return true;
1934
1935    return false;
1936}
1937
1938String mimeTypeFromDataURL(const String& url)
1939{
1940    ASSERT(protocolIs(url, "data"));
1941    size_t index = url.find(';');
1942    if (index == notFound)
1943        index = url.find(',');
1944    if (index != notFound) {
1945        if (index > 5)
1946            return url.substring(5, index - 5).lower();
1947        return "text/plain"; // Data URLs with no MIME type are considered text/plain.
1948    }
1949    return "";
1950}
1951
1952String mimeTypeFromURL(const URL& url)
1953{
1954    String decodedPath = decodeURLEscapeSequences(url.path());
1955    String extension = decodedPath.substring(decodedPath.reverseFind('.') + 1);
1956
1957    // We don't use MIMETypeRegistry::getMIMETypeForPath() because it returns "application/octet-stream" upon failure
1958    return MIMETypeRegistry::getMIMETypeForExtension(extension);
1959}
1960
1961bool URL::isSafeToSendToAnotherThread() const
1962{
1963    return m_string.isSafeToSendToAnotherThread();
1964}
1965
1966String URL::stringCenterEllipsizedToLength(unsigned length) const
1967{
1968    if (string().length() <= length)
1969        return string();
1970
1971    return string().left(length / 2 - 1) + "..." + string().right(length / 2 - 2);
1972}
1973
1974URL URL::fakeURLWithRelativePart(const String& relativePart)
1975{
1976    return URL(URL(), "webkit-fake-url://" + createCanonicalUUIDString() + '/' + relativePart);
1977}
1978
1979}
1980