uri.d revision 1.1.1.2
1// Written in the D programming language.
2
3/**
4 * Encode and decode Uniform Resource Identifiers (URIs).
5 * URIs are used in internet transfer protocols.
6 * Valid URI characters consist of letters, digits,
7 * and the characters $(B ;/?:@&=+$,-_.!~*'())
8 * Reserved URI characters are $(B ;/?:@&=+$,)
9 * Escape sequences consist of $(B %) followed by two hex digits.
10 *
11 * See_Also:
12 *  $(LINK2 https://www.ietf.org/rfc/rfc3986.txt, RFC 3986)<br>
13 *  $(LINK2 http://en.wikipedia.org/wiki/Uniform_resource_identifier, Wikipedia)
14 * Copyright: Copyright The D Language Foundation 2000 - 2009.
15 * License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
16 * Authors:   $(HTTP digitalmars.com, Walter Bright)
17 * Source:    $(PHOBOSSRC std/uri.d)
18 */
19/*          Copyright The D Language Foundation 2000 - 2009.
20 * Distributed under the Boost Software License, Version 1.0.
21 *    (See accompanying file LICENSE_1_0.txt or copy at
22 *          http://www.boost.org/LICENSE_1_0.txt)
23 */
24module std.uri;
25
26//debug=uri;        // uncomment to turn on debugging writefln's
27debug(uri) import std.stdio;
28import std.traits : isSomeChar;
29
30/** This Exception is thrown if something goes wrong when encoding or
31decoding a URI.
32*/
33class URIException : Exception
34{
35    import std.exception : basicExceptionCtors;
36    mixin basicExceptionCtors;
37}
38
39///
40@safe unittest
41{
42    import std.exception : assertThrown;
43    assertThrown!URIException("%ab".decode);
44}
45
46private enum
47{
48    URI_Alpha = 1,
49    URI_Reserved = 2,
50    URI_Mark = 4,
51    URI_Digit = 8,
52    URI_Hash = 0x10,        // '#'
53}
54
55private immutable char[16] hex2ascii = "0123456789ABCDEF";
56
57private immutable ubyte[128] uri_flags =      // indexed by character
58    ({
59        ubyte[128] uflags;
60
61        // Compile time initialize
62        uflags['#'] |= URI_Hash;
63
64        foreach (c; 'A' .. 'Z' + 1)
65        {
66            uflags[c] |= URI_Alpha;
67            uflags[c + 0x20] |= URI_Alpha;   // lowercase letters
68        }
69        foreach (c; '0' .. '9' + 1) uflags[c] |= URI_Digit;
70        foreach (c; ";/?:@&=+$,")   uflags[c] |= URI_Reserved;
71        foreach (c; "-_.!~*'()")    uflags[c] |= URI_Mark;
72        return uflags;
73    })();
74
75private string URI_Encode(dstring str, uint unescapedSet) @safe pure
76{
77    uint j;
78    uint k;
79    dchar V;
80    dchar C;
81
82    // result buffer
83    char[50] buffer = void;
84    char[] R;
85    uint Rlen;
86    uint Rsize; // alloc'd size
87
88    immutable len = str.length;
89
90    R = buffer[];
91    Rsize = buffer.length;
92    Rlen = 0;
93
94    for (k = 0; k != len; k++)
95    {
96        C = str[k];
97        // if (C in unescapedSet)
98        if (C < uri_flags.length && uri_flags[C] & unescapedSet)
99        {
100            if (Rlen == Rsize)
101            {
102                char[] R2;
103
104                Rsize *= 2;
105                R2 = new char[Rsize];
106                R2[0 .. Rlen] = R[0 .. Rlen];
107                R = R2;
108            }
109            R[Rlen] = cast(char) C;
110            Rlen++;
111        }
112        else
113        {
114            char[6] Octet;
115            uint L;
116
117            V = C;
118
119            // Transform V into octets
120            if (V <= 0x7F)
121            {
122                Octet[0] = cast(char) V;
123                L = 1;
124            }
125            else if (V <= 0x7FF)
126            {
127                Octet[0] = cast(char)(0xC0 | (V >> 6));
128                Octet[1] = cast(char)(0x80 | (V & 0x3F));
129                L = 2;
130            }
131            else if (V <= 0xFFFF)
132            {
133                Octet[0] = cast(char)(0xE0 | (V >> 12));
134                Octet[1] = cast(char)(0x80 | ((V >> 6) & 0x3F));
135                Octet[2] = cast(char)(0x80 | (V & 0x3F));
136                L = 3;
137            }
138            else if (V <= 0x1FFFFF)
139            {
140                Octet[0] = cast(char)(0xF0 | (V >> 18));
141                Octet[1] = cast(char)(0x80 | ((V >> 12) & 0x3F));
142                Octet[2] = cast(char)(0x80 | ((V >> 6) & 0x3F));
143                Octet[3] = cast(char)(0x80 | (V & 0x3F));
144                L = 4;
145            }
146            else
147            {
148                throw new URIException("Undefined UTF-32 code point");
149            }
150
151            if (Rlen + L * 3 > Rsize)
152            {
153                char[] R2;
154
155                Rsize = 2 * (Rlen + L * 3);
156                R2 = new char[Rsize];
157                R2[0 .. Rlen] = R[0 .. Rlen];
158                R = R2;
159            }
160
161            for (j = 0; j < L; j++)
162            {
163                R[Rlen] = '%';
164                R[Rlen + 1] = hex2ascii[Octet[j] >> 4];
165                R[Rlen + 2] = hex2ascii[Octet[j] & 15];
166
167                Rlen += 3;
168            }
169        }
170    }
171
172    return R[0 .. Rlen].idup;
173}
174
175@safe pure unittest
176{
177    import std.exception : assertThrown;
178
179    assert(URI_Encode("", 0) == "");
180    assert(URI_Encode(URI_Decode("%F0%BF%BF%BF", 0), 0) == "%F0%BF%BF%BF");
181    dstring a;
182    a ~= cast(dchar) 0xFFFFFFFF;
183    assertThrown(URI_Encode(a, 0));
184    assert(URI_Encode("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0).length == 3 * 60);
185}
186
187private uint ascii2hex(dchar c) @nogc @safe pure nothrow
188{
189    return (c <= '9') ? c - '0' :
190        (c <= 'F') ? c - 'A' + 10 :
191        c - 'a' + 10;
192}
193
194private dstring URI_Decode(Char)(scope const(Char)[] uri, uint reservedSet)
195if (isSomeChar!Char)
196{
197    import std.ascii : isHexDigit;
198
199    uint j;
200    uint k;
201    uint V;
202    dchar C;
203
204    uint Rlen;
205    immutable len = uri.length;
206    auto s = uri;
207
208    auto Rsize = len;
209    dchar[] R = new dchar[Rsize];
210    Rlen = 0;
211
212    for (k = 0; k != len; k++)
213    {
214        char B;
215        uint start;
216
217        C = s[k];
218        if (C != '%')
219        {
220            R[Rlen] = C;
221            Rlen++;
222            continue;
223        }
224        start = k;
225        if (k + 2 >= len)
226            throw new URIException("Unexpected end of URI");
227        if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
228            throw new URIException("Expected two hexadecimal digits after '%'");
229        B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
230        k += 2;
231        if ((B & 0x80) == 0)
232        {
233            C = B;
234        }
235        else
236        {
237            uint n;
238
239            for (n = 1; ; n++)
240            {
241                if (n > 4)
242                    throw new URIException("UTF-32 code point size too large");
243                if (((B << n) & 0x80) == 0)
244                {
245                    if (n == 1)
246                        throw new URIException("UTF-32 code point size too small");
247                    break;
248                }
249            }
250
251            // Pick off (7 - n) significant bits of B from first byte of octet
252            V = B & ((1 << (7 - n)) - 1);   // (!!!)
253
254            if (k + (3 * (n - 1)) >= len)
255                throw new URIException("UTF-32 unaligned String");
256            for (j = 1; j != n; j++)
257            {
258                k++;
259                if (s[k] != '%')
260                    throw new URIException("Expected: '%'");
261                if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
262                    throw new URIException("Expected two hexadecimal digits after '%'");
263                B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
264                if ((B & 0xC0) != 0x80)
265                    throw new URIException("Incorrect UTF-32 multi-byte sequence");
266                k += 2;
267                V = (V << 6) | (B & 0x3F);
268            }
269            if (V > 0x10FFFF)
270                throw new URIException("Unknown UTF-32 code point");
271            C = V;
272        }
273        if (C < uri_flags.length && uri_flags[C] & reservedSet)
274        {
275            // R ~= s[start .. k + 1];
276            immutable width = (k + 1) - start;
277            for (int ii = 0; ii < width; ii++)
278                R[Rlen + ii] = s[start + ii];
279            Rlen += width;
280        }
281        else
282        {
283            R[Rlen] = C;
284            Rlen++;
285        }
286    }
287    assert(Rlen <= Rsize);  // enforce our preallocation size guarantee
288
289    // Copy array on stack to array in memory
290    return R[0 .. Rlen].idup;
291}
292
293@safe pure unittest
294{
295    import std.exception : assertThrown;
296
297    assert(URI_Decode("", 0) == "");
298    assertThrown!URIException(URI_Decode("%", 0));
299    assertThrown!URIException(URI_Decode("%xx", 0));
300    assertThrown!URIException(URI_Decode("%FF", 0));
301    assertThrown!URIException(URI_Decode("%C0", 0));
302    assertThrown!URIException(URI_Decode("%C0000000", 0));
303    assertThrown!URIException(URI_Decode("%C0%xx0000", 0));
304    assertThrown!URIException(URI_Decode("%C0%C00000", 0));
305    assertThrown!URIException(URI_Decode("%F7%BF%BF%BF", 0));
306    assert(URI_Decode("%23", URI_Hash) == "%23");
307}
308
309/*************************************
310 * Decodes the URI string encodedURI into a UTF-8 string and returns it.
311 * Escape sequences that resolve to reserved URI characters are not replaced.
312 * Escape sequences that resolve to the '#' character are not replaced.
313 */
314string decode(Char)(scope const(Char)[] encodedURI)
315if (isSomeChar!Char)
316{
317    import std.algorithm.iteration : each;
318    import std.utf : encode;
319    auto s = URI_Decode(encodedURI, URI_Reserved | URI_Hash);
320    char[] r;
321    s.each!(c => encode(r, c));
322    return r;
323}
324
325///
326@safe unittest
327{
328    assert("foo%20bar".decode == "foo bar");
329    assert("%3C%3E.@.%E2%84%A2".decode == "<>.@.���");
330    assert("foo&/".decode == "foo&/");
331    assert("!@#$&*(".decode == "!@#$&*(");
332}
333
334/*******************************
335 * Decodes the URI string encodedURI into a UTF-8 string and returns it. All
336 * escape sequences are decoded.
337 */
338string decodeComponent(Char)(scope const(Char)[] encodedURIComponent)
339if (isSomeChar!Char)
340{
341    import std.algorithm.iteration : each;
342    import std.utf : encode;
343    auto s = URI_Decode(encodedURIComponent, 0);
344    char[] r;
345    s.each!(c => encode(r, c));
346    return r;
347}
348
349///
350@safe unittest
351{
352    assert("foo%2F%26".decodeComponent == "foo/&");
353    assert("dl%C3%A4ng%20r%C3%B6cks".decodeComponent == "dl��ng r��cks");
354    assert("!%40%23%24%25%5E%26*(".decodeComponent == "!@#$%^&*(");
355}
356
357/*****************************
358 * Encodes the UTF-8 string uri into a URI and returns that URI. Any character
359 * not a valid URI character is escaped. The '#' character is not escaped.
360 */
361string encode(Char)(scope const(Char)[] uri)
362if (isSomeChar!Char)
363{
364    import std.utf : toUTF32;
365    auto s = toUTF32(uri);
366    return URI_Encode(s, URI_Reserved | URI_Hash | URI_Alpha | URI_Digit | URI_Mark);
367}
368
369///
370@safe unittest
371{
372    assert("foo bar".encode == "foo%20bar");
373    assert("<>.@.���".encode == "%3C%3E.@.%E2%84%A2");
374    assert("foo/#?a=1&b=2".encode == "foo/#?a=1&b=2");
375    assert("dlang+rocks!".encode == "dlang+rocks!");
376    assert("!@#$%^&*(".encode == "!@#$%25%5E&*(");
377}
378
379/********************************
380 * Encodes the UTF-8 string uriComponent into a URI and returns that URI.
381 * Any character not a letter, digit, or one of -_.!~*'() is escaped.
382 */
383string encodeComponent(Char)(scope const(Char)[] uriComponent)
384if (isSomeChar!Char)
385{
386    import std.utf : toUTF32;
387    auto s = toUTF32(uriComponent);
388    return URI_Encode(s, URI_Alpha | URI_Digit | URI_Mark);
389}
390
391///
392@safe unittest
393{
394    assert("!@#$%^&*(".encodeComponent == "!%40%23%24%25%5E%26*(");
395    assert("<>.@.���".encodeComponent == "%3C%3E.%40.%E2%84%A2");
396    assert("foo/&".encodeComponent == "foo%2F%26");
397    assert("dl��ng r��cks".encodeComponent == "dl%C3%A4ng%20r%C3%B6cks");
398    assert("dlang+rocks!".encodeComponent == "dlang%2Brocks!");
399}
400
401/* Encode associative array using www-form-urlencoding
402 *
403 * Params:
404 *      values = an associative array containing the values to be encoded.
405 *
406 * Returns:
407 *      A string encoded using www-form-urlencoding.
408 */
409package string urlEncode(scope string[string] values) @safe pure
410{
411    if (values.length == 0)
412        return "";
413
414    import std.array : Appender;
415    import std.format.write : formattedWrite;
416
417    Appender!string enc;
418    enc.reserve(values.length * 128);
419
420    bool first = true;
421    foreach (k, v; values)
422    {
423        if (!first)
424            enc.put('&');
425        formattedWrite(enc, "%s=%s", encodeComponent(k), encodeComponent(v));
426        first = false;
427    }
428    return enc.data;
429}
430
431@safe pure unittest
432{
433    // @system because urlEncode -> encodeComponent -> URI_Encode
434    // URI_Encode uses alloca and pointer slicing
435    string[string] a;
436    assert(urlEncode(a) == "");
437    assert(urlEncode(["name1" : "value1"]) == "name1=value1");
438    auto enc = urlEncode(["name1" : "value1", "name2" : "value2"]);
439    assert(enc == "name1=value1&name2=value2" || enc == "name2=value2&name1=value1");
440}
441
442/***************************
443 * Does string s[] start with a URL?
444 * Returns:
445 *  -1   it does not
446 *  len  it does, and s[0 .. len] is the slice of s[] that is that URL
447 */
448
449ptrdiff_t uriLength(Char)(scope const(Char)[] s)
450if (isSomeChar!Char)
451{
452    /* Must start with one of:
453     *  http://
454     *  https://
455     *  www.
456     */
457    import std.ascii : isAlphaNum;
458    import std.uni : icmp;
459
460    ptrdiff_t i;
461
462    if (s.length <= 4)
463        return -1;
464
465    if (s.length > 7 && icmp(s[0 .. 7], "http://") == 0)
466    {
467        i = 7;
468    }
469    else
470    {
471        if (s.length > 8 && icmp(s[0 .. 8], "https://") == 0)
472            i = 8;
473        else
474            return -1;
475    }
476
477    ptrdiff_t lastdot;
478    for (; i < s.length; i++)
479    {
480        auto c = s[i];
481        if (isAlphaNum(c))
482            continue;
483        if (c == '-' || c == '_' || c == '?' ||
484                c == '=' || c == '%' || c == '&' ||
485                c == '/' || c == '+' || c == '#' ||
486                c == '~' || c == '$')
487            continue;
488        if (c == '.')
489        {
490            lastdot = i;
491            continue;
492        }
493        break;
494    }
495    if (!lastdot)
496        return -1;
497
498    return i;
499}
500
501///
502@safe pure unittest
503{
504    string s1 = "http://www.digitalmars.com/~fred/fredsRX.html#foo end!";
505    assert(uriLength(s1) == 49);
506    string s2 = "no uri here";
507    assert(uriLength(s2) == -1);
508    assert(uriLength("issue 14924") < 0);
509}
510
511@safe pure nothrow @nogc unittest
512{
513    assert(uriLength("") == -1);
514    assert(uriLength("https://www") == -1);
515}
516
517/***************************
518 * Does string s[] start with an email address?
519 * Returns:
520 *  -1    it does not
521 *  len   it does, and s[0 .. i] is the slice of s[] that is that email address
522 * References:
523 *  RFC2822
524 */
525ptrdiff_t emailLength(Char)(scope const(Char)[] s)
526if (isSomeChar!Char)
527{
528    import std.ascii : isAlpha, isAlphaNum;
529
530    ptrdiff_t i;
531
532    if (s.length == 0)
533        return -1;
534
535    if (!isAlpha(s[0]))
536        return -1;
537
538    for (i = 1; 1; i++)
539    {
540        if (i == s.length)
541            return -1;
542        auto c = s[i];
543        if (isAlphaNum(c))
544            continue;
545        if (c == '-' || c == '_' || c == '.')
546            continue;
547        if (c != '@')
548            return -1;
549        i++;
550        break;
551    }
552
553    /* Now do the part past the '@'
554     */
555    ptrdiff_t lastdot;
556    for (; i < s.length; i++)
557    {
558        auto c = s[i];
559        if (isAlphaNum(c))
560            continue;
561        if (c == '-' || c == '_')
562            continue;
563        if (c == '.')
564        {
565            lastdot = i;
566            continue;
567        }
568        break;
569    }
570    if (!lastdot || (i - lastdot != 3 && i - lastdot != 4))
571        return -1;
572
573    return i;
574}
575
576///
577@safe pure unittest
578{
579    string s1 = "my.e-mail@www.example-domain.com with garbage added";
580    assert(emailLength(s1) == 32);
581    string s2 = "no email address here";
582    assert(emailLength(s2) == -1);
583    assert(emailLength("issue 14924") < 0);
584}
585
586@safe pure unittest
587{
588    //@system because of encode -> URI_Encode
589    debug(uri) writeln("uri.encodeURI.unittest");
590
591    string source = "http://www.digitalmars.com/~fred/fred's RX.html#foo";
592    string target = "http://www.digitalmars.com/~fred/fred's%20RX.html#foo";
593
594    auto result = encode(source);
595    debug(uri) writefln("result = '%s'", result);
596    assert(result == target);
597    result = decode(target);
598    debug(uri) writefln("result = '%s'", result);
599    assert(result == source);
600
601    result = encode(decode("%E3%81%82%E3%81%82"));
602    assert(result == "%E3%81%82%E3%81%82");
603
604    result = encodeComponent("c++");
605    assert(result == "c%2B%2B");
606
607    auto str = new char[10_000_000];
608    str[] = 'A';
609    result = encodeComponent(str);
610    foreach (char c; result)
611        assert(c == 'A');
612
613    result = decode("%41%42%43");
614    debug(uri) writeln(result);
615
616    import std.meta : AliasSeq;
617    static foreach (StringType; AliasSeq!(char[], wchar[], dchar[], string, wstring, dstring))
618    {{
619        import std.conv : to;
620        StringType decoded1 = source.to!StringType;
621        string encoded1 = encode(decoded1);
622        assert(decoded1 == source.to!StringType); // check that `decoded1` wasn't changed
623        assert(encoded1 == target);
624        assert(decoded1 == decode(encoded1).to!StringType);
625
626        StringType encoded2 = target.to!StringType;
627        string decoded2 = decode(encoded2);
628        assert(encoded2 == target.to!StringType); // check that `encoded2` wasn't changed
629        assert(decoded2 == source);
630        assert(encoded2 == encode(decoded2).to!StringType);
631    }}
632}
633
634@safe pure nothrow @nogc unittest
635{
636    assert(emailLength("") == -1);
637    assert(emailLength("@") == -1);
638    assert(emailLength("abcd") == -1);
639    assert(emailLength("blah@blub") == -1);
640    assert(emailLength("blah@blub.") == -1);
641    assert(emailLength("blah@blub.domain") == -1);
642}
643