uri.d revision 1.1.1.1
1// Written in the D programming language.
2
3/**
4 * Encode and decode Uniform Resource Identifiers (URIs).
5 * URIs are used in internet transfer protocols.
6 * Valid URI characters consist of letters, digits,
7 * and the characters $(B ;/?:@&=+$,-_.!~*'())
8 * Reserved URI characters are $(B ;/?:@&=+$,)
9 * Escape sequences consist of $(B %) followed by two hex digits.
10 *
11 * See_Also:
12 *  $(LINK2 http://www.ietf.org/rfc/rfc3986.txt, RFC 3986)<br>
13 *  $(LINK2 http://en.wikipedia.org/wiki/Uniform_resource_identifier, Wikipedia)
14 * Copyright: Copyright Digital Mars 2000 - 2009.
15 * License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
16 * Authors:   $(HTTP digitalmars.com, Walter Bright)
17 * Source:    $(PHOBOSSRC std/_uri.d)
18 */
19/*          Copyright Digital Mars 2000 - 2009.
20 * Distributed under the Boost Software License, Version 1.0.
21 *    (See accompanying file LICENSE_1_0.txt or copy at
22 *          http://www.boost.org/LICENSE_1_0.txt)
23 */
24module std.uri;
25
26//debug=uri;        // uncomment to turn on debugging writefln's
27debug(uri) import std.stdio;
28import std.traits : isSomeChar;
29
30/** This Exception is thrown if something goes wrong when encoding or
31decoding a URI.
32*/
33class URIException : Exception
34{
35    import std.exception : basicExceptionCtors;
36    mixin basicExceptionCtors;
37}
38
39private enum
40{
41    URI_Alpha = 1,
42    URI_Reserved = 2,
43    URI_Mark = 4,
44    URI_Digit = 8,
45    URI_Hash = 0x10,        // '#'
46}
47
48private immutable char[16] hex2ascii = "0123456789ABCDEF";
49
50private immutable ubyte[128] uri_flags =      // indexed by character
51    ({
52        ubyte[128] uflags;
53
54        // Compile time initialize
55        uflags['#'] |= URI_Hash;
56
57        foreach (c; 'A' .. 'Z' + 1)
58        {
59            uflags[c] |= URI_Alpha;
60            uflags[c + 0x20] |= URI_Alpha;   // lowercase letters
61        }
62        foreach (c; '0' .. '9' + 1) uflags[c] |= URI_Digit;
63        foreach (c; ";/?:@&=+$,")   uflags[c] |= URI_Reserved;
64        foreach (c; "-_.!~*'()")    uflags[c] |= URI_Mark;
65        return uflags;
66    })();
67
68private string URI_Encode(dstring str, uint unescapedSet)
69{
70    import core.exception : OutOfMemoryError;
71    import core.stdc.stdlib : alloca;
72
73    uint j;
74    uint k;
75    dchar V;
76    dchar C;
77
78    // result buffer
79    char[50] buffer = void;
80    char* R;
81    uint Rlen;
82    uint Rsize; // alloc'd size
83
84    immutable len = str.length;
85
86    R = buffer.ptr;
87    Rsize = buffer.length;
88    Rlen = 0;
89
90    for (k = 0; k != len; k++)
91    {
92        C = str[k];
93        // if (C in unescapedSet)
94        if (C < uri_flags.length && uri_flags[C] & unescapedSet)
95        {
96            if (Rlen == Rsize)
97            {
98                char* R2;
99
100                Rsize *= 2;
101                if (Rsize > 1024)
102                {
103                    R2 = (new char[Rsize]).ptr;
104                }
105                else
106                {
107                    R2 = cast(char *) alloca(Rsize * char.sizeof);
108                    if (!R2)
109                        throw new OutOfMemoryError("Alloca failure");
110                }
111                R2[0 .. Rlen] = R[0 .. Rlen];
112                R = R2;
113            }
114            R[Rlen] = cast(char) C;
115            Rlen++;
116        }
117        else
118        {
119            char[6] Octet;
120            uint L;
121
122            V = C;
123
124            // Transform V into octets
125            if (V <= 0x7F)
126            {
127                Octet[0] = cast(char) V;
128                L = 1;
129            }
130            else if (V <= 0x7FF)
131            {
132                Octet[0] = cast(char)(0xC0 | (V >> 6));
133                Octet[1] = cast(char)(0x80 | (V & 0x3F));
134                L = 2;
135            }
136            else if (V <= 0xFFFF)
137            {
138                Octet[0] = cast(char)(0xE0 | (V >> 12));
139                Octet[1] = cast(char)(0x80 | ((V >> 6) & 0x3F));
140                Octet[2] = cast(char)(0x80 | (V & 0x3F));
141                L = 3;
142            }
143            else if (V <= 0x1FFFFF)
144            {
145                Octet[0] = cast(char)(0xF0 | (V >> 18));
146                Octet[1] = cast(char)(0x80 | ((V >> 12) & 0x3F));
147                Octet[2] = cast(char)(0x80 | ((V >> 6) & 0x3F));
148                Octet[3] = cast(char)(0x80 | (V & 0x3F));
149                L = 4;
150            }
151            else
152            {
153                throw new URIException("Undefined UTF-32 code point");
154            }
155
156            if (Rlen + L * 3 > Rsize)
157            {
158                char *R2;
159
160                Rsize = 2 * (Rlen + L * 3);
161                if (Rsize > 1024)
162                {
163                    R2 = (new char[Rsize]).ptr;
164                }
165                else
166                {
167                    R2 = cast(char *) alloca(Rsize * char.sizeof);
168                    if (!R2)
169                        throw new OutOfMemoryError("Alloca failure");
170                }
171                R2[0 .. Rlen] = R[0 .. Rlen];
172                R = R2;
173            }
174
175            for (j = 0; j < L; j++)
176            {
177                R[Rlen] = '%';
178                R[Rlen + 1] = hex2ascii[Octet[j] >> 4];
179                R[Rlen + 2] = hex2ascii[Octet[j] & 15];
180
181                Rlen += 3;
182            }
183        }
184    }
185
186    return R[0 .. Rlen].idup;
187}
188
189private uint ascii2hex(dchar c) @nogc @safe pure nothrow
190{
191    return (c <= '9') ? c - '0' :
192        (c <= 'F') ? c - 'A' + 10 :
193        c - 'a' + 10;
194}
195
196private dstring URI_Decode(Char)(in Char[] uri, uint reservedSet)
197if (isSomeChar!Char)
198{
199    import core.exception : OutOfMemoryError;
200    import core.stdc.stdlib : alloca;
201    import std.ascii : isHexDigit;
202
203    uint j;
204    uint k;
205    uint V;
206    dchar C;
207
208    // Result array, allocated on stack
209    dchar* R;
210    uint Rlen;
211
212    immutable len = uri.length;
213    auto s = uri.ptr;
214
215    // Preallocate result buffer R guaranteed to be large enough for result
216    auto Rsize = len;
217    if (Rsize > 1024 / dchar.sizeof)
218    {
219        R = (new dchar[Rsize]).ptr;
220    }
221    else
222    {
223        R = cast(dchar *) alloca(Rsize * dchar.sizeof);
224        if (!R)
225            throw new OutOfMemoryError("Alloca failure");
226    }
227    Rlen = 0;
228
229    for (k = 0; k != len; k++)
230    {
231        char B;
232        uint start;
233
234        C = s[k];
235        if (C != '%')
236        {
237            R[Rlen] = C;
238            Rlen++;
239            continue;
240        }
241        start = k;
242        if (k + 2 >= len)
243            throw new URIException("Unexpected end of URI");
244        if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
245            throw new URIException("Expected two hexadecimal digits after '%'");
246        B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
247        k += 2;
248        if ((B & 0x80) == 0)
249        {
250            C = B;
251        }
252        else
253        {
254            uint n;
255
256            for (n = 1; ; n++)
257            {
258                if (n > 4)
259                    throw new URIException("UTF-32 code point size too large");
260                if (((B << n) & 0x80) == 0)
261                {
262                    if (n == 1)
263                        throw new URIException("UTF-32 code point size too small");
264                    break;
265                }
266            }
267
268            // Pick off (7 - n) significant bits of B from first byte of octet
269            V = B & ((1 << (7 - n)) - 1);   // (!!!)
270
271            if (k + (3 * (n - 1)) >= len)
272                throw new URIException("UTF-32 unaligned String");
273            for (j = 1; j != n; j++)
274            {
275                k++;
276                if (s[k] != '%')
277                    throw new URIException("Expected: '%'");
278                if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
279                    throw new URIException("Expected two hexadecimal digits after '%'");
280                B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
281                if ((B & 0xC0) != 0x80)
282                    throw new URIException("Incorrect UTF-32 multi-byte sequence");
283                k += 2;
284                V = (V << 6) | (B & 0x3F);
285            }
286            if (V > 0x10FFFF)
287                throw new URIException("Unknown UTF-32 code point");
288            C = V;
289        }
290        if (C < uri_flags.length && uri_flags[C] & reservedSet)
291        {
292            // R ~= s[start .. k + 1];
293            immutable width = (k + 1) - start;
294            for (int ii = 0; ii < width; ii++)
295                R[Rlen + ii] = s[start + ii];
296            Rlen += width;
297        }
298        else
299        {
300            R[Rlen] = C;
301            Rlen++;
302        }
303    }
304    assert(Rlen <= Rsize);  // enforce our preallocation size guarantee
305
306    // Copy array on stack to array in memory
307    return R[0 .. Rlen].idup;
308}
309
310/*************************************
311 * Decodes the URI string encodedURI into a UTF-8 string and returns it.
312 * Escape sequences that resolve to reserved URI characters are not replaced.
313 * Escape sequences that resolve to the '#' character are not replaced.
314 */
315
316string decode(Char)(in Char[] encodedURI)
317if (isSomeChar!Char)
318{
319    import std.algorithm.iteration : each;
320    import std.utf : encode;
321    auto s = URI_Decode(encodedURI, URI_Reserved | URI_Hash);
322    char[] r;
323    s.each!(c => encode(r, c));
324    return r;
325}
326
327/*******************************
328 * Decodes the URI string encodedURI into a UTF-8 string and returns it. All
329 * escape sequences are decoded.
330 */
331
332string decodeComponent(Char)(in Char[] encodedURIComponent)
333if (isSomeChar!Char)
334{
335    import std.algorithm.iteration : each;
336    import std.utf : encode;
337    auto s = URI_Decode(encodedURIComponent, 0);
338    char[] r;
339    s.each!(c => encode(r, c));
340    return r;
341}
342
343/*****************************
344 * Encodes the UTF-8 string uri into a URI and returns that URI. Any character
345 * not a valid URI character is escaped. The '#' character is not escaped.
346 */
347
348string encode(Char)(in Char[] uri)
349if (isSomeChar!Char)
350{
351    import std.utf : toUTF32;
352    auto s = toUTF32(uri);
353    return URI_Encode(s, URI_Reserved | URI_Hash | URI_Alpha | URI_Digit | URI_Mark);
354}
355
356/********************************
357 * Encodes the UTF-8 string uriComponent into a URI and returns that URI.
358 * Any character not a letter, digit, or one of -_.!~*'() is escaped.
359 */
360
361string encodeComponent(Char)(in Char[] uriComponent)
362if (isSomeChar!Char)
363{
364    import std.utf : toUTF32;
365    auto s = toUTF32(uriComponent);
366    return URI_Encode(s, URI_Alpha | URI_Digit | URI_Mark);
367}
368
369/* Encode associative array using www-form-urlencoding
370 *
371 * Params:
372 *      values = an associative array containing the values to be encoded.
373 *
374 * Returns:
375 *      A string encoded using www-form-urlencoding.
376 */
377package string urlEncode(in string[string] values)
378{
379    if (values.length == 0)
380        return "";
381
382    import std.array : Appender;
383    import std.format : formattedWrite;
384
385    Appender!string enc;
386    enc.reserve(values.length * 128);
387
388    bool first = true;
389    foreach (k, v; values)
390    {
391        if (!first)
392            enc.put('&');
393        formattedWrite(enc, "%s=%s", encodeComponent(k), encodeComponent(v));
394        first = false;
395    }
396    return enc.data;
397}
398
399@system unittest
400{
401    // @system because urlEncode -> encodeComponent -> URI_Encode
402    // URI_Encode uses alloca and pointer slicing
403    string[string] a;
404    assert(urlEncode(a) == "");
405    assert(urlEncode(["name1" : "value1"]) == "name1=value1");
406    auto enc = urlEncode(["name1" : "value1", "name2" : "value2"]);
407    assert(enc == "name1=value1&name2=value2" || enc == "name2=value2&name1=value1");
408}
409
410/***************************
411 * Does string s[] start with a URL?
412 * Returns:
413 *  -1   it does not
414 *  len  it does, and s[0 .. len] is the slice of s[] that is that URL
415 */
416
417ptrdiff_t uriLength(Char)(in Char[] s)
418if (isSomeChar!Char)
419{
420    /* Must start with one of:
421     *  http://
422     *  https://
423     *  www.
424     */
425    import std.ascii : isAlphaNum;
426    import std.uni : icmp;
427
428    ptrdiff_t i;
429
430    if (s.length <= 4)
431        return -1;
432
433    if (s.length > 7 && icmp(s[0 .. 7], "http://") == 0)
434    {
435        i = 7;
436    }
437    else
438    {
439        if (s.length > 8 && icmp(s[0 .. 8], "https://") == 0)
440            i = 8;
441        else
442            return -1;
443    }
444
445    ptrdiff_t lastdot;
446    for (; i < s.length; i++)
447    {
448        auto c = s[i];
449        if (isAlphaNum(c))
450            continue;
451        if (c == '-' || c == '_' || c == '?' ||
452                c == '=' || c == '%' || c == '&' ||
453                c == '/' || c == '+' || c == '#' ||
454                c == '~' || c == '$')
455            continue;
456        if (c == '.')
457        {
458            lastdot = i;
459            continue;
460        }
461        break;
462    }
463    if (!lastdot)
464        return -1;
465
466    return i;
467}
468
469///
470@safe unittest
471{
472    string s1 = "http://www.digitalmars.com/~fred/fredsRX.html#foo end!";
473    assert(uriLength(s1) == 49);
474    string s2 = "no uri here";
475    assert(uriLength(s2) == -1);
476    assert(uriLength("issue 14924") < 0);
477}
478
479
480/***************************
481 * Does string s[] start with an email address?
482 * Returns:
483 *  -1    it does not
484 *  len   it does, and s[0 .. i] is the slice of s[] that is that email address
485 * References:
486 *  RFC2822
487 */
488ptrdiff_t emailLength(Char)(in Char[] s)
489if (isSomeChar!Char)
490{
491    import std.ascii : isAlpha, isAlphaNum;
492
493    ptrdiff_t i;
494
495    if (!isAlpha(s[0]))
496        return -1;
497
498    for (i = 1; 1; i++)
499    {
500        if (i == s.length)
501            return -1;
502        auto c = s[i];
503        if (isAlphaNum(c))
504            continue;
505        if (c == '-' || c == '_' || c == '.')
506            continue;
507        if (c != '@')
508            return -1;
509        i++;
510        break;
511    }
512
513    /* Now do the part past the '@'
514     */
515    ptrdiff_t lastdot;
516    for (; i < s.length; i++)
517    {
518        auto c = s[i];
519        if (isAlphaNum(c))
520            continue;
521        if (c == '-' || c == '_')
522            continue;
523        if (c == '.')
524        {
525            lastdot = i;
526            continue;
527        }
528        break;
529    }
530    if (!lastdot || (i - lastdot != 3 && i - lastdot != 4))
531        return -1;
532
533    return i;
534}
535
536///
537@safe unittest
538{
539    string s1 = "my.e-mail@www.example-domain.com with garbage added";
540    assert(emailLength(s1) == 32);
541    string s2 = "no email address here";
542    assert(emailLength(s2) == -1);
543    assert(emailLength("issue 14924") < 0);
544}
545
546
547@system unittest
548{
549    //@system because of encode -> URI_Encode
550    debug(uri) writeln("uri.encodeURI.unittest");
551
552    string source = "http://www.digitalmars.com/~fred/fred's RX.html#foo";
553    string target = "http://www.digitalmars.com/~fred/fred's%20RX.html#foo";
554
555    auto result = encode(source);
556    debug(uri) writefln("result = '%s'", result);
557    assert(result == target);
558    result = decode(target);
559    debug(uri) writefln("result = '%s'", result);
560    assert(result == source);
561
562    result = encode(decode("%E3%81%82%E3%81%82"));
563    assert(result == "%E3%81%82%E3%81%82");
564
565    result = encodeComponent("c++");
566    assert(result == "c%2B%2B");
567
568    auto str = new char[10_000_000];
569    str[] = 'A';
570    result = encodeComponent(str);
571    foreach (char c; result)
572        assert(c == 'A');
573
574    result = decode("%41%42%43");
575    debug(uri) writeln(result);
576
577    import std.meta : AliasSeq;
578    foreach (StringType; AliasSeq!(char[], wchar[], dchar[], string, wstring, dstring))
579    {
580        import std.conv : to;
581        StringType decoded1 = source.to!StringType;
582        string encoded1 = encode(decoded1);
583        assert(decoded1 == source.to!StringType); // check that `decoded1` wasn't changed
584        assert(encoded1 == target);
585        assert(decoded1 == decode(encoded1).to!StringType);
586
587        StringType encoded2 = target.to!StringType;
588        string decoded2 = decode(encoded2);
589        assert(encoded2 == target.to!StringType); // check that `encoded2` wasn't changed
590        assert(decoded2 == source);
591        assert(encoded2 == encode(decoded2).to!StringType);
592    }
593}
594