src/std/uri.d

// Written in the D programming language.

/**
 * Encode and decode Uniform Resource Identifiers (URIs).
 * URIs are used in internet transfer protocols.
 * Valid URI characters consist of letters, digits,
 * and the characters $(B ;/?:@&amp;=+$,-_.!~*'())
 * Reserved URI characters are $(B ;/?:@&amp;=+$,)
 * Escape sequences consist of $(B %) followed by two hex digits.
 *
 * See_Also:
 *  $(LINK2 https://www.ietf.org/rfc/rfc3986.txt, RFC 3986)<br>
 *  $(LINK2 http://en.wikipedia.org/wiki/Uniform_resource_identifier, Wikipedia)
 * Copyright: Copyright The D Language Foundation 2000 - 2009.
 * License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
 * Authors:   $(HTTP digitalmars.com, Walter Bright)
 * Source:    $(PHOBOSSRC std/uri.d)
 */
/*          Copyright The D Language Foundation 2000 - 2009.
 * Distributed under the Boost Software License, Version 1.0.
 *    (See accompanying file LICENSE_1_0.txt or copy at
 *          http://www.boost.org/LICENSE_1_0.txt)
 */
module std.uri;

//debug=uri;        // uncomment to turn on debugging writefln's
debug(uri) import std.stdio;
import std.traits : isSomeChar;

/** This Exception is thrown if something goes wrong when encoding or
decoding a URI.
*/
class URIException : Exception
{
    import std.exception : basicExceptionCtors;
    mixin basicExceptionCtors;
}

///
@safe unittest
{
    import std.exception : assertThrown;
    assertThrown!URIException("%ab".decode);
}

private enum
{
    URI_Alpha = 1,
    URI_Reserved = 2,
    URI_Mark = 4,
    URI_Digit = 8,
    URI_Hash = 0x10,        // '#'
}

private immutable char[16] hex2ascii = "0123456789ABCDEF";

private immutable ubyte[128] uri_flags =      // indexed by character
    ({
        ubyte[128] uflags;

        // Compile time initialize
        uflags['#'] |= URI_Hash;

        foreach (c; 'A' .. 'Z' + 1)
        {
            uflags[c] |= URI_Alpha;
            uflags[c + 0x20] |= URI_Alpha;   // lowercase letters
        }
        foreach (c; '0' .. '9' + 1) uflags[c] |= URI_Digit;
        foreach (c; ";/?:@&=+$,")   uflags[c] |= URI_Reserved;
        foreach (c; "-_.!~*'()")    uflags[c] |= URI_Mark;
        return uflags;
    })();

private string URI_Encode(dstring str, uint unescapedSet) @safe pure
{
    uint j;
    uint k;
    dchar V;
    dchar C;

    // result buffer
    char[50] buffer = void;
    char[] R;
    uint Rlen;
    uint Rsize; // alloc'd size

    immutable len = str.length;

    R = buffer[];
    Rsize = buffer.length;
    Rlen = 0;

    for (k = 0; k != len; k++)
    {
        C = str[k];
        // if (C in unescapedSet)
        if (C < uri_flags.length && uri_flags[C] & unescapedSet)
        {
            if (Rlen == Rsize)
            {
                char[] R2;

                Rsize *= 2;
                R2 = new char[Rsize];
                R2[0 .. Rlen] = R[0 .. Rlen];
                R = R2;
            }
            R[Rlen] = cast(char) C;
            Rlen++;
        }
        else
        {
            char[6] Octet;
            uint L;

            V = C;

            // Transform V into octets
            if (V <= 0x7F)
            {
                Octet[0] = cast(char) V;
                L = 1;
            }
            else if (V <= 0x7FF)
            {
                Octet[0] = cast(char)(0xC0 | (V >> 6));
                Octet[1] = cast(char)(0x80 | (V & 0x3F));
                L = 2;
            }
            else if (V <= 0xFFFF)
            {
                Octet[0] = cast(char)(0xE0 | (V >> 12));
                Octet[1] = cast(char)(0x80 | ((V >> 6) & 0x3F));
                Octet[2] = cast(char)(0x80 | (V & 0x3F));
                L = 3;
            }
            else if (V <= 0x1FFFFF)
            {
                Octet[0] = cast(char)(0xF0 | (V >> 18));
                Octet[1] = cast(char)(0x80 | ((V >> 12) & 0x3F));
                Octet[2] = cast(char)(0x80 | ((V >> 6) & 0x3F));
                Octet[3] = cast(char)(0x80 | (V & 0x3F));
                L = 4;
            }
            else
            {
                throw new URIException("Undefined UTF-32 code point");
            }

            if (Rlen + L * 3 > Rsize)
            {
                char[] R2;

                Rsize = 2 * (Rlen + L * 3);
                R2 = new char[Rsize];
                R2[0 .. Rlen] = R[0 .. Rlen];
                R = R2;
            }

            for (j = 0; j < L; j++)
            {
                R[Rlen] = '%';
                R[Rlen + 1] = hex2ascii[Octet[j] >> 4];
                R[Rlen + 2] = hex2ascii[Octet[j] & 15];

                Rlen += 3;
            }
        }
    }

    return R[0 .. Rlen].idup;
}

@safe pure unittest
{
    import std.exception : assertThrown;

    assert(URI_Encode("", 0) == "");
    assert(URI_Encode(URI_Decode("%F0%BF%BF%BF", 0), 0) == "%F0%BF%BF%BF");
    dstring a;
    a ~= cast(dchar) 0xFFFFFFFF;
    assertThrown(URI_Encode(a, 0));
    assert(URI_Encode("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0).length == 3 * 60);
}

private uint ascii2hex(dchar c) @nogc @safe pure nothrow
{
    return (c <= '9') ? c - '0' :
        (c <= 'F') ? c - 'A' + 10 :
        c - 'a' + 10;
}

private dstring URI_Decode(Char)(scope const(Char)[] uri, uint reservedSet)
if (isSomeChar!Char)
{
    import std.ascii : isHexDigit;

    uint j;
    uint k;
    uint V;
    dchar C;

    uint Rlen;
    immutable len = uri.length;
    auto s = uri;

    auto Rsize = len;
    dchar[] R = new dchar[Rsize];
    Rlen = 0;

    for (k = 0; k != len; k++)
    {
        char B;
        uint start;

        C = s[k];
        if (C != '%')
        {
            R[Rlen] = C;
            Rlen++;
            continue;
        }
        start = k;
        if (k + 2 >= len)
            throw new URIException("Unexpected end of URI");
        if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
            throw new URIException("Expected two hexadecimal digits after '%'");
        B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
        k += 2;
        if ((B & 0x80) == 0)
        {
            C = B;
        }
        else
        {
            uint n;

            for (n = 1; ; n++)
            {
                if (n > 4)
                    throw new URIException("UTF-32 code point size too large");
                if (((B << n) & 0x80) == 0)
                {
                    if (n == 1)
                        throw new URIException("UTF-32 code point size too small");
                    break;
                }
            }

            // Pick off (7 - n) significant bits of B from first byte of octet
            V = B & ((1 << (7 - n)) - 1);   // (!!!)

            if (k + (3 * (n - 1)) >= len)
                throw new URIException("UTF-32 unaligned String");
            for (j = 1; j != n; j++)
            {
                k++;
                if (s[k] != '%')
                    throw new URIException("Expected: '%'");
                if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
                    throw new URIException("Expected two hexadecimal digits after '%'");
                B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
                if ((B & 0xC0) != 0x80)
                    throw new URIException("Incorrect UTF-32 multi-byte sequence");
                k += 2;
                V = (V << 6) | (B & 0x3F);
            }
            if (V > 0x10FFFF)
                throw new URIException("Unknown UTF-32 code point");
            C = V;
        }
        if (C < uri_flags.length && uri_flags[C] & reservedSet)
        {
            // R ~= s[start .. k + 1];
            immutable width = (k + 1) - start;
            for (int ii = 0; ii < width; ii++)
                R[Rlen + ii] = s[start + ii];
            Rlen += width;
        }
        else
        {
            R[Rlen] = C;
            Rlen++;
        }
    }
    assert(Rlen <= Rsize);  // enforce our preallocation size guarantee

    // Copy array on stack to array in memory
    return R[0 .. Rlen].idup;
}

@safe pure unittest
{
    import std.exception : assertThrown;

    assert(URI_Decode("", 0) == "");
    assertThrown!URIException(URI_Decode("%", 0));
    assertThrown!URIException(URI_Decode("%xx", 0));
    assertThrown!URIException(URI_Decode("%FF", 0));
    assertThrown!URIException(URI_Decode("%C0", 0));
    assertThrown!URIException(URI_Decode("%C0000000", 0));
    assertThrown!URIException(URI_Decode("%C0%xx0000", 0));
    assertThrown!URIException(URI_Decode("%C0%C00000", 0));
    assertThrown!URIException(URI_Decode("%F7%BF%BF%BF", 0));
    assert(URI_Decode("%23", URI_Hash) == "%23");
}

/*************************************
 * Decodes the URI string encodedURI into a UTF-8 string and returns it.
 * Escape sequences that resolve to reserved URI characters are not replaced.
 * Escape sequences that resolve to the '#' character are not replaced.
 */
string decode(Char)(scope const(Char)[] encodedURI)
if (isSomeChar!Char)
{
    import std.algorithm.iteration : each;
    import std.utf : encode;
    auto s = URI_Decode(encodedURI, URI_Reserved | URI_Hash);
    char[] r;
    s.each!(c => encode(r, c));
    return r;
}

///
@safe unittest
{
    assert("foo%20bar".decode == "foo bar");
    assert("%3C%3E.@.%E2%84%A2".decode == "<>.@.���");
    assert("foo&/".decode == "foo&/");
    assert("!@#$&*(".decode == "!@#$&*(");
}

/*******************************
 * Decodes the URI string encodedURI into a UTF-8 string and returns it. All
 * escape sequences are decoded.
 */
string decodeComponent(Char)(scope const(Char)[] encodedURIComponent)
if (isSomeChar!Char)
{
    import std.algorithm.iteration : each;
    import std.utf : encode;
    auto s = URI_Decode(encodedURIComponent, 0);
    char[] r;
    s.each!(c => encode(r, c));
    return r;
}

///
@safe unittest
{
    assert("foo%2F%26".decodeComponent == "foo/&");
    assert("dl%C3%A4ng%20r%C3%B6cks".decodeComponent == "dl��ng r��cks");
    assert("!%40%23%24%25%5E%26*(".decodeComponent == "!@#$%^&*(");
}

/*****************************
 * Encodes the UTF-8 string uri into a URI and returns that URI. Any character
 * not a valid URI character is escaped. The '#' character is not escaped.
 */
string encode(Char)(scope const(Char)[] uri)
if (isSomeChar!Char)
{
    import std.utf : toUTF32;
    auto s = toUTF32(uri);
    return URI_Encode(s, URI_Reserved | URI_Hash | URI_Alpha | URI_Digit | URI_Mark);
}

///
@safe unittest
{
    assert("foo bar".encode == "foo%20bar");
    assert("<>.@.���".encode == "%3C%3E.@.%E2%84%A2");
    assert("foo/#?a=1&b=2".encode == "foo/#?a=1&b=2");
    assert("dlang+rocks!".encode == "dlang+rocks!");
    assert("!@#$%^&*(".encode == "!@#$%25%5E&*(");
}

/********************************
 * Encodes the UTF-8 string uriComponent into a URI and returns that URI.
 * Any character not a letter, digit, or one of -_.!~*'() is escaped.
 */
string encodeComponent(Char)(scope const(Char)[] uriComponent)
if (isSomeChar!Char)
{
    import std.utf : toUTF32;
    auto s = toUTF32(uriComponent);
    return URI_Encode(s, URI_Alpha | URI_Digit | URI_Mark);
}

///
@safe unittest
{
    assert("!@#$%^&*(".encodeComponent == "!%40%23%24%25%5E%26*(");
    assert("<>.@.���".encodeComponent == "%3C%3E.%40.%E2%84%A2");
    assert("foo/&".encodeComponent == "foo%2F%26");
    assert("dl��ng r��cks".encodeComponent == "dl%C3%A4ng%20r%C3%B6cks");
    assert("dlang+rocks!".encodeComponent == "dlang%2Brocks!");
}

/* Encode associative array using www-form-urlencoding
 *
 * Params:
 *      values = an associative array containing the values to be encoded.
 *
 * Returns:
 *      A string encoded using www-form-urlencoding.
 */
package string urlEncode(scope string[string] values) @safe pure
{
    if (values.length == 0)
        return "";

    import std.array : Appender;
    import std.format.write : formattedWrite;

    Appender!string enc;
    enc.reserve(values.length * 128);

    bool first = true;
    foreach (k, v; values)
    {
        if (!first)
            enc.put('&');
        formattedWrite(enc, "%s=%s", encodeComponent(k), encodeComponent(v));
        first = false;
    }
    return enc.data;
}

@safe pure unittest
{
    // @system because urlEncode -> encodeComponent -> URI_Encode
    // URI_Encode uses alloca and pointer slicing
    string[string] a;
    assert(urlEncode(a) == "");
    assert(urlEncode(["name1" : "value1"]) == "name1=value1");
    auto enc = urlEncode(["name1" : "value1", "name2" : "value2"]);
    assert(enc == "name1=value1&name2=value2" || enc == "name2=value2&name1=value1");
}

/***************************
 * Does string s[] start with a URL?
 * Returns:
 *  -1   it does not
 *  len  it does, and s[0 .. len] is the slice of s[] that is that URL
 */

ptrdiff_t uriLength(Char)(scope const(Char)[] s)
if (isSomeChar!Char)
{
    /* Must start with one of:
     *  http://
     *  https://
     *  www.
     */
    import std.ascii : isAlphaNum;
    import std.uni : icmp;

    ptrdiff_t i;

    if (s.length <= 4)
        return -1;

    if (s.length > 7 && icmp(s[0 .. 7], "http://") == 0)
    {
        i = 7;
    }
    else
    {
        if (s.length > 8 && icmp(s[0 .. 8], "https://") == 0)
            i = 8;
        else
            return -1;
    }

    ptrdiff_t lastdot;
    for (; i < s.length; i++)
    {
        auto c = s[i];
        if (isAlphaNum(c))
            continue;
        if (c == '-' || c == '_' || c == '?' ||
                c == '=' || c == '%' || c == '&' ||
                c == '/' || c == '+' || c == '#' ||
                c == '~' || c == '$')
            continue;
        if (c == '.')
        {
            lastdot = i;
            continue;
        }
        break;
    }
    if (!lastdot)
        return -1;

    return i;
}

///
@safe pure unittest
{
    string s1 = "http://www.digitalmars.com/~fred/fredsRX.html#foo end!";
    assert(uriLength(s1) == 49);
    string s2 = "no uri here";
    assert(uriLength(s2) == -1);
    assert(uriLength("issue 14924") < 0);
}

@safe pure nothrow @nogc unittest
{
    assert(uriLength("") == -1);
    assert(uriLength("https://www") == -1);
}

/***************************
 * Does string s[] start with an email address?
 * Returns:
 *  -1    it does not
 *  len   it does, and s[0 .. i] is the slice of s[] that is that email address
 * References:
 *  RFC2822
 */
ptrdiff_t emailLength(Char)(scope const(Char)[] s)
if (isSomeChar!Char)
{
    import std.ascii : isAlpha, isAlphaNum;

    ptrdiff_t i;

    if (s.length == 0)
        return -1;

    if (!isAlpha(s[0]))
        return -1;

    for (i = 1; 1; i++)
    {
        if (i == s.length)
            return -1;
        auto c = s[i];
        if (isAlphaNum(c))
            continue;
        if (c == '-' || c == '_' || c == '.')
            continue;
        if (c != '@')
            return -1;
        i++;
        break;
    }

    /* Now do the part past the '@'
     */
    ptrdiff_t lastdot;
    for (; i < s.length; i++)
    {
        auto c = s[i];
        if (isAlphaNum(c))
            continue;
        if (c == '-' || c == '_')
            continue;
        if (c == '.')
        {
            lastdot = i;
            continue;
        }
        break;
    }
    if (!lastdot || (i - lastdot != 3 && i - lastdot != 4))
        return -1;

    return i;
}

///
@safe pure unittest
{
    string s1 = "my.e-mail@www.example-domain.com with garbage added";
    assert(emailLength(s1) == 32);
    string s2 = "no email address here";
    assert(emailLength(s2) == -1);
    assert(emailLength("issue 14924") < 0);
}

@safe pure unittest
{
    //@system because of encode -> URI_Encode
    debug(uri) writeln("uri.encodeURI.unittest");

    string source = "http://www.digitalmars.com/~fred/fred's RX.html#foo";
    string target = "http://www.digitalmars.com/~fred/fred's%20RX.html#foo";

    auto result = encode(source);
    debug(uri) writefln("result = '%s'", result);
    assert(result == target);
    result = decode(target);
    debug(uri) writefln("result = '%s'", result);
    assert(result == source);

    result = encode(decode("%E3%81%82%E3%81%82"));
    assert(result == "%E3%81%82%E3%81%82");

    result = encodeComponent("c++");
    assert(result == "c%2B%2B");

    auto str = new char[10_000_000];
    str[] = 'A';
    result = encodeComponent(str);
    foreach (char c; result)
        assert(c == 'A');

    result = decode("%41%42%43");
    debug(uri) writeln(result);

    import std.meta : AliasSeq;
    static foreach (StringType; AliasSeq!(char[], wchar[], dchar[], string, wstring, dstring))
    {{
        import std.conv : to;
        StringType decoded1 = source.to!StringType;
        string encoded1 = encode(decoded1);
        assert(decoded1 == source.to!StringType); // check that `decoded1` wasn't changed
        assert(encoded1 == target);
        assert(decoded1 == decode(encoded1).to!StringType);

        StringType encoded2 = target.to!StringType;
        string decoded2 = decode(encoded2);
        assert(encoded2 == target.to!StringType); // check that `encoded2` wasn't changed
        assert(decoded2 == source);
        assert(encoded2 == encode(decoded2).to!StringType);
    }}
}

@safe pure nothrow @nogc unittest
{
    assert(emailLength("") == -1);
    assert(emailLength("@") == -1);
    assert(emailLength("abcd") == -1);
    assert(emailLength("blah@blub") == -1);
    assert(emailLength("blah@blub.") == -1);
    assert(emailLength("blah@blub.domain") == -1);
}