1/* Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements.  See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License.  You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "apr.h"
18#include "apr_private.h"
19#include "apr_errno.h"
20#include "apr_arch_utf8.h"
21
22/* Implementation of RFC 3629, "UTF-8, a transformation format of ISO 10646"
23 * with particular attention to canonical translation forms (see section 10
24 * "Security Considerations" of the RFC for more info).
25 *
26 * Since several architectures including Windows support unicode, with UCS2
27 * used as the actual storage conventions by that archicture, these functions
28 * exist to transform or validate UCS2 strings into APR's 'char' type
29 * convention.  It is left up to the operating system to determine the
30 * validitity of the string, e.g. normative forms, in the context of
31 * its native language support.  Other file systems which support filename
32 * characters of 0x80-0xff but have no explicit requirement for Unicode
33 * will find this function useful only for validating the character sequences
34 * and rejecting poorly encoded UTF8 sequences.
35 *
36 * Len UCS-4 range (hex) UTF-8 octet sequence (binary)
37 * 1:2 00000000-0000007F 0xxxxxxx
38 * 2:2 00000080-000007FF 110XXXXx 10xxxxxx
39 * 3:2 00000800-0000FFFF 1110XXXX 10Xxxxxx 10xxxxxx
40 * 4:4 00010000-001FFFFF 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
41 *     00200000-03FFFFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
42 *     04000000-7FFFFFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
43 *
44 * One of the X bits must be 1 to avoid overlong representation of ucs2 values.
45 *
46 * For conversion into ucs2, the 4th form is limited in range to 0010 FFFF,
47 * and the final two forms are used only by full ucs4, per RFC 3629;
48 *
49 *   "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in
50 *   Unicode parlance), being actually UCS-4 characters transformed
51 *   through UTF-16, need special treatment: the UTF-16 transformation
52 *   must be undone, yielding a UCS-4 character that is then transformed
53 *   as above."
54 *
55 * From RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
56 *
57 *  U' = U - 0x10000
58 *  U' = 00000000 0000yyyy yyyyyyxx xxxxxxxx
59 *                    W1 = 110110yy yyyyyyyy
60 *                    W2 = 110111xx xxxxxxxx
61 *  Max U' = 0000 00001111 11111111 11111111
62 *  Max U  = 0000 00010000 11111111 11111111
63 *
64 * Len is the table above is a mapping of bytes used for utf8:ucs2 values,
65 * which results in these conclusions of maximum allocations;
66 *
67 *  apr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
68 *  apr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
69 */
70
71APR_DECLARE(apr_status_t) apr_conv_utf8_to_ucs2(const char *in,
72                                                apr_size_t *inbytes,
73                                                apr_wchar_t *out,
74                                                apr_size_t *outwords)
75{
76    apr_int64_t newch, mask;
77    apr_size_t expect, eating;
78    int ch;
79
80    while (*inbytes && *outwords)
81    {
82        ch = (unsigned char)(*in++);
83        if (!(ch & 0200)) {
84            /* US-ASCII-7 plain text
85             */
86            --*inbytes;
87            --*outwords;
88            *(out++) = ch;
89        }
90        else
91        {
92            if ((ch & 0300) != 0300) {
93                /* Multibyte Continuation is out of place
94                 */
95                return APR_EINVAL;
96            }
97            else
98            {
99                /* Multibyte Sequence Lead Character
100                 *
101                 * Compute the expected bytes while adjusting
102                 * or lead byte and leading zeros mask.
103                 */
104                mask = 0340;
105                expect = 1;
106                while ((ch & mask) == mask) {
107                    mask |= mask >> 1;
108                    if (++expect > 3) /* (truly 5 for ucs-4) */
109                        return APR_EINVAL;
110                }
111                newch = ch & ~mask;
112                eating = expect + 1;
113                if (*inbytes <= expect)
114                    return APR_INCOMPLETE;
115                /* Reject values of excessive leading 0 bits
116                 * utf-8 _demands_ the shortest possible byte length
117                 */
118                if (expect == 1) {
119                    if (!(newch & 0036))
120                        return APR_EINVAL;
121                }
122                else {
123                    /* Reject values of excessive leading 0 bits
124                     */
125                    if (!newch && !((unsigned char)*in & 0077 & (mask << 1)))
126                        return APR_EINVAL;
127                    if (expect == 2) {
128                        /* Reject values D800-DFFF when not utf16 encoded
129                         * (may not be an appropriate restriction for ucs-4)
130                         */
131                        if (newch == 0015 && ((unsigned char)*in & 0040))
132                            return APR_EINVAL;
133                    }
134                    else if (expect == 3) {
135                        /* Short circuit values > 110000
136                         */
137                        if (newch > 4)
138                            return APR_EINVAL;
139                        if (newch == 4 && ((unsigned char)*in & 0060))
140                            return APR_EINVAL;
141                    }
142                }
143                /* Where the boolean (expect > 2) is true, we will need
144                 * an extra word for the output.
145                 */
146                if (*outwords < (apr_size_t)(expect > 2) + 1)
147                    break; /* buffer full */
148                while (expect--)
149                {
150                    /* Multibyte Continuation must be legal */
151                    if (((ch = (unsigned char)*(in++)) & 0300) != 0200)
152                        return APR_EINVAL;
153                    newch <<= 6;
154                    newch |= (ch & 0077);
155                }
156                *inbytes -= eating;
157                /* newch is now a true ucs-4 character
158                 *
159                 * now we need to fold to ucs-2
160                 */
161                if (newch < 0x10000)
162                {
163                    --*outwords;
164                    *(out++) = (apr_wchar_t) newch;
165                }
166                else
167                {
168                    *outwords -= 2;
169                    newch -= 0x10000;
170                    *(out++) = (apr_wchar_t) (0xD800 | (newch >> 10));
171                    *(out++) = (apr_wchar_t) (0xDC00 | (newch & 0x03FF));
172                }
173            }
174        }
175    }
176    /* Buffer full 'errors' aren't errors, the client must inspect both
177     * the inbytes and outwords values
178     */
179    return APR_SUCCESS;
180}
181
182APR_DECLARE(apr_status_t) apr_conv_ucs2_to_utf8(const apr_wchar_t *in,
183                                                apr_size_t *inwords,
184                                                char *out,
185                                                apr_size_t *outbytes)
186{
187    apr_int64_t newch, require;
188    apr_size_t need;
189    char *invout;
190    int ch;
191
192    while (*inwords && *outbytes)
193    {
194        ch = (unsigned short)(*in++);
195        if (ch < 0x80)
196        {
197            --*inwords;
198            --*outbytes;
199            *(out++) = (unsigned char) ch;
200        }
201        else
202        {
203            if ((ch & 0xFC00) == 0xDC00) {
204                /* Invalid Leading ucs-2 Multiword Continuation Character
205                 */
206                return APR_EINVAL;
207            }
208            if ((ch & 0xFC00) == 0xD800) {
209                /* Leading ucs-2 Multiword Character
210                 */
211                if (*inwords < 2) {
212                    /* Missing ucs-2 Multiword Continuation Character
213                     */
214                    return APR_INCOMPLETE;
215                }
216                if (((unsigned short)(*in) & 0xFC00) != 0xDC00) {
217                    /* Invalid ucs-2 Multiword Continuation Character
218                     */
219                    return APR_EINVAL;
220                }
221                newch = (ch & 0x03FF) << 10 | ((unsigned short)(*in++) & 0x03FF);
222                newch += 0x10000;
223            }
224            else {
225                /* ucs-2 Single Word Character
226                 */
227                newch = ch;
228            }
229            /* Determine the absolute minimum utf-8 bytes required
230             */
231            require = newch >> 11;
232            need = 1;
233            while (require)
234                require >>= 5, ++need;
235            if (need >= *outbytes)
236                break; /* Insufficient buffer */
237            *inwords -= (need > 2) + 1;
238            *outbytes -= need + 1;
239            /* Compute the utf-8 characters in last to first order,
240             * calculating the lead character length bits along the way.
241             */
242            ch = 0200;
243            out += need + 1;
244            invout = out;
245            while (need--) {
246                ch |= ch >> 1;
247                *(--invout) = (unsigned char)(0200 | (newch & 0077));
248                newch >>= 6;
249            }
250            /* Compute the lead utf-8 character and move the dest offset
251             */
252            *(--invout) = (unsigned char)(ch | newch);
253        }
254    }
255    /* Buffer full 'errors' aren't errors, the client must inspect both
256     * the inwords and outbytes values
257     */
258    return APR_SUCCESS;
259}
260