1/* Licensed to the Apache Software Foundation (ASF) under one or more 2 * contributor license agreements. See the NOTICE file distributed with 3 * this work for additional information regarding copyright ownership. 4 * The ASF licenses this file to You under the Apache License, Version 2.0 5 * (the "License"); you may not use this file except in compliance with 6 * the License. You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "apr.h" 18#include "apr_private.h" 19#include "apr_errno.h" 20#include "apr_arch_utf8.h" 21 22/* Implementation of RFC 3629, "UTF-8, a transformation format of ISO 10646" 23 * with particular attention to canonical translation forms (see section 10 24 * "Security Considerations" of the RFC for more info). 25 * 26 * Since several architectures including Windows support unicode, with UCS2 27 * used as the actual storage conventions by that archicture, these functions 28 * exist to transform or validate UCS2 strings into APR's 'char' type 29 * convention. It is left up to the operating system to determine the 30 * validitity of the string, e.g. normative forms, in the context of 31 * its native language support. Other file systems which support filename 32 * characters of 0x80-0xff but have no explicit requirement for Unicode 33 * will find this function useful only for validating the character sequences 34 * and rejecting poorly encoded UTF8 sequences. 35 * 36 * Len UCS-4 range (hex) UTF-8 octet sequence (binary) 37 * 1:2 00000000-0000007F 0xxxxxxx 38 * 2:2 00000080-000007FF 110XXXXx 10xxxxxx 39 * 3:2 00000800-0000FFFF 1110XXXX 10Xxxxxx 10xxxxxx 40 * 4:4 00010000-001FFFFF 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx 41 * 00200000-03FFFFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx 42 * 04000000-7FFFFFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 43 * 44 * One of the X bits must be 1 to avoid overlong representation of ucs2 values. 45 * 46 * For conversion into ucs2, the 4th form is limited in range to 0010 FFFF, 47 * and the final two forms are used only by full ucs4, per RFC 3629; 48 * 49 * "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in 50 * Unicode parlance), being actually UCS-4 characters transformed 51 * through UTF-16, need special treatment: the UTF-16 transformation 52 * must be undone, yielding a UCS-4 character that is then transformed 53 * as above." 54 * 55 * From RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask 56 * 57 * U' = U - 0x10000 58 * U' = 00000000 0000yyyy yyyyyyxx xxxxxxxx 59 * W1 = 110110yy yyyyyyyy 60 * W2 = 110111xx xxxxxxxx 61 * Max U' = 0000 00001111 11111111 11111111 62 * Max U = 0000 00010000 11111111 11111111 63 * 64 * Len is the table above is a mapping of bytes used for utf8:ucs2 values, 65 * which results in these conclusions of maximum allocations; 66 * 67 * apr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2 68 * apr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2 69 */ 70 71APR_DECLARE(apr_status_t) apr_conv_utf8_to_ucs2(const char *in, 72 apr_size_t *inbytes, 73 apr_wchar_t *out, 74 apr_size_t *outwords) 75{ 76 apr_int64_t newch, mask; 77 apr_size_t expect, eating; 78 int ch; 79 80 while (*inbytes && *outwords) 81 { 82 ch = (unsigned char)(*in++); 83 if (!(ch & 0200)) { 84 /* US-ASCII-7 plain text 85 */ 86 --*inbytes; 87 --*outwords; 88 *(out++) = ch; 89 } 90 else 91 { 92 if ((ch & 0300) != 0300) { 93 /* Multibyte Continuation is out of place 94 */ 95 return APR_EINVAL; 96 } 97 else 98 { 99 /* Multibyte Sequence Lead Character 100 * 101 * Compute the expected bytes while adjusting 102 * or lead byte and leading zeros mask. 103 */ 104 mask = 0340; 105 expect = 1; 106 while ((ch & mask) == mask) { 107 mask |= mask >> 1; 108 if (++expect > 3) /* (truly 5 for ucs-4) */ 109 return APR_EINVAL; 110 } 111 newch = ch & ~mask; 112 eating = expect + 1; 113 if (*inbytes <= expect) 114 return APR_INCOMPLETE; 115 /* Reject values of excessive leading 0 bits 116 * utf-8 _demands_ the shortest possible byte length 117 */ 118 if (expect == 1) { 119 if (!(newch & 0036)) 120 return APR_EINVAL; 121 } 122 else { 123 /* Reject values of excessive leading 0 bits 124 */ 125 if (!newch && !((unsigned char)*in & 0077 & (mask << 1))) 126 return APR_EINVAL; 127 if (expect == 2) { 128 /* Reject values D800-DFFF when not utf16 encoded 129 * (may not be an appropriate restriction for ucs-4) 130 */ 131 if (newch == 0015 && ((unsigned char)*in & 0040)) 132 return APR_EINVAL; 133 } 134 else if (expect == 3) { 135 /* Short circuit values > 110000 136 */ 137 if (newch > 4) 138 return APR_EINVAL; 139 if (newch == 4 && ((unsigned char)*in & 0060)) 140 return APR_EINVAL; 141 } 142 } 143 /* Where the boolean (expect > 2) is true, we will need 144 * an extra word for the output. 145 */ 146 if (*outwords < (apr_size_t)(expect > 2) + 1) 147 break; /* buffer full */ 148 while (expect--) 149 { 150 /* Multibyte Continuation must be legal */ 151 if (((ch = (unsigned char)*(in++)) & 0300) != 0200) 152 return APR_EINVAL; 153 newch <<= 6; 154 newch |= (ch & 0077); 155 } 156 *inbytes -= eating; 157 /* newch is now a true ucs-4 character 158 * 159 * now we need to fold to ucs-2 160 */ 161 if (newch < 0x10000) 162 { 163 --*outwords; 164 *(out++) = (apr_wchar_t) newch; 165 } 166 else 167 { 168 *outwords -= 2; 169 newch -= 0x10000; 170 *(out++) = (apr_wchar_t) (0xD800 | (newch >> 10)); 171 *(out++) = (apr_wchar_t) (0xDC00 | (newch & 0x03FF)); 172 } 173 } 174 } 175 } 176 /* Buffer full 'errors' aren't errors, the client must inspect both 177 * the inbytes and outwords values 178 */ 179 return APR_SUCCESS; 180} 181 182APR_DECLARE(apr_status_t) apr_conv_ucs2_to_utf8(const apr_wchar_t *in, 183 apr_size_t *inwords, 184 char *out, 185 apr_size_t *outbytes) 186{ 187 apr_int64_t newch, require; 188 apr_size_t need; 189 char *invout; 190 int ch; 191 192 while (*inwords && *outbytes) 193 { 194 ch = (unsigned short)(*in++); 195 if (ch < 0x80) 196 { 197 --*inwords; 198 --*outbytes; 199 *(out++) = (unsigned char) ch; 200 } 201 else 202 { 203 if ((ch & 0xFC00) == 0xDC00) { 204 /* Invalid Leading ucs-2 Multiword Continuation Character 205 */ 206 return APR_EINVAL; 207 } 208 if ((ch & 0xFC00) == 0xD800) { 209 /* Leading ucs-2 Multiword Character 210 */ 211 if (*inwords < 2) { 212 /* Missing ucs-2 Multiword Continuation Character 213 */ 214 return APR_INCOMPLETE; 215 } 216 if (((unsigned short)(*in) & 0xFC00) != 0xDC00) { 217 /* Invalid ucs-2 Multiword Continuation Character 218 */ 219 return APR_EINVAL; 220 } 221 newch = (ch & 0x03FF) << 10 | ((unsigned short)(*in++) & 0x03FF); 222 newch += 0x10000; 223 } 224 else { 225 /* ucs-2 Single Word Character 226 */ 227 newch = ch; 228 } 229 /* Determine the absolute minimum utf-8 bytes required 230 */ 231 require = newch >> 11; 232 need = 1; 233 while (require) 234 require >>= 5, ++need; 235 if (need >= *outbytes) 236 break; /* Insufficient buffer */ 237 *inwords -= (need > 2) + 1; 238 *outbytes -= need + 1; 239 /* Compute the utf-8 characters in last to first order, 240 * calculating the lead character length bits along the way. 241 */ 242 ch = 0200; 243 out += need + 1; 244 invout = out; 245 while (need--) { 246 ch |= ch >> 1; 247 *(--invout) = (unsigned char)(0200 | (newch & 0077)); 248 newch >>= 6; 249 } 250 /* Compute the lead utf-8 character and move the dest offset 251 */ 252 *(--invout) = (unsigned char)(ch | newch); 253 } 254 } 255 /* Buffer full 'errors' aren't errors, the client must inspect both 256 * the inwords and outbytes values 257 */ 258 return APR_SUCCESS; 259} 260