1/*- 2 * Copyright (c) 2002-2004 Tim J. Robbins 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/param.h> 28__FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.16 2007/10/15 09:51:30 ache Exp $"); 29 30#include "xlocale_private.h" 31 32#include <errno.h> 33#include <limits.h> 34#include <runetype.h> 35#include <stdlib.h> 36#include <string.h> 37#include <wchar.h> 38#include "mblocal.h" 39 40/* 41 * 10952550: detect ill-formed UTF-8 42 * Unicode 6.0, section D92, mandates specific byte sequences for well- 43 * formed UTF-8. UTF-8 sequences are now limited to 4 bytes, while the 44 * FreeBSD code originally handled up to 6. Illegal surrogate code point 45 * sequences are now detected. And while "non-shortest forms" were detected, 46 * this only happened after completing the sequence. Now, all ill-formed 47 * sequences are detected at the earliest point. 48 * 49 * Table 3-7. Well-Formed UTF-8 Byte Sequences 50 * 51 * Code Points 1st 2nd 3rd 4th Byte 52 * U+0000..U+007F 00..7F 53 * U+0080..U+07FF C2..DF 80..BF 54 * U+0800..U+0FFF E0 A0..BF 80..BF 55 * U+1000..U+CFFF E1..EC 80..BF 80..BF 56 * U+D000..U+D7FF ED 80..9F 80..BF 57 * U+E000..U+FFFF EE..EF 80..BF 80..BF 58 * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF 59 * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF 60 * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 61 * 62 * Note that while any 3rd and 4th byte can be in the range 80..BF, the 63 * second byte is often limited to a smaller range. 64 */ 65 66typedef struct { 67 unsigned char lowerbound; 68 unsigned char upperbound; 69} SecondByte; 70static SecondByte sb_00_00 = {0x00, 0x00}; 71static SecondByte sb_80_8F = {0x80, 0x8F}; 72static SecondByte sb_80_9F = {0x80, 0x9F}; 73static SecondByte sb_80_BF = {0x80, 0xBF}; 74static SecondByte sb_90_BF = {0x90, 0xBF}; 75static SecondByte sb_A0_BF = {0xA0, 0xBF}; 76 77#define UTF8_MB_CUR_MAX 4 78 79static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, 80 size_t, mbstate_t * __restrict, locale_t); 81static int _UTF8_mbsinit(const mbstate_t *, locale_t); 82static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, 83 const char ** __restrict, size_t, size_t, 84 mbstate_t * __restrict, locale_t); 85static size_t _UTF8_wcrtomb(char * __restrict, wchar_t, 86 mbstate_t * __restrict, locale_t); 87static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, 88 size_t, size_t, mbstate_t * __restrict, locale_t); 89 90typedef struct { 91 wchar_t ch; 92 int want; 93 SecondByte sb; 94} _UTF8State; 95 96int 97_UTF8_init(struct __xlocale_st_runelocale *xrl) 98{ 99 100 xrl->__mbrtowc = _UTF8_mbrtowc; 101 xrl->__wcrtomb = _UTF8_wcrtomb; 102 xrl->__mbsinit = _UTF8_mbsinit; 103 xrl->__mbsnrtowcs = _UTF8_mbsnrtowcs; 104 xrl->__wcsnrtombs = _UTF8_wcsnrtombs; 105 xrl->__mb_cur_max = UTF8_MB_CUR_MAX; 106 /* 107 * UCS-4 encoding used as the internal representation, so 108 * slots 0x0080-0x00FF are occuped and must be excluded 109 * from the single byte ctype by setting the limit. 110 */ 111 xrl->__mb_sb_limit = 128; 112 113 return (0); 114} 115 116static int 117_UTF8_mbsinit(const mbstate_t *ps, locale_t loc) 118{ 119 120 return (ps == NULL || ((const _UTF8State *)ps)->want == 0); 121} 122 123static size_t 124_UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, 125 mbstate_t * __restrict ps, locale_t loc) 126{ 127 _UTF8State *us; 128 int ch, i, mask, want; 129 wchar_t wch; 130 SecondByte sb; 131 132 us = (_UTF8State *)ps; 133 134 if (us->want < 0 || us->want > UTF8_MB_CUR_MAX) { 135 errno = EINVAL; 136 return ((size_t)-1); 137 } 138 139 if (s == NULL) { 140 s = ""; 141 n = 1; 142 pwc = NULL; 143 } 144 145 if (n == 0) 146 /* Incomplete multibyte sequence */ 147 return ((size_t)-2); 148 149 if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) { 150 /* Fast path for plain ASCII characters. */ 151 if (pwc != NULL) 152 *pwc = ch; 153 return (ch != '\0' ? 1 : 0); 154 } 155 156 if (us->want == 0) { 157 /* 158 * Determine the number of octets that make up this character 159 * from the first octet, and a mask that extracts the 160 * interesting bits of the first octet. We already know 161 * the character is at least two bytes long. 162 * 163 * We detect if the first byte is illegal, and set sb to 164 * the legal range of the second byte. 165 */ 166 ch = (unsigned char)*s; 167 if ((ch & 0x80) == 0) { 168 mask = 0x7f; 169 want = 1; 170 sb = sb_00_00; 171 } else if ((ch & 0xe0) == 0xc0) { 172 if (ch < 0xc2) goto malformed; 173 mask = 0x1f; 174 want = 2; 175 sb = sb_80_BF; 176 } else if ((ch & 0xf0) == 0xe0) { 177 mask = 0x0f; 178 want = 3; 179 switch (ch) { 180 case 0xe0: 181 sb = sb_A0_BF; 182 break; 183 case 0xed: 184 sb = sb_80_9F; 185 break; 186 default: 187 sb = sb_80_BF; 188 break; 189 } 190 } else if ((ch & 0xf8) == 0xf0) { 191 if (ch > 0xf4) goto malformed; 192 mask = 0x07; 193 want = 4; 194 switch (ch) { 195 case 0xf0: 196 sb = sb_90_BF; 197 break; 198 case 0xf4: 199 sb = sb_80_8F; 200 break; 201 default: 202 sb = sb_80_BF; 203 break; 204 } 205 } else { 206malformed: 207 /* 208 * Malformed input; input is not UTF-8. 209 */ 210 errno = EILSEQ; 211 return ((size_t)-1); 212 } 213 } else { 214 want = us->want; 215 sb = us->sb; 216 } 217 218 /* 219 * Decode the octet sequence representing the character in chunks 220 * of 6 bits, most significant first. 221 */ 222 if (us->want == 0) 223 wch = (unsigned char)*s++ & mask; 224 else 225 wch = us->ch; 226 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 227 if (sb.lowerbound) { 228 if ((unsigned char)*s < sb.lowerbound || 229 (unsigned char)*s > sb.upperbound) goto malformed; 230 sb = sb_00_00; 231 } else if ((*s & 0xc0) != 0x80) goto malformed; 232 wch <<= 6; 233 wch |= *s++ & 0x3f; 234 } 235 if (i < want) { 236 /* Incomplete multibyte sequence. */ 237 us->want = want - i; 238 us->sb = sb; 239 us->ch = wch; 240 return ((size_t)-2); 241 } 242 if (pwc != NULL) 243 *pwc = wch; 244 us->want = 0; 245 return (wch == L'\0' ? 0 : want); 246} 247 248static size_t 249_UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, 250 size_t nms, size_t len, mbstate_t * __restrict ps, locale_t loc) 251{ 252 _UTF8State *us; 253 const char *s; 254 size_t nchr; 255 wchar_t wc; 256 size_t nb; 257 258 us = (_UTF8State *)ps; 259 260 s = *src; 261 nchr = 0; 262 263 if (dst == NULL) { 264 /* 265 * The fast path in the loop below is not safe if an ASCII 266 * character appears as anything but the first byte of a 267 * multibyte sequence. Check now to avoid doing it in the loop. 268 */ 269 if (nms > 0 && us->want > 0 && (signed char)*s > 0) { 270 errno = EILSEQ; 271 return ((size_t)-1); 272 } 273 for (;;) { 274 if (nms > 0 && (signed char)*s > 0) 275 /* 276 * Fast path for plain ASCII characters 277 * excluding NUL. 278 */ 279 nb = 1; 280 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps, loc)) == 281 (size_t)-1) 282 /* Invalid sequence - mbrtowc() sets errno. */ 283 return ((size_t)-1); 284 else if (nb == 0 || nb == (size_t)-2) 285 return (nchr); 286 s += nb; 287 nms -= nb; 288 nchr++; 289 } 290 /*NOTREACHED*/ 291 } 292 293 /* 294 * The fast path in the loop below is not safe if an ASCII 295 * character appears as anything but the first byte of a 296 * multibyte sequence. Check now to avoid doing it in the loop. 297 */ 298 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { 299 errno = EILSEQ; 300 return ((size_t)-1); 301 } 302 while (len-- > 0) { 303 if (nms > 0 && (signed char)*s > 0) { 304 /* 305 * Fast path for plain ASCII characters 306 * excluding NUL. 307 */ 308 *dst = (wchar_t)*s; 309 nb = 1; 310 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps, loc)) == 311 (size_t)-1) { 312 *src = s; 313 return ((size_t)-1); 314 } else if (nb == (size_t)-2) { 315 *src = s + nms; 316 return (nchr); 317 } else if (nb == 0) { 318 *src = NULL; 319 return (nchr); 320 } 321 s += nb; 322 nms -= nb; 323 nchr++; 324 dst++; 325 } 326 *src = s; 327 return (nchr); 328} 329 330static size_t 331_UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps, locale_t loc) 332{ 333 _UTF8State *us; 334 unsigned char lead; 335 int i, len; 336 337 us = (_UTF8State *)ps; 338 339 if (us->want != 0) { 340 errno = EINVAL; 341 return ((size_t)-1); 342 } 343 344 if (s == NULL) 345 /* Reset to initial shift state (no-op) */ 346 return (1); 347 348 if ((wc & ~0x7f) == 0) { 349 /* Fast path for plain ASCII characters. */ 350 *s = (char)wc; 351 return (1); 352 } 353 354 /* 355 * Determine the number of octets needed to represent this character. 356 * We always output the shortest sequence possible. Also specify the 357 * first few bits of the first octet, which contains the information 358 * about the sequence length. 359 */ 360 if ((wc & ~0x7f) == 0) { 361 lead = 0; 362 len = 1; 363 } else if ((wc & ~0x7ff) == 0) { 364 lead = 0xc0; 365 len = 2; 366 } else if ((wc & ~0xffff) == 0) { 367 if (wc >= 0xd800 && wc <= 0xdfff) goto illegal; 368 lead = 0xe0; 369 len = 3; 370 } else if ((wc & ~0x1fffff) == 0) { 371 if (wc > 0x10ffff) goto illegal; 372 lead = 0xf0; 373 len = 4; 374 } else { 375illegal: 376 errno = EILSEQ; 377 return ((size_t)-1); 378 } 379 380 /* 381 * Output the octets representing the character in chunks 382 * of 6 bits, least significant last. The first octet is 383 * a special case because it contains the sequence length 384 * information. 385 */ 386 for (i = len - 1; i > 0; i--) { 387 s[i] = (wc & 0x3f) | 0x80; 388 wc >>= 6; 389 } 390 *s = (wc & 0xff) | lead; 391 392 return (len); 393} 394 395static size_t 396_UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 397 size_t nwc, size_t len, mbstate_t * __restrict ps, locale_t loc) 398{ 399 _UTF8State *us; 400 char buf[MB_LEN_MAX]; 401 const wchar_t *s; 402 size_t nbytes; 403 size_t nb; 404 405 us = (_UTF8State *)ps; 406 407 if (us->want != 0) { 408 errno = EINVAL; 409 return ((size_t)-1); 410 } 411 412 s = *src; 413 nbytes = 0; 414 415 if (dst == NULL) { 416 while (nwc-- > 0) { 417 if (0 <= *s && *s < 0x80) 418 /* Fast path for plain ASCII characters. */ 419 nb = 1; 420 else if ((nb = _UTF8_wcrtomb(buf, *s, ps, loc)) == 421 (size_t)-1) 422 /* Invalid character - wcrtomb() sets errno. */ 423 return ((size_t)-1); 424 if (*s == L'\0') 425 return (nbytes + nb - 1); 426 s++; 427 nbytes += nb; 428 } 429 return (nbytes); 430 } 431 432 while (len > 0 && nwc-- > 0) { 433 if (0 <= *s && *s < 0x80) { 434 /* Fast path for plain ASCII characters. */ 435 nb = 1; 436 *dst = *s; 437 } else if (len > (size_t)UTF8_MB_CUR_MAX) { 438 /* Enough space to translate in-place. */ 439 if ((nb = _UTF8_wcrtomb(dst, *s, ps, loc)) == (size_t)-1) { 440 *src = s; 441 return ((size_t)-1); 442 } 443 } else { 444 /* 445 * May not be enough space; use temp. buffer. 446 */ 447 if ((nb = _UTF8_wcrtomb(buf, *s, ps, loc)) == (size_t)-1) { 448 *src = s; 449 return ((size_t)-1); 450 } 451 if (nb > (int)len) 452 /* MB sequence for character won't fit. */ 453 break; 454 memcpy(dst, buf, nb); 455 } 456 if (*s == L'\0') { 457 *src = NULL; 458 return (nbytes + nb - 1); 459 } 460 s++; 461 dst += nb; 462 len -= nb; 463 nbytes += nb; 464 } 465 *src = s; 466 return (nbytes); 467} 468