1/*- 2 * Copyright (c) 2002-2004 Tim J. Robbins 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27// MWW: Generated by applying utf2.c.patch to utf8.c in the FreeBSD patch sets. 28 29#include <sys/param.h> 30__FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.16 2007/10/15 09:51:30 ache Exp $"); 31 32#include "xlocale_private.h" 33 34#include <errno.h> 35#include <limits.h> 36#include <runetype.h> 37#include <stdlib.h> 38#include <string.h> 39#include <wchar.h> 40#include "mblocal.h" 41 42#define UTF2_MB_CUR_MAX 3 43 44static size_t _UTF2_mbrtowc(wchar_t * __restrict, const char * __restrict, 45 size_t, mbstate_t * __restrict, locale_t); 46static int _UTF2_mbsinit(const mbstate_t *, locale_t); 47static size_t _UTF2_mbsnrtowcs(wchar_t * __restrict, 48 const char ** __restrict, size_t, size_t, 49 mbstate_t * __restrict, locale_t); 50static size_t _UTF2_wcrtomb(char * __restrict, wchar_t, 51 mbstate_t * __restrict, locale_t); 52static size_t _UTF2_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, 53 size_t, size_t, mbstate_t * __restrict, locale_t); 54 55typedef struct { 56 wchar_t ch; 57 int want; 58 wchar_t lbound; 59} _UTF2State; 60 61__private_extern__ int 62_UTF2_init(struct __xlocale_st_runelocale *xrl) 63{ 64 65 xrl->__mbrtowc = _UTF2_mbrtowc; 66 xrl->__wcrtomb = _UTF2_wcrtomb; 67 xrl->__mbsinit = _UTF2_mbsinit; 68 xrl->__mbsnrtowcs = _UTF2_mbsnrtowcs; 69 xrl->__wcsnrtombs = _UTF2_wcsnrtombs; 70 xrl->__mb_cur_max = UTF2_MB_CUR_MAX; 71 /* 72 * UCS-4 encoding used as the internal representation, so 73 * slots 0x0080-0x00FF are occuped and must be excluded 74 * from the single byte ctype by setting the limit. 75 */ 76 xrl->__mb_sb_limit = 128; 77 78 return (0); 79} 80 81static int 82_UTF2_mbsinit(const mbstate_t *ps, locale_t loc) 83{ 84 85 return (ps == NULL || ((const _UTF2State *)ps)->want == 0); 86} 87 88static size_t 89_UTF2_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, 90 mbstate_t * __restrict ps, locale_t loc) 91{ 92 _UTF2State *us; 93 int ch, i, mask, want; 94 wchar_t lbound, wch; 95 96 us = (_UTF2State *)ps; 97 98 if (us->want < 0 || us->want > 6) { 99 errno = EINVAL; 100 return ((size_t)-1); 101 } 102 103 if (s == NULL) { 104 s = ""; 105 n = 1; 106 pwc = NULL; 107 } 108 109 if (n == 0) 110 /* Incomplete multibyte sequence */ 111 return ((size_t)-2); 112 113 if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) { 114 /* Fast path for plain ASCII characters. */ 115 if (pwc != NULL) 116 *pwc = ch; 117 return (ch != '\0' ? 1 : 0); 118 } 119 120 if (us->want == 0) { 121 /* 122 * Determine the number of octets that make up this character 123 * from the first octet, and a mask that extracts the 124 * interesting bits of the first octet. We already know 125 * the character is at least two bytes long. 126 * 127 * We also specify a lower bound for the character code to 128 * detect redundant, non-"shortest form" encodings. For 129 * example, the sequence C0 80 is _not_ a legal representation 130 * of the null character. This enforces a 1-to-1 mapping 131 * between character codes and their multibyte representations. 132 */ 133 ch = (unsigned char)*s; 134 if ((ch & 0x80) == 0) { 135 mask = 0x7f; 136 want = 1; 137 lbound = 0; 138 } else if ((ch & 0xe0) == 0xc0) { 139 mask = 0x1f; 140 want = 2; 141 lbound = 0x80; 142 } else if ((ch & 0xf0) == 0xe0) { 143 mask = 0x0f; 144 want = 3; 145 lbound = 0x800; 146 } else { 147 /* 148 * Malformed input; input is not UTF2. 149 */ 150 errno = EILSEQ; 151 return ((size_t)-1); 152 } 153 } else { 154 want = us->want; 155 lbound = us->lbound; 156 } 157 158 /* 159 * Decode the octet sequence representing the character in chunks 160 * of 6 bits, most significant first. 161 */ 162 if (us->want == 0) 163 wch = (unsigned char)*s++ & mask; 164 else 165 wch = us->ch; 166 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 167 if ((*s & 0xc0) != 0x80) { 168 /* 169 * Malformed input; bad characters in the middle 170 * of a character. 171 */ 172 errno = EILSEQ; 173 return ((size_t)-1); 174 } 175 wch <<= 6; 176 wch |= *s++ & 0x3f; 177 } 178 if (i < want) { 179 /* Incomplete multibyte sequence. */ 180 us->want = want - i; 181 us->lbound = lbound; 182 us->ch = wch; 183 return ((size_t)-2); 184 } 185 if (wch < lbound) { 186 /* 187 * Malformed input; redundant encoding. 188 */ 189 errno = EILSEQ; 190 return ((size_t)-1); 191 } 192 if (pwc != NULL) 193 *pwc = wch; 194 us->want = 0; 195 return (wch == L'\0' ? 0 : want); 196} 197 198static size_t 199_UTF2_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, 200 size_t nms, size_t len, mbstate_t * __restrict ps, locale_t loc) 201{ 202 _UTF2State *us; 203 const char *s; 204 size_t nchr; 205 wchar_t wc; 206 size_t nb; 207 208 us = (_UTF2State *)ps; 209 210 s = *src; 211 nchr = 0; 212 213 if (dst == NULL) { 214 /* 215 * The fast path in the loop below is not safe if an ASCII 216 * character appears as anything but the first byte of a 217 * multibyte sequence. Check now to avoid doing it in the loop. 218 */ 219 if (nms > 0 && us->want > 0 && (signed char)*s > 0) { 220 errno = EILSEQ; 221 return ((size_t)-1); 222 } 223 for (;;) { 224 if (nms > 0 && (signed char)*s > 0) 225 /* 226 * Fast path for plain ASCII characters 227 * excluding NUL. 228 */ 229 nb = 1; 230 else if ((nb = _UTF2_mbrtowc(&wc, s, nms, ps, loc)) == 231 (size_t)-1) 232 /* Invalid sequence - mbrtowc() sets errno. */ 233 return ((size_t)-1); 234 else if (nb == 0 || nb == (size_t)-2) 235 return (nchr); 236 s += nb; 237 nms -= nb; 238 nchr++; 239 } 240 /*NOTREACHED*/ 241 } 242 243 /* 244 * The fast path in the loop below is not safe if an ASCII 245 * character appears as anything but the first byte of a 246 * multibyte sequence. Check now to avoid doing it in the loop. 247 */ 248 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { 249 errno = EILSEQ; 250 return ((size_t)-1); 251 } 252 while (len-- > 0) { 253 if (nms > 0 && (signed char)*s > 0) { 254 /* 255 * Fast path for plain ASCII characters 256 * excluding NUL. 257 */ 258 *dst = (wchar_t)*s; 259 nb = 1; 260 } else if ((nb = _UTF2_mbrtowc(dst, s, nms, ps, loc)) == 261 (size_t)-1) { 262 *src = s; 263 return ((size_t)-1); 264 } else if (nb == (size_t)-2) { 265 *src = s + nms; 266 return (nchr); 267 } else if (nb == 0) { 268 *src = NULL; 269 return (nchr); 270 } 271 s += nb; 272 nms -= nb; 273 nchr++; 274 dst++; 275 } 276 *src = s; 277 return (nchr); 278} 279 280static size_t 281_UTF2_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps, locale_t loc) 282{ 283 _UTF2State *us; 284 unsigned char lead; 285 int i, len; 286 287 us = (_UTF2State *)ps; 288 289 if (us->want != 0) { 290 errno = EINVAL; 291 return ((size_t)-1); 292 } 293 294 if (s == NULL) 295 /* Reset to initial shift state (no-op) */ 296 return (1); 297 298 if ((wc & ~0x7f) == 0) { 299 /* Fast path for plain ASCII characters. */ 300 *s = (char)wc; 301 return (1); 302 } 303 304 /* 305 * Determine the number of octets needed to represent this character. 306 * We always output the shortest sequence possible. Also specify the 307 * first few bits of the first octet, which contains the information 308 * about the sequence length. 309 */ 310 if ((wc & ~0x7f) == 0) { 311 lead = 0; 312 len = 1; 313 } else if ((wc & ~0x7ff) == 0) { 314 lead = 0xc0; 315 len = 2; 316 } else if ((wc & ~0xffff) == 0) { 317 lead = 0xe0; 318 len = 3; 319 } else { 320 errno = EILSEQ; 321 return ((size_t)-1); 322 } 323 324 /* 325 * Output the octets representing the character in chunks 326 * of 6 bits, least significant last. The first octet is 327 * a special case because it contains the sequence length 328 * information. 329 */ 330 for (i = len - 1; i > 0; i--) { 331 s[i] = (wc & 0x3f) | 0x80; 332 wc >>= 6; 333 } 334 *s = (wc & 0xff) | lead; 335 336 return (len); 337} 338 339static size_t 340_UTF2_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 341 size_t nwc, size_t len, mbstate_t * __restrict ps, locale_t loc) 342{ 343 _UTF2State *us; 344 char buf[MB_LEN_MAX]; 345 const wchar_t *s; 346 size_t nbytes; 347 size_t nb; 348 349 us = (_UTF2State *)ps; 350 351 if (us->want != 0) { 352 errno = EINVAL; 353 return ((size_t)-1); 354 } 355 356 s = *src; 357 nbytes = 0; 358 359 if (dst == NULL) { 360 while (nwc-- > 0) { 361 if (0 <= *s && *s < 0x80) 362 /* Fast path for plain ASCII characters. */ 363 nb = 1; 364 else if ((nb = _UTF2_wcrtomb(buf, *s, ps, loc)) == 365 (size_t)-1) 366 /* Invalid character - wcrtomb() sets errno. */ 367 return ((size_t)-1); 368 if (*s == L'\0') 369 return (nbytes + nb - 1); 370 s++; 371 nbytes += nb; 372 } 373 return (nbytes); 374 } 375 376 while (len > 0 && nwc-- > 0) { 377 if (0 <= *s && *s < 0x80) { 378 /* Fast path for plain ASCII characters. */ 379 nb = 1; 380 *dst = *s; 381 } else if (len > (size_t)UTF2_MB_CUR_MAX) { 382 /* Enough space to translate in-place. */ 383 if ((nb = _UTF2_wcrtomb(dst, *s, ps, loc)) == (size_t)-1) { 384 *src = s; 385 return ((size_t)-1); 386 } 387 } else { 388 /* 389 * May not be enough space; use temp. buffer. 390 */ 391 if ((nb = _UTF2_wcrtomb(buf, *s, ps, loc)) == (size_t)-1) { 392 *src = s; 393 return ((size_t)-1); 394 } 395 if (nb > (int)len) 396 /* MB sequence for character won't fit. */ 397 break; 398 memcpy(dst, buf, nb); 399 } 400 if (*s == L'\0') { 401 *src = NULL; 402 return (nbytes + nb - 1); 403 } 404 s++; 405 dst += nb; 406 len -= nb; 407 nbytes += nb; 408 } 409 *src = s; 410 return (nbytes); 411} 412