1104828Stjr/*- 2268571Spfg * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 3128004Stjr * Copyright (c) 2002-2004 Tim J. Robbins 4104828Stjr * All rights reserved. 5104828Stjr * 6227753Stheraven * Copyright (c) 2011 The FreeBSD Foundation 7227753Stheraven * All rights reserved. 8227753Stheraven * Portions of this software were developed by David Chisnall 9227753Stheraven * under sponsorship from the FreeBSD Foundation. 10227753Stheraven * 11104828Stjr * Redistribution and use in source and binary forms, with or without 12104828Stjr * modification, are permitted provided that the following conditions 13104828Stjr * are met: 14104828Stjr * 1. Redistributions of source code must retain the above copyright 15104828Stjr * notice, this list of conditions and the following disclaimer. 16104828Stjr * 2. Redistributions in binary form must reproduce the above copyright 17104828Stjr * notice, this list of conditions and the following disclaimer in the 18104828Stjr * documentation and/or other materials provided with the distribution. 19104828Stjr * 20104828Stjr * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21104828Stjr * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22104828Stjr * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23104828Stjr * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24104828Stjr * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25104828Stjr * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26104828Stjr * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27104828Stjr * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28104828Stjr * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29104828Stjr * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30104828Stjr * SUCH DAMAGE. 31104828Stjr */ 32104828Stjr 33128004Stjr#include <sys/param.h> 34104828Stjr__FBSDID("$FreeBSD$"); 35104828Stjr 36121893Stjr#include <errno.h> 37132687Stjr#include <limits.h> 38121893Stjr#include <runetype.h> 39104828Stjr#include <stdlib.h> 40128004Stjr#include <string.h> 41121893Stjr#include <wchar.h> 42129153Stjr#include "mblocal.h" 43104828Stjr 44172619Sacheextern int __mb_sb_limit; 45172619Sache 46142654Sphantomstatic size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, 47142654Sphantom size_t, mbstate_t * __restrict); 48142654Sphantomstatic int _UTF8_mbsinit(const mbstate_t *); 49142654Sphantomstatic size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, 50142654Sphantom const char ** __restrict, size_t, size_t, 51142654Sphantom mbstate_t * __restrict); 52142654Sphantomstatic size_t _UTF8_wcrtomb(char * __restrict, wchar_t, 53142654Sphantom mbstate_t * __restrict); 54142654Sphantomstatic size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, 55142654Sphantom size_t, size_t, mbstate_t * __restrict); 56121893Stjr 57128004Stjrtypedef struct { 58129336Stjr wchar_t ch; 59129336Stjr int want; 60129336Stjr wchar_t lbound; 61128004Stjr} _UTF8State; 62128004Stjr 63104828Stjrint 64227753Stheraven_UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl) 65104828Stjr{ 66104828Stjr 67227753Stheraven l->__mbrtowc = _UTF8_mbrtowc; 68227753Stheraven l->__wcrtomb = _UTF8_wcrtomb; 69227753Stheraven l->__mbsinit = _UTF8_mbsinit; 70227753Stheraven l->__mbsnrtowcs = _UTF8_mbsnrtowcs; 71227753Stheraven l->__wcsnrtombs = _UTF8_wcsnrtombs; 72227753Stheraven l->runes = rl; 73227753Stheraven l->__mb_cur_max = 6; 74172661Sache /* 75172661Sache * UCS-4 encoding used as the internal representation, so 76172661Sache * slots 0x0080-0x00FF are occuped and must be excluded 77172661Sache * from the single byte ctype by setting the limit. 78172661Sache */ 79227753Stheraven l->__mb_sb_limit = 128; 80104828Stjr 81104828Stjr return (0); 82104828Stjr} 83104828Stjr 84142654Sphantomstatic int 85128004Stjr_UTF8_mbsinit(const mbstate_t *ps) 86128004Stjr{ 87128004Stjr 88129336Stjr return (ps == NULL || ((const _UTF8State *)ps)->want == 0); 89128004Stjr} 90128004Stjr 91142654Sphantomstatic size_t 92121893Stjr_UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, 93128004Stjr mbstate_t * __restrict ps) 94104828Stjr{ 95128004Stjr _UTF8State *us; 96129336Stjr int ch, i, mask, want; 97121893Stjr wchar_t lbound, wch; 98104828Stjr 99128004Stjr us = (_UTF8State *)ps; 100128004Stjr 101129336Stjr if (us->want < 0 || us->want > 6) { 102128155Stjr errno = EINVAL; 103128155Stjr return ((size_t)-1); 104128155Stjr } 105128155Stjr 106128004Stjr if (s == NULL) { 107128004Stjr s = ""; 108128004Stjr n = 1; 109128004Stjr pwc = NULL; 110128004Stjr } 111128004Stjr 112121893Stjr if (n == 0) 113121893Stjr /* Incomplete multibyte sequence */ 114121893Stjr return ((size_t)-2); 115104828Stjr 116129336Stjr if (us->want == 0) { 117104828Stjr /* 118129336Stjr * Determine the number of octets that make up this character 119129336Stjr * from the first octet, and a mask that extracts the 120129336Stjr * interesting bits of the first octet. We already know 121129336Stjr * the character is at least two bytes long. 122129336Stjr * 123129336Stjr * We also specify a lower bound for the character code to 124129336Stjr * detect redundant, non-"shortest form" encodings. For 125129336Stjr * example, the sequence C0 80 is _not_ a legal representation 126129336Stjr * of the null character. This enforces a 1-to-1 mapping 127129336Stjr * between character codes and their multibyte representations. 128104828Stjr */ 129129336Stjr ch = (unsigned char)*s; 130129336Stjr if ((ch & 0x80) == 0) { 131268571Spfg /* Fast path for plain ASCII characters. */ 132268571Spfg if (pwc != NULL) 133268571Spfg *pwc = ch; 134268571Spfg return (ch != '\0' ? 1 : 0); 135268571Spfg } 136268571Spfg if ((ch & 0xe0) == 0xc0) { 137129336Stjr mask = 0x1f; 138129336Stjr want = 2; 139129336Stjr lbound = 0x80; 140129336Stjr } else if ((ch & 0xf0) == 0xe0) { 141129336Stjr mask = 0x0f; 142129336Stjr want = 3; 143129336Stjr lbound = 0x800; 144129336Stjr } else if ((ch & 0xf8) == 0xf0) { 145129336Stjr mask = 0x07; 146129336Stjr want = 4; 147129336Stjr lbound = 0x10000; 148129336Stjr } else { 149129336Stjr /* 150129336Stjr * Malformed input; input is not UTF-8. 151129336Stjr */ 152129336Stjr errno = EILSEQ; 153129336Stjr return ((size_t)-1); 154129336Stjr } 155129336Stjr } else { 156129336Stjr want = us->want; 157129336Stjr lbound = us->lbound; 158104828Stjr } 159104828Stjr 160104828Stjr /* 161104828Stjr * Decode the octet sequence representing the character in chunks 162104828Stjr * of 6 bits, most significant first. 163104828Stjr */ 164129336Stjr if (us->want == 0) 165129336Stjr wch = (unsigned char)*s++ & mask; 166129336Stjr else 167129336Stjr wch = us->ch; 168129336Stjr for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 169121893Stjr if ((*s & 0xc0) != 0x80) { 170104828Stjr /* 171104828Stjr * Malformed input; bad characters in the middle 172104828Stjr * of a character. 173104828Stjr */ 174121893Stjr errno = EILSEQ; 175121893Stjr return ((size_t)-1); 176104828Stjr } 177104828Stjr wch <<= 6; 178121893Stjr wch |= *s++ & 0x3f; 179104828Stjr } 180129336Stjr if (i < want) { 181129336Stjr /* Incomplete multibyte sequence. */ 182129336Stjr us->want = want - i; 183129336Stjr us->lbound = lbound; 184129336Stjr us->ch = wch; 185129336Stjr return ((size_t)-2); 186129336Stjr } 187121893Stjr if (wch < lbound) { 188104828Stjr /* 189104828Stjr * Malformed input; redundant encoding. 190104828Stjr */ 191121893Stjr errno = EILSEQ; 192121893Stjr return ((size_t)-1); 193121893Stjr } 194287393Sbapt if ((wch >= 0xd800 && wch <= 0xdfff) || wch > 0x10ffff) { 195265361Spfg /* 196265361Spfg * Malformed input; invalid code points. 197265361Spfg */ 198265361Spfg errno = EILSEQ; 199265361Spfg return ((size_t)-1); 200265361Spfg } 201121893Stjr if (pwc != NULL) 202121893Stjr *pwc = wch; 203129336Stjr us->want = 0; 204129336Stjr return (wch == L'\0' ? 0 : want); 205104828Stjr} 206104828Stjr 207142654Sphantomstatic size_t 208132687Stjr_UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, 209132687Stjr size_t nms, size_t len, mbstate_t * __restrict ps) 210132687Stjr{ 211132687Stjr _UTF8State *us; 212132687Stjr const char *s; 213132687Stjr size_t nchr; 214132687Stjr wchar_t wc; 215132687Stjr size_t nb; 216132687Stjr 217132687Stjr us = (_UTF8State *)ps; 218132687Stjr 219132687Stjr s = *src; 220132687Stjr nchr = 0; 221132687Stjr 222132687Stjr if (dst == NULL) { 223132687Stjr /* 224132687Stjr * The fast path in the loop below is not safe if an ASCII 225132687Stjr * character appears as anything but the first byte of a 226132687Stjr * multibyte sequence. Check now to avoid doing it in the loop. 227132687Stjr */ 228132687Stjr if (nms > 0 && us->want > 0 && (signed char)*s > 0) { 229132687Stjr errno = EILSEQ; 230132687Stjr return ((size_t)-1); 231132687Stjr } 232132687Stjr for (;;) { 233132687Stjr if (nms > 0 && (signed char)*s > 0) 234132687Stjr /* 235132687Stjr * Fast path for plain ASCII characters 236132687Stjr * excluding NUL. 237132687Stjr */ 238132687Stjr nb = 1; 239132687Stjr else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) == 240132687Stjr (size_t)-1) 241132687Stjr /* Invalid sequence - mbrtowc() sets errno. */ 242132687Stjr return ((size_t)-1); 243132687Stjr else if (nb == 0 || nb == (size_t)-2) 244132687Stjr return (nchr); 245132687Stjr s += nb; 246132687Stjr nms -= nb; 247132687Stjr nchr++; 248132687Stjr } 249132687Stjr /*NOTREACHED*/ 250132687Stjr } 251132687Stjr 252132687Stjr /* 253132687Stjr * The fast path in the loop below is not safe if an ASCII 254132687Stjr * character appears as anything but the first byte of a 255132687Stjr * multibyte sequence. Check now to avoid doing it in the loop. 256132687Stjr */ 257132687Stjr if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { 258132687Stjr errno = EILSEQ; 259132687Stjr return ((size_t)-1); 260132687Stjr } 261132687Stjr while (len-- > 0) { 262132687Stjr if (nms > 0 && (signed char)*s > 0) { 263132687Stjr /* 264132687Stjr * Fast path for plain ASCII characters 265132687Stjr * excluding NUL. 266132687Stjr */ 267132687Stjr *dst = (wchar_t)*s; 268132687Stjr nb = 1; 269132687Stjr } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) == 270132687Stjr (size_t)-1) { 271132687Stjr *src = s; 272132687Stjr return ((size_t)-1); 273132687Stjr } else if (nb == (size_t)-2) { 274132687Stjr *src = s + nms; 275132687Stjr return (nchr); 276132687Stjr } else if (nb == 0) { 277132687Stjr *src = NULL; 278132687Stjr return (nchr); 279132687Stjr } 280132687Stjr s += nb; 281132687Stjr nms -= nb; 282132687Stjr nchr++; 283132687Stjr dst++; 284132687Stjr } 285132687Stjr *src = s; 286132687Stjr return (nchr); 287132687Stjr} 288132687Stjr 289142654Sphantomstatic size_t 290128155Stjr_UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps) 291104828Stjr{ 292128155Stjr _UTF8State *us; 293104828Stjr unsigned char lead; 294104828Stjr int i, len; 295104828Stjr 296128155Stjr us = (_UTF8State *)ps; 297128155Stjr 298129336Stjr if (us->want != 0) { 299128155Stjr errno = EINVAL; 300128155Stjr return ((size_t)-1); 301128155Stjr } 302128155Stjr 303121893Stjr if (s == NULL) 304121893Stjr /* Reset to initial shift state (no-op) */ 305121893Stjr return (1); 306121893Stjr 307104828Stjr /* 308104828Stjr * Determine the number of octets needed to represent this character. 309104828Stjr * We always output the shortest sequence possible. Also specify the 310104828Stjr * first few bits of the first octet, which contains the information 311104828Stjr * about the sequence length. 312104828Stjr */ 313121893Stjr if ((wc & ~0x7f) == 0) { 314268571Spfg /* Fast path for plain ASCII characters. */ 315268571Spfg *s = (char)wc; 316268571Spfg return (1); 317121893Stjr } else if ((wc & ~0x7ff) == 0) { 318104828Stjr lead = 0xc0; 319104828Stjr len = 2; 320121893Stjr } else if ((wc & ~0xffff) == 0) { 321287393Sbapt if (wc >= 0xd800 && wc <= 0xdfff) { 322287393Sbapt errno = EILSEQ; 323287393Sbapt return ((size_t)-1); 324287393Sbapt } 325104828Stjr lead = 0xe0; 326104828Stjr len = 3; 327287393Sbapt } else if (wc >= 0 && wc <= 0x10ffff) { 328104828Stjr lead = 0xf0; 329104828Stjr len = 4; 330104828Stjr } else { 331121893Stjr errno = EILSEQ; 332121893Stjr return ((size_t)-1); 333104828Stjr } 334104828Stjr 335121893Stjr /* 336121893Stjr * Output the octets representing the character in chunks 337121893Stjr * of 6 bits, least significant last. The first octet is 338121893Stjr * a special case because it contains the sequence length 339121893Stjr * information. 340121893Stjr */ 341121893Stjr for (i = len - 1; i > 0; i--) { 342121893Stjr s[i] = (wc & 0x3f) | 0x80; 343121893Stjr wc >>= 6; 344104828Stjr } 345121893Stjr *s = (wc & 0xff) | lead; 346104828Stjr 347104828Stjr return (len); 348104828Stjr} 349132687Stjr 350142654Sphantomstatic size_t 351132687Stjr_UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 352132687Stjr size_t nwc, size_t len, mbstate_t * __restrict ps) 353132687Stjr{ 354132687Stjr _UTF8State *us; 355132687Stjr char buf[MB_LEN_MAX]; 356132687Stjr const wchar_t *s; 357132687Stjr size_t nbytes; 358132687Stjr size_t nb; 359132687Stjr 360132687Stjr us = (_UTF8State *)ps; 361132687Stjr 362132687Stjr if (us->want != 0) { 363132687Stjr errno = EINVAL; 364132687Stjr return ((size_t)-1); 365132687Stjr } 366132687Stjr 367132687Stjr s = *src; 368132687Stjr nbytes = 0; 369132687Stjr 370132687Stjr if (dst == NULL) { 371132687Stjr while (nwc-- > 0) { 372132687Stjr if (0 <= *s && *s < 0x80) 373132687Stjr /* Fast path for plain ASCII characters. */ 374132687Stjr nb = 1; 375132687Stjr else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == 376132687Stjr (size_t)-1) 377132687Stjr /* Invalid character - wcrtomb() sets errno. */ 378132687Stjr return ((size_t)-1); 379132687Stjr if (*s == L'\0') 380132687Stjr return (nbytes + nb - 1); 381132687Stjr s++; 382132687Stjr nbytes += nb; 383132687Stjr } 384132687Stjr return (nbytes); 385132687Stjr } 386132687Stjr 387132687Stjr while (len > 0 && nwc-- > 0) { 388132687Stjr if (0 <= *s && *s < 0x80) { 389132687Stjr /* Fast path for plain ASCII characters. */ 390132687Stjr nb = 1; 391132687Stjr *dst = *s; 392132687Stjr } else if (len > (size_t)MB_CUR_MAX) { 393132687Stjr /* Enough space to translate in-place. */ 394141716Sstefanf if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) { 395132687Stjr *src = s; 396132687Stjr return ((size_t)-1); 397132687Stjr } 398132687Stjr } else { 399132687Stjr /* 400132687Stjr * May not be enough space; use temp. buffer. 401132687Stjr */ 402141716Sstefanf if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) { 403132687Stjr *src = s; 404132687Stjr return ((size_t)-1); 405132687Stjr } 406132687Stjr if (nb > (int)len) 407132687Stjr /* MB sequence for character won't fit. */ 408132687Stjr break; 409132687Stjr memcpy(dst, buf, nb); 410132687Stjr } 411132687Stjr if (*s == L'\0') { 412132687Stjr *src = NULL; 413132687Stjr return (nbytes + nb - 1); 414132687Stjr } 415132687Stjr s++; 416132687Stjr dst += nb; 417132687Stjr len -= nb; 418132687Stjr nbytes += nb; 419132687Stjr } 420132687Stjr *src = s; 421132687Stjr return (nbytes); 422132687Stjr} 423