1104828Stjr/*- 2290494Sbapt * Copyright 2013 Garrett D'Amore <garrett@damore.org> 3268272Spfg * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 4128004Stjr * Copyright (c) 2002-2004 Tim J. Robbins 5104828Stjr * All rights reserved. 6104828Stjr * 7227753Stheraven * Copyright (c) 2011 The FreeBSD Foundation 8227753Stheraven * All rights reserved. 9227753Stheraven * Portions of this software were developed by David Chisnall 10227753Stheraven * under sponsorship from the FreeBSD Foundation. 11227753Stheraven * 12104828Stjr * Redistribution and use in source and binary forms, with or without 13104828Stjr * modification, are permitted provided that the following conditions 14104828Stjr * are met: 15104828Stjr * 1. Redistributions of source code must retain the above copyright 16104828Stjr * notice, this list of conditions and the following disclaimer. 17104828Stjr * 2. Redistributions in binary form must reproduce the above copyright 18104828Stjr * notice, this list of conditions and the following disclaimer in the 19104828Stjr * documentation and/or other materials provided with the distribution. 20104828Stjr * 21104828Stjr * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 22104828Stjr * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23104828Stjr * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24104828Stjr * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 25104828Stjr * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26104828Stjr * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27104828Stjr * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28104828Stjr * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29104828Stjr * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30104828Stjr * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31104828Stjr * SUCH DAMAGE. 32104828Stjr */ 33104828Stjr 34128004Stjr#include <sys/param.h> 35104828Stjr__FBSDID("$FreeBSD$"); 36104828Stjr 37121893Stjr#include <errno.h> 38132687Stjr#include <limits.h> 39121893Stjr#include <runetype.h> 40104828Stjr#include <stdlib.h> 41128004Stjr#include <string.h> 42121893Stjr#include <wchar.h> 43129153Stjr#include "mblocal.h" 44104828Stjr 45172619Sacheextern int __mb_sb_limit; 46172619Sache 47142654Sphantomstatic size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, 48142654Sphantom size_t, mbstate_t * __restrict); 49142654Sphantomstatic int _UTF8_mbsinit(const mbstate_t *); 50142654Sphantomstatic size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, 51142654Sphantom const char ** __restrict, size_t, size_t, 52142654Sphantom mbstate_t * __restrict); 53142654Sphantomstatic size_t _UTF8_wcrtomb(char * __restrict, wchar_t, 54142654Sphantom mbstate_t * __restrict); 55142654Sphantomstatic size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, 56142654Sphantom size_t, size_t, mbstate_t * __restrict); 57121893Stjr 58128004Stjrtypedef struct { 59129336Stjr wchar_t ch; 60129336Stjr int want; 61129336Stjr wchar_t lbound; 62128004Stjr} _UTF8State; 63128004Stjr 64104828Stjrint 65227753Stheraven_UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl) 66104828Stjr{ 67104828Stjr 68227753Stheraven l->__mbrtowc = _UTF8_mbrtowc; 69227753Stheraven l->__wcrtomb = _UTF8_wcrtomb; 70227753Stheraven l->__mbsinit = _UTF8_mbsinit; 71227753Stheraven l->__mbsnrtowcs = _UTF8_mbsnrtowcs; 72227753Stheraven l->__wcsnrtombs = _UTF8_wcsnrtombs; 73227753Stheraven l->runes = rl; 74290494Sbapt l->__mb_cur_max = 4; 75172661Sache /* 76172661Sache * UCS-4 encoding used as the internal representation, so 77172661Sache * slots 0x0080-0x00FF are occuped and must be excluded 78172661Sache * from the single byte ctype by setting the limit. 79172661Sache */ 80227753Stheraven l->__mb_sb_limit = 128; 81104828Stjr 82104828Stjr return (0); 83104828Stjr} 84104828Stjr 85142654Sphantomstatic int 86128004Stjr_UTF8_mbsinit(const mbstate_t *ps) 87128004Stjr{ 88128004Stjr 89129336Stjr return (ps == NULL || ((const _UTF8State *)ps)->want == 0); 90128004Stjr} 91128004Stjr 92142654Sphantomstatic size_t 93121893Stjr_UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, 94128004Stjr mbstate_t * __restrict ps) 95104828Stjr{ 96128004Stjr _UTF8State *us; 97129336Stjr int ch, i, mask, want; 98121893Stjr wchar_t lbound, wch; 99104828Stjr 100128004Stjr us = (_UTF8State *)ps; 101128004Stjr 102129336Stjr if (us->want < 0 || us->want > 6) { 103128155Stjr errno = EINVAL; 104128155Stjr return ((size_t)-1); 105128155Stjr } 106128155Stjr 107128004Stjr if (s == NULL) { 108128004Stjr s = ""; 109128004Stjr n = 1; 110128004Stjr pwc = NULL; 111128004Stjr } 112128004Stjr 113121893Stjr if (n == 0) 114121893Stjr /* Incomplete multibyte sequence */ 115121893Stjr return ((size_t)-2); 116104828Stjr 117129336Stjr if (us->want == 0) { 118104828Stjr /* 119129336Stjr * Determine the number of octets that make up this character 120129336Stjr * from the first octet, and a mask that extracts the 121129336Stjr * interesting bits of the first octet. We already know 122129336Stjr * the character is at least two bytes long. 123129336Stjr * 124129336Stjr * We also specify a lower bound for the character code to 125129336Stjr * detect redundant, non-"shortest form" encodings. For 126129336Stjr * example, the sequence C0 80 is _not_ a legal representation 127129336Stjr * of the null character. This enforces a 1-to-1 mapping 128129336Stjr * between character codes and their multibyte representations. 129104828Stjr */ 130129336Stjr ch = (unsigned char)*s; 131129336Stjr if ((ch & 0x80) == 0) { 132268272Spfg /* Fast path for plain ASCII characters. */ 133268272Spfg if (pwc != NULL) 134268272Spfg *pwc = ch; 135268272Spfg return (ch != '\0' ? 1 : 0); 136268272Spfg } 137268272Spfg if ((ch & 0xe0) == 0xc0) { 138129336Stjr mask = 0x1f; 139129336Stjr want = 2; 140129336Stjr lbound = 0x80; 141129336Stjr } else if ((ch & 0xf0) == 0xe0) { 142129336Stjr mask = 0x0f; 143129336Stjr want = 3; 144129336Stjr lbound = 0x800; 145129336Stjr } else if ((ch & 0xf8) == 0xf0) { 146129336Stjr mask = 0x07; 147129336Stjr want = 4; 148129336Stjr lbound = 0x10000; 149129336Stjr } else { 150129336Stjr /* 151129336Stjr * Malformed input; input is not UTF-8. 152129336Stjr */ 153129336Stjr errno = EILSEQ; 154129336Stjr return ((size_t)-1); 155129336Stjr } 156129336Stjr } else { 157129336Stjr want = us->want; 158129336Stjr lbound = us->lbound; 159104828Stjr } 160104828Stjr 161104828Stjr /* 162104828Stjr * Decode the octet sequence representing the character in chunks 163104828Stjr * of 6 bits, most significant first. 164104828Stjr */ 165129336Stjr if (us->want == 0) 166129336Stjr wch = (unsigned char)*s++ & mask; 167129336Stjr else 168129336Stjr wch = us->ch; 169290494Sbapt 170129336Stjr for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 171121893Stjr if ((*s & 0xc0) != 0x80) { 172104828Stjr /* 173104828Stjr * Malformed input; bad characters in the middle 174104828Stjr * of a character. 175104828Stjr */ 176121893Stjr errno = EILSEQ; 177121893Stjr return ((size_t)-1); 178104828Stjr } 179104828Stjr wch <<= 6; 180121893Stjr wch |= *s++ & 0x3f; 181104828Stjr } 182129336Stjr if (i < want) { 183129336Stjr /* Incomplete multibyte sequence. */ 184129336Stjr us->want = want - i; 185129336Stjr us->lbound = lbound; 186129336Stjr us->ch = wch; 187129336Stjr return ((size_t)-2); 188129336Stjr } 189121893Stjr if (wch < lbound) { 190104828Stjr /* 191104828Stjr * Malformed input; redundant encoding. 192104828Stjr */ 193121893Stjr errno = EILSEQ; 194121893Stjr return ((size_t)-1); 195121893Stjr } 196287125Sed if ((wch >= 0xd800 && wch <= 0xdfff) || wch > 0x10ffff) { 197265095Spfg /* 198265095Spfg * Malformed input; invalid code points. 199265095Spfg */ 200265095Spfg errno = EILSEQ; 201265095Spfg return ((size_t)-1); 202265095Spfg } 203121893Stjr if (pwc != NULL) 204121893Stjr *pwc = wch; 205129336Stjr us->want = 0; 206129336Stjr return (wch == L'\0' ? 0 : want); 207104828Stjr} 208104828Stjr 209142654Sphantomstatic size_t 210132687Stjr_UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, 211132687Stjr size_t nms, size_t len, mbstate_t * __restrict ps) 212132687Stjr{ 213132687Stjr _UTF8State *us; 214132687Stjr const char *s; 215132687Stjr size_t nchr; 216132687Stjr wchar_t wc; 217132687Stjr size_t nb; 218132687Stjr 219132687Stjr us = (_UTF8State *)ps; 220132687Stjr 221132687Stjr s = *src; 222132687Stjr nchr = 0; 223132687Stjr 224132687Stjr if (dst == NULL) { 225132687Stjr /* 226132687Stjr * The fast path in the loop below is not safe if an ASCII 227132687Stjr * character appears as anything but the first byte of a 228132687Stjr * multibyte sequence. Check now to avoid doing it in the loop. 229132687Stjr */ 230132687Stjr if (nms > 0 && us->want > 0 && (signed char)*s > 0) { 231132687Stjr errno = EILSEQ; 232132687Stjr return ((size_t)-1); 233132687Stjr } 234132687Stjr for (;;) { 235132687Stjr if (nms > 0 && (signed char)*s > 0) 236132687Stjr /* 237132687Stjr * Fast path for plain ASCII characters 238132687Stjr * excluding NUL. 239132687Stjr */ 240132687Stjr nb = 1; 241132687Stjr else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) == 242132687Stjr (size_t)-1) 243132687Stjr /* Invalid sequence - mbrtowc() sets errno. */ 244132687Stjr return ((size_t)-1); 245132687Stjr else if (nb == 0 || nb == (size_t)-2) 246132687Stjr return (nchr); 247132687Stjr s += nb; 248132687Stjr nms -= nb; 249132687Stjr nchr++; 250132687Stjr } 251132687Stjr /*NOTREACHED*/ 252132687Stjr } 253132687Stjr 254132687Stjr /* 255132687Stjr * The fast path in the loop below is not safe if an ASCII 256132687Stjr * character appears as anything but the first byte of a 257132687Stjr * multibyte sequence. Check now to avoid doing it in the loop. 258132687Stjr */ 259132687Stjr if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { 260132687Stjr errno = EILSEQ; 261132687Stjr return ((size_t)-1); 262132687Stjr } 263132687Stjr while (len-- > 0) { 264132687Stjr if (nms > 0 && (signed char)*s > 0) { 265132687Stjr /* 266132687Stjr * Fast path for plain ASCII characters 267132687Stjr * excluding NUL. 268132687Stjr */ 269132687Stjr *dst = (wchar_t)*s; 270132687Stjr nb = 1; 271132687Stjr } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) == 272132687Stjr (size_t)-1) { 273132687Stjr *src = s; 274132687Stjr return ((size_t)-1); 275132687Stjr } else if (nb == (size_t)-2) { 276132687Stjr *src = s + nms; 277132687Stjr return (nchr); 278132687Stjr } else if (nb == 0) { 279132687Stjr *src = NULL; 280132687Stjr return (nchr); 281132687Stjr } 282132687Stjr s += nb; 283132687Stjr nms -= nb; 284132687Stjr nchr++; 285132687Stjr dst++; 286132687Stjr } 287132687Stjr *src = s; 288132687Stjr return (nchr); 289132687Stjr} 290132687Stjr 291142654Sphantomstatic size_t 292128155Stjr_UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps) 293104828Stjr{ 294128155Stjr _UTF8State *us; 295104828Stjr unsigned char lead; 296104828Stjr int i, len; 297104828Stjr 298128155Stjr us = (_UTF8State *)ps; 299128155Stjr 300129336Stjr if (us->want != 0) { 301128155Stjr errno = EINVAL; 302128155Stjr return ((size_t)-1); 303128155Stjr } 304128155Stjr 305121893Stjr if (s == NULL) 306121893Stjr /* Reset to initial shift state (no-op) */ 307121893Stjr return (1); 308121893Stjr 309104828Stjr /* 310104828Stjr * Determine the number of octets needed to represent this character. 311104828Stjr * We always output the shortest sequence possible. Also specify the 312104828Stjr * first few bits of the first octet, which contains the information 313104828Stjr * about the sequence length. 314104828Stjr */ 315121893Stjr if ((wc & ~0x7f) == 0) { 316268272Spfg /* Fast path for plain ASCII characters. */ 317268272Spfg *s = (char)wc; 318268272Spfg return (1); 319121893Stjr } else if ((wc & ~0x7ff) == 0) { 320104828Stjr lead = 0xc0; 321104828Stjr len = 2; 322121893Stjr } else if ((wc & ~0xffff) == 0) { 323287125Sed if (wc >= 0xd800 && wc <= 0xdfff) { 324287125Sed errno = EILSEQ; 325287125Sed return ((size_t)-1); 326287125Sed } 327104828Stjr lead = 0xe0; 328104828Stjr len = 3; 329286491Sbapt } else if (wc >= 0 && wc <= 0x10ffff) { 330104828Stjr lead = 0xf0; 331104828Stjr len = 4; 332104828Stjr } else { 333121893Stjr errno = EILSEQ; 334121893Stjr return ((size_t)-1); 335104828Stjr } 336104828Stjr 337121893Stjr /* 338121893Stjr * Output the octets representing the character in chunks 339121893Stjr * of 6 bits, least significant last. The first octet is 340121893Stjr * a special case because it contains the sequence length 341121893Stjr * information. 342121893Stjr */ 343121893Stjr for (i = len - 1; i > 0; i--) { 344121893Stjr s[i] = (wc & 0x3f) | 0x80; 345121893Stjr wc >>= 6; 346104828Stjr } 347121893Stjr *s = (wc & 0xff) | lead; 348104828Stjr 349104828Stjr return (len); 350104828Stjr} 351132687Stjr 352142654Sphantomstatic size_t 353132687Stjr_UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 354132687Stjr size_t nwc, size_t len, mbstate_t * __restrict ps) 355132687Stjr{ 356132687Stjr _UTF8State *us; 357132687Stjr char buf[MB_LEN_MAX]; 358132687Stjr const wchar_t *s; 359132687Stjr size_t nbytes; 360132687Stjr size_t nb; 361132687Stjr 362132687Stjr us = (_UTF8State *)ps; 363132687Stjr 364132687Stjr if (us->want != 0) { 365132687Stjr errno = EINVAL; 366132687Stjr return ((size_t)-1); 367132687Stjr } 368132687Stjr 369132687Stjr s = *src; 370132687Stjr nbytes = 0; 371132687Stjr 372132687Stjr if (dst == NULL) { 373132687Stjr while (nwc-- > 0) { 374132687Stjr if (0 <= *s && *s < 0x80) 375132687Stjr /* Fast path for plain ASCII characters. */ 376132687Stjr nb = 1; 377132687Stjr else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == 378132687Stjr (size_t)-1) 379132687Stjr /* Invalid character - wcrtomb() sets errno. */ 380132687Stjr return ((size_t)-1); 381132687Stjr if (*s == L'\0') 382132687Stjr return (nbytes + nb - 1); 383132687Stjr s++; 384132687Stjr nbytes += nb; 385132687Stjr } 386132687Stjr return (nbytes); 387132687Stjr } 388132687Stjr 389132687Stjr while (len > 0 && nwc-- > 0) { 390132687Stjr if (0 <= *s && *s < 0x80) { 391132687Stjr /* Fast path for plain ASCII characters. */ 392132687Stjr nb = 1; 393132687Stjr *dst = *s; 394132687Stjr } else if (len > (size_t)MB_CUR_MAX) { 395132687Stjr /* Enough space to translate in-place. */ 396141716Sstefanf if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) { 397132687Stjr *src = s; 398132687Stjr return ((size_t)-1); 399132687Stjr } 400132687Stjr } else { 401132687Stjr /* 402132687Stjr * May not be enough space; use temp. buffer. 403132687Stjr */ 404141716Sstefanf if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) { 405132687Stjr *src = s; 406132687Stjr return ((size_t)-1); 407132687Stjr } 408132687Stjr if (nb > (int)len) 409132687Stjr /* MB sequence for character won't fit. */ 410132687Stjr break; 411132687Stjr memcpy(dst, buf, nb); 412132687Stjr } 413132687Stjr if (*s == L'\0') { 414132687Stjr *src = NULL; 415132687Stjr return (nbytes + nb - 1); 416132687Stjr } 417132687Stjr s++; 418132687Stjr dst += nb; 419132687Stjr len -= nb; 420132687Stjr nbytes += nb; 421132687Stjr } 422132687Stjr *src = s; 423132687Stjr return (nbytes); 424132687Stjr} 425