1104828Stjr/*- 2128004Stjr * Copyright (c) 2002-2004 Tim J. Robbins 3104828Stjr * All rights reserved. 4104828Stjr * 5235785Stheraven * Copyright (c) 2011 The FreeBSD Foundation 6235785Stheraven * All rights reserved. 7235785Stheraven * Portions of this software were developed by David Chisnall 8235785Stheraven * under sponsorship from the FreeBSD Foundation. 9235785Stheraven * 10104828Stjr * Redistribution and use in source and binary forms, with or without 11104828Stjr * modification, are permitted provided that the following conditions 12104828Stjr * are met: 13104828Stjr * 1. Redistributions of source code must retain the above copyright 14104828Stjr * notice, this list of conditions and the following disclaimer. 15104828Stjr * 2. Redistributions in binary form must reproduce the above copyright 16104828Stjr * notice, this list of conditions and the following disclaimer in the 17104828Stjr * documentation and/or other materials provided with the distribution. 18104828Stjr * 19104828Stjr * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20104828Stjr * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21104828Stjr * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22104828Stjr * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23104828Stjr * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24104828Stjr * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25104828Stjr * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26104828Stjr * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27104828Stjr * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28104828Stjr * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29104828Stjr * SUCH DAMAGE. 30104828Stjr */ 31104828Stjr 32128004Stjr#include <sys/param.h> 33104828Stjr__FBSDID("$FreeBSD$"); 34104828Stjr 35121893Stjr#include <errno.h> 36132687Stjr#include <limits.h> 37121893Stjr#include <runetype.h> 38104828Stjr#include <stdlib.h> 39128004Stjr#include <string.h> 40121893Stjr#include <wchar.h> 41129153Stjr#include "mblocal.h" 42104828Stjr 43172619Sacheextern int __mb_sb_limit; 44172619Sache 45142654Sphantomstatic size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, 46142654Sphantom size_t, mbstate_t * __restrict); 47142654Sphantomstatic int _UTF8_mbsinit(const mbstate_t *); 48142654Sphantomstatic size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, 49142654Sphantom const char ** __restrict, size_t, size_t, 50142654Sphantom mbstate_t * __restrict); 51142654Sphantomstatic size_t _UTF8_wcrtomb(char * __restrict, wchar_t, 52142654Sphantom mbstate_t * __restrict); 53142654Sphantomstatic size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, 54142654Sphantom size_t, size_t, mbstate_t * __restrict); 55121893Stjr 56128004Stjrtypedef struct { 57129336Stjr wchar_t ch; 58129336Stjr int want; 59129336Stjr wchar_t lbound; 60128004Stjr} _UTF8State; 61128004Stjr 62104828Stjrint 63235785Stheraven_UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl) 64104828Stjr{ 65104828Stjr 66235785Stheraven l->__mbrtowc = _UTF8_mbrtowc; 67235785Stheraven l->__wcrtomb = _UTF8_wcrtomb; 68235785Stheraven l->__mbsinit = _UTF8_mbsinit; 69235785Stheraven l->__mbsnrtowcs = _UTF8_mbsnrtowcs; 70235785Stheraven l->__wcsnrtombs = _UTF8_wcsnrtombs; 71235785Stheraven l->runes = rl; 72235785Stheraven l->__mb_cur_max = 6; 73172661Sache /* 74172661Sache * UCS-4 encoding used as the internal representation, so 75172661Sache * slots 0x0080-0x00FF are occuped and must be excluded 76172661Sache * from the single byte ctype by setting the limit. 77172661Sache */ 78235785Stheraven l->__mb_sb_limit = 128; 79104828Stjr 80104828Stjr return (0); 81104828Stjr} 82104828Stjr 83142654Sphantomstatic int 84128004Stjr_UTF8_mbsinit(const mbstate_t *ps) 85128004Stjr{ 86128004Stjr 87129336Stjr return (ps == NULL || ((const _UTF8State *)ps)->want == 0); 88128004Stjr} 89128004Stjr 90142654Sphantomstatic size_t 91121893Stjr_UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, 92128004Stjr mbstate_t * __restrict ps) 93104828Stjr{ 94128004Stjr _UTF8State *us; 95129336Stjr int ch, i, mask, want; 96121893Stjr wchar_t lbound, wch; 97104828Stjr 98128004Stjr us = (_UTF8State *)ps; 99128004Stjr 100129336Stjr if (us->want < 0 || us->want > 6) { 101128155Stjr errno = EINVAL; 102128155Stjr return ((size_t)-1); 103128155Stjr } 104128155Stjr 105128004Stjr if (s == NULL) { 106128004Stjr s = ""; 107128004Stjr n = 1; 108128004Stjr pwc = NULL; 109128004Stjr } 110128004Stjr 111121893Stjr if (n == 0) 112121893Stjr /* Incomplete multibyte sequence */ 113121893Stjr return ((size_t)-2); 114104828Stjr 115131881Stjr if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) { 116131881Stjr /* Fast path for plain ASCII characters. */ 117131881Stjr if (pwc != NULL) 118131881Stjr *pwc = ch; 119131881Stjr return (ch != '\0' ? 1 : 0); 120131881Stjr } 121131881Stjr 122129336Stjr if (us->want == 0) { 123104828Stjr /* 124129336Stjr * Determine the number of octets that make up this character 125129336Stjr * from the first octet, and a mask that extracts the 126129336Stjr * interesting bits of the first octet. We already know 127129336Stjr * the character is at least two bytes long. 128129336Stjr * 129129336Stjr * We also specify a lower bound for the character code to 130129336Stjr * detect redundant, non-"shortest form" encodings. For 131129336Stjr * example, the sequence C0 80 is _not_ a legal representation 132129336Stjr * of the null character. This enforces a 1-to-1 mapping 133129336Stjr * between character codes and their multibyte representations. 134104828Stjr */ 135129336Stjr ch = (unsigned char)*s; 136129336Stjr if ((ch & 0x80) == 0) { 137129336Stjr mask = 0x7f; 138129336Stjr want = 1; 139129336Stjr lbound = 0; 140129336Stjr } else if ((ch & 0xe0) == 0xc0) { 141129336Stjr mask = 0x1f; 142129336Stjr want = 2; 143129336Stjr lbound = 0x80; 144129336Stjr } else if ((ch & 0xf0) == 0xe0) { 145129336Stjr mask = 0x0f; 146129336Stjr want = 3; 147129336Stjr lbound = 0x800; 148129336Stjr } else if ((ch & 0xf8) == 0xf0) { 149129336Stjr mask = 0x07; 150129336Stjr want = 4; 151129336Stjr lbound = 0x10000; 152129336Stjr } else if ((ch & 0xfc) == 0xf8) { 153129336Stjr mask = 0x03; 154129336Stjr want = 5; 155129336Stjr lbound = 0x200000; 156157289Strhodes } else if ((ch & 0xfe) == 0xfc) { 157129336Stjr mask = 0x01; 158129336Stjr want = 6; 159129336Stjr lbound = 0x4000000; 160129336Stjr } else { 161129336Stjr /* 162129336Stjr * Malformed input; input is not UTF-8. 163129336Stjr */ 164129336Stjr errno = EILSEQ; 165129336Stjr return ((size_t)-1); 166129336Stjr } 167129336Stjr } else { 168129336Stjr want = us->want; 169129336Stjr lbound = us->lbound; 170104828Stjr } 171104828Stjr 172104828Stjr /* 173104828Stjr * Decode the octet sequence representing the character in chunks 174104828Stjr * of 6 bits, most significant first. 175104828Stjr */ 176129336Stjr if (us->want == 0) 177129336Stjr wch = (unsigned char)*s++ & mask; 178129336Stjr else 179129336Stjr wch = us->ch; 180129336Stjr for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 181121893Stjr if ((*s & 0xc0) != 0x80) { 182104828Stjr /* 183104828Stjr * Malformed input; bad characters in the middle 184104828Stjr * of a character. 185104828Stjr */ 186121893Stjr errno = EILSEQ; 187121893Stjr return ((size_t)-1); 188104828Stjr } 189104828Stjr wch <<= 6; 190121893Stjr wch |= *s++ & 0x3f; 191104828Stjr } 192129336Stjr if (i < want) { 193129336Stjr /* Incomplete multibyte sequence. */ 194129336Stjr us->want = want - i; 195129336Stjr us->lbound = lbound; 196129336Stjr us->ch = wch; 197129336Stjr return ((size_t)-2); 198129336Stjr } 199121893Stjr if (wch < lbound) { 200104828Stjr /* 201104828Stjr * Malformed input; redundant encoding. 202104828Stjr */ 203121893Stjr errno = EILSEQ; 204121893Stjr return ((size_t)-1); 205121893Stjr } 206121893Stjr if (pwc != NULL) 207121893Stjr *pwc = wch; 208129336Stjr us->want = 0; 209129336Stjr return (wch == L'\0' ? 0 : want); 210104828Stjr} 211104828Stjr 212142654Sphantomstatic size_t 213132687Stjr_UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, 214132687Stjr size_t nms, size_t len, mbstate_t * __restrict ps) 215132687Stjr{ 216132687Stjr _UTF8State *us; 217132687Stjr const char *s; 218132687Stjr size_t nchr; 219132687Stjr wchar_t wc; 220132687Stjr size_t nb; 221132687Stjr 222132687Stjr us = (_UTF8State *)ps; 223132687Stjr 224132687Stjr s = *src; 225132687Stjr nchr = 0; 226132687Stjr 227132687Stjr if (dst == NULL) { 228132687Stjr /* 229132687Stjr * The fast path in the loop below is not safe if an ASCII 230132687Stjr * character appears as anything but the first byte of a 231132687Stjr * multibyte sequence. Check now to avoid doing it in the loop. 232132687Stjr */ 233132687Stjr if (nms > 0 && us->want > 0 && (signed char)*s > 0) { 234132687Stjr errno = EILSEQ; 235132687Stjr return ((size_t)-1); 236132687Stjr } 237132687Stjr for (;;) { 238132687Stjr if (nms > 0 && (signed char)*s > 0) 239132687Stjr /* 240132687Stjr * Fast path for plain ASCII characters 241132687Stjr * excluding NUL. 242132687Stjr */ 243132687Stjr nb = 1; 244132687Stjr else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) == 245132687Stjr (size_t)-1) 246132687Stjr /* Invalid sequence - mbrtowc() sets errno. */ 247132687Stjr return ((size_t)-1); 248132687Stjr else if (nb == 0 || nb == (size_t)-2) 249132687Stjr return (nchr); 250132687Stjr s += nb; 251132687Stjr nms -= nb; 252132687Stjr nchr++; 253132687Stjr } 254132687Stjr /*NOTREACHED*/ 255132687Stjr } 256132687Stjr 257132687Stjr /* 258132687Stjr * The fast path in the loop below is not safe if an ASCII 259132687Stjr * character appears as anything but the first byte of a 260132687Stjr * multibyte sequence. Check now to avoid doing it in the loop. 261132687Stjr */ 262132687Stjr if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { 263132687Stjr errno = EILSEQ; 264132687Stjr return ((size_t)-1); 265132687Stjr } 266132687Stjr while (len-- > 0) { 267132687Stjr if (nms > 0 && (signed char)*s > 0) { 268132687Stjr /* 269132687Stjr * Fast path for plain ASCII characters 270132687Stjr * excluding NUL. 271132687Stjr */ 272132687Stjr *dst = (wchar_t)*s; 273132687Stjr nb = 1; 274132687Stjr } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) == 275132687Stjr (size_t)-1) { 276132687Stjr *src = s; 277132687Stjr return ((size_t)-1); 278132687Stjr } else if (nb == (size_t)-2) { 279132687Stjr *src = s + nms; 280132687Stjr return (nchr); 281132687Stjr } else if (nb == 0) { 282132687Stjr *src = NULL; 283132687Stjr return (nchr); 284132687Stjr } 285132687Stjr s += nb; 286132687Stjr nms -= nb; 287132687Stjr nchr++; 288132687Stjr dst++; 289132687Stjr } 290132687Stjr *src = s; 291132687Stjr return (nchr); 292132687Stjr} 293132687Stjr 294142654Sphantomstatic size_t 295128155Stjr_UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps) 296104828Stjr{ 297128155Stjr _UTF8State *us; 298104828Stjr unsigned char lead; 299104828Stjr int i, len; 300104828Stjr 301128155Stjr us = (_UTF8State *)ps; 302128155Stjr 303129336Stjr if (us->want != 0) { 304128155Stjr errno = EINVAL; 305128155Stjr return ((size_t)-1); 306128155Stjr } 307128155Stjr 308121893Stjr if (s == NULL) 309121893Stjr /* Reset to initial shift state (no-op) */ 310121893Stjr return (1); 311121893Stjr 312131881Stjr if ((wc & ~0x7f) == 0) { 313131881Stjr /* Fast path for plain ASCII characters. */ 314131881Stjr *s = (char)wc; 315131881Stjr return (1); 316131881Stjr } 317131881Stjr 318104828Stjr /* 319104828Stjr * Determine the number of octets needed to represent this character. 320104828Stjr * We always output the shortest sequence possible. Also specify the 321104828Stjr * first few bits of the first octet, which contains the information 322104828Stjr * about the sequence length. 323104828Stjr */ 324121893Stjr if ((wc & ~0x7f) == 0) { 325104828Stjr lead = 0; 326104828Stjr len = 1; 327121893Stjr } else if ((wc & ~0x7ff) == 0) { 328104828Stjr lead = 0xc0; 329104828Stjr len = 2; 330121893Stjr } else if ((wc & ~0xffff) == 0) { 331104828Stjr lead = 0xe0; 332104828Stjr len = 3; 333121893Stjr } else if ((wc & ~0x1fffff) == 0) { 334104828Stjr lead = 0xf0; 335104828Stjr len = 4; 336121893Stjr } else if ((wc & ~0x3ffffff) == 0) { 337104828Stjr lead = 0xf8; 338104828Stjr len = 5; 339121893Stjr } else if ((wc & ~0x7fffffff) == 0) { 340104828Stjr lead = 0xfc; 341104828Stjr len = 6; 342104828Stjr } else { 343121893Stjr errno = EILSEQ; 344121893Stjr return ((size_t)-1); 345104828Stjr } 346104828Stjr 347121893Stjr /* 348121893Stjr * Output the octets representing the character in chunks 349121893Stjr * of 6 bits, least significant last. The first octet is 350121893Stjr * a special case because it contains the sequence length 351121893Stjr * information. 352121893Stjr */ 353121893Stjr for (i = len - 1; i > 0; i--) { 354121893Stjr s[i] = (wc & 0x3f) | 0x80; 355121893Stjr wc >>= 6; 356104828Stjr } 357121893Stjr *s = (wc & 0xff) | lead; 358104828Stjr 359104828Stjr return (len); 360104828Stjr} 361132687Stjr 362142654Sphantomstatic size_t 363132687Stjr_UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 364132687Stjr size_t nwc, size_t len, mbstate_t * __restrict ps) 365132687Stjr{ 366132687Stjr _UTF8State *us; 367132687Stjr char buf[MB_LEN_MAX]; 368132687Stjr const wchar_t *s; 369132687Stjr size_t nbytes; 370132687Stjr size_t nb; 371132687Stjr 372132687Stjr us = (_UTF8State *)ps; 373132687Stjr 374132687Stjr if (us->want != 0) { 375132687Stjr errno = EINVAL; 376132687Stjr return ((size_t)-1); 377132687Stjr } 378132687Stjr 379132687Stjr s = *src; 380132687Stjr nbytes = 0; 381132687Stjr 382132687Stjr if (dst == NULL) { 383132687Stjr while (nwc-- > 0) { 384132687Stjr if (0 <= *s && *s < 0x80) 385132687Stjr /* Fast path for plain ASCII characters. */ 386132687Stjr nb = 1; 387132687Stjr else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == 388132687Stjr (size_t)-1) 389132687Stjr /* Invalid character - wcrtomb() sets errno. */ 390132687Stjr return ((size_t)-1); 391132687Stjr if (*s == L'\0') 392132687Stjr return (nbytes + nb - 1); 393132687Stjr s++; 394132687Stjr nbytes += nb; 395132687Stjr } 396132687Stjr return (nbytes); 397132687Stjr } 398132687Stjr 399132687Stjr while (len > 0 && nwc-- > 0) { 400132687Stjr if (0 <= *s && *s < 0x80) { 401132687Stjr /* Fast path for plain ASCII characters. */ 402132687Stjr nb = 1; 403132687Stjr *dst = *s; 404132687Stjr } else if (len > (size_t)MB_CUR_MAX) { 405132687Stjr /* Enough space to translate in-place. */ 406141716Sstefanf if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) { 407132687Stjr *src = s; 408132687Stjr return ((size_t)-1); 409132687Stjr } 410132687Stjr } else { 411132687Stjr /* 412132687Stjr * May not be enough space; use temp. buffer. 413132687Stjr */ 414141716Sstefanf if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) { 415132687Stjr *src = s; 416132687Stjr return ((size_t)-1); 417132687Stjr } 418132687Stjr if (nb > (int)len) 419132687Stjr /* MB sequence for character won't fit. */ 420132687Stjr break; 421132687Stjr memcpy(dst, buf, nb); 422132687Stjr } 423132687Stjr if (*s == L'\0') { 424132687Stjr *src = NULL; 425132687Stjr return (nbytes + nb - 1); 426132687Stjr } 427132687Stjr s++; 428132687Stjr dst += nb; 429132687Stjr len -= nb; 430132687Stjr nbytes += nb; 431132687Stjr } 432132687Stjr *src = s; 433132687Stjr return (nbytes); 434132687Stjr} 435