1104828Stjr/*-
2290494Sbapt * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3268272Spfg * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
4128004Stjr * Copyright (c) 2002-2004 Tim J. Robbins
5104828Stjr * All rights reserved.
6104828Stjr *
7227753Stheraven * Copyright (c) 2011 The FreeBSD Foundation
8227753Stheraven * All rights reserved.
9227753Stheraven * Portions of this software were developed by David Chisnall
10227753Stheraven * under sponsorship from the FreeBSD Foundation.
11227753Stheraven *
12104828Stjr * Redistribution and use in source and binary forms, with or without
13104828Stjr * modification, are permitted provided that the following conditions
14104828Stjr * are met:
15104828Stjr * 1. Redistributions of source code must retain the above copyright
16104828Stjr *    notice, this list of conditions and the following disclaimer.
17104828Stjr * 2. Redistributions in binary form must reproduce the above copyright
18104828Stjr *    notice, this list of conditions and the following disclaimer in the
19104828Stjr *    documentation and/or other materials provided with the distribution.
20104828Stjr *
21104828Stjr * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
22104828Stjr * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23104828Stjr * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24104828Stjr * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
25104828Stjr * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26104828Stjr * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27104828Stjr * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28104828Stjr * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29104828Stjr * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30104828Stjr * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31104828Stjr * SUCH DAMAGE.
32104828Stjr */
33104828Stjr
34128004Stjr#include <sys/param.h>
35104828Stjr__FBSDID("$FreeBSD$");
36104828Stjr
37121893Stjr#include <errno.h>
38132687Stjr#include <limits.h>
39121893Stjr#include <runetype.h>
40104828Stjr#include <stdlib.h>
41128004Stjr#include <string.h>
42121893Stjr#include <wchar.h>
43129153Stjr#include "mblocal.h"
44104828Stjr
45172619Sacheextern int __mb_sb_limit;
46172619Sache
47142654Sphantomstatic size_t	_UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
48142654Sphantom		    size_t, mbstate_t * __restrict);
49142654Sphantomstatic int	_UTF8_mbsinit(const mbstate_t *);
50142654Sphantomstatic size_t	_UTF8_mbsnrtowcs(wchar_t * __restrict,
51142654Sphantom		    const char ** __restrict, size_t, size_t,
52142654Sphantom		    mbstate_t * __restrict);
53142654Sphantomstatic size_t	_UTF8_wcrtomb(char * __restrict, wchar_t,
54142654Sphantom		    mbstate_t * __restrict);
55142654Sphantomstatic size_t	_UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
56142654Sphantom		    size_t, size_t, mbstate_t * __restrict);
57121893Stjr
58128004Stjrtypedef struct {
59129336Stjr	wchar_t	ch;
60129336Stjr	int	want;
61129336Stjr	wchar_t	lbound;
62128004Stjr} _UTF8State;
63128004Stjr
64104828Stjrint
65227753Stheraven_UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl)
66104828Stjr{
67104828Stjr
68227753Stheraven	l->__mbrtowc = _UTF8_mbrtowc;
69227753Stheraven	l->__wcrtomb = _UTF8_wcrtomb;
70227753Stheraven	l->__mbsinit = _UTF8_mbsinit;
71227753Stheraven	l->__mbsnrtowcs = _UTF8_mbsnrtowcs;
72227753Stheraven	l->__wcsnrtombs = _UTF8_wcsnrtombs;
73227753Stheraven	l->runes = rl;
74290494Sbapt	l->__mb_cur_max = 4;
75172661Sache	/*
76172661Sache	 * UCS-4 encoding used as the internal representation, so
77172661Sache	 * slots 0x0080-0x00FF are occuped and must be excluded
78172661Sache	 * from the single byte ctype by setting the limit.
79172661Sache	 */
80227753Stheraven	l->__mb_sb_limit = 128;
81104828Stjr
82104828Stjr	return (0);
83104828Stjr}
84104828Stjr
85142654Sphantomstatic int
86128004Stjr_UTF8_mbsinit(const mbstate_t *ps)
87128004Stjr{
88128004Stjr
89129336Stjr	return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
90128004Stjr}
91128004Stjr
92142654Sphantomstatic size_t
93121893Stjr_UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
94128004Stjr    mbstate_t * __restrict ps)
95104828Stjr{
96128004Stjr	_UTF8State *us;
97129336Stjr	int ch, i, mask, want;
98121893Stjr	wchar_t lbound, wch;
99104828Stjr
100128004Stjr	us = (_UTF8State *)ps;
101128004Stjr
102129336Stjr	if (us->want < 0 || us->want > 6) {
103128155Stjr		errno = EINVAL;
104128155Stjr		return ((size_t)-1);
105128155Stjr	}
106128155Stjr
107128004Stjr	if (s == NULL) {
108128004Stjr		s = "";
109128004Stjr		n = 1;
110128004Stjr		pwc = NULL;
111128004Stjr	}
112128004Stjr
113121893Stjr	if (n == 0)
114121893Stjr		/* Incomplete multibyte sequence */
115121893Stjr		return ((size_t)-2);
116104828Stjr
117129336Stjr	if (us->want == 0) {
118104828Stjr		/*
119129336Stjr		 * Determine the number of octets that make up this character
120129336Stjr		 * from the first octet, and a mask that extracts the
121129336Stjr		 * interesting bits of the first octet. We already know
122129336Stjr		 * the character is at least two bytes long.
123129336Stjr		 *
124129336Stjr		 * We also specify a lower bound for the character code to
125129336Stjr		 * detect redundant, non-"shortest form" encodings. For
126129336Stjr		 * example, the sequence C0 80 is _not_ a legal representation
127129336Stjr		 * of the null character. This enforces a 1-to-1 mapping
128129336Stjr		 * between character codes and their multibyte representations.
129104828Stjr		 */
130129336Stjr		ch = (unsigned char)*s;
131129336Stjr		if ((ch & 0x80) == 0) {
132268272Spfg			/* Fast path for plain ASCII characters. */
133268272Spfg			if (pwc != NULL)
134268272Spfg				*pwc = ch;
135268272Spfg			return (ch != '\0' ? 1 : 0);
136268272Spfg		}
137268272Spfg		if ((ch & 0xe0) == 0xc0) {
138129336Stjr			mask = 0x1f;
139129336Stjr			want = 2;
140129336Stjr			lbound = 0x80;
141129336Stjr		} else if ((ch & 0xf0) == 0xe0) {
142129336Stjr			mask = 0x0f;
143129336Stjr			want = 3;
144129336Stjr			lbound = 0x800;
145129336Stjr		} else if ((ch & 0xf8) == 0xf0) {
146129336Stjr			mask = 0x07;
147129336Stjr			want = 4;
148129336Stjr			lbound = 0x10000;
149129336Stjr		} else {
150129336Stjr			/*
151129336Stjr			 * Malformed input; input is not UTF-8.
152129336Stjr			 */
153129336Stjr			errno = EILSEQ;
154129336Stjr			return ((size_t)-1);
155129336Stjr		}
156129336Stjr	} else {
157129336Stjr		want = us->want;
158129336Stjr		lbound = us->lbound;
159104828Stjr	}
160104828Stjr
161104828Stjr	/*
162104828Stjr	 * Decode the octet sequence representing the character in chunks
163104828Stjr	 * of 6 bits, most significant first.
164104828Stjr	 */
165129336Stjr	if (us->want == 0)
166129336Stjr		wch = (unsigned char)*s++ & mask;
167129336Stjr	else
168129336Stjr		wch = us->ch;
169290494Sbapt
170129336Stjr	for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
171121893Stjr		if ((*s & 0xc0) != 0x80) {
172104828Stjr			/*
173104828Stjr			 * Malformed input; bad characters in the middle
174104828Stjr			 * of a character.
175104828Stjr			 */
176121893Stjr			errno = EILSEQ;
177121893Stjr			return ((size_t)-1);
178104828Stjr		}
179104828Stjr		wch <<= 6;
180121893Stjr		wch |= *s++ & 0x3f;
181104828Stjr	}
182129336Stjr	if (i < want) {
183129336Stjr		/* Incomplete multibyte sequence. */
184129336Stjr		us->want = want - i;
185129336Stjr		us->lbound = lbound;
186129336Stjr		us->ch = wch;
187129336Stjr		return ((size_t)-2);
188129336Stjr	}
189121893Stjr	if (wch < lbound) {
190104828Stjr		/*
191104828Stjr		 * Malformed input; redundant encoding.
192104828Stjr		 */
193121893Stjr		errno = EILSEQ;
194121893Stjr		return ((size_t)-1);
195121893Stjr	}
196287125Sed	if ((wch >= 0xd800 && wch <= 0xdfff) || wch > 0x10ffff) {
197265095Spfg		/*
198265095Spfg		 * Malformed input; invalid code points.
199265095Spfg		 */
200265095Spfg		errno = EILSEQ;
201265095Spfg		return ((size_t)-1);
202265095Spfg	}
203121893Stjr	if (pwc != NULL)
204121893Stjr		*pwc = wch;
205129336Stjr	us->want = 0;
206129336Stjr	return (wch == L'\0' ? 0 : want);
207104828Stjr}
208104828Stjr
209142654Sphantomstatic size_t
210132687Stjr_UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
211132687Stjr    size_t nms, size_t len, mbstate_t * __restrict ps)
212132687Stjr{
213132687Stjr	_UTF8State *us;
214132687Stjr	const char *s;
215132687Stjr	size_t nchr;
216132687Stjr	wchar_t wc;
217132687Stjr	size_t nb;
218132687Stjr
219132687Stjr	us = (_UTF8State *)ps;
220132687Stjr
221132687Stjr	s = *src;
222132687Stjr	nchr = 0;
223132687Stjr
224132687Stjr	if (dst == NULL) {
225132687Stjr		/*
226132687Stjr		 * The fast path in the loop below is not safe if an ASCII
227132687Stjr		 * character appears as anything but the first byte of a
228132687Stjr		 * multibyte sequence. Check now to avoid doing it in the loop.
229132687Stjr		 */
230132687Stjr		if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
231132687Stjr			errno = EILSEQ;
232132687Stjr			return ((size_t)-1);
233132687Stjr		}
234132687Stjr		for (;;) {
235132687Stjr			if (nms > 0 && (signed char)*s > 0)
236132687Stjr				/*
237132687Stjr				 * Fast path for plain ASCII characters
238132687Stjr				 * excluding NUL.
239132687Stjr				 */
240132687Stjr				nb = 1;
241132687Stjr			else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
242132687Stjr			    (size_t)-1)
243132687Stjr				/* Invalid sequence - mbrtowc() sets errno. */
244132687Stjr				return ((size_t)-1);
245132687Stjr			else if (nb == 0 || nb == (size_t)-2)
246132687Stjr				return (nchr);
247132687Stjr			s += nb;
248132687Stjr			nms -= nb;
249132687Stjr			nchr++;
250132687Stjr		}
251132687Stjr		/*NOTREACHED*/
252132687Stjr	}
253132687Stjr
254132687Stjr	/*
255132687Stjr	 * The fast path in the loop below is not safe if an ASCII
256132687Stjr	 * character appears as anything but the first byte of a
257132687Stjr	 * multibyte sequence. Check now to avoid doing it in the loop.
258132687Stjr	 */
259132687Stjr	if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
260132687Stjr		errno = EILSEQ;
261132687Stjr		return ((size_t)-1);
262132687Stjr	}
263132687Stjr	while (len-- > 0) {
264132687Stjr		if (nms > 0 && (signed char)*s > 0) {
265132687Stjr			/*
266132687Stjr			 * Fast path for plain ASCII characters
267132687Stjr			 * excluding NUL.
268132687Stjr			 */
269132687Stjr			*dst = (wchar_t)*s;
270132687Stjr			nb = 1;
271132687Stjr		} else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
272132687Stjr		    (size_t)-1) {
273132687Stjr			*src = s;
274132687Stjr			return ((size_t)-1);
275132687Stjr		} else if (nb == (size_t)-2) {
276132687Stjr			*src = s + nms;
277132687Stjr			return (nchr);
278132687Stjr		} else if (nb == 0) {
279132687Stjr			*src = NULL;
280132687Stjr			return (nchr);
281132687Stjr		}
282132687Stjr		s += nb;
283132687Stjr		nms -= nb;
284132687Stjr		nchr++;
285132687Stjr		dst++;
286132687Stjr	}
287132687Stjr	*src = s;
288132687Stjr	return (nchr);
289132687Stjr}
290132687Stjr
291142654Sphantomstatic size_t
292128155Stjr_UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
293104828Stjr{
294128155Stjr	_UTF8State *us;
295104828Stjr	unsigned char lead;
296104828Stjr	int i, len;
297104828Stjr
298128155Stjr	us = (_UTF8State *)ps;
299128155Stjr
300129336Stjr	if (us->want != 0) {
301128155Stjr		errno = EINVAL;
302128155Stjr		return ((size_t)-1);
303128155Stjr	}
304128155Stjr
305121893Stjr	if (s == NULL)
306121893Stjr		/* Reset to initial shift state (no-op) */
307121893Stjr		return (1);
308121893Stjr
309104828Stjr	/*
310104828Stjr	 * Determine the number of octets needed to represent this character.
311104828Stjr	 * We always output the shortest sequence possible. Also specify the
312104828Stjr	 * first few bits of the first octet, which contains the information
313104828Stjr	 * about the sequence length.
314104828Stjr	 */
315121893Stjr	if ((wc & ~0x7f) == 0) {
316268272Spfg		/* Fast path for plain ASCII characters. */
317268272Spfg		*s = (char)wc;
318268272Spfg		return (1);
319121893Stjr	} else if ((wc & ~0x7ff) == 0) {
320104828Stjr		lead = 0xc0;
321104828Stjr		len = 2;
322121893Stjr	} else if ((wc & ~0xffff) == 0) {
323287125Sed		if (wc >= 0xd800 && wc <= 0xdfff) {
324287125Sed			errno = EILSEQ;
325287125Sed			return ((size_t)-1);
326287125Sed		}
327104828Stjr		lead = 0xe0;
328104828Stjr		len = 3;
329286491Sbapt	} else if (wc >= 0 && wc <= 0x10ffff) {
330104828Stjr		lead = 0xf0;
331104828Stjr		len = 4;
332104828Stjr	} else {
333121893Stjr		errno = EILSEQ;
334121893Stjr		return ((size_t)-1);
335104828Stjr	}
336104828Stjr
337121893Stjr	/*
338121893Stjr	 * Output the octets representing the character in chunks
339121893Stjr	 * of 6 bits, least significant last. The first octet is
340121893Stjr	 * a special case because it contains the sequence length
341121893Stjr	 * information.
342121893Stjr	 */
343121893Stjr	for (i = len - 1; i > 0; i--) {
344121893Stjr		s[i] = (wc & 0x3f) | 0x80;
345121893Stjr		wc >>= 6;
346104828Stjr	}
347121893Stjr	*s = (wc & 0xff) | lead;
348104828Stjr
349104828Stjr	return (len);
350104828Stjr}
351132687Stjr
352142654Sphantomstatic size_t
353132687Stjr_UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
354132687Stjr    size_t nwc, size_t len, mbstate_t * __restrict ps)
355132687Stjr{
356132687Stjr	_UTF8State *us;
357132687Stjr	char buf[MB_LEN_MAX];
358132687Stjr	const wchar_t *s;
359132687Stjr	size_t nbytes;
360132687Stjr	size_t nb;
361132687Stjr
362132687Stjr	us = (_UTF8State *)ps;
363132687Stjr
364132687Stjr	if (us->want != 0) {
365132687Stjr		errno = EINVAL;
366132687Stjr		return ((size_t)-1);
367132687Stjr	}
368132687Stjr
369132687Stjr	s = *src;
370132687Stjr	nbytes = 0;
371132687Stjr
372132687Stjr	if (dst == NULL) {
373132687Stjr		while (nwc-- > 0) {
374132687Stjr			if (0 <= *s && *s < 0x80)
375132687Stjr				/* Fast path for plain ASCII characters. */
376132687Stjr				nb = 1;
377132687Stjr			else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
378132687Stjr			    (size_t)-1)
379132687Stjr				/* Invalid character - wcrtomb() sets errno. */
380132687Stjr				return ((size_t)-1);
381132687Stjr			if (*s == L'\0')
382132687Stjr				return (nbytes + nb - 1);
383132687Stjr			s++;
384132687Stjr			nbytes += nb;
385132687Stjr		}
386132687Stjr		return (nbytes);
387132687Stjr	}
388132687Stjr
389132687Stjr	while (len > 0 && nwc-- > 0) {
390132687Stjr		if (0 <= *s && *s < 0x80) {
391132687Stjr			/* Fast path for plain ASCII characters. */
392132687Stjr			nb = 1;
393132687Stjr			*dst = *s;
394132687Stjr		} else if (len > (size_t)MB_CUR_MAX) {
395132687Stjr			/* Enough space to translate in-place. */
396141716Sstefanf			if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
397132687Stjr				*src = s;
398132687Stjr				return ((size_t)-1);
399132687Stjr			}
400132687Stjr		} else {
401132687Stjr			/*
402132687Stjr			 * May not be enough space; use temp. buffer.
403132687Stjr			 */
404141716Sstefanf			if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
405132687Stjr				*src = s;
406132687Stjr				return ((size_t)-1);
407132687Stjr			}
408132687Stjr			if (nb > (int)len)
409132687Stjr				/* MB sequence for character won't fit. */
410132687Stjr				break;
411132687Stjr			memcpy(dst, buf, nb);
412132687Stjr		}
413132687Stjr		if (*s == L'\0') {
414132687Stjr			*src = NULL;
415132687Stjr			return (nbytes + nb - 1);
416132687Stjr		}
417132687Stjr		s++;
418132687Stjr		dst += nb;
419132687Stjr		len -= nb;
420132687Stjr		nbytes += nb;
421132687Stjr	}
422132687Stjr	*src = s;
423132687Stjr	return (nbytes);
424132687Stjr}
425