1104828Stjr/*-
2268571Spfg * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
3128004Stjr * Copyright (c) 2002-2004 Tim J. Robbins
4104828Stjr * All rights reserved.
5104828Stjr *
6227753Stheraven * Copyright (c) 2011 The FreeBSD Foundation
7227753Stheraven * All rights reserved.
8227753Stheraven * Portions of this software were developed by David Chisnall
9227753Stheraven * under sponsorship from the FreeBSD Foundation.
10227753Stheraven *
11104828Stjr * Redistribution and use in source and binary forms, with or without
12104828Stjr * modification, are permitted provided that the following conditions
13104828Stjr * are met:
14104828Stjr * 1. Redistributions of source code must retain the above copyright
15104828Stjr *    notice, this list of conditions and the following disclaimer.
16104828Stjr * 2. Redistributions in binary form must reproduce the above copyright
17104828Stjr *    notice, this list of conditions and the following disclaimer in the
18104828Stjr *    documentation and/or other materials provided with the distribution.
19104828Stjr *
20104828Stjr * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21104828Stjr * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22104828Stjr * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23104828Stjr * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24104828Stjr * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25104828Stjr * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26104828Stjr * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27104828Stjr * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28104828Stjr * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29104828Stjr * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30104828Stjr * SUCH DAMAGE.
31104828Stjr */
32104828Stjr
33128004Stjr#include <sys/param.h>
34104828Stjr__FBSDID("$FreeBSD$");
35104828Stjr
36121893Stjr#include <errno.h>
37132687Stjr#include <limits.h>
38121893Stjr#include <runetype.h>
39104828Stjr#include <stdlib.h>
40128004Stjr#include <string.h>
41121893Stjr#include <wchar.h>
42129153Stjr#include "mblocal.h"
43104828Stjr
44172619Sacheextern int __mb_sb_limit;
45172619Sache
46142654Sphantomstatic size_t	_UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
47142654Sphantom		    size_t, mbstate_t * __restrict);
48142654Sphantomstatic int	_UTF8_mbsinit(const mbstate_t *);
49142654Sphantomstatic size_t	_UTF8_mbsnrtowcs(wchar_t * __restrict,
50142654Sphantom		    const char ** __restrict, size_t, size_t,
51142654Sphantom		    mbstate_t * __restrict);
52142654Sphantomstatic size_t	_UTF8_wcrtomb(char * __restrict, wchar_t,
53142654Sphantom		    mbstate_t * __restrict);
54142654Sphantomstatic size_t	_UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
55142654Sphantom		    size_t, size_t, mbstate_t * __restrict);
56121893Stjr
57128004Stjrtypedef struct {
58129336Stjr	wchar_t	ch;
59129336Stjr	int	want;
60129336Stjr	wchar_t	lbound;
61128004Stjr} _UTF8State;
62128004Stjr
63104828Stjrint
64227753Stheraven_UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl)
65104828Stjr{
66104828Stjr
67227753Stheraven	l->__mbrtowc = _UTF8_mbrtowc;
68227753Stheraven	l->__wcrtomb = _UTF8_wcrtomb;
69227753Stheraven	l->__mbsinit = _UTF8_mbsinit;
70227753Stheraven	l->__mbsnrtowcs = _UTF8_mbsnrtowcs;
71227753Stheraven	l->__wcsnrtombs = _UTF8_wcsnrtombs;
72227753Stheraven	l->runes = rl;
73227753Stheraven	l->__mb_cur_max = 6;
74172661Sache	/*
75172661Sache	 * UCS-4 encoding used as the internal representation, so
76172661Sache	 * slots 0x0080-0x00FF are occuped and must be excluded
77172661Sache	 * from the single byte ctype by setting the limit.
78172661Sache	 */
79227753Stheraven	l->__mb_sb_limit = 128;
80104828Stjr
81104828Stjr	return (0);
82104828Stjr}
83104828Stjr
84142654Sphantomstatic int
85128004Stjr_UTF8_mbsinit(const mbstate_t *ps)
86128004Stjr{
87128004Stjr
88129336Stjr	return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
89128004Stjr}
90128004Stjr
91142654Sphantomstatic size_t
92121893Stjr_UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
93128004Stjr    mbstate_t * __restrict ps)
94104828Stjr{
95128004Stjr	_UTF8State *us;
96129336Stjr	int ch, i, mask, want;
97121893Stjr	wchar_t lbound, wch;
98104828Stjr
99128004Stjr	us = (_UTF8State *)ps;
100128004Stjr
101129336Stjr	if (us->want < 0 || us->want > 6) {
102128155Stjr		errno = EINVAL;
103128155Stjr		return ((size_t)-1);
104128155Stjr	}
105128155Stjr
106128004Stjr	if (s == NULL) {
107128004Stjr		s = "";
108128004Stjr		n = 1;
109128004Stjr		pwc = NULL;
110128004Stjr	}
111128004Stjr
112121893Stjr	if (n == 0)
113121893Stjr		/* Incomplete multibyte sequence */
114121893Stjr		return ((size_t)-2);
115104828Stjr
116129336Stjr	if (us->want == 0) {
117104828Stjr		/*
118129336Stjr		 * Determine the number of octets that make up this character
119129336Stjr		 * from the first octet, and a mask that extracts the
120129336Stjr		 * interesting bits of the first octet. We already know
121129336Stjr		 * the character is at least two bytes long.
122129336Stjr		 *
123129336Stjr		 * We also specify a lower bound for the character code to
124129336Stjr		 * detect redundant, non-"shortest form" encodings. For
125129336Stjr		 * example, the sequence C0 80 is _not_ a legal representation
126129336Stjr		 * of the null character. This enforces a 1-to-1 mapping
127129336Stjr		 * between character codes and their multibyte representations.
128104828Stjr		 */
129129336Stjr		ch = (unsigned char)*s;
130129336Stjr		if ((ch & 0x80) == 0) {
131268571Spfg			/* Fast path for plain ASCII characters. */
132268571Spfg			if (pwc != NULL)
133268571Spfg				*pwc = ch;
134268571Spfg			return (ch != '\0' ? 1 : 0);
135268571Spfg		}
136268571Spfg		if ((ch & 0xe0) == 0xc0) {
137129336Stjr			mask = 0x1f;
138129336Stjr			want = 2;
139129336Stjr			lbound = 0x80;
140129336Stjr		} else if ((ch & 0xf0) == 0xe0) {
141129336Stjr			mask = 0x0f;
142129336Stjr			want = 3;
143129336Stjr			lbound = 0x800;
144129336Stjr		} else if ((ch & 0xf8) == 0xf0) {
145129336Stjr			mask = 0x07;
146129336Stjr			want = 4;
147129336Stjr			lbound = 0x10000;
148129336Stjr		} else {
149129336Stjr			/*
150129336Stjr			 * Malformed input; input is not UTF-8.
151129336Stjr			 */
152129336Stjr			errno = EILSEQ;
153129336Stjr			return ((size_t)-1);
154129336Stjr		}
155129336Stjr	} else {
156129336Stjr		want = us->want;
157129336Stjr		lbound = us->lbound;
158104828Stjr	}
159104828Stjr
160104828Stjr	/*
161104828Stjr	 * Decode the octet sequence representing the character in chunks
162104828Stjr	 * of 6 bits, most significant first.
163104828Stjr	 */
164129336Stjr	if (us->want == 0)
165129336Stjr		wch = (unsigned char)*s++ & mask;
166129336Stjr	else
167129336Stjr		wch = us->ch;
168129336Stjr	for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
169121893Stjr		if ((*s & 0xc0) != 0x80) {
170104828Stjr			/*
171104828Stjr			 * Malformed input; bad characters in the middle
172104828Stjr			 * of a character.
173104828Stjr			 */
174121893Stjr			errno = EILSEQ;
175121893Stjr			return ((size_t)-1);
176104828Stjr		}
177104828Stjr		wch <<= 6;
178121893Stjr		wch |= *s++ & 0x3f;
179104828Stjr	}
180129336Stjr	if (i < want) {
181129336Stjr		/* Incomplete multibyte sequence. */
182129336Stjr		us->want = want - i;
183129336Stjr		us->lbound = lbound;
184129336Stjr		us->ch = wch;
185129336Stjr		return ((size_t)-2);
186129336Stjr	}
187121893Stjr	if (wch < lbound) {
188104828Stjr		/*
189104828Stjr		 * Malformed input; redundant encoding.
190104828Stjr		 */
191121893Stjr		errno = EILSEQ;
192121893Stjr		return ((size_t)-1);
193121893Stjr	}
194287393Sbapt	if ((wch >= 0xd800 && wch <= 0xdfff) || wch > 0x10ffff) {
195265361Spfg		/*
196265361Spfg		 * Malformed input; invalid code points.
197265361Spfg		 */
198265361Spfg		errno = EILSEQ;
199265361Spfg		return ((size_t)-1);
200265361Spfg	}
201121893Stjr	if (pwc != NULL)
202121893Stjr		*pwc = wch;
203129336Stjr	us->want = 0;
204129336Stjr	return (wch == L'\0' ? 0 : want);
205104828Stjr}
206104828Stjr
207142654Sphantomstatic size_t
208132687Stjr_UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
209132687Stjr    size_t nms, size_t len, mbstate_t * __restrict ps)
210132687Stjr{
211132687Stjr	_UTF8State *us;
212132687Stjr	const char *s;
213132687Stjr	size_t nchr;
214132687Stjr	wchar_t wc;
215132687Stjr	size_t nb;
216132687Stjr
217132687Stjr	us = (_UTF8State *)ps;
218132687Stjr
219132687Stjr	s = *src;
220132687Stjr	nchr = 0;
221132687Stjr
222132687Stjr	if (dst == NULL) {
223132687Stjr		/*
224132687Stjr		 * The fast path in the loop below is not safe if an ASCII
225132687Stjr		 * character appears as anything but the first byte of a
226132687Stjr		 * multibyte sequence. Check now to avoid doing it in the loop.
227132687Stjr		 */
228132687Stjr		if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
229132687Stjr			errno = EILSEQ;
230132687Stjr			return ((size_t)-1);
231132687Stjr		}
232132687Stjr		for (;;) {
233132687Stjr			if (nms > 0 && (signed char)*s > 0)
234132687Stjr				/*
235132687Stjr				 * Fast path for plain ASCII characters
236132687Stjr				 * excluding NUL.
237132687Stjr				 */
238132687Stjr				nb = 1;
239132687Stjr			else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
240132687Stjr			    (size_t)-1)
241132687Stjr				/* Invalid sequence - mbrtowc() sets errno. */
242132687Stjr				return ((size_t)-1);
243132687Stjr			else if (nb == 0 || nb == (size_t)-2)
244132687Stjr				return (nchr);
245132687Stjr			s += nb;
246132687Stjr			nms -= nb;
247132687Stjr			nchr++;
248132687Stjr		}
249132687Stjr		/*NOTREACHED*/
250132687Stjr	}
251132687Stjr
252132687Stjr	/*
253132687Stjr	 * The fast path in the loop below is not safe if an ASCII
254132687Stjr	 * character appears as anything but the first byte of a
255132687Stjr	 * multibyte sequence. Check now to avoid doing it in the loop.
256132687Stjr	 */
257132687Stjr	if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
258132687Stjr		errno = EILSEQ;
259132687Stjr		return ((size_t)-1);
260132687Stjr	}
261132687Stjr	while (len-- > 0) {
262132687Stjr		if (nms > 0 && (signed char)*s > 0) {
263132687Stjr			/*
264132687Stjr			 * Fast path for plain ASCII characters
265132687Stjr			 * excluding NUL.
266132687Stjr			 */
267132687Stjr			*dst = (wchar_t)*s;
268132687Stjr			nb = 1;
269132687Stjr		} else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
270132687Stjr		    (size_t)-1) {
271132687Stjr			*src = s;
272132687Stjr			return ((size_t)-1);
273132687Stjr		} else if (nb == (size_t)-2) {
274132687Stjr			*src = s + nms;
275132687Stjr			return (nchr);
276132687Stjr		} else if (nb == 0) {
277132687Stjr			*src = NULL;
278132687Stjr			return (nchr);
279132687Stjr		}
280132687Stjr		s += nb;
281132687Stjr		nms -= nb;
282132687Stjr		nchr++;
283132687Stjr		dst++;
284132687Stjr	}
285132687Stjr	*src = s;
286132687Stjr	return (nchr);
287132687Stjr}
288132687Stjr
289142654Sphantomstatic size_t
290128155Stjr_UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
291104828Stjr{
292128155Stjr	_UTF8State *us;
293104828Stjr	unsigned char lead;
294104828Stjr	int i, len;
295104828Stjr
296128155Stjr	us = (_UTF8State *)ps;
297128155Stjr
298129336Stjr	if (us->want != 0) {
299128155Stjr		errno = EINVAL;
300128155Stjr		return ((size_t)-1);
301128155Stjr	}
302128155Stjr
303121893Stjr	if (s == NULL)
304121893Stjr		/* Reset to initial shift state (no-op) */
305121893Stjr		return (1);
306121893Stjr
307104828Stjr	/*
308104828Stjr	 * Determine the number of octets needed to represent this character.
309104828Stjr	 * We always output the shortest sequence possible. Also specify the
310104828Stjr	 * first few bits of the first octet, which contains the information
311104828Stjr	 * about the sequence length.
312104828Stjr	 */
313121893Stjr	if ((wc & ~0x7f) == 0) {
314268571Spfg		/* Fast path for plain ASCII characters. */
315268571Spfg		*s = (char)wc;
316268571Spfg		return (1);
317121893Stjr	} else if ((wc & ~0x7ff) == 0) {
318104828Stjr		lead = 0xc0;
319104828Stjr		len = 2;
320121893Stjr	} else if ((wc & ~0xffff) == 0) {
321287393Sbapt		if (wc >= 0xd800 && wc <= 0xdfff) {
322287393Sbapt			errno = EILSEQ;
323287393Sbapt			return ((size_t)-1);
324287393Sbapt		}
325104828Stjr		lead = 0xe0;
326104828Stjr		len = 3;
327287393Sbapt	} else if (wc >= 0 && wc <= 0x10ffff) {
328104828Stjr		lead = 0xf0;
329104828Stjr		len = 4;
330104828Stjr	} else {
331121893Stjr		errno = EILSEQ;
332121893Stjr		return ((size_t)-1);
333104828Stjr	}
334104828Stjr
335121893Stjr	/*
336121893Stjr	 * Output the octets representing the character in chunks
337121893Stjr	 * of 6 bits, least significant last. The first octet is
338121893Stjr	 * a special case because it contains the sequence length
339121893Stjr	 * information.
340121893Stjr	 */
341121893Stjr	for (i = len - 1; i > 0; i--) {
342121893Stjr		s[i] = (wc & 0x3f) | 0x80;
343121893Stjr		wc >>= 6;
344104828Stjr	}
345121893Stjr	*s = (wc & 0xff) | lead;
346104828Stjr
347104828Stjr	return (len);
348104828Stjr}
349132687Stjr
350142654Sphantomstatic size_t
351132687Stjr_UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
352132687Stjr    size_t nwc, size_t len, mbstate_t * __restrict ps)
353132687Stjr{
354132687Stjr	_UTF8State *us;
355132687Stjr	char buf[MB_LEN_MAX];
356132687Stjr	const wchar_t *s;
357132687Stjr	size_t nbytes;
358132687Stjr	size_t nb;
359132687Stjr
360132687Stjr	us = (_UTF8State *)ps;
361132687Stjr
362132687Stjr	if (us->want != 0) {
363132687Stjr		errno = EINVAL;
364132687Stjr		return ((size_t)-1);
365132687Stjr	}
366132687Stjr
367132687Stjr	s = *src;
368132687Stjr	nbytes = 0;
369132687Stjr
370132687Stjr	if (dst == NULL) {
371132687Stjr		while (nwc-- > 0) {
372132687Stjr			if (0 <= *s && *s < 0x80)
373132687Stjr				/* Fast path for plain ASCII characters. */
374132687Stjr				nb = 1;
375132687Stjr			else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
376132687Stjr			    (size_t)-1)
377132687Stjr				/* Invalid character - wcrtomb() sets errno. */
378132687Stjr				return ((size_t)-1);
379132687Stjr			if (*s == L'\0')
380132687Stjr				return (nbytes + nb - 1);
381132687Stjr			s++;
382132687Stjr			nbytes += nb;
383132687Stjr		}
384132687Stjr		return (nbytes);
385132687Stjr	}
386132687Stjr
387132687Stjr	while (len > 0 && nwc-- > 0) {
388132687Stjr		if (0 <= *s && *s < 0x80) {
389132687Stjr			/* Fast path for plain ASCII characters. */
390132687Stjr			nb = 1;
391132687Stjr			*dst = *s;
392132687Stjr		} else if (len > (size_t)MB_CUR_MAX) {
393132687Stjr			/* Enough space to translate in-place. */
394141716Sstefanf			if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
395132687Stjr				*src = s;
396132687Stjr				return ((size_t)-1);
397132687Stjr			}
398132687Stjr		} else {
399132687Stjr			/*
400132687Stjr			 * May not be enough space; use temp. buffer.
401132687Stjr			 */
402141716Sstefanf			if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
403132687Stjr				*src = s;
404132687Stjr				return ((size_t)-1);
405132687Stjr			}
406132687Stjr			if (nb > (int)len)
407132687Stjr				/* MB sequence for character won't fit. */
408132687Stjr				break;
409132687Stjr			memcpy(dst, buf, nb);
410132687Stjr		}
411132687Stjr		if (*s == L'\0') {
412132687Stjr			*src = NULL;
413132687Stjr			return (nbytes + nb - 1);
414132687Stjr		}
415132687Stjr		s++;
416132687Stjr		dst += nb;
417132687Stjr		len -= nb;
418132687Stjr		nbytes += nb;
419132687Stjr	}
420132687Stjr	*src = s;
421132687Stjr	return (nbytes);
422132687Stjr}
423