1104828Stjr/*-
2128004Stjr * Copyright (c) 2002-2004 Tim J. Robbins
3104828Stjr * All rights reserved.
4104828Stjr *
5235785Stheraven * Copyright (c) 2011 The FreeBSD Foundation
6235785Stheraven * All rights reserved.
7235785Stheraven * Portions of this software were developed by David Chisnall
8235785Stheraven * under sponsorship from the FreeBSD Foundation.
9235785Stheraven *
10104828Stjr * Redistribution and use in source and binary forms, with or without
11104828Stjr * modification, are permitted provided that the following conditions
12104828Stjr * are met:
13104828Stjr * 1. Redistributions of source code must retain the above copyright
14104828Stjr *    notice, this list of conditions and the following disclaimer.
15104828Stjr * 2. Redistributions in binary form must reproduce the above copyright
16104828Stjr *    notice, this list of conditions and the following disclaimer in the
17104828Stjr *    documentation and/or other materials provided with the distribution.
18104828Stjr *
19104828Stjr * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20104828Stjr * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21104828Stjr * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22104828Stjr * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23104828Stjr * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24104828Stjr * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25104828Stjr * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26104828Stjr * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27104828Stjr * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28104828Stjr * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29104828Stjr * SUCH DAMAGE.
30104828Stjr */
31104828Stjr
32128004Stjr#include <sys/param.h>
33104828Stjr__FBSDID("$FreeBSD$");
34104828Stjr
35121893Stjr#include <errno.h>
36132687Stjr#include <limits.h>
37121893Stjr#include <runetype.h>
38104828Stjr#include <stdlib.h>
39128004Stjr#include <string.h>
40121893Stjr#include <wchar.h>
41129153Stjr#include "mblocal.h"
42104828Stjr
43172619Sacheextern int __mb_sb_limit;
44172619Sache
45142654Sphantomstatic size_t	_UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
46142654Sphantom		    size_t, mbstate_t * __restrict);
47142654Sphantomstatic int	_UTF8_mbsinit(const mbstate_t *);
48142654Sphantomstatic size_t	_UTF8_mbsnrtowcs(wchar_t * __restrict,
49142654Sphantom		    const char ** __restrict, size_t, size_t,
50142654Sphantom		    mbstate_t * __restrict);
51142654Sphantomstatic size_t	_UTF8_wcrtomb(char * __restrict, wchar_t,
52142654Sphantom		    mbstate_t * __restrict);
53142654Sphantomstatic size_t	_UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
54142654Sphantom		    size_t, size_t, mbstate_t * __restrict);
55121893Stjr
56128004Stjrtypedef struct {
57129336Stjr	wchar_t	ch;
58129336Stjr	int	want;
59129336Stjr	wchar_t	lbound;
60128004Stjr} _UTF8State;
61128004Stjr
62104828Stjrint
63235785Stheraven_UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl)
64104828Stjr{
65104828Stjr
66235785Stheraven	l->__mbrtowc = _UTF8_mbrtowc;
67235785Stheraven	l->__wcrtomb = _UTF8_wcrtomb;
68235785Stheraven	l->__mbsinit = _UTF8_mbsinit;
69235785Stheraven	l->__mbsnrtowcs = _UTF8_mbsnrtowcs;
70235785Stheraven	l->__wcsnrtombs = _UTF8_wcsnrtombs;
71235785Stheraven	l->runes = rl;
72235785Stheraven	l->__mb_cur_max = 6;
73172661Sache	/*
74172661Sache	 * UCS-4 encoding used as the internal representation, so
75172661Sache	 * slots 0x0080-0x00FF are occuped and must be excluded
76172661Sache	 * from the single byte ctype by setting the limit.
77172661Sache	 */
78235785Stheraven	l->__mb_sb_limit = 128;
79104828Stjr
80104828Stjr	return (0);
81104828Stjr}
82104828Stjr
83142654Sphantomstatic int
84128004Stjr_UTF8_mbsinit(const mbstate_t *ps)
85128004Stjr{
86128004Stjr
87129336Stjr	return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
88128004Stjr}
89128004Stjr
90142654Sphantomstatic size_t
91121893Stjr_UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
92128004Stjr    mbstate_t * __restrict ps)
93104828Stjr{
94128004Stjr	_UTF8State *us;
95129336Stjr	int ch, i, mask, want;
96121893Stjr	wchar_t lbound, wch;
97104828Stjr
98128004Stjr	us = (_UTF8State *)ps;
99128004Stjr
100129336Stjr	if (us->want < 0 || us->want > 6) {
101128155Stjr		errno = EINVAL;
102128155Stjr		return ((size_t)-1);
103128155Stjr	}
104128155Stjr
105128004Stjr	if (s == NULL) {
106128004Stjr		s = "";
107128004Stjr		n = 1;
108128004Stjr		pwc = NULL;
109128004Stjr	}
110128004Stjr
111121893Stjr	if (n == 0)
112121893Stjr		/* Incomplete multibyte sequence */
113121893Stjr		return ((size_t)-2);
114104828Stjr
115131881Stjr	if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
116131881Stjr		/* Fast path for plain ASCII characters. */
117131881Stjr		if (pwc != NULL)
118131881Stjr			*pwc = ch;
119131881Stjr		return (ch != '\0' ? 1 : 0);
120131881Stjr	}
121131881Stjr
122129336Stjr	if (us->want == 0) {
123104828Stjr		/*
124129336Stjr		 * Determine the number of octets that make up this character
125129336Stjr		 * from the first octet, and a mask that extracts the
126129336Stjr		 * interesting bits of the first octet. We already know
127129336Stjr		 * the character is at least two bytes long.
128129336Stjr		 *
129129336Stjr		 * We also specify a lower bound for the character code to
130129336Stjr		 * detect redundant, non-"shortest form" encodings. For
131129336Stjr		 * example, the sequence C0 80 is _not_ a legal representation
132129336Stjr		 * of the null character. This enforces a 1-to-1 mapping
133129336Stjr		 * between character codes and their multibyte representations.
134104828Stjr		 */
135129336Stjr		ch = (unsigned char)*s;
136129336Stjr		if ((ch & 0x80) == 0) {
137129336Stjr			mask = 0x7f;
138129336Stjr			want = 1;
139129336Stjr			lbound = 0;
140129336Stjr		} else if ((ch & 0xe0) == 0xc0) {
141129336Stjr			mask = 0x1f;
142129336Stjr			want = 2;
143129336Stjr			lbound = 0x80;
144129336Stjr		} else if ((ch & 0xf0) == 0xe0) {
145129336Stjr			mask = 0x0f;
146129336Stjr			want = 3;
147129336Stjr			lbound = 0x800;
148129336Stjr		} else if ((ch & 0xf8) == 0xf0) {
149129336Stjr			mask = 0x07;
150129336Stjr			want = 4;
151129336Stjr			lbound = 0x10000;
152129336Stjr		} else if ((ch & 0xfc) == 0xf8) {
153129336Stjr			mask = 0x03;
154129336Stjr			want = 5;
155129336Stjr			lbound = 0x200000;
156157289Strhodes		} else if ((ch & 0xfe) == 0xfc) {
157129336Stjr			mask = 0x01;
158129336Stjr			want = 6;
159129336Stjr			lbound = 0x4000000;
160129336Stjr		} else {
161129336Stjr			/*
162129336Stjr			 * Malformed input; input is not UTF-8.
163129336Stjr			 */
164129336Stjr			errno = EILSEQ;
165129336Stjr			return ((size_t)-1);
166129336Stjr		}
167129336Stjr	} else {
168129336Stjr		want = us->want;
169129336Stjr		lbound = us->lbound;
170104828Stjr	}
171104828Stjr
172104828Stjr	/*
173104828Stjr	 * Decode the octet sequence representing the character in chunks
174104828Stjr	 * of 6 bits, most significant first.
175104828Stjr	 */
176129336Stjr	if (us->want == 0)
177129336Stjr		wch = (unsigned char)*s++ & mask;
178129336Stjr	else
179129336Stjr		wch = us->ch;
180129336Stjr	for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
181121893Stjr		if ((*s & 0xc0) != 0x80) {
182104828Stjr			/*
183104828Stjr			 * Malformed input; bad characters in the middle
184104828Stjr			 * of a character.
185104828Stjr			 */
186121893Stjr			errno = EILSEQ;
187121893Stjr			return ((size_t)-1);
188104828Stjr		}
189104828Stjr		wch <<= 6;
190121893Stjr		wch |= *s++ & 0x3f;
191104828Stjr	}
192129336Stjr	if (i < want) {
193129336Stjr		/* Incomplete multibyte sequence. */
194129336Stjr		us->want = want - i;
195129336Stjr		us->lbound = lbound;
196129336Stjr		us->ch = wch;
197129336Stjr		return ((size_t)-2);
198129336Stjr	}
199121893Stjr	if (wch < lbound) {
200104828Stjr		/*
201104828Stjr		 * Malformed input; redundant encoding.
202104828Stjr		 */
203121893Stjr		errno = EILSEQ;
204121893Stjr		return ((size_t)-1);
205121893Stjr	}
206121893Stjr	if (pwc != NULL)
207121893Stjr		*pwc = wch;
208129336Stjr	us->want = 0;
209129336Stjr	return (wch == L'\0' ? 0 : want);
210104828Stjr}
211104828Stjr
212142654Sphantomstatic size_t
213132687Stjr_UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
214132687Stjr    size_t nms, size_t len, mbstate_t * __restrict ps)
215132687Stjr{
216132687Stjr	_UTF8State *us;
217132687Stjr	const char *s;
218132687Stjr	size_t nchr;
219132687Stjr	wchar_t wc;
220132687Stjr	size_t nb;
221132687Stjr
222132687Stjr	us = (_UTF8State *)ps;
223132687Stjr
224132687Stjr	s = *src;
225132687Stjr	nchr = 0;
226132687Stjr
227132687Stjr	if (dst == NULL) {
228132687Stjr		/*
229132687Stjr		 * The fast path in the loop below is not safe if an ASCII
230132687Stjr		 * character appears as anything but the first byte of a
231132687Stjr		 * multibyte sequence. Check now to avoid doing it in the loop.
232132687Stjr		 */
233132687Stjr		if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
234132687Stjr			errno = EILSEQ;
235132687Stjr			return ((size_t)-1);
236132687Stjr		}
237132687Stjr		for (;;) {
238132687Stjr			if (nms > 0 && (signed char)*s > 0)
239132687Stjr				/*
240132687Stjr				 * Fast path for plain ASCII characters
241132687Stjr				 * excluding NUL.
242132687Stjr				 */
243132687Stjr				nb = 1;
244132687Stjr			else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
245132687Stjr			    (size_t)-1)
246132687Stjr				/* Invalid sequence - mbrtowc() sets errno. */
247132687Stjr				return ((size_t)-1);
248132687Stjr			else if (nb == 0 || nb == (size_t)-2)
249132687Stjr				return (nchr);
250132687Stjr			s += nb;
251132687Stjr			nms -= nb;
252132687Stjr			nchr++;
253132687Stjr		}
254132687Stjr		/*NOTREACHED*/
255132687Stjr	}
256132687Stjr
257132687Stjr	/*
258132687Stjr	 * The fast path in the loop below is not safe if an ASCII
259132687Stjr	 * character appears as anything but the first byte of a
260132687Stjr	 * multibyte sequence. Check now to avoid doing it in the loop.
261132687Stjr	 */
262132687Stjr	if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
263132687Stjr		errno = EILSEQ;
264132687Stjr		return ((size_t)-1);
265132687Stjr	}
266132687Stjr	while (len-- > 0) {
267132687Stjr		if (nms > 0 && (signed char)*s > 0) {
268132687Stjr			/*
269132687Stjr			 * Fast path for plain ASCII characters
270132687Stjr			 * excluding NUL.
271132687Stjr			 */
272132687Stjr			*dst = (wchar_t)*s;
273132687Stjr			nb = 1;
274132687Stjr		} else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
275132687Stjr		    (size_t)-1) {
276132687Stjr			*src = s;
277132687Stjr			return ((size_t)-1);
278132687Stjr		} else if (nb == (size_t)-2) {
279132687Stjr			*src = s + nms;
280132687Stjr			return (nchr);
281132687Stjr		} else if (nb == 0) {
282132687Stjr			*src = NULL;
283132687Stjr			return (nchr);
284132687Stjr		}
285132687Stjr		s += nb;
286132687Stjr		nms -= nb;
287132687Stjr		nchr++;
288132687Stjr		dst++;
289132687Stjr	}
290132687Stjr	*src = s;
291132687Stjr	return (nchr);
292132687Stjr}
293132687Stjr
294142654Sphantomstatic size_t
295128155Stjr_UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
296104828Stjr{
297128155Stjr	_UTF8State *us;
298104828Stjr	unsigned char lead;
299104828Stjr	int i, len;
300104828Stjr
301128155Stjr	us = (_UTF8State *)ps;
302128155Stjr
303129336Stjr	if (us->want != 0) {
304128155Stjr		errno = EINVAL;
305128155Stjr		return ((size_t)-1);
306128155Stjr	}
307128155Stjr
308121893Stjr	if (s == NULL)
309121893Stjr		/* Reset to initial shift state (no-op) */
310121893Stjr		return (1);
311121893Stjr
312131881Stjr	if ((wc & ~0x7f) == 0) {
313131881Stjr		/* Fast path for plain ASCII characters. */
314131881Stjr		*s = (char)wc;
315131881Stjr		return (1);
316131881Stjr	}
317131881Stjr
318104828Stjr	/*
319104828Stjr	 * Determine the number of octets needed to represent this character.
320104828Stjr	 * We always output the shortest sequence possible. Also specify the
321104828Stjr	 * first few bits of the first octet, which contains the information
322104828Stjr	 * about the sequence length.
323104828Stjr	 */
324121893Stjr	if ((wc & ~0x7f) == 0) {
325104828Stjr		lead = 0;
326104828Stjr		len = 1;
327121893Stjr	} else if ((wc & ~0x7ff) == 0) {
328104828Stjr		lead = 0xc0;
329104828Stjr		len = 2;
330121893Stjr	} else if ((wc & ~0xffff) == 0) {
331104828Stjr		lead = 0xe0;
332104828Stjr		len = 3;
333121893Stjr	} else if ((wc & ~0x1fffff) == 0) {
334104828Stjr		lead = 0xf0;
335104828Stjr		len = 4;
336121893Stjr	} else if ((wc & ~0x3ffffff) == 0) {
337104828Stjr		lead = 0xf8;
338104828Stjr		len = 5;
339121893Stjr	} else if ((wc & ~0x7fffffff) == 0) {
340104828Stjr		lead = 0xfc;
341104828Stjr		len = 6;
342104828Stjr	} else {
343121893Stjr		errno = EILSEQ;
344121893Stjr		return ((size_t)-1);
345104828Stjr	}
346104828Stjr
347121893Stjr	/*
348121893Stjr	 * Output the octets representing the character in chunks
349121893Stjr	 * of 6 bits, least significant last. The first octet is
350121893Stjr	 * a special case because it contains the sequence length
351121893Stjr	 * information.
352121893Stjr	 */
353121893Stjr	for (i = len - 1; i > 0; i--) {
354121893Stjr		s[i] = (wc & 0x3f) | 0x80;
355121893Stjr		wc >>= 6;
356104828Stjr	}
357121893Stjr	*s = (wc & 0xff) | lead;
358104828Stjr
359104828Stjr	return (len);
360104828Stjr}
361132687Stjr
362142654Sphantomstatic size_t
363132687Stjr_UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
364132687Stjr    size_t nwc, size_t len, mbstate_t * __restrict ps)
365132687Stjr{
366132687Stjr	_UTF8State *us;
367132687Stjr	char buf[MB_LEN_MAX];
368132687Stjr	const wchar_t *s;
369132687Stjr	size_t nbytes;
370132687Stjr	size_t nb;
371132687Stjr
372132687Stjr	us = (_UTF8State *)ps;
373132687Stjr
374132687Stjr	if (us->want != 0) {
375132687Stjr		errno = EINVAL;
376132687Stjr		return ((size_t)-1);
377132687Stjr	}
378132687Stjr
379132687Stjr	s = *src;
380132687Stjr	nbytes = 0;
381132687Stjr
382132687Stjr	if (dst == NULL) {
383132687Stjr		while (nwc-- > 0) {
384132687Stjr			if (0 <= *s && *s < 0x80)
385132687Stjr				/* Fast path for plain ASCII characters. */
386132687Stjr				nb = 1;
387132687Stjr			else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
388132687Stjr			    (size_t)-1)
389132687Stjr				/* Invalid character - wcrtomb() sets errno. */
390132687Stjr				return ((size_t)-1);
391132687Stjr			if (*s == L'\0')
392132687Stjr				return (nbytes + nb - 1);
393132687Stjr			s++;
394132687Stjr			nbytes += nb;
395132687Stjr		}
396132687Stjr		return (nbytes);
397132687Stjr	}
398132687Stjr
399132687Stjr	while (len > 0 && nwc-- > 0) {
400132687Stjr		if (0 <= *s && *s < 0x80) {
401132687Stjr			/* Fast path for plain ASCII characters. */
402132687Stjr			nb = 1;
403132687Stjr			*dst = *s;
404132687Stjr		} else if (len > (size_t)MB_CUR_MAX) {
405132687Stjr			/* Enough space to translate in-place. */
406141716Sstefanf			if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
407132687Stjr				*src = s;
408132687Stjr				return ((size_t)-1);
409132687Stjr			}
410132687Stjr		} else {
411132687Stjr			/*
412132687Stjr			 * May not be enough space; use temp. buffer.
413132687Stjr			 */
414141716Sstefanf			if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
415132687Stjr				*src = s;
416132687Stjr				return ((size_t)-1);
417132687Stjr			}
418132687Stjr			if (nb > (int)len)
419132687Stjr				/* MB sequence for character won't fit. */
420132687Stjr				break;
421132687Stjr			memcpy(dst, buf, nb);
422132687Stjr		}
423132687Stjr		if (*s == L'\0') {
424132687Stjr			*src = NULL;
425132687Stjr			return (nbytes + nb - 1);
426132687Stjr		}
427132687Stjr		s++;
428132687Stjr		dst += nb;
429132687Stjr		len -= nb;
430132687Stjr		nbytes += nb;
431132687Stjr	}
432132687Stjr	*src = s;
433132687Stjr	return (nbytes);
434132687Stjr}
435