1/*-
2 * Copyright (c) 2003, 2005 Ryuichiro Imura
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD$");
29
30#include <sys/param.h>
31#include <sys/kernel.h>
32#include <sys/systm.h>
33#include <sys/malloc.h>
34#include <sys/iconv.h>
35
36#include "iconv_converter_if.h"
37
38/*
39 * "UCS" converter
40 */
41
42#define	KICONV_UCS_COMBINE	0x1
43#define	KICONV_UCS_FROM_UTF8	0x2
44#define	KICONV_UCS_TO_UTF8	0x4
45#define	KICONV_UCS_FROM_LE	0x8
46#define	KICONV_UCS_TO_LE	0x10
47#define	KICONV_UCS_FROM_UTF16	0x20
48#define	KICONV_UCS_TO_UTF16	0x40
49#define	KICONV_UCS_UCS4		0x80
50
51#define	ENCODING_UTF16	"UTF-16BE"
52#define	ENCODING_UTF8	"UTF-8"
53
54static struct {
55	const char *name;
56	int from_flag, to_flag;
57} unicode_family[] = {
58	{ "UTF-8",	KICONV_UCS_FROM_UTF8,	KICONV_UCS_TO_UTF8 },
59	{ "UCS-2LE",	KICONV_UCS_FROM_LE,	KICONV_UCS_TO_LE },
60	{ "UTF-16BE",	KICONV_UCS_FROM_UTF16,	KICONV_UCS_TO_UTF16 },
61	{ "UTF-16LE",	KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
62	    KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
63	{ NULL,		0,	0 }
64};
65
66static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
67static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
68static uint32_t encode_surrogate(uint32_t code);
69static uint32_t decode_surrogate(const u_char *ucs);
70
71#ifdef MODULE_DEPEND
72MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
73#endif
74
75/*
76 * UCS converter instance
77 */
78struct iconv_ucs {
79	KOBJ_FIELDS;
80	int			convtype;
81	struct iconv_cspair *	d_csp;
82	struct iconv_cspair *	d_cspf;
83	void *			f_ctp;
84	void *			t_ctp;
85	void *			ctype;
86};
87
88static int
89iconv_ucs_open(struct iconv_converter_class *dcp,
90	struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
91{
92	struct iconv_ucs *dp;
93	int i;
94	const char *from, *to;
95
96	dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
97	to = csp->cp_to;
98	from = cspf ? cspf->cp_from : csp->cp_from;
99
100	dp->convtype = 0;
101
102	if (cspf)
103		dp->convtype |= KICONV_UCS_COMBINE;
104	for (i = 0; unicode_family[i].name; i++) {
105		if (strcmp(from, unicode_family[i].name) == 0)
106			dp->convtype |= unicode_family[i].from_flag;
107		if (strcmp(to, unicode_family[i].name) == 0)
108			dp->convtype |= unicode_family[i].to_flag;
109	}
110	if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
111		dp->convtype |= KICONV_UCS_UCS4;
112	else
113		dp->convtype &= ~KICONV_UCS_UCS4;
114
115	dp->f_ctp = dp->t_ctp = NULL;
116	if (dp->convtype & KICONV_UCS_COMBINE) {
117		if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
118		    (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
119			iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
120		}
121		if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
122		    (dp->convtype & KICONV_UCS_TO_LE) == 0) {
123			iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
124		}
125	}
126
127	dp->ctype = NULL;
128	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
129		iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
130
131	dp->d_csp = csp;
132	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
133		if (cspf) {
134			dp->d_cspf = cspf;
135			cspf->cp_refcount++;
136		} else
137			csp->cp_refcount++;
138	}
139	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
140		csp->cp_refcount++;
141	*dpp = (void*)dp;
142	return 0;
143}
144
145static int
146iconv_ucs_close(void *data)
147{
148	struct iconv_ucs *dp = data;
149
150	if (dp->f_ctp)
151		iconv_close(dp->f_ctp);
152	if (dp->t_ctp)
153		iconv_close(dp->t_ctp);
154	if (dp->ctype)
155		iconv_close(dp->ctype);
156	if (dp->d_cspf)
157		dp->d_cspf->cp_refcount--;
158	else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
159		dp->d_csp->cp_refcount--;
160	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
161		dp->d_csp->cp_refcount--;
162	kobj_delete((struct kobj*)data, M_ICONV);
163	return 0;
164}
165
166static int
167iconv_ucs_conv(void *d2p, const char **inbuf,
168	size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
169	int convchar, int casetype)
170{
171	struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
172	int ret = 0, i;
173	size_t in, on, ir, or, inlen, outlen, ucslen;
174	const char *src, *p;
175	char *dst;
176	u_char ucs[4], *q;
177	uint32_t code;
178
179	if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
180		return 0;
181	ir = in = *inbytesleft;
182	or = on = *outbytesleft;
183	src = *inbuf;
184	dst = *outbuf;
185
186	while (ir > 0 && or > 0) {
187
188		/*
189		 * The first half of conversion.
190		 * (convert any code into ENCODING_UNICODE)
191		 */
192		code = 0;
193		p = src;
194		if (dp->convtype & KICONV_UCS_FROM_UTF8) {
195			/* convert UTF-8 to ENCODING_UNICODE */
196			inlen = 0;
197			code = utf8_to_ucs4(p, &inlen, ir);
198			if (code == 0) {
199				ret = -1;
200				break;
201			}
202
203			if (casetype == KICONV_FROM_LOWER && dp->ctype) {
204				code = towlower(code, dp->ctype);
205			} else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
206				code = towupper(code, dp->ctype);
207			}
208
209			if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
210				/* reserved for utf-16 surrogate pair */
211				/* invalid unicode */
212				ret = -1;
213				break;
214			}
215
216			if (inlen == 4) {
217				if (dp->convtype & KICONV_UCS_UCS4) {
218					ucslen = 4;
219					code = encode_surrogate(code);
220				} else {
221					/* can't handle with ucs-2 */
222					ret = -1;
223					break;
224				}
225			} else {
226				ucslen = 2;
227			}
228
229			/* save UCS-4 into ucs[] */
230			for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
231				*q++ = (code >> (i << 3)) & 0xff;
232
233		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
234			/* convert local code to ENCODING_UNICODE */
235			ucslen = 4;
236			inlen = ir;
237			q = ucs;
238			ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
239			    &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
240			if (ret)
241				break;
242			inlen = ir - inlen;
243			ucslen = 4 - ucslen;
244
245		} else {
246			/* src code is a proper subset of ENCODING_UNICODE */
247			q = ucs;
248			if (dp->convtype & KICONV_UCS_FROM_LE) {
249				*q = *(p + 1);
250				*(q + 1) = *p;
251				p += 2;
252			} else {
253				*q = *p++;
254				*(q + 1) = *p++;
255			}
256			if ((*q & 0xfc) == 0xd8) {
257				if (dp->convtype & KICONV_UCS_UCS4 &&
258				    dp->convtype & KICONV_UCS_FROM_UTF16) {
259					inlen = ucslen = 4;
260				} else {
261					/* invalid unicode */
262					ret = -1;
263					break;
264				}
265			} else {
266				inlen = ucslen = 2;
267			}
268			if (ir < inlen) {
269				ret = -1;
270				break;
271			}
272			if (ucslen == 4) {
273				q += 2;
274				if (dp->convtype & KICONV_UCS_FROM_LE) {
275					*q = *(p + 1);
276					*(q + 1) = *p;
277				} else {
278					*q = *p++;
279					*(q + 1) = *p;
280				}
281				if ((*q & 0xfc) != 0xdc) {
282					/* invalid unicode */
283					ret = -1;
284					break;
285				}
286			}
287		}
288
289		/*
290		 * The second half of conversion.
291		 * (convert ENCODING_UNICODE into any code)
292		 */
293		p = ucs;
294		if (dp->convtype & KICONV_UCS_TO_UTF8) {
295			q = (u_char *)dst;
296			if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
297				/* decode surrogate pair */
298				code = decode_surrogate(p);
299			} else {
300				code = (ucs[0] << 8) | ucs[1];
301			}
302
303			if (casetype == KICONV_LOWER && dp->ctype) {
304				code = towlower(code, dp->ctype);
305			} else if (casetype == KICONV_UPPER && dp->ctype) {
306				code = towupper(code, dp->ctype);
307			}
308
309			outlen = 0;
310			if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
311				ret = -1;
312				break;
313			}
314
315			src += inlen;
316			ir -= inlen;
317			dst += outlen;
318			or -= outlen;
319
320		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
321			ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
322			    &or, casetype & (KICONV_LOWER | KICONV_UPPER));
323			if (ret)
324				break;
325
326			src += inlen;
327			ir -= inlen;
328
329		} else {
330			/* dst code is a proper subset of ENCODING_UNICODE */
331			if (or < ucslen) {
332				ret = -1;
333				break;
334			}
335			src += inlen;
336			ir -= inlen;
337			or -= ucslen;
338			if (dp->convtype & KICONV_UCS_TO_LE) {
339				*dst++ = *(p + 1);
340				*dst++ = *p;
341				p += 2;
342			} else {
343				*dst++ = *p++;
344				*dst++ = *p++;
345			}
346			if (ucslen == 4) {
347				if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
348				    (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
349					ret = -1;
350					break;
351				}
352				if (dp->convtype & KICONV_UCS_TO_LE) {
353					*dst++ = *(p + 1);
354					*dst++ = *p;
355				} else {
356					*dst++ = *p++;
357					*dst++ = *p;
358				}
359			}
360		}
361
362		if (convchar == 1)
363			break;
364	}
365
366	*inbuf += in - ir;
367	*outbuf += on - or;
368	*inbytesleft -= in - ir;
369	*outbytesleft -= on - or;
370	return (ret);
371}
372
373static int
374iconv_ucs_init(struct iconv_converter_class *dcp)
375{
376	int error;
377
378	error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
379	if (error)
380		return (error);
381	error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
382	if (error)
383		return (error);
384	return (0);
385}
386
387static int
388iconv_ucs_done(struct iconv_converter_class *dcp)
389{
390	return (0);
391}
392
393static const char *
394iconv_ucs_name(struct iconv_converter_class *dcp)
395{
396	return (ENCODING_UNICODE);
397}
398
399static kobj_method_t iconv_ucs_methods[] = {
400	KOBJMETHOD(iconv_converter_open,	iconv_ucs_open),
401	KOBJMETHOD(iconv_converter_close,	iconv_ucs_close),
402	KOBJMETHOD(iconv_converter_conv,	iconv_ucs_conv),
403	KOBJMETHOD(iconv_converter_init,	iconv_ucs_init),
404	KOBJMETHOD(iconv_converter_done,	iconv_ucs_done),
405	KOBJMETHOD(iconv_converter_name,	iconv_ucs_name),
406	{0, 0}
407};
408
409KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
410
411static uint32_t
412utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
413{
414	size_t i, w = 0;
415	uint32_t ucs4 = 0;
416
417	/*
418	 * get leading 1 byte from utf-8
419	 */
420	if ((*src & 0x80) == 0) {
421		/*
422		 * leading 1 bit is "0"
423		 *  utf-8: 0xxxxxxx
424		 *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
425		 */
426		w = 1;
427		/* get trailing 7 bits */
428		ucs4 = *src & 0x7f;
429	} else if ((*src & 0xe0) == 0xc0) {
430		/*
431		 * leading 3 bits are "110"
432		 *  utf-8: 110xxxxx 10yyyyyy
433		 *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
434		 */
435		w = 2;
436		/* get trailing 5 bits */
437		ucs4 = *src & 0x1f;
438	} else if ((*src & 0xf0) == 0xe0) {
439		/*
440		 * leading 4 bits are "1110"
441		 *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
442		 *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
443		 */
444		w = 3;
445		/* get trailing 4 bits */
446		ucs4 = *src & 0x0f;
447	} else if ((*src & 0xf8) == 0xf0) {
448		/*
449		 * leading 5 bits are "11110"
450		 *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
451		 *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
452		 */
453		w = 4;
454		/* get trailing 3 bits */
455		ucs4 = *src & 0x07;
456	} else {
457		/* out of utf-16 range or having illegal bits */
458		return (0);
459	}
460	if (w == 0)
461		return (0);
462
463	if (srclen < w)
464		return (0);
465
466	/*
467	 * get left parts from utf-8
468	 */
469	for (i = 1 ; i < w ; i++) {
470		if ((*(src + i) & 0xc0) != 0x80) {
471			/* invalid: leading 2 bits are not "10" */
472			return (0);
473		}
474		/* concatenate trailing 6 bits into ucs4 */
475		ucs4 <<= 6;
476		ucs4 |= *(src + i) & 0x3f;
477	}
478
479	*utf8width = w;
480	return (ucs4);
481}
482
483static u_char *
484ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
485{
486	u_char lead, *p;
487	size_t i, w;
488
489	/*
490	 * determine utf-8 width and leading bits
491	 */
492	if (ucs4 < 0x80) {
493		w = 1;
494		lead = 0;	/* "0" */
495	} else if (ucs4 < 0x800) {
496		w = 2;
497		lead = 0xc0;	/* "11" */
498	} else if (ucs4 < 0x10000) {
499		w = 3;
500		lead = 0xe0;	/* "111" */
501	} else if (ucs4 < 0x200000) {
502		w = 4;
503		lead = 0xf0;	/* "1111" */
504	} else {
505		return (NULL);
506	}
507
508	if (dstlen < w)
509		return (NULL);
510
511	/*
512	 * construct utf-8
513	 */
514	p = dst;
515	for (i = w - 1 ; i >= 1 ; i--) {
516		/* get trailing 6 bits and put it with leading bit as "1" */
517		*(p + i) = (ucs4 & 0x3f) | 0x80;
518		ucs4 >>= 6;
519	}
520	*p = ucs4 | lead;
521
522	*utf8width = w;
523
524	return (p);
525}
526
527static uint32_t
528encode_surrogate(register uint32_t code)
529{
530	return ((((code - 0x10000) << 6) & 0x3ff0000) |
531	    ((code - 0x10000) & 0x3ff) | 0xd800dc00);
532}
533
534static uint32_t
535decode_surrogate(register const u_char *ucs)
536{
537	return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
538	    ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
539}
540
541