1/* $NetBSD: unicode.c,v 1.1.1.1 2007/03/06 00:10:39 dillo Exp $ */
2
3/*-
4 * Copyright (c) 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Dieter Baron.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33__KERNEL_RCSID(0, "$NetBSD$");
34
35#include <sys/null.h>
36
37#include "unicode.h"
38
39size_t
40utf8_to_utf16(uint16_t *dst, size_t dst_len,
41	      const char *src, size_t src_len,
42	      int flags, int *errp)
43{
44    const unsigned char *s;
45    size_t spos, dpos;
46    int error;
47    uint16_t c;
48
49#define IS_CONT(c)	(((c)&0xc0) == 0x80)
50
51    error = 0;
52    s = (const unsigned char *)src;
53    spos = dpos = 0;
54    while (spos<src_len) {
55	if (s[spos] < 0x80)
56	    c = s[spos++];
57	else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK)
58		 && (spos >= src_len || !IS_CONT(s[spos+1]))
59		 && s[spos]>=0xa0) {
60	    /* not valid UTF-8, assume ISO 8859-1 */
61	    c = s[spos++];
62	}
63	else if (s[spos] < 0xc0 || s[spos] >= 0xf5) {
64	    /* continuation byte without lead byte
65	       or lead byte for codepoint above 0x10ffff */
66	    error++;
67	    spos++;
68	    continue;
69	}
70	else if (s[spos] < 0xe0) {
71	    if (spos >= src_len || !IS_CONT(s[spos+1])) {
72		spos++;
73		error++;
74		continue;
75	    }
76	    c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f);
77	    spos += 2;
78	    if (c < 0x80) {
79		/* overlong encoding */
80		error++;
81		continue;
82	    }
83	}
84	else if (s[spos] < 0xf0) {
85	    if (spos >= src_len-2
86		|| !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) {
87		spos++;
88		error++;
89		continue;
90	    }
91	    c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6)
92		| (s[spos+2] & 0x3f);
93	    spos += 3;
94	    if (c < 0x800 || (c & 0xdf00) == 0xd800 ) {
95		/* overlong encoding or encoded surrogate */
96		error++;
97		continue;
98	    }
99	}
100	else {
101	    uint32_t cc;
102	    /* UTF-16 surrogate pair */
103
104	    if (spos >= src_len-3 || !IS_CONT(s[spos+1])
105		|| !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) {
106		spos++;
107		error++;
108
109		continue;
110	    }
111	    cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12)
112		 | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f);
113	    spos += 4;
114	    if (cc < 0x10000) {
115		/* overlong encoding */
116		error++;
117		continue;
118	    }
119	    if (dst && dpos < dst_len)
120		dst[dpos] = (0xd800 | ((cc-0x10000)>>10));
121	    dpos++;
122	    c = 0xdc00 | ((cc-0x10000) & 0x3ffff);
123	}
124
125	if (dst && dpos < dst_len)
126	    dst[dpos] = c;
127	dpos++;
128    }
129
130    if (errp)
131	*errp = error;
132
133    return dpos;
134
135#undef IS_CONT
136}
137
138
139size_t
140utf16_to_utf8(char *dst, size_t dst_len,
141	      const uint16_t *src, size_t src_len,
142	      int flags, int *errp)
143{
144    uint8_t spos, dpos;
145    int error;
146
147#define CHECK_LENGTH(l)	(dpos > dst_len-(l) ? dst=NULL : NULL)
148#define ADD_BYTE(b)	(dst ? dst[dpos] = (b) : 0, dpos++)
149
150    error = 0;
151    dpos = 0;
152    for (spos=0; spos<src_len; spos++) {
153	if (src[spos] < 0x80) {
154	    CHECK_LENGTH(1);
155	    ADD_BYTE(src[spos]);
156	}
157	else if (src[spos] < 0x800) {
158	    CHECK_LENGTH(2);
159	    ADD_BYTE(0xc0 | (src[spos]>>6));
160	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
161	}
162	else if ((src[spos] & 0xdc00) == 0xd800) {
163	    uint32_t c;
164	    /* first surrogate */
165	    if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) {
166		/* no second surrogate present */
167		error++;
168		continue;
169	    }
170	    spos++;
171	    CHECK_LENGTH(4);
172	    c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000;
173	    ADD_BYTE(0xf0 | (c>>18));
174	    ADD_BYTE(0x80 | ((c>>12) & 0x3f));
175	    ADD_BYTE(0x80 | ((c>>6) & 0x3f));
176	    ADD_BYTE(0x80 | (c & 0x3f));
177	}
178	else if ((src[spos] & 0xdc00) == 0xdc00) {
179	    /* second surrogate without preceding first surrogate */
180	    error++;
181	}
182	else {
183	    CHECK_LENGTH(3);
184	    ADD_BYTE(0xe0 | src[spos]>>12);
185	    ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f));
186	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
187	}
188    }
189
190    if (errp)
191	*errp = error;
192
193    return dpos;
194
195#undef ADD_BYTE
196#undef CHECK_LENGTH
197}
198