1271493Sdelphij/* $NetBSD: unicode.h,v 1.1.1.1 2007/03/06 00:10:39 dillo Exp $ */
2271493Sdelphij
3271493Sdelphij/*-
4271493Sdelphij * Copyright (c) 2007 The NetBSD Foundation, Inc.
5271493Sdelphij * All rights reserved.
6271493Sdelphij *
7271493Sdelphij * This code is derived from software contributed to The NetBSD Foundation
8271493Sdelphij * by Dieter Baron.
9271493Sdelphij *
10271493Sdelphij * Redistribution and use in source and binary forms, with or without
11271493Sdelphij * modification, are permitted provided that the following conditions
12271493Sdelphij * are met:
13271493Sdelphij * 1. Redistributions of source code must retain the above copyright
14271493Sdelphij *    notice, this list of conditions and the following disclaimer.
15271493Sdelphij * 2. Redistributions in binary form must reproduce the above copyright
16271493Sdelphij *    notice, this list of conditions and the following disclaimer in the
17271493Sdelphij *    documentation and/or other materials provided with the distribution.
18271493Sdelphij *
19271493Sdelphij * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20271493Sdelphij * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21271493Sdelphij * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22271493Sdelphij * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23271493Sdelphij * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24271493Sdelphij * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25271493Sdelphij * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26271493Sdelphij * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27271493Sdelphij * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28271493Sdelphij * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29271493Sdelphij * POSSIBILITY OF SUCH DAMAGE.
30271493Sdelphij *
31271493Sdelphij * $FreeBSD: releng/11.0/sys/dev/hyperv/utilities/unicode.h 271493 2014-09-13 02:15:31Z delphij $
32271493Sdelphij */
33271493Sdelphij
34271493Sdelphij#include <sys/types.h>
35271493Sdelphij
36271493Sdelphij#define UNICODE_DECOMPOSE		0x01
37271493Sdelphij#define UNICODE_PRECOMPOSE		0x02
38271493Sdelphij#define UNICODE_UTF8_LATIN1_FALLBACK	0x03
39271493Sdelphij
40271493Sdelphijsize_t utf8_to_utf16(uint16_t *, size_t, const char *, size_t, int, int *);
41271493Sdelphijsize_t utf16_to_utf8(char *, size_t, const uint16_t *, size_t, int, int *);
42271493Sdelphij
43271493Sdelphijsize_t
44271493Sdelphijutf8_to_utf16(uint16_t *dst, size_t dst_len,
45271493Sdelphij	      const char *src, size_t src_len,
46271493Sdelphij	      int flags, int *errp)
47271493Sdelphij{
48271493Sdelphij    const unsigned char *s;
49271493Sdelphij    size_t spos, dpos;
50271493Sdelphij    int error;
51271493Sdelphij    uint16_t c;
52271493Sdelphij
53271493Sdelphij#define IS_CONT(c)	(((c)&0xc0) == 0x80)
54271493Sdelphij
55271493Sdelphij    error = 0;
56271493Sdelphij    s = (const unsigned char *)src;
57271493Sdelphij    spos = dpos = 0;
58271493Sdelphij    while (spos<src_len) {
59271493Sdelphij	if (s[spos] < 0x80)
60271493Sdelphij	    c = s[spos++];
61271493Sdelphij	else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK)
62271493Sdelphij		 && (spos >= src_len || !IS_CONT(s[spos+1]))
63271493Sdelphij		 && s[spos]>=0xa0) {
64271493Sdelphij	    /* not valid UTF-8, assume ISO 8859-1 */
65271493Sdelphij	    c = s[spos++];
66271493Sdelphij	}
67271493Sdelphij	else if (s[spos] < 0xc0 || s[spos] >= 0xf5) {
68271493Sdelphij	    /* continuation byte without lead byte
69271493Sdelphij	       or lead byte for codepoint above 0x10ffff */
70271493Sdelphij	    error++;
71271493Sdelphij	    spos++;
72271493Sdelphij	    continue;
73271493Sdelphij	}
74271493Sdelphij	else if (s[spos] < 0xe0) {
75271493Sdelphij	    if (spos >= src_len || !IS_CONT(s[spos+1])) {
76271493Sdelphij		spos++;
77271493Sdelphij		error++;
78271493Sdelphij		continue;
79271493Sdelphij	    }
80271493Sdelphij	    c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f);
81271493Sdelphij	    spos += 2;
82271493Sdelphij	    if (c < 0x80) {
83271493Sdelphij		/* overlong encoding */
84271493Sdelphij		error++;
85271493Sdelphij		continue;
86271493Sdelphij	    }
87271493Sdelphij	}
88271493Sdelphij	else if (s[spos] < 0xf0) {
89271493Sdelphij	    if (spos >= src_len-2
90271493Sdelphij		|| !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) {
91271493Sdelphij		spos++;
92271493Sdelphij		error++;
93271493Sdelphij		continue;
94271493Sdelphij	    }
95271493Sdelphij	    c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6)
96271493Sdelphij		| (s[spos+2] & 0x3f);
97271493Sdelphij	    spos += 3;
98271493Sdelphij	    if (c < 0x800 || (c & 0xdf00) == 0xd800 ) {
99271493Sdelphij		/* overlong encoding or encoded surrogate */
100271493Sdelphij		error++;
101271493Sdelphij		continue;
102271493Sdelphij	    }
103271493Sdelphij	}
104271493Sdelphij	else {
105271493Sdelphij	    uint32_t cc;
106271493Sdelphij	    /* UTF-16 surrogate pair */
107271493Sdelphij
108271493Sdelphij	    if (spos >= src_len-3 || !IS_CONT(s[spos+1])
109271493Sdelphij		|| !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) {
110271493Sdelphij		spos++;
111271493Sdelphij		error++;
112271493Sdelphij
113271493Sdelphij		continue;
114271493Sdelphij	    }
115271493Sdelphij	    cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12)
116271493Sdelphij		 | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f);
117271493Sdelphij	    spos += 4;
118271493Sdelphij	    if (cc < 0x10000) {
119271493Sdelphij		/* overlong encoding */
120271493Sdelphij		error++;
121271493Sdelphij		continue;
122271493Sdelphij	    }
123271493Sdelphij	    if (dst && dpos < dst_len)
124271493Sdelphij		dst[dpos] = (0xd800 | ((cc-0x10000)>>10));
125271493Sdelphij	    dpos++;
126271493Sdelphij	    c = 0xdc00 | ((cc-0x10000) & 0x3ffff);
127271493Sdelphij	}
128271493Sdelphij
129271493Sdelphij	if (dst && dpos < dst_len)
130271493Sdelphij	    dst[dpos] = c;
131271493Sdelphij	dpos++;
132271493Sdelphij    }
133271493Sdelphij
134271493Sdelphij    if (errp)
135271493Sdelphij	*errp = error;
136271493Sdelphij
137271493Sdelphij    return dpos;
138271493Sdelphij
139271493Sdelphij#undef IS_CONT
140271493Sdelphij}
141271493Sdelphij
142271493Sdelphij
143271493Sdelphijsize_t
144271493Sdelphijutf16_to_utf8(char *dst, size_t dst_len,
145271493Sdelphij	      const uint16_t *src, size_t src_len,
146271493Sdelphij	      int flags, int *errp)
147271493Sdelphij{
148271493Sdelphij    uint16_t spos, dpos;
149271493Sdelphij    int error;
150271493Sdelphij
151271493Sdelphij#define CHECK_LENGTH(l)	(dpos > dst_len-(l) ? dst=NULL : NULL)
152271493Sdelphij#define ADD_BYTE(b)	(dst ? dst[dpos] = (b) : 0, dpos++)
153271493Sdelphij
154271493Sdelphij    error = 0;
155271493Sdelphij    dpos = 0;
156271493Sdelphij    for (spos=0; spos<src_len; spos++) {
157271493Sdelphij	if (src[spos] < 0x80) {
158271493Sdelphij	    CHECK_LENGTH(1);
159271493Sdelphij	    ADD_BYTE(src[spos]);
160271493Sdelphij	}
161271493Sdelphij	else if (src[spos] < 0x800) {
162271493Sdelphij	    CHECK_LENGTH(2);
163271493Sdelphij	    ADD_BYTE(0xc0 | (src[spos]>>6));
164271493Sdelphij	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
165271493Sdelphij	}
166271493Sdelphij	else if ((src[spos] & 0xdc00) == 0xd800) {
167271493Sdelphij	    uint32_t c;
168271493Sdelphij	    /* first surrogate */
169271493Sdelphij	    if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) {
170271493Sdelphij		/* no second surrogate present */
171271493Sdelphij		error++;
172271493Sdelphij		continue;
173271493Sdelphij	    }
174271493Sdelphij	    spos++;
175271493Sdelphij	    CHECK_LENGTH(4);
176271493Sdelphij	    c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000;
177271493Sdelphij	    ADD_BYTE(0xf0 | (c>>18));
178271493Sdelphij	    ADD_BYTE(0x80 | ((c>>12) & 0x3f));
179271493Sdelphij	    ADD_BYTE(0x80 | ((c>>6) & 0x3f));
180271493Sdelphij	    ADD_BYTE(0x80 | (c & 0x3f));
181271493Sdelphij	}
182271493Sdelphij	else if ((src[spos] & 0xdc00) == 0xdc00) {
183271493Sdelphij	    /* second surrogate without preceding first surrogate */
184271493Sdelphij	    error++;
185271493Sdelphij	}
186271493Sdelphij	else {
187271493Sdelphij	    CHECK_LENGTH(3);
188271493Sdelphij	    ADD_BYTE(0xe0 | src[spos]>>12);
189271493Sdelphij	    ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f));
190271493Sdelphij	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
191271493Sdelphij	}
192271493Sdelphij    }
193271493Sdelphij
194271493Sdelphij    if (errp)
195271493Sdelphij	*errp = error;
196271493Sdelphij
197271493Sdelphij    return dpos;
198271493Sdelphij
199271493Sdelphij#undef ADD_BYTE
200271493Sdelphij#undef CHECK_LENGTH
201271493Sdelphij}
202