unicode.h revision 272322
1139825Simp/* $NetBSD: unicode.h,v 1.1.1.1 2007/03/06 00:10:39 dillo Exp $ */
296485Sjake
3105531Stmm/*-
496485Sjake * Copyright (c) 2007 The NetBSD Foundation, Inc.
596485Sjake * All rights reserved.
696485Sjake *
796485Sjake * This code is derived from software contributed to The NetBSD Foundation
896485Sjake * by Dieter Baron.
996485Sjake *
1096485Sjake * Redistribution and use in source and binary forms, with or without
1196485Sjake * modification, are permitted provided that the following conditions
1296485Sjake * are met:
1396485Sjake * 1. Redistributions of source code must retain the above copyright
1496485Sjake *    notice, this list of conditions and the following disclaimer.
1596485Sjake * 2. Redistributions in binary form must reproduce the above copyright
1696485Sjake *    notice, this list of conditions and the following disclaimer in the
1796485Sjake *    documentation and/or other materials provided with the distribution.
1896485Sjake *
1996485Sjake * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
2096485Sjake * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
2196485Sjake * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
2296485Sjake * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
2396485Sjake * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
2496485Sjake * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
2596485Sjake * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
2696485Sjake * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
2796485Sjake * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28224682Smarius * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29224682Smarius * POSSIBILITY OF SUCH DAMAGE.
30224682Smarius *
3196485Sjake * $FreeBSD: stable/10/sys/dev/hyperv/utilities/unicode.h 272322 2014-09-30 17:54:57Z delphij $
3296485Sjake */
3396485Sjake
34105531Stmm#include <sys/types.h>
3596485Sjake
3696485Sjake#define UNICODE_DECOMPOSE		0x01
37105531Stmm#define UNICODE_PRECOMPOSE		0x02
3896485Sjake#define UNICODE_UTF8_LATIN1_FALLBACK	0x03
39108301Sjake
4096485Sjakesize_t utf8_to_utf16(uint16_t *, size_t, const char *, size_t, int, int *);
4196485Sjakesize_t utf16_to_utf8(char *, size_t, const uint16_t *, size_t, int, int *);
42276772Smarkj
43276772Smarkjsize_t
44105531Stmmutf8_to_utf16(uint16_t *dst, size_t dst_len,
45105531Stmm	      const char *src, size_t src_len,
46105531Stmm	      int flags, int *errp)
47105531Stmm{
48108301Sjake    const unsigned char *s;
4996485Sjake    size_t spos, dpos;
50276772Smarkj    int error;
5196485Sjake    uint16_t c;
52276772Smarkj
53276772Smarkj#define IS_CONT(c)	(((c)&0xc0) == 0x80)
5496485Sjake
55276772Smarkj    error = 0;
5696485Sjake    s = (const unsigned char *)src;
57276772Smarkj    spos = dpos = 0;
58276772Smarkj    while (spos<src_len) {
5996485Sjake	if (s[spos] < 0x80)
6096485Sjake	    c = s[spos++];
61276772Smarkj	else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK)
6296485Sjake		 && (spos >= src_len || !IS_CONT(s[spos+1]))
6396485Sjake		 && s[spos]>=0xa0) {
6496485Sjake	    /* not valid UTF-8, assume ISO 8859-1 */
65113238Sjake	    c = s[spos++];
6696485Sjake	}
67105531Stmm	else if (s[spos] < 0xc0 || s[spos] >= 0xf5) {
6896485Sjake	    /* continuation byte without lead byte
69105531Stmm	       or lead byte for codepoint above 0x10ffff */
70105531Stmm	    error++;
71276772Smarkj	    spos++;
72276772Smarkj	    continue;
73276772Smarkj	}
74105531Stmm	else if (s[spos] < 0xe0) {
7596485Sjake	    if (spos >= src_len || !IS_CONT(s[spos+1])) {
76269105Sgavin		spos++;
7796485Sjake		error++;
7896485Sjake		continue;
79276772Smarkj	    }
80105531Stmm	    c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f);
81105531Stmm	    spos += 2;
82105531Stmm	    if (c < 0x80) {
8396485Sjake		/* overlong encoding */
84290957Smarius		error++;
85105531Stmm		continue;
86105531Stmm	    }
87290957Smarius	}
88290957Smarius	else if (s[spos] < 0xf0) {
89290957Smarius	    if (spos >= src_len-2
90290957Smarius		|| !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) {
91290957Smarius		spos++;
92105531Stmm		error++;
93105531Stmm		continue;
94105531Stmm	    }
95105531Stmm	    c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6)
9696485Sjake		| (s[spos+2] & 0x3f);
97105531Stmm	    spos += 3;
98105531Stmm	    if (c < 0x800 || (c & 0xdf00) == 0xd800 ) {
99105531Stmm		/* overlong encoding or encoded surrogate */
100105531Stmm		error++;
101105531Stmm		continue;
102105531Stmm	    }
103105531Stmm	}
104105531Stmm	else {
105105531Stmm	    uint32_t cc;
10696485Sjake	    /* UTF-16 surrogate pair */
107105531Stmm
10896485Sjake	    if (spos >= src_len-3 || !IS_CONT(s[spos+1])
109224682Smarius		|| !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) {
110224682Smarius		spos++;
11196485Sjake		error++;
112105531Stmm
11396485Sjake		continue;
11496485Sjake	    }
115175768Sru	    cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12)
11696485Sjake		 | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f);
11796485Sjake	    spos += 4;
11896485Sjake	    if (cc < 0x10000) {
11996485Sjake		/* overlong encoding */
120105531Stmm		error++;
121105531Stmm		continue;
122105531Stmm	    }
123105531Stmm	    if (dst && dpos < dst_len)
124105531Stmm		dst[dpos] = (0xd800 | ((cc-0x10000)>>10));
125105531Stmm	    dpos++;
126105531Stmm	    c = 0xdc00 | ((cc-0x10000) & 0x3ffff);
127276772Smarkj	}
12896485Sjake
12996485Sjake	if (dst && dpos < dst_len)
130276772Smarkj	    dst[dpos] = c;
131105531Stmm	dpos++;
132290957Smarius    }
133105531Stmm
134105531Stmm    if (errp)
135105531Stmm	*errp = error;
136105531Stmm
137105531Stmm    return dpos;
138276772Smarkj
13996485Sjake#undef IS_CONT
140105531Stmm}
141276772Smarkj
142276772Smarkj
143276772Smarkjsize_t
14496485Sjakeutf16_to_utf8(char *dst, size_t dst_len,
14596485Sjake	      const uint16_t *src, size_t src_len,
146175768Sru	      int flags, int *errp)
14796485Sjake{
14896485Sjake    uint16_t spos, dpos;
14996485Sjake    int error;
15096485Sjake
151175768Sru#define CHECK_LENGTH(l)	(dpos > dst_len-(l) ? dst=NULL : NULL)
15296485Sjake#define ADD_BYTE(b)	(dst ? dst[dpos] = (b) : 0, dpos++)
153269105Sgavin
15496485Sjake    error = 0;
15596485Sjake    dpos = 0;
156276772Smarkj    for (spos=0; spos<src_len; spos++) {
157276772Smarkj	if (src[spos] < 0x80) {
158276772Smarkj	    CHECK_LENGTH(1);
15996485Sjake	    ADD_BYTE(src[spos]);
16096485Sjake	}
161269105Sgavin	else if (src[spos] < 0x800) {
16296485Sjake	    CHECK_LENGTH(2);
163	    ADD_BYTE(0xc0 | (src[spos]>>6));
164	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
165	}
166	else if ((src[spos] & 0xdc00) == 0xd800) {
167	    uint32_t c;
168	    /* first surrogate */
169	    if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) {
170		/* no second surrogate present */
171		error++;
172		continue;
173	    }
174	    spos++;
175	    CHECK_LENGTH(4);
176	    c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000;
177	    ADD_BYTE(0xf0 | (c>>18));
178	    ADD_BYTE(0x80 | ((c>>12) & 0x3f));
179	    ADD_BYTE(0x80 | ((c>>6) & 0x3f));
180	    ADD_BYTE(0x80 | (c & 0x3f));
181	}
182	else if ((src[spos] & 0xdc00) == 0xdc00) {
183	    /* second surrogate without preceding first surrogate */
184	    error++;
185	}
186	else {
187	    CHECK_LENGTH(3);
188	    ADD_BYTE(0xe0 | src[spos]>>12);
189	    ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f));
190	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
191	}
192    }
193
194    if (errp)
195	*errp = error;
196
197    return dpos;
198
199#undef ADD_BYTE
200#undef CHECK_LENGTH
201}
202