1271493Sdelphij/* $NetBSD: unicode.h,v 1.1.1.1 2007/03/06 00:10:39 dillo Exp $ */ 2271493Sdelphij 3271493Sdelphij/*- 4271493Sdelphij * Copyright (c) 2007 The NetBSD Foundation, Inc. 5271493Sdelphij * All rights reserved. 6271493Sdelphij * 7271493Sdelphij * This code is derived from software contributed to The NetBSD Foundation 8271493Sdelphij * by Dieter Baron. 9271493Sdelphij * 10271493Sdelphij * Redistribution and use in source and binary forms, with or without 11271493Sdelphij * modification, are permitted provided that the following conditions 12271493Sdelphij * are met: 13271493Sdelphij * 1. Redistributions of source code must retain the above copyright 14271493Sdelphij * notice, this list of conditions and the following disclaimer. 15271493Sdelphij * 2. Redistributions in binary form must reproduce the above copyright 16271493Sdelphij * notice, this list of conditions and the following disclaimer in the 17271493Sdelphij * documentation and/or other materials provided with the distribution. 18271493Sdelphij * 19271493Sdelphij * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20271493Sdelphij * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21271493Sdelphij * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22271493Sdelphij * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23271493Sdelphij * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24271493Sdelphij * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25271493Sdelphij * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26271493Sdelphij * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27271493Sdelphij * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28271493Sdelphij * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29271493Sdelphij * POSSIBILITY OF SUCH DAMAGE. 30271493Sdelphij * 31271493Sdelphij * $FreeBSD: releng/11.0/sys/dev/hyperv/utilities/unicode.h 271493 2014-09-13 02:15:31Z delphij $ 32271493Sdelphij */ 33271493Sdelphij 34271493Sdelphij#include <sys/types.h> 35271493Sdelphij 36271493Sdelphij#define UNICODE_DECOMPOSE 0x01 37271493Sdelphij#define UNICODE_PRECOMPOSE 0x02 38271493Sdelphij#define UNICODE_UTF8_LATIN1_FALLBACK 0x03 39271493Sdelphij 40271493Sdelphijsize_t utf8_to_utf16(uint16_t *, size_t, const char *, size_t, int, int *); 41271493Sdelphijsize_t utf16_to_utf8(char *, size_t, const uint16_t *, size_t, int, int *); 42271493Sdelphij 43271493Sdelphijsize_t 44271493Sdelphijutf8_to_utf16(uint16_t *dst, size_t dst_len, 45271493Sdelphij const char *src, size_t src_len, 46271493Sdelphij int flags, int *errp) 47271493Sdelphij{ 48271493Sdelphij const unsigned char *s; 49271493Sdelphij size_t spos, dpos; 50271493Sdelphij int error; 51271493Sdelphij uint16_t c; 52271493Sdelphij 53271493Sdelphij#define IS_CONT(c) (((c)&0xc0) == 0x80) 54271493Sdelphij 55271493Sdelphij error = 0; 56271493Sdelphij s = (const unsigned char *)src; 57271493Sdelphij spos = dpos = 0; 58271493Sdelphij while (spos<src_len) { 59271493Sdelphij if (s[spos] < 0x80) 60271493Sdelphij c = s[spos++]; 61271493Sdelphij else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK) 62271493Sdelphij && (spos >= src_len || !IS_CONT(s[spos+1])) 63271493Sdelphij && s[spos]>=0xa0) { 64271493Sdelphij /* not valid UTF-8, assume ISO 8859-1 */ 65271493Sdelphij c = s[spos++]; 66271493Sdelphij } 67271493Sdelphij else if (s[spos] < 0xc0 || s[spos] >= 0xf5) { 68271493Sdelphij /* continuation byte without lead byte 69271493Sdelphij or lead byte for codepoint above 0x10ffff */ 70271493Sdelphij error++; 71271493Sdelphij spos++; 72271493Sdelphij continue; 73271493Sdelphij } 74271493Sdelphij else if (s[spos] < 0xe0) { 75271493Sdelphij if (spos >= src_len || !IS_CONT(s[spos+1])) { 76271493Sdelphij spos++; 77271493Sdelphij error++; 78271493Sdelphij continue; 79271493Sdelphij } 80271493Sdelphij c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f); 81271493Sdelphij spos += 2; 82271493Sdelphij if (c < 0x80) { 83271493Sdelphij /* overlong encoding */ 84271493Sdelphij error++; 85271493Sdelphij continue; 86271493Sdelphij } 87271493Sdelphij } 88271493Sdelphij else if (s[spos] < 0xf0) { 89271493Sdelphij if (spos >= src_len-2 90271493Sdelphij || !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) { 91271493Sdelphij spos++; 92271493Sdelphij error++; 93271493Sdelphij continue; 94271493Sdelphij } 95271493Sdelphij c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6) 96271493Sdelphij | (s[spos+2] & 0x3f); 97271493Sdelphij spos += 3; 98271493Sdelphij if (c < 0x800 || (c & 0xdf00) == 0xd800 ) { 99271493Sdelphij /* overlong encoding or encoded surrogate */ 100271493Sdelphij error++; 101271493Sdelphij continue; 102271493Sdelphij } 103271493Sdelphij } 104271493Sdelphij else { 105271493Sdelphij uint32_t cc; 106271493Sdelphij /* UTF-16 surrogate pair */ 107271493Sdelphij 108271493Sdelphij if (spos >= src_len-3 || !IS_CONT(s[spos+1]) 109271493Sdelphij || !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) { 110271493Sdelphij spos++; 111271493Sdelphij error++; 112271493Sdelphij 113271493Sdelphij continue; 114271493Sdelphij } 115271493Sdelphij cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12) 116271493Sdelphij | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f); 117271493Sdelphij spos += 4; 118271493Sdelphij if (cc < 0x10000) { 119271493Sdelphij /* overlong encoding */ 120271493Sdelphij error++; 121271493Sdelphij continue; 122271493Sdelphij } 123271493Sdelphij if (dst && dpos < dst_len) 124271493Sdelphij dst[dpos] = (0xd800 | ((cc-0x10000)>>10)); 125271493Sdelphij dpos++; 126271493Sdelphij c = 0xdc00 | ((cc-0x10000) & 0x3ffff); 127271493Sdelphij } 128271493Sdelphij 129271493Sdelphij if (dst && dpos < dst_len) 130271493Sdelphij dst[dpos] = c; 131271493Sdelphij dpos++; 132271493Sdelphij } 133271493Sdelphij 134271493Sdelphij if (errp) 135271493Sdelphij *errp = error; 136271493Sdelphij 137271493Sdelphij return dpos; 138271493Sdelphij 139271493Sdelphij#undef IS_CONT 140271493Sdelphij} 141271493Sdelphij 142271493Sdelphij 143271493Sdelphijsize_t 144271493Sdelphijutf16_to_utf8(char *dst, size_t dst_len, 145271493Sdelphij const uint16_t *src, size_t src_len, 146271493Sdelphij int flags, int *errp) 147271493Sdelphij{ 148271493Sdelphij uint16_t spos, dpos; 149271493Sdelphij int error; 150271493Sdelphij 151271493Sdelphij#define CHECK_LENGTH(l) (dpos > dst_len-(l) ? dst=NULL : NULL) 152271493Sdelphij#define ADD_BYTE(b) (dst ? dst[dpos] = (b) : 0, dpos++) 153271493Sdelphij 154271493Sdelphij error = 0; 155271493Sdelphij dpos = 0; 156271493Sdelphij for (spos=0; spos<src_len; spos++) { 157271493Sdelphij if (src[spos] < 0x80) { 158271493Sdelphij CHECK_LENGTH(1); 159271493Sdelphij ADD_BYTE(src[spos]); 160271493Sdelphij } 161271493Sdelphij else if (src[spos] < 0x800) { 162271493Sdelphij CHECK_LENGTH(2); 163271493Sdelphij ADD_BYTE(0xc0 | (src[spos]>>6)); 164271493Sdelphij ADD_BYTE(0x80 | (src[spos] & 0x3f)); 165271493Sdelphij } 166271493Sdelphij else if ((src[spos] & 0xdc00) == 0xd800) { 167271493Sdelphij uint32_t c; 168271493Sdelphij /* first surrogate */ 169271493Sdelphij if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) { 170271493Sdelphij /* no second surrogate present */ 171271493Sdelphij error++; 172271493Sdelphij continue; 173271493Sdelphij } 174271493Sdelphij spos++; 175271493Sdelphij CHECK_LENGTH(4); 176271493Sdelphij c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000; 177271493Sdelphij ADD_BYTE(0xf0 | (c>>18)); 178271493Sdelphij ADD_BYTE(0x80 | ((c>>12) & 0x3f)); 179271493Sdelphij ADD_BYTE(0x80 | ((c>>6) & 0x3f)); 180271493Sdelphij ADD_BYTE(0x80 | (c & 0x3f)); 181271493Sdelphij } 182271493Sdelphij else if ((src[spos] & 0xdc00) == 0xdc00) { 183271493Sdelphij /* second surrogate without preceding first surrogate */ 184271493Sdelphij error++; 185271493Sdelphij } 186271493Sdelphij else { 187271493Sdelphij CHECK_LENGTH(3); 188271493Sdelphij ADD_BYTE(0xe0 | src[spos]>>12); 189271493Sdelphij ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f)); 190271493Sdelphij ADD_BYTE(0x80 | (src[spos] & 0x3f)); 191271493Sdelphij } 192271493Sdelphij } 193271493Sdelphij 194271493Sdelphij if (errp) 195271493Sdelphij *errp = error; 196271493Sdelphij 197271493Sdelphij return dpos; 198271493Sdelphij 199271493Sdelphij#undef ADD_BYTE 200271493Sdelphij#undef CHECK_LENGTH 201271493Sdelphij} 202