unicode.h revision 272322
1139825Simp/* $NetBSD: unicode.h,v 1.1.1.1 2007/03/06 00:10:39 dillo Exp $ */ 296485Sjake 3105531Stmm/*- 496485Sjake * Copyright (c) 2007 The NetBSD Foundation, Inc. 596485Sjake * All rights reserved. 696485Sjake * 796485Sjake * This code is derived from software contributed to The NetBSD Foundation 896485Sjake * by Dieter Baron. 996485Sjake * 1096485Sjake * Redistribution and use in source and binary forms, with or without 1196485Sjake * modification, are permitted provided that the following conditions 1296485Sjake * are met: 1396485Sjake * 1. Redistributions of source code must retain the above copyright 1496485Sjake * notice, this list of conditions and the following disclaimer. 1596485Sjake * 2. Redistributions in binary form must reproduce the above copyright 1696485Sjake * notice, this list of conditions and the following disclaimer in the 1796485Sjake * documentation and/or other materials provided with the distribution. 1896485Sjake * 1996485Sjake * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 2096485Sjake * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 2196485Sjake * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 2296485Sjake * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 2396485Sjake * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 2496485Sjake * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 2596485Sjake * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 2696485Sjake * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 2796485Sjake * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28224682Smarius * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29224682Smarius * POSSIBILITY OF SUCH DAMAGE. 30224682Smarius * 3196485Sjake * $FreeBSD: stable/10/sys/dev/hyperv/utilities/unicode.h 272322 2014-09-30 17:54:57Z delphij $ 3296485Sjake */ 3396485Sjake 34105531Stmm#include <sys/types.h> 3596485Sjake 3696485Sjake#define UNICODE_DECOMPOSE 0x01 37105531Stmm#define UNICODE_PRECOMPOSE 0x02 3896485Sjake#define UNICODE_UTF8_LATIN1_FALLBACK 0x03 39108301Sjake 4096485Sjakesize_t utf8_to_utf16(uint16_t *, size_t, const char *, size_t, int, int *); 4196485Sjakesize_t utf16_to_utf8(char *, size_t, const uint16_t *, size_t, int, int *); 42276772Smarkj 43276772Smarkjsize_t 44105531Stmmutf8_to_utf16(uint16_t *dst, size_t dst_len, 45105531Stmm const char *src, size_t src_len, 46105531Stmm int flags, int *errp) 47105531Stmm{ 48108301Sjake const unsigned char *s; 4996485Sjake size_t spos, dpos; 50276772Smarkj int error; 5196485Sjake uint16_t c; 52276772Smarkj 53276772Smarkj#define IS_CONT(c) (((c)&0xc0) == 0x80) 5496485Sjake 55276772Smarkj error = 0; 5696485Sjake s = (const unsigned char *)src; 57276772Smarkj spos = dpos = 0; 58276772Smarkj while (spos<src_len) { 5996485Sjake if (s[spos] < 0x80) 6096485Sjake c = s[spos++]; 61276772Smarkj else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK) 6296485Sjake && (spos >= src_len || !IS_CONT(s[spos+1])) 6396485Sjake && s[spos]>=0xa0) { 6496485Sjake /* not valid UTF-8, assume ISO 8859-1 */ 65113238Sjake c = s[spos++]; 6696485Sjake } 67105531Stmm else if (s[spos] < 0xc0 || s[spos] >= 0xf5) { 6896485Sjake /* continuation byte without lead byte 69105531Stmm or lead byte for codepoint above 0x10ffff */ 70105531Stmm error++; 71276772Smarkj spos++; 72276772Smarkj continue; 73276772Smarkj } 74105531Stmm else if (s[spos] < 0xe0) { 7596485Sjake if (spos >= src_len || !IS_CONT(s[spos+1])) { 76269105Sgavin spos++; 7796485Sjake error++; 7896485Sjake continue; 79276772Smarkj } 80105531Stmm c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f); 81105531Stmm spos += 2; 82105531Stmm if (c < 0x80) { 8396485Sjake /* overlong encoding */ 84290957Smarius error++; 85105531Stmm continue; 86105531Stmm } 87290957Smarius } 88290957Smarius else if (s[spos] < 0xf0) { 89290957Smarius if (spos >= src_len-2 90290957Smarius || !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) { 91290957Smarius spos++; 92105531Stmm error++; 93105531Stmm continue; 94105531Stmm } 95105531Stmm c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6) 9696485Sjake | (s[spos+2] & 0x3f); 97105531Stmm spos += 3; 98105531Stmm if (c < 0x800 || (c & 0xdf00) == 0xd800 ) { 99105531Stmm /* overlong encoding or encoded surrogate */ 100105531Stmm error++; 101105531Stmm continue; 102105531Stmm } 103105531Stmm } 104105531Stmm else { 105105531Stmm uint32_t cc; 10696485Sjake /* UTF-16 surrogate pair */ 107105531Stmm 10896485Sjake if (spos >= src_len-3 || !IS_CONT(s[spos+1]) 109224682Smarius || !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) { 110224682Smarius spos++; 11196485Sjake error++; 112105531Stmm 11396485Sjake continue; 11496485Sjake } 115175768Sru cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12) 11696485Sjake | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f); 11796485Sjake spos += 4; 11896485Sjake if (cc < 0x10000) { 11996485Sjake /* overlong encoding */ 120105531Stmm error++; 121105531Stmm continue; 122105531Stmm } 123105531Stmm if (dst && dpos < dst_len) 124105531Stmm dst[dpos] = (0xd800 | ((cc-0x10000)>>10)); 125105531Stmm dpos++; 126105531Stmm c = 0xdc00 | ((cc-0x10000) & 0x3ffff); 127276772Smarkj } 12896485Sjake 12996485Sjake if (dst && dpos < dst_len) 130276772Smarkj dst[dpos] = c; 131105531Stmm dpos++; 132290957Smarius } 133105531Stmm 134105531Stmm if (errp) 135105531Stmm *errp = error; 136105531Stmm 137105531Stmm return dpos; 138276772Smarkj 13996485Sjake#undef IS_CONT 140105531Stmm} 141276772Smarkj 142276772Smarkj 143276772Smarkjsize_t 14496485Sjakeutf16_to_utf8(char *dst, size_t dst_len, 14596485Sjake const uint16_t *src, size_t src_len, 146175768Sru int flags, int *errp) 14796485Sjake{ 14896485Sjake uint16_t spos, dpos; 14996485Sjake int error; 15096485Sjake 151175768Sru#define CHECK_LENGTH(l) (dpos > dst_len-(l) ? dst=NULL : NULL) 15296485Sjake#define ADD_BYTE(b) (dst ? dst[dpos] = (b) : 0, dpos++) 153269105Sgavin 15496485Sjake error = 0; 15596485Sjake dpos = 0; 156276772Smarkj for (spos=0; spos<src_len; spos++) { 157276772Smarkj if (src[spos] < 0x80) { 158276772Smarkj CHECK_LENGTH(1); 15996485Sjake ADD_BYTE(src[spos]); 16096485Sjake } 161269105Sgavin else if (src[spos] < 0x800) { 16296485Sjake CHECK_LENGTH(2); 163 ADD_BYTE(0xc0 | (src[spos]>>6)); 164 ADD_BYTE(0x80 | (src[spos] & 0x3f)); 165 } 166 else if ((src[spos] & 0xdc00) == 0xd800) { 167 uint32_t c; 168 /* first surrogate */ 169 if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) { 170 /* no second surrogate present */ 171 error++; 172 continue; 173 } 174 spos++; 175 CHECK_LENGTH(4); 176 c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000; 177 ADD_BYTE(0xf0 | (c>>18)); 178 ADD_BYTE(0x80 | ((c>>12) & 0x3f)); 179 ADD_BYTE(0x80 | ((c>>6) & 0x3f)); 180 ADD_BYTE(0x80 | (c & 0x3f)); 181 } 182 else if ((src[spos] & 0xdc00) == 0xdc00) { 183 /* second surrogate without preceding first surrogate */ 184 error++; 185 } 186 else { 187 CHECK_LENGTH(3); 188 ADD_BYTE(0xe0 | src[spos]>>12); 189 ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f)); 190 ADD_BYTE(0x80 | (src[spos] & 0x3f)); 191 } 192 } 193 194 if (errp) 195 *errp = error; 196 197 return dpos; 198 199#undef ADD_BYTE 200#undef CHECK_LENGTH 201} 202