1/* $NetBSD$ */ 2 3/* OpenLDAP: pkg/ldap/libraries/libldap/utf-8-conv.c,v 1.16.2.6 2010/04/13 20:23:01 kurt Exp */ 4/* This work is part of OpenLDAP Software <http://www.openldap.org/>. 5 * 6 * Copyright 1998-2010 The OpenLDAP Foundation. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted only as authorized by the OpenLDAP 11 * Public License. 12 * 13 * A copy of this license is available in the file LICENSE in the 14 * top-level directory of the distribution or, alternatively, at 15 * <http://www.OpenLDAP.org/license.html>. 16 */ 17/* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved. 18 * 19 * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND 20 * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT 21 * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS 22 * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE" 23 * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION 24 * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP 25 * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT 26 * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY. 27 *--- 28 * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License 29 * can be found in the file "build/LICENSE-2.0.1" in this distribution 30 * of OpenLDAP Software. 31 */ 32 33/* 34 * UTF-8 Conversion Routines 35 * 36 * These routines convert between Wide Character and UTF-8, 37 * or between MultiByte and UTF-8 encodings. 38 * 39 * Both single character and string versions of the functions are provided. 40 * All functions return -1 if the character or string cannot be converted. 41 */ 42 43#include "portable.h" 44 45#if SIZEOF_WCHAR_T >= 4 46/* These routines assume ( sizeof(wchar_t) >= 4 ) */ 47 48#include <stdio.h> 49#include <ac/stdlib.h> /* For wctomb, wcstombs, mbtowc, mbstowcs */ 50#include <ac/string.h> 51#include <ac/time.h> /* for time_t */ 52 53#include "ldap-int.h" 54 55#include <ldap_utf8.h> 56 57static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 58 59 60/*----------------------------------------------------------------------------- 61 UTF-8 Format Summary 62 63ASCII chars 7 bits 64 0xxxxxxx 65 662-character UTF-8 sequence: 11 bits 67 110xxxxx 10xxxxxx 68 693-character UTF-8 16 bits 70 1110xxxx 10xxxxxx 10xxxxxx 71 724-char UTF-8 21 bits 73 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 74 755-char UTF-8 26 bits 76 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 77 786-char UTF-8 31 bits 79 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 80 81Unicode address space (0 - 0x10FFFF) 21 bits 82ISO-10646 address space (0 - 0x7FFFFFFF) 31 bits 83 84Note: This code does not prevent UTF-8 sequences which are longer than 85 necessary from being decoded. 86*/ 87 88/*----------------------------------------------------------------------------- 89 Convert a UTF-8 character to a wide char. 90 Return the length of the UTF-8 input character in bytes. 91*/ 92int 93ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char ) 94{ 95 int utflen, i; 96 wchar_t ch; 97 98 if (utf8char == NULL) return -1; 99 100 /* Get UTF-8 sequence length from 1st byte */ 101 utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen); 102 103 if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1; 104 105 /* First byte minus length tag */ 106 ch = (wchar_t)(utf8char[0] & mask[utflen]); 107 108 for(i=1; i < utflen; i++) { 109 /* Subsequent bytes must start with 10 */ 110 if ((utf8char[i] & 0xc0) != 0x80) return -1; 111 112 ch <<= 6; /* 6 bits of data in each subsequent byte */ 113 ch |= (wchar_t)(utf8char[i] & 0x3f); 114 } 115 116 if (wchar) *wchar = ch; 117 118 return utflen; 119} 120 121/*----------------------------------------------------------------------------- 122 Convert a UTF-8 string to a wide char string. 123 No more than 'count' wide chars will be written to the output buffer. 124 Return the size of the converted string in wide chars, excl null terminator. 125*/ 126int 127ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count ) 128{ 129 size_t wclen = 0; 130 int utflen, i; 131 wchar_t ch; 132 133 134 /* If input ptr is NULL or empty... */ 135 if (utf8str == NULL || !*utf8str) { 136 if ( wcstr ) 137 *wcstr = 0; 138 return 0; 139 } 140 141 /* Examine next UTF-8 character. If output buffer is NULL, ignore count */ 142 while ( *utf8str && (wcstr==NULL || wclen<count) ) { 143 /* Get UTF-8 sequence length from 1st byte */ 144 utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen); 145 146 if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1; 147 148 /* First byte minus length tag */ 149 ch = (wchar_t)(utf8str[0] & mask[utflen]); 150 151 for(i=1; i < utflen; i++) { 152 /* Subsequent bytes must start with 10 */ 153 if ((utf8str[i] & 0xc0) != 0x80) return -1; 154 155 ch <<= 6; /* 6 bits of data in each subsequent byte */ 156 ch |= (wchar_t)(utf8str[i] & 0x3f); 157 } 158 159 if (wcstr) wcstr[wclen] = ch; 160 161 utf8str += utflen; /* Move to next UTF-8 character */ 162 wclen++; /* Count number of wide chars stored/required */ 163 } 164 165 /* Add null terminator if there's room in the buffer. */ 166 if (wcstr && wclen < count) wcstr[wclen] = 0; 167 168 return wclen; 169} 170 171 172/*----------------------------------------------------------------------------- 173 Convert one wide char to a UTF-8 character. 174 Return the length of the converted UTF-8 character in bytes. 175 No more than 'count' bytes will be written to the output buffer. 176*/ 177int 178ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count ) 179{ 180 int len=0; 181 182 if (utf8char == NULL) /* Just determine the required UTF-8 char length. */ 183 { /* Ignore count */ 184 if( wchar < 0 ) 185 return -1; 186 if( wchar < 0x80 ) 187 return 1; 188 if( wchar < 0x800 ) 189 return 2; 190 if( wchar < 0x10000 ) 191 return 3; 192 if( wchar < 0x200000 ) 193 return 4; 194 if( wchar < 0x4000000 ) 195 return 5; 196#if SIZEOF_WCHAR_T > 4 197 /* UL is not strictly needed by ANSI C */ 198 if( wchar < (wchar_t)0x80000000UL ) 199#endif /* SIZEOF_WCHAR_T > 4 */ 200 return 6; 201 return -1; 202 } 203 204 205 if ( wchar < 0 ) { /* Invalid wide character */ 206 len = -1; 207 208 } else if( wchar < 0x80 ) { 209 if (count >= 1) { 210 utf8char[len++] = (char)wchar; 211 } 212 213 } else if( wchar < 0x800 ) { 214 if (count >=2) { 215 utf8char[len++] = 0xc0 | ( wchar >> 6 ); 216 utf8char[len++] = 0x80 | ( wchar & 0x3f ); 217 } 218 219 } else if( wchar < 0x10000 ) { 220 if (count >= 3) { 221 utf8char[len++] = 0xe0 | ( wchar >> 12 ); 222 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f ); 223 utf8char[len++] = 0x80 | ( wchar & 0x3f ); 224 } 225 226 } else if( wchar < 0x200000 ) { 227 if (count >= 4) { 228 utf8char[len++] = 0xf0 | ( wchar >> 18 ); 229 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f ); 230 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f ); 231 utf8char[len++] = 0x80 | ( wchar & 0x3f ); 232 } 233 234 } else if( wchar < 0x4000000 ) { 235 if (count >= 5) { 236 utf8char[len++] = 0xf8 | ( wchar >> 24 ); 237 utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f ); 238 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f ); 239 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f ); 240 utf8char[len++] = 0x80 | ( wchar & 0x3f ); 241 } 242 243 } else 244#if SIZEOF_WCHAR_T > 4 245 /* UL is not strictly needed by ANSI C */ 246 if( wchar < (wchar_t)0x80000000UL ) 247#endif /* SIZEOF_WCHAR_T > 4 */ 248 { 249 if (count >= 6) { 250 utf8char[len++] = 0xfc | ( wchar >> 30 ); 251 utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f ); 252 utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f ); 253 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f ); 254 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f ); 255 utf8char[len++] = 0x80 | ( wchar & 0x3f ); 256 } 257 258#if SIZEOF_WCHAR_T > 4 259 } else { 260 len = -1; 261#endif /* SIZEOF_WCHAR_T > 4 */ 262 } 263 264 return len; 265 266} 267 268 269/*----------------------------------------------------------------------------- 270 Convert a wide char string to a UTF-8 string. 271 No more than 'count' bytes will be written to the output buffer. 272 Return the # of bytes written to the output buffer, excl null terminator. 273*/ 274int 275ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count ) 276{ 277 int len = 0; 278 int n; 279 char *p = utf8str; 280 wchar_t empty = 0; /* To avoid use of L"" construct */ 281 282 if (wcstr == NULL) /* Treat input ptr NULL as an empty string */ 283 wcstr = ∅ 284 285 if (utf8str == NULL) /* Just compute size of output, excl null */ 286 { 287 while (*wcstr) 288 { 289 /* Get UTF-8 size of next wide char */ 290 n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN); 291 if (n == -1) 292 return -1; 293 len += n; 294 } 295 296 return len; 297 } 298 299 300 /* Do the actual conversion. */ 301 302 n = 1; /* In case of empty wcstr */ 303 while (*wcstr) 304 { 305 n = ldap_x_wc_to_utf8( p, *wcstr++, count); 306 307 if (n <= 0) /* If encoding error (-1) or won't fit (0), quit */ 308 break; 309 310 p += n; 311 count -= n; /* Space left in output buffer */ 312 } 313 314 /* If not enough room for last character, pad remainder with null 315 so that return value = original count, indicating buffer full. */ 316 if (n == 0) 317 { 318 while (count--) 319 *p++ = 0; 320 } 321 322 /* Add a null terminator if there's room. */ 323 else if (count) 324 *p = 0; 325 326 if (n == -1) /* Conversion encountered invalid wide char. */ 327 return -1; 328 329 /* Return the number of bytes written to output buffer, excl null. */ 330 return (p - utf8str); 331} 332 333 334/*----------------------------------------------------------------------------- 335 Convert a UTF-8 character to a MultiByte character. 336 Return the size of the converted character in bytes. 337*/ 338int 339ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char, 340 int (*f_wctomb)(char *mbchar, wchar_t wchar) ) 341{ 342 wchar_t wchar; 343 int n; 344 char tmp[6]; /* Large enough for biggest multibyte char */ 345 346 if (f_wctomb == NULL) /* If no conversion function was given... */ 347 f_wctomb = wctomb; /* use the local ANSI C function */ 348 349 /* First convert UTF-8 char to a wide char */ 350 n = ldap_x_utf8_to_wc( &wchar, utf8char); 351 352 if (n == -1) 353 return -1; /* Invalid UTF-8 character */ 354 355 if (mbchar == NULL) 356 n = f_wctomb( tmp, wchar ); 357 else 358 n = f_wctomb( mbchar, wchar); 359 360 return n; 361} 362 363/*----------------------------------------------------------------------------- 364 Convert a UTF-8 string to a MultiByte string. 365 No more than 'count' bytes will be written to the output buffer. 366 Return the size of the converted string in bytes, excl null terminator. 367*/ 368int 369ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count, 370 size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) ) 371{ 372 wchar_t *wcs; 373 size_t wcsize; 374 int n; 375 376 if (f_wcstombs == NULL) /* If no conversion function was given... */ 377 f_wcstombs = wcstombs; /* use the local ANSI C function */ 378 379 if (utf8str == NULL || *utf8str == 0) /* NULL or empty input string */ 380 { 381 if (mbstr) 382 *mbstr = 0; 383 return 0; 384 } 385 386/* Allocate memory for the maximum size wchar string that we could get. */ 387 wcsize = strlen(utf8str) + 1; 388 wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t)); 389 if (wcs == NULL) 390 return -1; /* Memory allocation failure. */ 391 392 /* First convert the UTF-8 string to a wide char string */ 393 n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize); 394 395 /* Then convert wide char string to multi-byte string */ 396 if (n != -1) 397 { 398 n = f_wcstombs(mbstr, wcs, count); 399 } 400 401 LDAP_FREE(wcs); 402 403 return n; 404} 405 406/*----------------------------------------------------------------------------- 407 Convert a MultiByte character to a UTF-8 character. 408 'mbsize' indicates the number of bytes of 'mbchar' to check. 409 Returns the number of bytes written to the output character. 410*/ 411int 412ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize, 413 int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) ) 414{ 415 wchar_t wchar; 416 int n; 417 418 if (f_mbtowc == NULL) /* If no conversion function was given... */ 419 f_mbtowc = mbtowc; /* use the local ANSI C function */ 420 421 if (mbsize == 0) /* 0 is not valid. */ 422 return -1; 423 424 if (mbchar == NULL || *mbchar == 0) 425 { 426 if (utf8char) 427 *utf8char = 0; 428 return 1; 429 } 430 431 /* First convert the MB char to a Wide Char */ 432 n = f_mbtowc( &wchar, mbchar, mbsize); 433 434 if (n == -1) 435 return -1; 436 437 /* Convert the Wide Char to a UTF-8 character. */ 438 n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN); 439 440 return n; 441} 442 443 444/*----------------------------------------------------------------------------- 445 Convert a MultiByte string to a UTF-8 string. 446 No more than 'count' bytes will be written to the output buffer. 447 Return the size of the converted string in bytes, excl null terminator. 448*/ 449int 450ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count, 451 size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) ) 452{ 453 wchar_t *wcs; 454 int n; 455 size_t wcsize; 456 457 if (mbstr == NULL) /* Treat NULL input string as an empty string */ 458 mbstr = ""; 459 460 if (f_mbstowcs == NULL) /* If no conversion function was given... */ 461 f_mbstowcs = mbstowcs; /* use the local ANSI C function */ 462 463 /* Allocate memory for the maximum size wchar string that we could get. */ 464 wcsize = strlen(mbstr) + 1; 465 wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) ); 466 if (wcs == NULL) 467 return -1; 468 469 /* First convert multi-byte string to a wide char string */ 470 n = f_mbstowcs(wcs, mbstr, wcsize); 471 472 /* Convert wide char string to UTF-8 string */ 473 if (n != -1) 474 { 475 n = ldap_x_wcs_to_utf8s( utf8str, wcs, count); 476 } 477 478 LDAP_FREE(wcs); 479 480 return n; 481} 482 483#endif /* SIZEOF_WCHAR_T >= 4 */ 484