1/* utf-8.c -- Basic UTF-8 routines */ 2/* $OpenLDAP$ */ 3/* This work is part of OpenLDAP Software <http://www.openldap.org/>. 4 * 5 * Copyright 1998-2011 The OpenLDAP Foundation. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted only as authorized by the OpenLDAP 10 * Public License. 11 * 12 * A copy of this license is available in the file LICENSE in the 13 * top-level directory of the distribution or, alternatively, at 14 * <http://www.OpenLDAP.org/license.html>. 15 */ 16/* Basic UTF-8 routines 17 * 18 * These routines are "dumb". Though they understand UTF-8, 19 * they don't grok Unicode. That is, they can push bits, 20 * but don't have a clue what the bits represent. That's 21 * good enough for use with the LDAP Client SDK. 22 * 23 * These routines are not optimized. 24 */ 25 26#include "portable.h" 27 28#include <stdio.h> 29 30#include <ac/stdlib.h> 31 32#include <ac/socket.h> 33#include <ac/string.h> 34#include <ac/time.h> 35 36#include "ldap_utf8.h" 37 38#include "ldap-int.h" 39#include "ldap_defaults.h" 40 41/* 42 * return the number of bytes required to hold the 43 * NULL-terminated UTF-8 string NOT INCLUDING the 44 * termination. 45 */ 46ber_len_t ldap_utf8_bytes( const char * p ) 47{ 48 ber_len_t bytes; 49 50 for( bytes=0; p[bytes]; bytes++ ) { 51 /* EMPTY */ ; 52 } 53 54 return bytes; 55} 56 57ber_len_t ldap_utf8_chars( const char * p ) 58{ 59 /* could be optimized and could check for invalid sequences */ 60 ber_len_t chars=0; 61 62 for( ; *p ; LDAP_UTF8_INCR(p) ) { 63 chars++; 64 } 65 66 return chars; 67} 68 69/* return offset to next character */ 70int ldap_utf8_offset( const char * p ) 71{ 72 return LDAP_UTF8_NEXT(p) - p; 73} 74 75/* 76 * Returns length indicated by first byte. 77 */ 78const char ldap_utf8_lentab[] = { 79 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 82 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 84 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 85 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 86 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 }; 87 88int ldap_utf8_charlen( const char * p ) 89{ 90 if (!(*p & 0x80)) 91 return 1; 92 93 return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80]; 94} 95 96/* 97 * Make sure the UTF-8 char used the shortest possible encoding 98 * returns charlen if valid, 0 if not. 99 * 100 * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4. 101 * The table is slightly modified from that of the RFC. 102 * 103 * UCS-4 range (hex) UTF-8 sequence (binary) 104 * 0000 0000-0000 007F 0....... 105 * 0000 0080-0000 07FF 110++++. 10...... 106 * 0000 0800-0000 FFFF 1110++++ 10+..... 10...... 107 * 0001 0000-001F FFFF 11110+++ 10++.... 10...... 10...... 108 * 0020 0000-03FF FFFF 111110++ 10+++... 10...... 10...... 10...... 109 * 0400 0000-7FFF FFFF 1111110+ 10++++.. 10...... 10...... 10...... 10...... 110 * 111 * The '.' bits are "don't cares". When validating a UTF-8 sequence, 112 * at least one of the '+' bits must be set, otherwise the character 113 * should have been encoded in fewer octets. Note that in the two-octet 114 * case, only the first octet needs to be validated, and this is done 115 * in the ldap_utf8_lentab[] above. 116 */ 117 118/* mask of required bits in second octet */ 119#undef c 120#define c const char 121c ldap_utf8_mintab[] = { 122 (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, 123 (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, 124 (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, 125 (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 }; 126#undef c 127 128int ldap_utf8_charlen2( const char * p ) 129{ 130 int i = LDAP_UTF8_CHARLEN( p ); 131 132 if ( i > 2 ) { 133 if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) ) 134 i = 0; 135 } 136 return i; 137} 138 139/* conv UTF-8 to UCS-4, useful for comparisons */ 140ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p ) 141{ 142 const unsigned char *c = (const unsigned char *) p; 143 ldap_ucs4_t ch; 144 int len, i; 145 static unsigned char mask[] = { 146 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 147 148 len = LDAP_UTF8_CHARLEN2(p, len); 149 150 if( len == 0 ) return LDAP_UCS4_INVALID; 151 152 ch = c[0] & mask[len]; 153 154 for(i=1; i < len; i++) { 155 if ((c[i] & 0xc0) != 0x80) { 156 return LDAP_UCS4_INVALID; 157 } 158 159 ch <<= 6; 160 ch |= c[i] & 0x3f; 161 } 162 163 return ch; 164} 165 166/* conv UCS-4 to UTF-8, not used */ 167int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf ) 168{ 169 int len=0; 170 unsigned char* p = (unsigned char *) buf; 171 172 /* not a valid Unicode character */ 173 if ( c < 0 ) return 0; 174 175 /* Just return length, don't convert */ 176 if(buf == NULL) { 177 if( c < 0x80 ) return 1; 178 else if( c < 0x800 ) return 2; 179 else if( c < 0x10000 ) return 3; 180 else if( c < 0x200000 ) return 4; 181 else if( c < 0x4000000 ) return 5; 182 else return 6; 183 } 184 185 if( c < 0x80 ) { 186 p[len++] = c; 187 188 } else if( c < 0x800 ) { 189 p[len++] = 0xc0 | ( c >> 6 ); 190 p[len++] = 0x80 | ( c & 0x3f ); 191 192 } else if( c < 0x10000 ) { 193 p[len++] = 0xe0 | ( c >> 12 ); 194 p[len++] = 0x80 | ( (c >> 6) & 0x3f ); 195 p[len++] = 0x80 | ( c & 0x3f ); 196 197 } else if( c < 0x200000 ) { 198 p[len++] = 0xf0 | ( c >> 18 ); 199 p[len++] = 0x80 | ( (c >> 12) & 0x3f ); 200 p[len++] = 0x80 | ( (c >> 6) & 0x3f ); 201 p[len++] = 0x80 | ( c & 0x3f ); 202 203 } else if( c < 0x4000000 ) { 204 p[len++] = 0xf8 | ( c >> 24 ); 205 p[len++] = 0x80 | ( (c >> 18) & 0x3f ); 206 p[len++] = 0x80 | ( (c >> 12) & 0x3f ); 207 p[len++] = 0x80 | ( (c >> 6) & 0x3f ); 208 p[len++] = 0x80 | ( c & 0x3f ); 209 210 } else /* if( c < 0x80000000 ) */ { 211 p[len++] = 0xfc | ( c >> 30 ); 212 p[len++] = 0x80 | ( (c >> 24) & 0x3f ); 213 p[len++] = 0x80 | ( (c >> 18) & 0x3f ); 214 p[len++] = 0x80 | ( (c >> 12) & 0x3f ); 215 p[len++] = 0x80 | ( (c >> 6) & 0x3f ); 216 p[len++] = 0x80 | ( c & 0x3f ); 217 } 218 219 return len; 220} 221 222#define LDAP_UCS_UTF8LEN(c) \ 223 c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \ 224 (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6))))) 225 226/* Convert a string to UTF-8 format. The input string is expected to 227 * have characters of 1, 2, or 4 octets (in network byte order) 228 * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING 229 * types respectively. (Here T61STRING just means that there is one 230 * octet per character and characters may use the high bit of the octet. 231 * The characters are assumed to use ISO mappings, no provision is made 232 * for converting from T.61 coding rules to Unicode.) 233 */ 234 235int 236ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s ) 237{ 238 unsigned char *in, *end; 239 char *ptr; 240 ldap_ucs4_t u; 241 int i, l = 0; 242 243 utf8s->bv_val = NULL; 244 utf8s->bv_len = 0; 245 246 in = (unsigned char *)ucs->bv_val; 247 248 /* Make sure we stop at an even multiple of csize */ 249 end = in + ( ucs->bv_len & ~(csize-1) ); 250 251 for (; in < end; ) { 252 u = *in++; 253 if (csize > 1) { 254 u <<= 8; 255 u |= *in++; 256 } 257 if (csize > 2) { 258 u <<= 8; 259 u |= *in++; 260 u <<= 8; 261 u |= *in++; 262 } 263 i = LDAP_UCS_UTF8LEN(u); 264 if (i == 0) 265 return LDAP_INVALID_SYNTAX; 266 l += i; 267 } 268 269 utf8s->bv_val = LDAP_MALLOC( l+1 ); 270 if (utf8s->bv_val == NULL) 271 return LDAP_NO_MEMORY; 272 utf8s->bv_len = l; 273 274 ptr = utf8s->bv_val; 275 for (in = (unsigned char *)ucs->bv_val; in < end; ) { 276 u = *in++; 277 if (csize > 1) { 278 u <<= 8; 279 u |= *in++; 280 } 281 if (csize > 2) { 282 u <<= 8; 283 u |= *in++; 284 u <<= 8; 285 u |= *in++; 286 } 287 ptr += ldap_x_ucs4_to_utf8(u, ptr); 288 } 289 *ptr = '\0'; 290 return LDAP_SUCCESS; 291} 292 293/* 294 * Advance to the next UTF-8 character 295 * 296 * Ignores length of multibyte character, instead rely on 297 * continuation markers to find start of next character. 298 * This allows for "resyncing" of when invalid characters 299 * are provided provided the start of the next character 300 * is appears within the 6 bytes examined. 301 */ 302char* ldap_utf8_next( const char * p ) 303{ 304 int i; 305 const unsigned char *u = (const unsigned char *) p; 306 307 if( LDAP_UTF8_ISASCII(u) ) { 308 return (char *) &p[1]; 309 } 310 311 for( i=1; i<6; i++ ) { 312 if ( ( u[i] & 0xc0 ) != 0x80 ) { 313 return (char *) &p[i]; 314 } 315 } 316 317 return (char *) &p[i]; 318} 319 320/* 321 * Advance to the previous UTF-8 character 322 * 323 * Ignores length of multibyte character, instead rely on 324 * continuation markers to find start of next character. 325 * This allows for "resyncing" of when invalid characters 326 * are provided provided the start of the next character 327 * is appears within the 6 bytes examined. 328 */ 329char* ldap_utf8_prev( const char * p ) 330{ 331 int i; 332 const unsigned char *u = (const unsigned char *) p; 333 334 for( i=-1; i>-6 ; i-- ) { 335 if ( ( u[i] & 0xc0 ) != 0x80 ) { 336 return (char *) &p[i]; 337 } 338 } 339 340 return (char *) &p[i]; 341} 342 343/* 344 * Copy one UTF-8 character from src to dst returning 345 * number of bytes copied. 346 * 347 * Ignores length of multibyte character, instead rely on 348 * continuation markers to find start of next character. 349 * This allows for "resyncing" of when invalid characters 350 * are provided provided the start of the next character 351 * is appears within the 6 bytes examined. 352 */ 353int ldap_utf8_copy( char* dst, const char *src ) 354{ 355 int i; 356 const unsigned char *u = (const unsigned char *) src; 357 358 dst[0] = src[0]; 359 360 if( LDAP_UTF8_ISASCII(u) ) { 361 return 1; 362 } 363 364 for( i=1; i<6; i++ ) { 365 if ( ( u[i] & 0xc0 ) != 0x80 ) { 366 return i; 367 } 368 dst[i] = src[i]; 369 } 370 371 return i; 372} 373 374#ifndef UTF8_ALPHA_CTYPE 375/* 376 * UTF-8 ctype routines 377 * Only deals with characters < 0x80 (ie: US-ASCII) 378 */ 379 380int ldap_utf8_isascii( const char * p ) 381{ 382 unsigned c = * (const unsigned char *) p; 383 return LDAP_ASCII(c); 384} 385 386int ldap_utf8_isdigit( const char * p ) 387{ 388 unsigned c = * (const unsigned char *) p; 389 390 if(!LDAP_ASCII(c)) return 0; 391 392 return LDAP_DIGIT( c ); 393} 394 395int ldap_utf8_isxdigit( const char * p ) 396{ 397 unsigned c = * (const unsigned char *) p; 398 399 if(!LDAP_ASCII(c)) return 0; 400 401 return LDAP_HEX(c); 402} 403 404int ldap_utf8_isspace( const char * p ) 405{ 406 unsigned c = * (const unsigned char *) p; 407 408 if(!LDAP_ASCII(c)) return 0; 409 410 switch(c) { 411 case ' ': 412 case '\t': 413 case '\n': 414 case '\r': 415 case '\v': 416 case '\f': 417 return 1; 418 } 419 420 return 0; 421} 422 423/* 424 * These are not needed by the C SDK and are 425 * not "good enough" for general use. 426 */ 427int ldap_utf8_isalpha( const char * p ) 428{ 429 unsigned c = * (const unsigned char *) p; 430 431 if(!LDAP_ASCII(c)) return 0; 432 433 return LDAP_ALPHA(c); 434} 435 436int ldap_utf8_isalnum( const char * p ) 437{ 438 unsigned c = * (const unsigned char *) p; 439 440 if(!LDAP_ASCII(c)) return 0; 441 442 return LDAP_ALNUM(c); 443} 444 445int ldap_utf8_islower( const char * p ) 446{ 447 unsigned c = * (const unsigned char *) p; 448 449 if(!LDAP_ASCII(c)) return 0; 450 451 return LDAP_LOWER(c); 452} 453 454int ldap_utf8_isupper( const char * p ) 455{ 456 unsigned c = * (const unsigned char *) p; 457 458 if(!LDAP_ASCII(c)) return 0; 459 460 return LDAP_UPPER(c); 461} 462#endif 463 464 465/* 466 * UTF-8 string routines 467 */ 468 469/* like strchr() */ 470char * (ldap_utf8_strchr)( const char *str, const char *chr ) 471{ 472 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) { 473 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) { 474 return (char *) str; 475 } 476 } 477 478 return NULL; 479} 480 481/* like strcspn() but returns number of bytes, not characters */ 482ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set ) 483{ 484 const char *cstr; 485 const char *cset; 486 487 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) { 488 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) { 489 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) { 490 return cstr - str; 491 } 492 } 493 } 494 495 return cstr - str; 496} 497 498/* like strspn() but returns number of bytes, not characters */ 499ber_len_t (ldap_utf8_strspn)( const char *str, const char *set ) 500{ 501 const char *cstr; 502 const char *cset; 503 504 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) { 505 for( cset = set; ; LDAP_UTF8_INCR(cset) ) { 506 if( *cset == '\0' ) { 507 return cstr - str; 508 } 509 510 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) { 511 break; 512 } 513 } 514 } 515 516 return cstr - str; 517} 518 519/* like strpbrk(), replaces strchr() as well */ 520char *(ldap_utf8_strpbrk)( const char *str, const char *set ) 521{ 522 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) { 523 const char *cset; 524 525 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) { 526 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) { 527 return (char *) str; 528 } 529 } 530 } 531 532 return NULL; 533} 534 535/* like strtok_r(), not strtok() */ 536char *(ldap_utf8_strtok)(char *str, const char *sep, char **last) 537{ 538 char *begin; 539 char *end; 540 541 if( last == NULL ) return NULL; 542 543 begin = str ? str : *last; 544 545 begin += ldap_utf8_strspn( begin, sep ); 546 547 if( *begin == '\0' ) { 548 *last = NULL; 549 return NULL; 550 } 551 552 end = &begin[ ldap_utf8_strcspn( begin, sep ) ]; 553 554 if( *end != '\0' ) { 555 char *next = LDAP_UTF8_NEXT( end ); 556 *end = '\0'; 557 end = next; 558 } 559 560 *last = end; 561 return begin; 562} 563