1/* 2 Unix SMB/CIFS implementation. 3 Character set conversion Extensions 4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001 5 Copyright (C) Andrew Tridgell 2001 6 Copyright (C) Simo Sorce 2001 7 Copyright (C) Martin Pool 2003 8 9 This program is free software; you can redistribute it and/or modify 10 it under the terms of the GNU General Public License as published by 11 the Free Software Foundation; either version 2 of the License, or 12 (at your option) any later version. 13 14 This program is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 GNU General Public License for more details. 18 19 You should have received a copy of the GNU General Public License 20 along with this program; if not, write to the Free Software 21 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 22 23*/ 24#ifdef HAVE_CONFIG_H 25#include "config.h" 26#endif /* HAVE_CONFIG_H */ 27 28#include <stdio.h> 29#include <stdlib.h> 30#include <unistd.h> 31#include <string.h> 32#include <ctype.h> 33#include <errno.h> 34#include <sys/stat.h> 35#include <sys/param.h> 36#ifdef HAVE_USABLE_ICONV 37#include <iconv.h> 38#endif 39#if HAVE_LOCALE_H 40#include <locale.h> 41#endif 42#if HAVE_LANGINFO_H 43#include <langinfo.h> 44#endif 45 46#include <netatalk/endian.h> 47#include <atalk/logger.h> 48#include <atalk/unicode.h> 49#include <atalk/util.h> 50#include "byteorder.h" 51 52 53/** 54 * @file 55 * 56 * @brief Character-set conversion routines built on our iconv. 57 * 58 * @note Samba's internal character set (at least in the 3.0 series) 59 * is always the same as the one for the Unix filesystem. It is 60 * <b>not</b> necessarily UTF-8 and may be different on machines that 61 * need i18n filenames to be compatible with Unix software. It does 62 * have to be a superset of ASCII. All multibyte sequences must start 63 * with a byte with the high bit set. 64 * 65 * @sa lib/iconv.c 66 */ 67 68 69#define MAX_CHARSETS 20 70 71#define CHECK_FLAGS(a,b) (((a)!=NULL) ? (*(a) & (b)) : 0 ) 72 73static atalk_iconv_t conv_handles[MAX_CHARSETS][MAX_CHARSETS]; 74static char* charset_names[MAX_CHARSETS]; 75static struct charset_functions* charsets[MAX_CHARSETS]; 76static char hexdig[] = "0123456789abcdef"; 77#define hextoint( c ) ( isdigit( c ) ? c - '0' : c + 10 - 'a' ) 78 79static char* read_charsets_from_env(charset_t ch) 80{ 81 char *name; 82 83 switch (ch) { 84 case CH_MAC: 85 if (( name = getenv( "ATALK_MAC_CHARSET" )) != NULL ) 86 return name; 87 else 88 return "MAC_ROMAN"; 89 break; 90 case CH_UNIX: 91 if (( name = getenv( "ATALK_UNIX_CHARSET" )) != NULL ) 92 return name; 93 else 94 return "LOCALE"; 95 break; 96 default: 97 break; 98 } 99 return "ASCII"; 100} 101 102 103/** 104 * Return the name of a charset to give to iconv(). 105 **/ 106static const char *charset_name(charset_t ch) 107{ 108 const char *ret = NULL; 109 static int first = 1; 110 static char macname[128]; 111 static char unixname[128]; 112 113 if (first) { 114 memset(macname, 0, sizeof(macname)); 115 memset(unixname, 0, sizeof(unixname)); 116 first = 0; 117 } 118 119 if (ch == CH_UCS2) ret = "UCS-2"; 120 else if (ch == CH_UTF8) ret = "UTF8"; 121 else if (ch == CH_UTF8_MAC) ret = "UTF8-MAC"; 122 else if (ch == CH_UNIX) { 123 if (unixname[0] == '\0') { 124 ret = read_charsets_from_env(CH_UNIX); 125 strlcpy(unixname, ret, sizeof(unixname)); 126 } 127 else 128 ret = unixname; 129 } 130 else if (ch == CH_MAC) { 131 if (macname[0] == '\0') { 132 ret = read_charsets_from_env(CH_MAC); 133 strlcpy(macname, ret, sizeof(macname)); 134 } 135 else 136 ret = macname; 137 } 138 139 if (!ret) 140 ret = charset_names[ch]; 141 142#if defined(HAVE_NL_LANGINFO) && defined(CODESET) 143 if (ret && strcasecmp(ret, "LOCALE") == 0) { 144 const char *ln = NULL; 145 146#ifdef HAVE_SETLOCALE 147 setlocale(LC_ALL, ""); 148#endif 149 ln = nl_langinfo(CODESET); 150 if (ln) { 151 /* Check whether the charset name is supported 152 by iconv */ 153 atalk_iconv_t handle = atalk_iconv_open(ln, "UCS-2"); 154 if (handle == (atalk_iconv_t) -1) { 155 LOG(log_debug, logtype_default, "Locale charset '%s' unsupported, using ASCII instead", ln); 156 ln = "ASCII"; 157 } else { 158 atalk_iconv_close(handle); 159 } 160 if (ch==CH_UNIX) 161 strlcpy(unixname, ln, sizeof(unixname)); 162 } 163 ret = ln; 164 } 165#else /* system doesn't have LOCALE support */ 166 if (ch == CH_UNIX) ret = NULL; 167#endif 168 169 if (!ret || !*ret) ret = "ASCII"; 170 return ret; 171} 172 173static struct charset_functions* get_charset_functions (charset_t ch) 174{ 175 if (charsets[ch] != NULL) 176 return charsets[ch]; 177 178 charsets[ch] = find_charset_functions(charset_name(ch)); 179 180 return charsets[ch]; 181} 182 183 184static void lazy_initialize_conv(void) 185{ 186 static int initialized = 0; 187 188 if (!initialized) { 189 initialized = 1; 190 init_iconv(); 191 } 192} 193 194charset_t add_charset(const char* name) 195{ 196 static charset_t max_charset_t = NUM_CHARSETS-1; 197 charset_t cur_charset_t = max_charset_t+1; 198 unsigned int c1; 199 200 lazy_initialize_conv(); 201 202 for (c1=0; c1<=max_charset_t;c1++) { 203 if ( strcasecmp(name, charset_name(c1)) == 0) 204 return (c1); 205 } 206 207 if ( cur_charset_t >= MAX_CHARSETS ) { 208 LOG (log_debug, logtype_default, "Adding charset %s failed, too many charsets (max. %u allowed)", 209 name, MAX_CHARSETS); 210 return (charset_t) -1; 211 } 212 213 /* First try to setup the required conversions */ 214 215 conv_handles[cur_charset_t][CH_UCS2] = atalk_iconv_open( charset_name(CH_UCS2), name); 216 if (conv_handles[cur_charset_t][CH_UCS2] == (atalk_iconv_t)-1) { 217 LOG(log_error, logtype_default, "Required conversion from %s to %s not supported", 218 name, charset_name(CH_UCS2)); 219 conv_handles[cur_charset_t][CH_UCS2] = NULL; 220 return (charset_t) -1; 221 } 222 223 conv_handles[CH_UCS2][cur_charset_t] = atalk_iconv_open( name, charset_name(CH_UCS2)); 224 if (conv_handles[CH_UCS2][cur_charset_t] == (atalk_iconv_t)-1) { 225 LOG(log_error, logtype_default, "Required conversion from %s to %s not supported", 226 charset_name(CH_UCS2), name); 227 conv_handles[CH_UCS2][cur_charset_t] = NULL; 228 return (charset_t) -1; 229 } 230 231 /* register the new charset_t name */ 232 charset_names[cur_charset_t] = strdup(name); 233 234 charsets[cur_charset_t] = get_charset_functions (cur_charset_t); 235 max_charset_t++; 236 237#ifdef DEBUG 238 LOG(log_debug9, logtype_default, "Added charset %s with handle %u", name, cur_charset_t); 239#endif 240 return (cur_charset_t); 241} 242 243/** 244 * Initialize iconv conversion descriptors. 245 * 246 * This is called the first time it is needed, and also called again 247 * every time the configuration is reloaded, because the charset or 248 * codepage might have changed. 249 **/ 250void init_iconv(void) 251{ 252 int c1; 253 254 for (c1=0;c1<NUM_CHARSETS;c1++) { 255 const char *name = charset_name((charset_t)c1); 256 257 conv_handles[c1][CH_UCS2] = atalk_iconv_open( charset_name(CH_UCS2), name); 258 if (conv_handles[c1][CH_UCS2] == (atalk_iconv_t)-1) { 259 LOG(log_error, logtype_default, "Required conversion from %s to %s not supported", 260 name, charset_name(CH_UCS2)); 261 conv_handles[c1][CH_UCS2] = NULL; 262 } 263 264 if (c1 != CH_UCS2) { /* avoid lost memory, make valgrind happy */ 265 conv_handles[CH_UCS2][c1] = atalk_iconv_open( name, charset_name(CH_UCS2)); 266 if (conv_handles[CH_UCS2][c1] == (atalk_iconv_t)-1) { 267 LOG(log_error, logtype_default, "Required conversion from %s to %s not supported", 268 charset_name(CH_UCS2), name); 269 conv_handles[CH_UCS2][c1] = NULL; 270 } 271 } 272 273 charsets[c1] = get_charset_functions (c1); 274 } 275} 276 277/** 278 * 279 **/ 280static size_t add_null(charset_t to, char *buf, size_t bytesleft, size_t len) 281{ 282 /* Terminate the string */ 283 if (to == CH_UCS2 && bytesleft >= 2) { 284 buf[len] = 0; 285 buf[len+1] = 0; 286 287 } 288 else if ( to != CH_UCS2 && bytesleft > 0 ) 289 buf[len] = 0; 290 else { 291 errno = E2BIG; 292 return (size_t)(-1); 293 } 294 295 return len; 296} 297 298 299/** 300 * Convert string from one encoding to another, making error checking etc 301 * 302 * @param src pointer to source string (multibyte or singlebyte) 303 * @param srclen length of the source string in bytes 304 * @param dest pointer to destination string (multibyte or singlebyte) 305 * @param destlen maximal length allowed for string 306 * @returns the number of bytes occupied in the destination 307 **/ 308static size_t convert_string_internal(charset_t from, charset_t to, 309 void const *src, size_t srclen, 310 void *dest, size_t destlen) 311{ 312 size_t i_len, o_len; 313 size_t retval; 314 const char* inbuf = (const char*)src; 315 char* outbuf = (char*)dest; 316 char* o_save = outbuf; 317 atalk_iconv_t descriptor; 318 319 /* Fixed based on Samba 3.0.6 */ 320 if (srclen == (size_t)-1) { 321 if (from == CH_UCS2) { 322 srclen = (strlen_w((const ucs2_t *)src)) * 2; 323 } else { 324 srclen = strlen((const char *)src); 325 } 326 } 327 328 329 lazy_initialize_conv(); 330 331 descriptor = conv_handles[from][to]; 332 333 if (descriptor == (atalk_iconv_t)-1 || descriptor == (atalk_iconv_t)0) { 334 return (size_t) -1; 335 } 336 337 i_len=srclen; 338 o_len=destlen; 339 retval = atalk_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len); 340 if(retval==(size_t)-1) { 341 const char *reason="unknown error"; 342 switch(errno) { 343 case EINVAL: 344 reason="Incomplete multibyte sequence"; 345 break; 346 case E2BIG: 347 reason="No more room"; 348 break; 349 case EILSEQ: 350 reason="Illegal multibyte sequence"; 351 break; 352 } 353 LOG(log_debug, logtype_default,"Conversion error: %s",reason); 354 return (size_t)-1; 355 } 356 357 /* Terminate the string */ 358 return add_null( to, o_save, o_len, destlen -o_len); 359} 360 361 362size_t convert_string(charset_t from, charset_t to, 363 void const *src, size_t srclen, 364 void *dest, size_t destlen) 365{ 366 size_t i_len, o_len; 367 ucs2_t *u; 368 ucs2_t buffer[MAXPATHLEN]; 369 ucs2_t buffer2[MAXPATHLEN]; 370 371 /* convert from_set to UCS2 */ 372 if ((size_t)-1 == ( o_len = convert_string_internal( from, CH_UCS2, src, srclen, 373 (char*) buffer, sizeof(buffer))) ) { 374 LOG(log_error, logtype_default, "Conversion failed ( %s to CH_UCS2 )", charset_name(from)); 375 return (size_t) -1; 376 } 377 378 /* Do pre/decomposition */ 379 i_len = sizeof(buffer2); 380 u = buffer2; 381 if (charsets[to] && (charsets[to]->flags & CHARSET_DECOMPOSED) ) { 382 if ( (size_t)-1 == (i_len = decompose_w(buffer, o_len, u, &i_len)) ) 383 return (size_t)-1; 384 } 385 else if (!charsets[from] || (charsets[from]->flags & CHARSET_DECOMPOSED)) { 386 if ( (size_t)-1 == (i_len = precompose_w(buffer, o_len, u, &i_len)) ) 387 return (size_t)-1; 388 } 389 else { 390 u = buffer; 391 i_len = o_len; 392 } 393 /* Convert UCS2 to to_set */ 394 if ((size_t)(-1) == ( o_len = convert_string_internal( CH_UCS2, to, (char*) u, i_len, dest, destlen)) ) { 395 LOG(log_error, logtype_default, "Conversion failed (CH_UCS2 to %s):%s", charset_name(to), strerror(errno)); 396 return (size_t) -1; 397 } 398 399 return o_len; 400} 401 402 403 404/** 405 * Convert between character sets, allocating a new buffer for the result. 406 * 407 * @param srclen length of source buffer. 408 * @param dest always set at least to NULL 409 * @note -1 is not accepted for srclen. 410 * 411 * @returns Size in bytes of the converted string; or -1 in case of error. 412 **/ 413 414static size_t convert_string_allocate_internal(charset_t from, charset_t to, 415 void const *src, size_t srclen, char **dest) 416{ 417 size_t i_len, o_len, destlen; 418 size_t retval; 419 const char *inbuf = (const char *)src; 420 char *outbuf = NULL, *ob = NULL; 421 atalk_iconv_t descriptor; 422 423 *dest = NULL; 424 425 if (src == NULL || srclen == (size_t)-1) 426 return (size_t)-1; 427 428 lazy_initialize_conv(); 429 430 descriptor = conv_handles[from][to]; 431 432 if (descriptor == (atalk_iconv_t)-1 || descriptor == (atalk_iconv_t)0) { 433 /* conversion not supported, return -1*/ 434 LOG(log_debug, logtype_default, "convert_string_allocate: conversion not supported!"); 435 return -1; 436 } 437 438 destlen = MAX(srclen, 512); 439convert: 440 destlen = destlen * 2; 441 outbuf = (char *)realloc(ob, destlen); 442 if (!outbuf) { 443 LOG(log_debug, logtype_default,"convert_string_allocate: realloc failed!"); 444 SAFE_FREE(ob); 445 return (size_t)-1; 446 } else { 447 ob = outbuf; 448 } 449 inbuf = src; /* this restarts the whole conversion if buffer needed to be increased */ 450 i_len = srclen; 451 o_len = destlen; 452 retval = atalk_iconv(descriptor, 453 &inbuf, &i_len, 454 &outbuf, &o_len); 455 if(retval == (size_t)-1) { 456 const char *reason="unknown error"; 457 switch(errno) { 458 case EINVAL: 459 reason="Incomplete multibyte sequence"; 460 break; 461 case E2BIG: 462 goto convert; 463 case EILSEQ: 464 reason="Illegal multibyte sequence"; 465 break; 466 } 467 LOG(log_debug, logtype_default,"Conversion error: %s(%s)",reason,inbuf); 468 SAFE_FREE(ob); 469 return (size_t)-1; 470 } 471 472 473 destlen = destlen - o_len; 474 475 /* Terminate the string */ 476 if (to == CH_UCS2 && o_len >= 2) { 477 ob[destlen] = 0; 478 ob[destlen+1] = 0; 479 *dest = (char *)realloc(ob,destlen+2); 480 } 481 else if ( to != CH_UCS2 && o_len > 0 ) { 482 ob[destlen] = 0; 483 *dest = (char *)realloc(ob,destlen+1); 484 } 485 else { 486 goto convert; /* realloc */ 487 } 488 489 if (destlen && !*dest) { 490 LOG(log_debug, logtype_default, "convert_string_allocate: out of memory!"); 491 SAFE_FREE(ob); 492 return (size_t)-1; 493 } 494 495 return destlen; 496} 497 498 499size_t convert_string_allocate(charset_t from, charset_t to, 500 void const *src, size_t srclen, 501 char ** dest) 502{ 503 size_t i_len, o_len; 504 ucs2_t *u; 505 ucs2_t buffer[MAXPATHLEN]; 506 ucs2_t buffer2[MAXPATHLEN]; 507 508 *dest = NULL; 509 510 /* convert from_set to UCS2 */ 511 if ((size_t)(-1) == ( o_len = convert_string_internal( from, CH_UCS2, src, srclen, 512 buffer, sizeof(buffer))) ) { 513 LOG(log_error, logtype_default, "Conversion failed ( %s to CH_UCS2 )", charset_name(from)); 514 return (size_t) -1; 515 } 516 517 /* Do pre/decomposition */ 518 i_len = sizeof(buffer2); 519 u = buffer2; 520 if (charsets[to] && (charsets[to]->flags & CHARSET_DECOMPOSED) ) { 521 if ( (size_t)-1 == (i_len = decompose_w(buffer, o_len, u, &i_len)) ) 522 return (size_t)-1; 523 } 524 else if ( !charsets[from] || (charsets[from]->flags & CHARSET_DECOMPOSED) ) { 525 if ( (size_t)-1 == (i_len = precompose_w(buffer, o_len, u, &i_len)) ) 526 return (size_t)-1; 527 } 528 else { 529 u = buffer; 530 i_len = o_len; 531 } 532 533 /* Convert UCS2 to to_set */ 534 if ((size_t)-1 == ( o_len = convert_string_allocate_internal( CH_UCS2, to, (char*)u, i_len, dest)) ) 535 LOG(log_error, logtype_default, "Conversion failed (CH_UCS2 to %s):%s", charset_name(to), strerror(errno)); 536 537 return o_len; 538 539} 540 541size_t charset_strupper(charset_t ch, const char *src, size_t srclen, char *dest, size_t destlen) 542{ 543 size_t size; 544 char *buffer; 545 546 size = convert_string_allocate_internal(ch, CH_UCS2, src, srclen, 547 (char**) &buffer); 548 if (size == (size_t)-1) { 549 SAFE_FREE(buffer); 550 return size; 551 } 552 if (!strupper_w((ucs2_t *)buffer) && (dest == src)) { 553 free(buffer); 554 return srclen; 555 } 556 557 size = convert_string_internal(CH_UCS2, ch, buffer, size, dest, destlen); 558 free(buffer); 559 return size; 560} 561 562size_t charset_strlower(charset_t ch, const char *src, size_t srclen, char *dest, size_t destlen) 563{ 564 size_t size; 565 char *buffer; 566 567 size = convert_string_allocate_internal(ch, CH_UCS2, src, srclen, 568 (char **) &buffer); 569 if (size == (size_t)-1) { 570 SAFE_FREE(buffer); 571 return size; 572 } 573 if (!strlower_w((ucs2_t *)buffer) && (dest == src)) { 574 free(buffer); 575 return srclen; 576 } 577 578 size = convert_string_internal(CH_UCS2, ch, buffer, size, dest, destlen); 579 free(buffer); 580 return size; 581} 582 583 584size_t unix_strupper(const char *src, size_t srclen, char *dest, size_t destlen) 585{ 586 return charset_strupper( CH_UNIX, src, srclen, dest, destlen); 587} 588 589size_t unix_strlower(const char *src, size_t srclen, char *dest, size_t destlen) 590{ 591 return charset_strlower( CH_UNIX, src, srclen, dest, destlen); 592} 593 594size_t utf8_strupper(const char *src, size_t srclen, char *dest, size_t destlen) 595{ 596 return charset_strupper( CH_UTF8, src, srclen, dest, destlen); 597} 598 599size_t utf8_strlower(const char *src, size_t srclen, char *dest, size_t destlen) 600{ 601 return charset_strlower( CH_UTF8, src, srclen, dest, destlen); 602} 603 604/** 605 * Copy a string from a charset_t char* src to a UCS2 destination, allocating a buffer 606 * 607 * @param dest always set at least to NULL 608 * 609 * @returns The number of bytes occupied by the string in the destination 610 * or -1 in case of error. 611 **/ 612 613size_t charset_to_ucs2_allocate(charset_t ch, ucs2_t **dest, const char *src) 614{ 615 size_t src_len = strlen(src); 616 617 *dest = NULL; 618 return convert_string_allocate(ch, CH_UCS2, src, src_len, (char**) dest); 619} 620 621/** ----------------------------------- 622 * Copy a string from a charset_t char* src to a UTF-8 destination, allocating a buffer 623 * 624 * @param dest always set at least to NULL 625 * 626 * @returns The number of bytes occupied by the string in the destination 627 **/ 628 629size_t charset_to_utf8_allocate(charset_t ch, char **dest, const char *src) 630{ 631 size_t src_len = strlen(src); 632 633 *dest = NULL; 634 return convert_string_allocate(ch, CH_UTF8, src, src_len, dest); 635} 636 637/** ----------------------------------- 638 * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer 639 * 640 * @param dest always set at least to NULL 641 * 642 * @returns The number of bytes occupied by the string in the destination 643 **/ 644 645size_t ucs2_to_charset(charset_t ch, const ucs2_t *src, char *dest, size_t destlen) 646{ 647 size_t src_len = (strlen_w(src)) * sizeof(ucs2_t); 648 return convert_string(CH_UCS2, ch, src, src_len, dest, destlen); 649} 650 651/* --------------------------------- */ 652size_t ucs2_to_charset_allocate(charset_t ch, char **dest, const ucs2_t *src) 653{ 654 size_t src_len = (strlen_w(src)) * sizeof(ucs2_t); 655 *dest = NULL; 656 return convert_string_allocate(CH_UCS2, ch, src, src_len, dest); 657} 658 659/** --------------------------------- 660 * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer 661 * 662 * @param dest always set at least to NULL 663 * 664 * @returns The number of bytes occupied by the string in the destination 665 **/ 666 667size_t utf8_to_charset_allocate(charset_t ch, char **dest, const char *src) 668{ 669 size_t src_len = strlen(src); 670 *dest = NULL; 671 return convert_string_allocate(CH_UTF8, ch, src, src_len, dest); 672} 673 674size_t charset_precompose ( charset_t ch, char * src, size_t inlen, char * dst, size_t outlen) 675{ 676 char *buffer; 677 ucs2_t u[MAXPATHLEN]; 678 size_t len; 679 size_t ilen; 680 681 if ((size_t)(-1) == (len = convert_string_allocate_internal(ch, CH_UCS2, src, inlen, &buffer)) ) 682 return len; 683 684 ilen=sizeof(u); 685 686 if ( (size_t)-1 == (ilen = precompose_w((ucs2_t *)buffer, len, u, &ilen)) ) { 687 free (buffer); 688 return (size_t)(-1); 689 } 690 691 if ((size_t)(-1) == (len = convert_string_internal( CH_UCS2, ch, (char*)u, ilen, dst, outlen)) ) { 692 free (buffer); 693 return (size_t)(-1); 694 } 695 696 free(buffer); 697 return (len); 698} 699 700size_t charset_decompose ( charset_t ch, char * src, size_t inlen, char * dst, size_t outlen) 701{ 702 char *buffer; 703 ucs2_t u[MAXPATHLEN]; 704 size_t len; 705 size_t ilen; 706 707 if ((size_t)(-1) == (len = convert_string_allocate_internal(ch, CH_UCS2, src, inlen, &buffer)) ) 708 return len; 709 710 ilen=sizeof(u); 711 712 if ( (size_t)-1 == (ilen = decompose_w((ucs2_t *)buffer, len, u, &ilen)) ) { 713 free (buffer); 714 return (size_t)(-1); 715 } 716 717 if ((size_t)(-1) == (len = convert_string_internal( CH_UCS2, ch, (char*)u, ilen, dst, outlen)) ) { 718 free (buffer); 719 return (size_t)(-1); 720 } 721 722 free(buffer); 723 return (len); 724} 725 726size_t utf8_precompose ( char * src, size_t inlen, char * dst, size_t outlen) 727{ 728 return charset_precompose ( CH_UTF8, src, inlen, dst, outlen); 729} 730 731size_t utf8_decompose ( char * src, size_t inlen, char * dst, size_t outlen) 732{ 733 return charset_decompose ( CH_UTF8, src, inlen, dst, outlen); 734} 735 736#if 0 737static char debugbuf[ MAXPATHLEN +1 ]; 738char * debug_out ( char * seq, size_t len) 739{ 740 size_t i = 0; 741 unsigned char *p; 742 char *q; 743 744 p = (unsigned char*) seq; 745 q = debugbuf; 746 747 for ( i = 0; i<=(len-1); i++) 748 { 749 sprintf(q, "%2.2x.", *p); 750 q += 3; 751 p++; 752 } 753 *q=0; 754 q = debugbuf; 755 return q; 756} 757#endif 758 759/* 760 * Convert from MB to UCS2 charset 761 * Flags: 762 * CONV_UNESCAPEHEX: ':XX' will be converted to an UCS2 character 763 * CONV_IGNORE: return the first convertable characters. 764 * CONV_FORCE: force convertion 765 * FIXME: 766 * This will *not* work if the destination charset is not multibyte, i.e. UCS2->UCS2 will fail 767 * The (un)escape scheme is not compatible to the old cap style escape. This is bad, we need it 768 * for e.g. HFS cdroms. 769 */ 770 771static size_t pull_charset_flags (charset_t from_set, charset_t cap_set, const char *src, size_t srclen, char* dest, size_t destlen, u_int16_t *flags) 772{ 773 const u_int16_t option = (flags ? *flags : 0); 774 size_t i_len, o_len; 775 size_t j = 0; 776 const char* inbuf = (const char*)src; 777 char* outbuf = dest; 778 atalk_iconv_t descriptor; 779 atalk_iconv_t descriptor_cap; 780 781 if (srclen == (size_t)-1) 782 srclen = strlen(src) + 1; 783 784 descriptor = conv_handles[from_set][CH_UCS2]; 785 descriptor_cap = conv_handles[cap_set][CH_UCS2]; 786 787 if (descriptor == (atalk_iconv_t)-1 || descriptor == (atalk_iconv_t)0) { 788 errno = EINVAL; 789 return (size_t)-1; 790 } 791 792 i_len=srclen; 793 o_len=destlen; 794 795 while (i_len > 0) { 796 if ((option & CONV_UNESCAPEHEX)) { 797 for (j = 0; j < i_len; ++j) { 798 if (inbuf[j] == ':') break; 799 } 800 j = i_len - j; 801 i_len -= j; 802 } 803 804 if (i_len > 0 && 805 atalk_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len) == (size_t)-1) { 806 if (errno == EILSEQ || errno == EINVAL) { 807 errno = EILSEQ; 808 if ((option & CONV_IGNORE)) { 809 *flags |= CONV_REQMANGLE; 810 return destlen - o_len; 811 } 812 if ((option & CONV__EILSEQ)) { 813 if (o_len < 2) { 814 errno = E2BIG; 815 goto end; 816 } 817 *((ucs2_t *)outbuf) = (ucs2_t) IGNORE_CHAR; /**inbuf */ 818 inbuf++; 819 i_len--; 820 outbuf += 2; 821 o_len -= 2; 822 /* FIXME reset stat ? */ 823 continue; 824 } 825 } 826 goto end; 827 } 828 829 if (j) { 830 /* we're at the start on an hex encoded ucs2 char */ 831 char h[MAXPATHLEN]; 832 size_t hlen = 0; 833 834 i_len = j, j = 0; 835 while (i_len >= 3 && inbuf[0] == ':' && 836 isxdigit(inbuf[1]) && isxdigit(inbuf[2])) { 837 h[hlen++] = (hextoint(inbuf[1]) << 4) | hextoint(inbuf[2]); 838 inbuf += 3; 839 i_len -= 3; 840 } 841 if (hlen) { 842 const char *h_buf = h; 843 if (atalk_iconv(descriptor_cap, &h_buf, &hlen, &outbuf, &o_len) == (size_t)-1) { 844 i_len += hlen * 3; 845 inbuf -= hlen * 3; 846 if (errno == EILSEQ && (option & CONV_IGNORE)) { 847 *flags |= CONV_REQMANGLE; 848 return destlen - o_len; 849 } 850 goto end; 851 } 852 } else { 853 /* We have an invalid :xx sequence */ 854 errno = EILSEQ; 855 if ((option & CONV_IGNORE)) { 856 *flags |= CONV_REQMANGLE; 857 return destlen - o_len; 858 } 859 goto end; 860 } 861 } 862 } 863end: 864 return (i_len + j == 0 || (option & CONV_FORCE)) ? destlen - o_len : (size_t)-1; 865} 866 867/* 868 * Convert from UCS2 to MB charset 869 * Flags: 870 * CONV_ESCAPEDOTS: escape leading dots 871 * CONV_ESCAPEHEX: unconvertable characters and '/' will be escaped to :XX 872 * CONV_IGNORE: return the first convertable characters. 873 * CONV__EILSEQ: unconvertable characters will be replaced with '_' 874 * CONV_FORCE: force convertion 875 * FIXME: 876 * CONV_IGNORE and CONV_ESCAPEHEX can't work together. Should we check this ? 877 * This will *not* work if the destination charset is not multibyte, i.e. UCS2->UCS2 will fail 878 * The escape scheme is not compatible to the old cap style escape. This is bad, we need it 879 * for e.g. HFS cdroms. 880 */ 881 882 883static size_t push_charset_flags (charset_t to_set, charset_t cap_set, char* src, size_t srclen, char* dest, size_t destlen, u_int16_t *flags) 884{ 885 const u_int16_t option = (flags ? *flags : 0); 886 size_t i_len, o_len, i; 887 size_t j = 0; 888 const char* inbuf = (const char*)src; 889 char* outbuf = (char*)dest; 890 atalk_iconv_t descriptor; 891 atalk_iconv_t descriptor_cap; 892 char escch; /* 150210: uninitialized OK, depends on j */ 893 894 descriptor = conv_handles[CH_UCS2][to_set]; 895 descriptor_cap = conv_handles[CH_UCS2][cap_set]; 896 897 if (descriptor == (atalk_iconv_t)-1 || descriptor == (atalk_iconv_t)0) { 898 errno = EINVAL; 899 return (size_t) -1; 900 } 901 902 i_len=srclen; 903 o_len=destlen; 904 905 if ((option & CONV_ESCAPEDOTS) && 906 i_len >= 2 && SVAL(inbuf, 0) == 0x002e) { /* 0x002e = . */ 907 if (o_len < 3) { 908 errno = E2BIG; 909 goto end; 910 } 911 *outbuf++ = ':'; 912 *outbuf++ = '2'; 913 *outbuf++ = 'e'; 914 o_len -= 3; 915 inbuf += 2; 916 i_len -= 2; 917 *flags |= CONV_REQESCAPE; 918 } 919 920 while (i_len >= 2) { 921 if ((option & CONV_ESCAPEHEX)) { 922 for (i = 0; i < i_len; i += 2) { 923 ucs2_t c = SVAL(inbuf, i); 924 switch (c) { 925 case 0x003a: /* 0x003a = ':' */ 926 if ( ! (option & CONV_ALLOW_COLON)) { 927 errno = EILSEQ; 928 goto end; 929 } 930 escch = c; 931 j = i_len - i; 932 i_len = i; 933 break; 934 case 0x002f: /* 0x002f = '/' */ 935 escch = c; 936 j = i_len - i; 937 i_len = i; 938 break; 939 } 940 } 941 } 942 while (i_len > 0 && 943 atalk_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len) == (size_t)-1) { 944 if (errno == EILSEQ) { 945 if ((option & CONV_IGNORE)) { 946 *flags |= CONV_REQMANGLE; 947 return destlen - o_len; 948 } 949 if ((option & CONV_ESCAPEHEX)) { 950 const size_t bufsiz = o_len / 3 + 1; 951 char *buf = malloc(bufsiz); 952 size_t buflen; 953 954 if (!buf) 955 goto end; 956 i = i_len; 957 for (buflen = 1; buflen <= bufsiz; ++buflen) { 958 char *b = buf; 959 size_t o = buflen; 960 if (atalk_iconv(descriptor_cap, &inbuf, &i, &b, &o) != (size_t)-1) { 961 buflen -= o; 962 break; 963 } else if (errno != E2BIG) { 964 SAFE_FREE(buf); 965 goto end; 966 } else if (o < buflen) { 967 buflen -= o; 968 break; 969 } 970 } 971 if (o_len < buflen * 3) { 972 SAFE_FREE(buf); 973 errno = E2BIG; 974 goto end; 975 } 976 o_len -= buflen * 3; 977 i_len = i; 978 for (i = 0; i < buflen; ++i) { 979 *outbuf++ = ':'; 980 *outbuf++ = hexdig[(buf[i] >> 4) & 0x0f]; 981 *outbuf++ = hexdig[buf[i] & 0x0f]; 982 } 983 SAFE_FREE(buf); 984 *flags |= CONV_REQESCAPE; 985 continue; 986 } 987 } 988 goto end; 989 } 990 991 if (j) { 992 i_len = j, j = 0; 993 if (o_len < 3) { 994 errno = E2BIG; 995 goto end; 996 } 997 switch (escch) { 998 case '/': 999 *outbuf++ = ':'; 1000 *outbuf++ = '2'; 1001 *outbuf++ = 'f'; 1002 break; 1003 case ':': 1004 *outbuf++ = ':'; 1005 *outbuf++ = '3'; 1006 *outbuf++ = 'a'; 1007 break; 1008 default: 1009 /* 1010 * THIS SHOULD NEVER BE REACHED !!! 1011 * As a safety net I put in a ' ' here 1012 */ 1013 *outbuf++ = ':'; 1014 *outbuf++ = '2'; 1015 *outbuf++ = '0'; 1016 break; 1017 } 1018 o_len -= 3; 1019 inbuf += 2; 1020 i_len -= 2; 1021 } 1022 } 1023 if (i_len > 0) errno = EINVAL; 1024end: 1025 return (i_len + j == 0 || (option & CONV_FORCE)) ? destlen - o_len : (size_t)-1; 1026} 1027 1028/* 1029 * FIXME the size is a mess we really need a malloc/free logic 1030 *`dest size must be dest_len +2 1031 */ 1032size_t convert_charset ( charset_t from_set, charset_t to_set, charset_t cap_charset, const char *src, size_t src_len, char *dest, size_t dest_len, u_int16_t *flags) 1033{ 1034 size_t i_len, o_len; 1035 ucs2_t *u; 1036 ucs2_t buffer[MAXPATHLEN +2]; 1037 ucs2_t buffer2[MAXPATHLEN +2]; 1038 1039 lazy_initialize_conv(); 1040 1041 /* convert from_set to UCS2 */ 1042 if ((size_t)(-1) == ( o_len = pull_charset_flags( from_set, cap_charset, src, src_len, 1043 (char *) buffer, sizeof(buffer) -2, flags)) ) { 1044 LOG(log_error, logtype_default, "Conversion failed ( %s to CH_UCS2 )", charset_name(from_set)); 1045 return (size_t) -1; 1046 } 1047 1048 if ( o_len == 0) 1049 return o_len; 1050 1051 /* Do pre/decomposition */ 1052 i_len = sizeof(buffer2) -2; 1053 u = buffer2; 1054 if (CHECK_FLAGS(flags, CONV_DECOMPOSE) || (charsets[to_set] && (charsets[to_set]->flags & CHARSET_DECOMPOSED)) ) { 1055 if ( (size_t)-1 == (i_len = decompose_w(buffer, o_len, u, &i_len)) ) 1056 return (size_t)(-1); 1057 } 1058 else if (CHECK_FLAGS(flags, CONV_PRECOMPOSE) || !charsets[from_set] || (charsets[from_set]->flags & CHARSET_DECOMPOSED)) { 1059 if ( (size_t)-1 == (i_len = precompose_w(buffer, o_len, u, &i_len)) ) 1060 return (size_t)(-1); 1061 } 1062 else { 1063 u = buffer; 1064 i_len = o_len; 1065 } 1066 /* null terminate */ 1067 u[i_len] = 0; 1068 u[i_len +1] = 0; 1069 1070 /* Do case conversions */ 1071 if (CHECK_FLAGS(flags, CONV_TOUPPER)) { 1072 strupper_w(u); 1073 } 1074 else if (CHECK_FLAGS(flags, CONV_TOLOWER)) { 1075 strlower_w(u); 1076 } 1077 1078 /* Convert UCS2 to to_set */ 1079 if ((size_t)(-1) == ( o_len = push_charset_flags( to_set, cap_charset, (char *)u, i_len, dest, dest_len, flags )) ) { 1080 LOG(log_error, logtype_default, 1081 "Conversion failed (CH_UCS2 to %s):%s", charset_name(to_set), strerror(errno)); 1082 return (size_t) -1; 1083 } 1084 /* null terminate */ 1085 dest[o_len] = 0; 1086 dest[o_len +1] = 0; 1087 1088 return o_len; 1089} 1090