1/* 2 Unix SMB/CIFS implementation. 3 minimal iconv implementation 4 Copyright (C) Andrew Tridgell 2001 5 Copyright (C) Jelmer Vernooij 2002,2003 6 7 This program is free software; you can redistribute it and/or modify 8 it under the terms of the GNU General Public License as published by 9 the Free Software Foundation; either version 2 of the License, or 10 (at your option) any later version. 11 12 This program is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License 18 along with this program; if not, write to the Free Software 19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 20*/ 21 22#include "includes.h" 23 24/* 25 * We have to use strcasecmp here as the character conversions 26 * haven't been initialised yet. JRA. 27 */ 28 29#undef strcasecmp 30 31/** 32 * @file 33 * 34 * @brief Samba wrapper/stub for iconv character set conversion. 35 * 36 * iconv is the XPG2 interface for converting between character 37 * encodings. This file provides a Samba wrapper around it, and also 38 * a simple reimplementation that is used if the system does not 39 * implement iconv. 40 * 41 * Samba only works with encodings that are supersets of ASCII: ascii 42 * characters like whitespace can be tested for directly, multibyte 43 * sequences start with a byte with the high bit set, and strings are 44 * terminated by a nul byte. 45 * 46 * Note that the only function provided by iconv is conversion between 47 * characters. It doesn't directly support operations like 48 * uppercasing or comparison. We have to convert to UCS-2 and compare 49 * there. 50 * 51 * @sa Samba Developers Guide 52 **/ 53 54static size_t ascii_pull(void *,const char **, size_t *, char **, size_t *); 55static size_t ascii_push(void *,const char **, size_t *, char **, size_t *); 56static size_t latin1_push(void *,const char **, size_t *, char **, size_t *); 57static size_t utf8_pull(void *,const char **, size_t *, char **, size_t *); 58static size_t utf8_push(void *,const char **, size_t *, char **, size_t *); 59static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *); 60static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *); 61static size_t iconv_copy(void *,const char **, size_t *, char **, size_t *); 62static size_t iconv_swab (void *,const char **, size_t *, char **, size_t *); 63 64static struct charset_functions builtin_functions[] = { 65 /* windows is really neither UCS-2 not UTF-16 */ 66 {"UCS-2LE", iconv_copy, iconv_copy}, 67 {"UTF-16LE", iconv_copy, iconv_copy}, 68 {"UCS-2BE", iconv_swab, iconv_swab}, 69 {"UTF-16BE", iconv_swab, iconv_swab}, 70 71 /* we include the UTF-8 alias to cope with differing locale settings */ 72 {"UTF8", utf8_pull, utf8_push}, 73 {"UTF-8", utf8_pull, utf8_push}, 74 {"ASCII", ascii_pull, ascii_push}, 75 {"646", ascii_pull, ascii_push}, 76 {"ISO-8859-1", ascii_pull, latin1_push}, 77 {"UCS2-HEX", ucs2hex_pull, ucs2hex_push}, 78 {NULL, NULL, NULL} 79}; 80 81static struct charset_functions *charsets = NULL; 82 83static struct charset_functions *find_charset_functions(const char *name) 84{ 85 struct charset_functions *c = charsets; 86 87 while(c) { 88 if (strcasecmp(name, c->name) == 0) { 89 return c; 90 } 91 c = c->next; 92 } 93 94 return NULL; 95} 96 97NTSTATUS smb_register_charset(struct charset_functions *funcs) 98{ 99 if (!funcs) { 100 return NT_STATUS_INVALID_PARAMETER; 101 } 102 103 DEBUG(5, ("Attempting to register new charset %s\n", funcs->name)); 104 /* Check whether we already have this charset... */ 105 if (find_charset_functions(funcs->name)) { 106 DEBUG(0, ("Duplicate charset %s, not registering\n", funcs->name)); 107 return NT_STATUS_OBJECT_NAME_COLLISION; 108 } 109 110 funcs->next = funcs->prev = NULL; 111 DEBUG(5, ("Registered charset %s\n", funcs->name)); 112 DLIST_ADD(charsets, funcs); 113 return NT_STATUS_OK; 114} 115 116static void lazy_initialize_iconv(void) 117{ 118 static BOOL initialized; 119 int i; 120 121 if (!initialized) { 122 initialized = True; 123 for(i = 0; builtin_functions[i].name; i++) 124 smb_register_charset(&builtin_functions[i]); 125 static_init_charset; 126 } 127} 128 129/* if there was an error then reset the internal state, 130 this ensures that we don't have a shift state remaining for 131 character sets like SJIS */ 132static size_t sys_iconv(void *cd, 133 const char **inbuf, size_t *inbytesleft, 134 char **outbuf, size_t *outbytesleft) 135{ 136#ifdef HAVE_NATIVE_ICONV 137 size_t ret = iconv((iconv_t)cd, 138 (char **)inbuf, inbytesleft, 139 outbuf, outbytesleft); 140 if (ret == (size_t)-1) { 141 int saved_errno = errno; 142 iconv(cd, NULL, NULL, NULL, NULL); 143 errno = saved_errno; 144 } 145 return ret; 146#else 147 errno = EINVAL; 148 return -1; 149#endif 150} 151 152/** 153 * This is a simple portable iconv() implementaion. 154 * 155 * It only knows about a very small number of character sets - just 156 * enough that Samba works on systems that don't have iconv. 157 **/ 158size_t smb_iconv(smb_iconv_t cd, 159 const char **inbuf, size_t *inbytesleft, 160 char **outbuf, size_t *outbytesleft) 161{ 162 char cvtbuf[2048]; 163 char *bufp = cvtbuf; 164 size_t bufsize; 165 166 /* in many cases we can go direct */ 167 if (cd->direct) { 168 return cd->direct(cd->cd_direct, 169 inbuf, inbytesleft, outbuf, outbytesleft); 170 } 171 172 173 /* otherwise we have to do it chunks at a time */ 174 while (*inbytesleft > 0) { 175 bufp = cvtbuf; 176 bufsize = sizeof(cvtbuf); 177 178 if (cd->pull(cd->cd_pull, 179 inbuf, inbytesleft, &bufp, &bufsize) == -1 180 && errno != E2BIG) return -1; 181 182 bufp = cvtbuf; 183 bufsize = sizeof(cvtbuf) - bufsize; 184 185 if (cd->push(cd->cd_push, 186 (const char **)&bufp, &bufsize, 187 outbuf, outbytesleft) == -1) return -1; 188 } 189 190 return 0; 191} 192 193 194static BOOL is_utf16(const char *name) 195{ 196 return strcasecmp(name, "UCS-2LE") == 0 || 197 strcasecmp(name, "UTF-16LE") == 0; 198} 199 200/* 201 simple iconv_open() wrapper 202 */ 203smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode) 204{ 205 smb_iconv_t ret; 206 struct charset_functions *from, *to; 207 208 lazy_initialize_iconv(); 209 from = charsets; 210 to = charsets; 211 212 ret = SMB_MALLOC_P(struct _smb_iconv_t); 213 if (!ret) { 214 errno = ENOMEM; 215 return (smb_iconv_t)-1; 216 } 217 memset(ret, 0, sizeof(struct _smb_iconv_t)); 218 219 ret->from_name = SMB_STRDUP(fromcode); 220 ret->to_name = SMB_STRDUP(tocode); 221 222 /* check for the simplest null conversion */ 223 if (strcasecmp(fromcode, tocode) == 0) { 224 ret->direct = iconv_copy; 225 return ret; 226 } 227 228 /* check if we have a builtin function for this conversion */ 229 from = find_charset_functions(fromcode); 230 if(from)ret->pull = from->pull; 231 232 to = find_charset_functions(tocode); 233 if(to)ret->push = to->push; 234 235 /* check if we can use iconv for this conversion */ 236#ifdef HAVE_NATIVE_ICONV 237 if (!ret->pull) { 238 ret->cd_pull = iconv_open("UTF-16LE", fromcode); 239 if (ret->cd_pull == (iconv_t)-1) 240 ret->cd_pull = iconv_open("UCS-2LE", fromcode); 241 if (ret->cd_pull != (iconv_t)-1) 242 ret->pull = sys_iconv; 243 } 244 245 if (!ret->push) { 246 ret->cd_push = iconv_open(tocode, "UTF-16LE"); 247 if (ret->cd_push == (iconv_t)-1) 248 ret->cd_push = iconv_open(tocode, "UCS-2LE"); 249 if (ret->cd_push != (iconv_t)-1) 250 ret->push = sys_iconv; 251 } 252#endif 253 254 /* check if there is a module available that can do this conversion */ 255 if (!ret->pull && NT_STATUS_IS_OK(smb_probe_module("charset", fromcode))) { 256 if(!(from = find_charset_functions(fromcode))) 257 DEBUG(0, ("Module %s doesn't provide charset %s!\n", fromcode, fromcode)); 258 else 259 ret->pull = from->pull; 260 } 261 262 if (!ret->push && NT_STATUS_IS_OK(smb_probe_module("charset", tocode))) { 263 if(!(to = find_charset_functions(tocode))) 264 DEBUG(0, ("Module %s doesn't provide charset %s!\n", tocode, tocode)); 265 else 266 ret->push = to->push; 267 } 268 269 if (!ret->push || !ret->pull) { 270 SAFE_FREE(ret->from_name); 271 SAFE_FREE(ret->to_name); 272 SAFE_FREE(ret); 273 errno = EINVAL; 274 return (smb_iconv_t)-1; 275 } 276 277 /* check for conversion to/from ucs2 */ 278 if (is_utf16(fromcode) && to) { 279 ret->direct = to->push; 280 ret->push = ret->pull = NULL; 281 return ret; 282 } 283 284 if (is_utf16(tocode) && from) { 285 ret->direct = from->pull; 286 ret->push = ret->pull = NULL; 287 return ret; 288 } 289 290 /* Check if we can do the conversion direct */ 291#ifdef HAVE_NATIVE_ICONV 292 if (is_utf16(fromcode)) { 293 ret->direct = sys_iconv; 294 ret->cd_direct = ret->cd_push; 295 ret->cd_push = NULL; 296 return ret; 297 } 298 if (is_utf16(tocode)) { 299 ret->direct = sys_iconv; 300 ret->cd_direct = ret->cd_pull; 301 ret->cd_pull = NULL; 302 return ret; 303 } 304#endif 305 306 return ret; 307} 308 309/* 310 simple iconv_close() wrapper 311*/ 312int smb_iconv_close (smb_iconv_t cd) 313{ 314#ifdef HAVE_NATIVE_ICONV 315 if (cd->cd_direct) iconv_close((iconv_t)cd->cd_direct); 316 if (cd->cd_pull) iconv_close((iconv_t)cd->cd_pull); 317 if (cd->cd_push) iconv_close((iconv_t)cd->cd_push); 318#endif 319 320 SAFE_FREE(cd->from_name); 321 SAFE_FREE(cd->to_name); 322 323 memset(cd, 0, sizeof(*cd)); 324 SAFE_FREE(cd); 325 return 0; 326} 327 328 329/********************************************************************** 330 the following functions implement the builtin character sets in Samba 331 and also the "test" character sets that are designed to test 332 multi-byte character set support for english users 333***********************************************************************/ 334 335static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft, 336 char **outbuf, size_t *outbytesleft) 337{ 338 while (*inbytesleft >= 1 && *outbytesleft >= 2) { 339 (*outbuf)[0] = (*inbuf)[0]; 340 (*outbuf)[1] = 0; 341 (*inbytesleft) -= 1; 342 (*outbytesleft) -= 2; 343 (*inbuf) += 1; 344 (*outbuf) += 2; 345 } 346 347 if (*inbytesleft > 0) { 348 errno = E2BIG; 349 return -1; 350 } 351 352 return 0; 353} 354 355static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft, 356 char **outbuf, size_t *outbytesleft) 357{ 358 int ir_count=0; 359 360 while (*inbytesleft >= 2 && *outbytesleft >= 1) { 361 (*outbuf)[0] = (*inbuf)[0] & 0x7F; 362 if ((*inbuf)[1]) ir_count++; 363 (*inbytesleft) -= 2; 364 (*outbytesleft) -= 1; 365 (*inbuf) += 2; 366 (*outbuf) += 1; 367 } 368 369 if (*inbytesleft == 1) { 370 errno = EINVAL; 371 return -1; 372 } 373 374 if (*inbytesleft > 1) { 375 errno = E2BIG; 376 return -1; 377 } 378 379 return ir_count; 380} 381 382static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft, 383 char **outbuf, size_t *outbytesleft) 384{ 385 int ir_count=0; 386 387 while (*inbytesleft >= 2 && *outbytesleft >= 1) { 388 (*outbuf)[0] = (*inbuf)[0]; 389 if ((*inbuf)[1]) ir_count++; 390 (*inbytesleft) -= 2; 391 (*outbytesleft) -= 1; 392 (*inbuf) += 2; 393 (*outbuf) += 1; 394 } 395 396 if (*inbytesleft == 1) { 397 errno = EINVAL; 398 return -1; 399 } 400 401 if (*inbytesleft > 1) { 402 errno = E2BIG; 403 return -1; 404 } 405 406 return ir_count; 407} 408 409static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft, 410 char **outbuf, size_t *outbytesleft) 411{ 412 while (*inbytesleft >= 1 && *outbytesleft >= 2) { 413 unsigned v; 414 415 if ((*inbuf)[0] != '@') { 416 /* seven bit ascii case */ 417 (*outbuf)[0] = (*inbuf)[0]; 418 (*outbuf)[1] = 0; 419 (*inbytesleft) -= 1; 420 (*outbytesleft) -= 2; 421 (*inbuf) += 1; 422 (*outbuf) += 2; 423 continue; 424 } 425 /* it's a hex character */ 426 if (*inbytesleft < 5) { 427 errno = EINVAL; 428 return -1; 429 } 430 431 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) { 432 errno = EILSEQ; 433 return -1; 434 } 435 436 (*outbuf)[0] = v&0xff; 437 (*outbuf)[1] = v>>8; 438 (*inbytesleft) -= 5; 439 (*outbytesleft) -= 2; 440 (*inbuf) += 5; 441 (*outbuf) += 2; 442 } 443 444 if (*inbytesleft > 0) { 445 errno = E2BIG; 446 return -1; 447 } 448 449 return 0; 450} 451 452static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft, 453 char **outbuf, size_t *outbytesleft) 454{ 455 while (*inbytesleft >= 2 && *outbytesleft >= 1) { 456 char buf[6]; 457 458 if ((*inbuf)[1] == 0 && 459 ((*inbuf)[0] & 0x80) == 0 && 460 (*inbuf)[0] != '@') { 461 (*outbuf)[0] = (*inbuf)[0]; 462 (*inbytesleft) -= 2; 463 (*outbytesleft) -= 1; 464 (*inbuf) += 2; 465 (*outbuf) += 1; 466 continue; 467 } 468 if (*outbytesleft < 5) { 469 errno = E2BIG; 470 return -1; 471 } 472 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0)); 473 memcpy(*outbuf, buf, 5); 474 (*inbytesleft) -= 2; 475 (*outbytesleft) -= 5; 476 (*inbuf) += 2; 477 (*outbuf) += 5; 478 } 479 480 if (*inbytesleft == 1) { 481 errno = EINVAL; 482 return -1; 483 } 484 485 if (*inbytesleft > 1) { 486 errno = E2BIG; 487 return -1; 488 } 489 490 return 0; 491} 492 493static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft, 494 char **outbuf, size_t *outbytesleft) 495{ 496 int n; 497 498 n = MIN(*inbytesleft, *outbytesleft); 499 500 swab(*inbuf, *outbuf, (n&~1)); 501 if (n&1) { 502 (*outbuf)[n-1] = 0; 503 } 504 505 (*inbytesleft) -= n; 506 (*outbytesleft) -= n; 507 (*inbuf) += n; 508 (*outbuf) += n; 509 510 if (*inbytesleft > 0) { 511 errno = E2BIG; 512 return -1; 513 } 514 515 return 0; 516} 517 518static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft, 519 char **outbuf, size_t *outbytesleft) 520{ 521 int n; 522 523 n = MIN(*inbytesleft, *outbytesleft); 524 525 memmove(*outbuf, *inbuf, n); 526 527 (*inbytesleft) -= n; 528 (*outbytesleft) -= n; 529 (*inbuf) += n; 530 (*outbuf) += n; 531 532 if (*inbytesleft > 0) { 533 errno = E2BIG; 534 return -1; 535 } 536 537 return 0; 538} 539 540static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft, 541 char **outbuf, size_t *outbytesleft) 542{ 543 size_t in_left=*inbytesleft, out_left=*outbytesleft; 544 const uint8 *c = (const uint8 *)*inbuf; 545 uint8 *uc = (uint8 *)*outbuf; 546 547 while (in_left >= 1 && out_left >= 2) { 548 if ((c[0] & 0x80) == 0) { 549 uc[0] = c[0]; 550 uc[1] = 0; 551 c += 1; 552 in_left -= 1; 553 out_left -= 2; 554 uc += 2; 555 continue; 556 } 557 558 if ((c[0] & 0xe0) == 0xc0) { 559 if (in_left < 2 || 560 (c[1] & 0xc0) != 0x80) { 561 errno = EILSEQ; 562 goto error; 563 } 564 uc[1] = (c[0]>>2) & 0x7; 565 uc[0] = (c[0]<<6) | (c[1]&0x3f); 566 c += 2; 567 in_left -= 2; 568 out_left -= 2; 569 uc += 2; 570 continue; 571 } 572 573 if ((c[0] & 0xf0) == 0xe0) { 574 if (in_left < 3 || 575 (c[1] & 0xc0) != 0x80 || 576 (c[2] & 0xc0) != 0x80) { 577 errno = EILSEQ; 578 goto error; 579 } 580 uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF); 581 uc[0] = (c[1]<<6) | (c[2]&0x3f); 582 c += 3; 583 in_left -= 3; 584 out_left -= 2; 585 uc += 2; 586 continue; 587 } 588 589 if ((c[0] & 0xf8) == 0xf0) { 590 unsigned int codepoint; 591 if (in_left < 4 || 592 (c[1] & 0xc0) != 0x80 || 593 (c[2] & 0xc0) != 0x80 || 594 (c[3] & 0xc0) != 0x80) { 595 errno = EILSEQ; 596 goto error; 597 } 598 codepoint = 599 (c[3]&0x3f) | 600 ((c[2]&0x3f)<<6) | 601 ((c[1]&0x3f)<<12) | 602 ((c[0]&0x7)<<18); 603 if (codepoint < 0x10000) { 604 /* accept UTF-8 characters that are not 605 minimally packed, but pack the result */ 606 uc[0] = (codepoint & 0xFF); 607 uc[1] = (codepoint >> 8); 608 c += 4; 609 in_left -= 4; 610 out_left -= 2; 611 uc += 2; 612 continue; 613 } 614 615 codepoint -= 0x10000; 616 617 if (out_left < 4) { 618 errno = E2BIG; 619 goto error; 620 } 621 622 uc[0] = (codepoint>>10) & 0xFF; 623 uc[1] = (codepoint>>18) | 0xd8; 624 uc[2] = codepoint & 0xFF; 625 uc[3] = ((codepoint>>8) & 0x3) | 0xdc; 626 c += 4; 627 in_left -= 4; 628 out_left -= 4; 629 uc += 4; 630 continue; 631 } 632 633 /* we don't handle 5 byte sequences */ 634 errno = EINVAL; 635 goto error; 636 } 637 638 if (in_left > 0) { 639 errno = E2BIG; 640 goto error; 641 } 642 643 *inbytesleft = in_left; 644 *outbytesleft = out_left; 645 *inbuf = c; 646 *outbuf = uc; 647 return 0; 648 649error: 650 *inbytesleft = in_left; 651 *outbytesleft = out_left; 652 *inbuf = c; 653 *outbuf = uc; 654 return -1; 655} 656 657static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft, 658 char **outbuf, size_t *outbytesleft) 659{ 660 size_t in_left=*inbytesleft, out_left=*outbytesleft; 661 uint8 *c = (uint8 *)*outbuf; 662 const uint8 *uc = (const uint8 *)*inbuf; 663 664 while (in_left >= 2 && out_left >= 1) { 665 unsigned int codepoint; 666 667 if (uc[1] == 0 && !(uc[0] & 0x80)) { 668 /* simplest case */ 669 c[0] = uc[0]; 670 in_left -= 2; 671 out_left -= 1; 672 uc += 2; 673 c += 1; 674 continue; 675 } 676 677 if ((uc[1]&0xf8) == 0) { 678 /* next simplest case */ 679 if (out_left < 2) { 680 errno = E2BIG; 681 goto error; 682 } 683 c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2); 684 c[1] = 0x80 | (uc[0] & 0x3f); 685 in_left -= 2; 686 out_left -= 2; 687 uc += 2; 688 c += 2; 689 continue; 690 } 691 692 if ((uc[1] & 0xfc) == 0xdc) { 693 /* its the second part of a 4 byte sequence. Illegal */ 694 if (in_left < 4) { 695 errno = EINVAL; 696 } else { 697 errno = EILSEQ; 698 } 699 goto error; 700 } 701 702 if ((uc[1] & 0xfc) != 0xd8) { 703 codepoint = uc[0] | (uc[1]<<8); 704 if (out_left < 3) { 705 errno = E2BIG; 706 goto error; 707 } 708 c[0] = 0xe0 | (codepoint >> 12); 709 c[1] = 0x80 | ((codepoint >> 6) & 0x3f); 710 c[2] = 0x80 | (codepoint & 0x3f); 711 712 in_left -= 2; 713 out_left -= 3; 714 uc += 2; 715 c += 3; 716 continue; 717 } 718 719 /* its the first part of a 4 byte sequence */ 720 if (in_left < 4) { 721 errno = EINVAL; 722 goto error; 723 } 724 if ((uc[3] & 0xfc) != 0xdc) { 725 errno = EILSEQ; 726 goto error; 727 } 728 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) | 729 (uc[0]<<10) | ((uc[1] & 0x3)<<18)); 730 731 if (out_left < 4) { 732 errno = E2BIG; 733 goto error; 734 } 735 c[0] = 0xf0 | (codepoint >> 18); 736 c[1] = 0x80 | ((codepoint >> 12) & 0x3f); 737 c[2] = 0x80 | ((codepoint >> 6) & 0x3f); 738 c[3] = 0x80 | (codepoint & 0x3f); 739 740 in_left -= 4; 741 out_left -= 4; 742 uc += 4; 743 c += 4; 744 } 745 746 if (in_left == 1) { 747 errno = EINVAL; 748 goto error; 749 } 750 751 if (in_left > 1) { 752 errno = E2BIG; 753 goto error; 754 } 755 756 *inbytesleft = in_left; 757 *outbytesleft = out_left; 758 *inbuf = uc; 759 *outbuf = c; 760 761 return 0; 762 763error: 764 *inbytesleft = in_left; 765 *outbytesleft = out_left; 766 *inbuf = uc; 767 *outbuf = c; 768 return -1; 769} 770 771