1/* 2 Unix SMB/CIFS implementation. 3 minimal iconv implementation 4 Copyright (C) Andrew Tridgell 2001 5 Copyright (C) Jelmer Vernooij 2002,2003 6 7 This program is free software; you can redistribute it and/or modify 8 it under the terms of the GNU General Public License as published by 9 the Free Software Foundation; either version 3 of the License, or 10 (at your option) any later version. 11 12 This program is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License 18 along with this program. If not, see <http://www.gnu.org/licenses/>. 19*/ 20 21#include "includes.h" 22 23/* 24 * We have to use strcasecmp here as the character conversions 25 * haven't been initialised yet. JRA. 26 */ 27 28#undef strcasecmp 29 30/** 31 * @file 32 * 33 * @brief Samba wrapper/stub for iconv character set conversion. 34 * 35 * iconv is the XPG2 interface for converting between character 36 * encodings. This file provides a Samba wrapper around it, and also 37 * a simple reimplementation that is used if the system does not 38 * implement iconv. 39 * 40 * Samba only works with encodings that are supersets of ASCII: ascii 41 * characters like whitespace can be tested for directly, multibyte 42 * sequences start with a byte with the high bit set, and strings are 43 * terminated by a nul byte. 44 * 45 * Note that the only function provided by iconv is conversion between 46 * characters. It doesn't directly support operations like 47 * uppercasing or comparison. We have to convert to UCS-2 and compare 48 * there. 49 * 50 * @sa Samba Developers Guide 51 **/ 52 53static_decl_charset; 54 55static size_t ascii_pull(void *,const char **, size_t *, char **, size_t *); 56static size_t ascii_push(void *,const char **, size_t *, char **, size_t *); 57static size_t latin1_push(void *,const char **, size_t *, char **, size_t *); 58static size_t utf8_pull(void *,const char **, size_t *, char **, size_t *); 59static size_t utf8_push(void *,const char **, size_t *, char **, size_t *); 60static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *); 61static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *); 62static size_t iconv_copy(void *,const char **, size_t *, char **, size_t *); 63static size_t iconv_swab (void *,const char **, size_t *, char **, size_t *); 64 65static struct charset_functions builtin_functions[] = { 66 /* windows is really neither UCS-2 not UTF-16 */ 67 {"UCS-2LE", iconv_copy, iconv_copy}, 68 {"UTF-16LE", iconv_copy, iconv_copy}, 69 {"UCS-2BE", iconv_swab, iconv_swab}, 70 {"UTF-16BE", iconv_swab, iconv_swab}, 71 72 /* we include the UTF-8 alias to cope with differing locale settings */ 73 {"UTF8", utf8_pull, utf8_push}, 74 {"UTF-8", utf8_pull, utf8_push}, 75 {"ASCII", ascii_pull, ascii_push}, 76 {"646", ascii_pull, ascii_push}, 77 {"ISO-8859-1", ascii_pull, latin1_push}, 78 {"UCS2-HEX", ucs2hex_pull, ucs2hex_push}, 79 {NULL, NULL, NULL} 80}; 81 82static struct charset_functions *charsets = NULL; 83 84static struct charset_functions *find_charset_functions(const char *name) 85{ 86 struct charset_functions *c = charsets; 87 88 while(c) { 89 if (strcasecmp(name, c->name) == 0) { 90 return c; 91 } 92 c = c->next; 93 } 94 95 return NULL; 96} 97 98NTSTATUS smb_register_charset(struct charset_functions *funcs) 99{ 100 if (!funcs) { 101 return NT_STATUS_INVALID_PARAMETER; 102 } 103 104 DEBUG(5, ("Attempting to register new charset %s\n", funcs->name)); 105 /* Check whether we already have this charset... */ 106 if (find_charset_functions(funcs->name)) { 107 DEBUG(0, ("Duplicate charset %s, not registering\n", funcs->name)); 108 return NT_STATUS_OBJECT_NAME_COLLISION; 109 } 110 111 funcs->next = funcs->prev = NULL; 112 DEBUG(5, ("Registered charset %s\n", funcs->name)); 113 DLIST_ADD(charsets, funcs); 114 return NT_STATUS_OK; 115} 116 117static void lazy_initialize_iconv(void) 118{ 119 static bool initialized; 120 int i; 121 122 if (!initialized) { 123 initialized = True; 124 for(i = 0; builtin_functions[i].name; i++) 125 smb_register_charset(&builtin_functions[i]); 126 static_init_charset; 127 } 128} 129 130#ifdef HAVE_NATIVE_ICONV 131/* if there was an error then reset the internal state, 132 this ensures that we don't have a shift state remaining for 133 character sets like SJIS */ 134static size_t sys_iconv(void *cd, 135 const char **inbuf, size_t *inbytesleft, 136 char **outbuf, size_t *outbytesleft) 137{ 138 size_t ret = iconv((iconv_t)cd, 139 (void *)inbuf, inbytesleft, 140 outbuf, outbytesleft); 141 if (ret == (size_t)-1) { 142 int saved_errno = errno; 143 iconv(cd, NULL, NULL, NULL, NULL); 144 errno = saved_errno; 145 } 146 return ret; 147} 148#endif 149 150/** 151 * This is a simple portable iconv() implementaion. 152 * 153 * It only knows about a very small number of character sets - just 154 * enough that Samba works on systems that don't have iconv. 155 **/ 156size_t smb_iconv(smb_iconv_t cd, 157 const char **inbuf, size_t *inbytesleft, 158 char **outbuf, size_t *outbytesleft) 159{ 160 char cvtbuf[2048]; 161 char *bufp = cvtbuf; 162 size_t bufsize; 163 164 /* in many cases we can go direct */ 165 if (cd->direct) { 166 return cd->direct(cd->cd_direct, 167 inbuf, inbytesleft, outbuf, outbytesleft); 168 } 169 170 171 /* otherwise we have to do it chunks at a time */ 172 while (*inbytesleft > 0) { 173 bufp = cvtbuf; 174 bufsize = sizeof(cvtbuf); 175 176 if (cd->pull(cd->cd_pull, 177 inbuf, inbytesleft, &bufp, &bufsize) == -1 178 && errno != E2BIG) return -1; 179 180 bufp = cvtbuf; 181 bufsize = sizeof(cvtbuf) - bufsize; 182 183 if (cd->push(cd->cd_push, 184 (const char **)&bufp, &bufsize, 185 outbuf, outbytesleft) == -1) return -1; 186 } 187 188 return 0; 189} 190 191 192static bool is_utf16(const char *name) 193{ 194 return strcasecmp(name, "UCS-2LE") == 0 || 195 strcasecmp(name, "UTF-16LE") == 0; 196} 197 198/* 199 simple iconv_open() wrapper 200 */ 201smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode) 202{ 203 smb_iconv_t ret; 204 struct charset_functions *from, *to; 205 206 lazy_initialize_iconv(); 207 from = charsets; 208 to = charsets; 209 210 ret = SMB_MALLOC_P(struct smb_iconv_s); 211 if (!ret) { 212 errno = ENOMEM; 213 return (smb_iconv_t)-1; 214 } 215 memset(ret, 0, sizeof(struct smb_iconv_s)); 216 217 ret->from_name = SMB_STRDUP(fromcode); 218 ret->to_name = SMB_STRDUP(tocode); 219 220 /* check for the simplest null conversion */ 221 if (strcasecmp(fromcode, tocode) == 0) { 222 ret->direct = iconv_copy; 223 return ret; 224 } 225 226 /* check if we have a builtin function for this conversion */ 227 from = find_charset_functions(fromcode); 228 if(from)ret->pull = from->pull; 229 230 to = find_charset_functions(tocode); 231 if(to)ret->push = to->push; 232 233 /* check if we can use iconv for this conversion */ 234#ifdef HAVE_NATIVE_ICONV 235 if (!ret->pull) { 236 ret->cd_pull = iconv_open("UTF-16LE", fromcode); 237 if (ret->cd_pull == (iconv_t)-1) 238 ret->cd_pull = iconv_open("UCS-2LE", fromcode); 239 if (ret->cd_pull != (iconv_t)-1) 240 ret->pull = sys_iconv; 241 } 242 243 if (!ret->push) { 244 ret->cd_push = iconv_open(tocode, "UTF-16LE"); 245 if (ret->cd_push == (iconv_t)-1) 246 ret->cd_push = iconv_open(tocode, "UCS-2LE"); 247 if (ret->cd_push != (iconv_t)-1) 248 ret->push = sys_iconv; 249 } 250#endif 251 252 /* check if there is a module available that can do this conversion */ 253 if (!ret->pull && NT_STATUS_IS_OK(smb_probe_module("charset", fromcode))) { 254 if(!(from = find_charset_functions(fromcode))) 255 DEBUG(0, ("Module %s doesn't provide charset %s!\n", fromcode, fromcode)); 256 else 257 ret->pull = from->pull; 258 } 259 260 if (!ret->push && NT_STATUS_IS_OK(smb_probe_module("charset", tocode))) { 261 if(!(to = find_charset_functions(tocode))) 262 DEBUG(0, ("Module %s doesn't provide charset %s!\n", tocode, tocode)); 263 else 264 ret->push = to->push; 265 } 266 267 if (!ret->push || !ret->pull) { 268 SAFE_FREE(ret->from_name); 269 SAFE_FREE(ret->to_name); 270 SAFE_FREE(ret); 271 errno = EINVAL; 272 return (smb_iconv_t)-1; 273 } 274 275 /* check for conversion to/from ucs2 */ 276 if (is_utf16(fromcode) && to) { 277 ret->direct = to->push; 278 ret->push = ret->pull = NULL; 279 return ret; 280 } 281 282 if (is_utf16(tocode) && from) { 283 ret->direct = from->pull; 284 ret->push = ret->pull = NULL; 285 return ret; 286 } 287 288 /* Check if we can do the conversion direct */ 289#ifdef HAVE_NATIVE_ICONV 290 if (is_utf16(fromcode)) { 291 ret->direct = sys_iconv; 292 ret->cd_direct = ret->cd_push; 293 ret->cd_push = NULL; 294 return ret; 295 } 296 if (is_utf16(tocode)) { 297 ret->direct = sys_iconv; 298 ret->cd_direct = ret->cd_pull; 299 ret->cd_pull = NULL; 300 return ret; 301 } 302#endif 303 304 return ret; 305} 306 307/* 308 simple iconv_close() wrapper 309*/ 310int smb_iconv_close (smb_iconv_t cd) 311{ 312#ifdef HAVE_NATIVE_ICONV 313 if (cd->cd_direct) iconv_close((iconv_t)cd->cd_direct); 314 if (cd->cd_pull) iconv_close((iconv_t)cd->cd_pull); 315 if (cd->cd_push) iconv_close((iconv_t)cd->cd_push); 316#endif 317 318 SAFE_FREE(cd->from_name); 319 SAFE_FREE(cd->to_name); 320 321 memset(cd, 0, sizeof(*cd)); 322 SAFE_FREE(cd); 323 return 0; 324} 325 326 327/********************************************************************** 328 the following functions implement the builtin character sets in Samba 329 and also the "test" character sets that are designed to test 330 multi-byte character set support for english users 331***********************************************************************/ 332 333static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft, 334 char **outbuf, size_t *outbytesleft) 335{ 336 while (*inbytesleft >= 1 && *outbytesleft >= 2) { 337 (*outbuf)[0] = (*inbuf)[0]; 338 (*outbuf)[1] = 0; 339 (*inbytesleft) -= 1; 340 (*outbytesleft) -= 2; 341 (*inbuf) += 1; 342 (*outbuf) += 2; 343 } 344 345 if (*inbytesleft > 0) { 346 errno = E2BIG; 347 return -1; 348 } 349 350 return 0; 351} 352 353static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft, 354 char **outbuf, size_t *outbytesleft) 355{ 356 int ir_count=0; 357 358 while (*inbytesleft >= 2 && *outbytesleft >= 1) { 359 (*outbuf)[0] = (*inbuf)[0] & 0x7F; 360 if ((*inbuf)[1]) ir_count++; 361 (*inbytesleft) -= 2; 362 (*outbytesleft) -= 1; 363 (*inbuf) += 2; 364 (*outbuf) += 1; 365 } 366 367 if (*inbytesleft == 1) { 368 errno = EINVAL; 369 return -1; 370 } 371 372 if (*inbytesleft > 1) { 373 errno = E2BIG; 374 return -1; 375 } 376 377 return ir_count; 378} 379 380static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft, 381 char **outbuf, size_t *outbytesleft) 382{ 383 int ir_count=0; 384 385 while (*inbytesleft >= 2 && *outbytesleft >= 1) { 386 (*outbuf)[0] = (*inbuf)[0]; 387 if ((*inbuf)[1]) ir_count++; 388 (*inbytesleft) -= 2; 389 (*outbytesleft) -= 1; 390 (*inbuf) += 2; 391 (*outbuf) += 1; 392 } 393 394 if (*inbytesleft == 1) { 395 errno = EINVAL; 396 return -1; 397 } 398 399 if (*inbytesleft > 1) { 400 errno = E2BIG; 401 return -1; 402 } 403 404 return ir_count; 405} 406 407static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft, 408 char **outbuf, size_t *outbytesleft) 409{ 410 while (*inbytesleft >= 1 && *outbytesleft >= 2) { 411 unsigned v; 412 413 if ((*inbuf)[0] != '@') { 414 /* seven bit ascii case */ 415 (*outbuf)[0] = (*inbuf)[0]; 416 (*outbuf)[1] = 0; 417 (*inbytesleft) -= 1; 418 (*outbytesleft) -= 2; 419 (*inbuf) += 1; 420 (*outbuf) += 2; 421 continue; 422 } 423 /* it's a hex character */ 424 if (*inbytesleft < 5) { 425 errno = EINVAL; 426 return -1; 427 } 428 429 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) { 430 errno = EILSEQ; 431 return -1; 432 } 433 434 (*outbuf)[0] = v&0xff; 435 (*outbuf)[1] = v>>8; 436 (*inbytesleft) -= 5; 437 (*outbytesleft) -= 2; 438 (*inbuf) += 5; 439 (*outbuf) += 2; 440 } 441 442 if (*inbytesleft > 0) { 443 errno = E2BIG; 444 return -1; 445 } 446 447 return 0; 448} 449 450static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft, 451 char **outbuf, size_t *outbytesleft) 452{ 453 while (*inbytesleft >= 2 && *outbytesleft >= 1) { 454 char buf[6]; 455 456 if ((*inbuf)[1] == 0 && 457 ((*inbuf)[0] & 0x80) == 0 && 458 (*inbuf)[0] != '@') { 459 (*outbuf)[0] = (*inbuf)[0]; 460 (*inbytesleft) -= 2; 461 (*outbytesleft) -= 1; 462 (*inbuf) += 2; 463 (*outbuf) += 1; 464 continue; 465 } 466 if (*outbytesleft < 5) { 467 errno = E2BIG; 468 return -1; 469 } 470 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0)); 471 memcpy(*outbuf, buf, 5); 472 (*inbytesleft) -= 2; 473 (*outbytesleft) -= 5; 474 (*inbuf) += 2; 475 (*outbuf) += 5; 476 } 477 478 if (*inbytesleft == 1) { 479 errno = EINVAL; 480 return -1; 481 } 482 483 if (*inbytesleft > 1) { 484 errno = E2BIG; 485 return -1; 486 } 487 488 return 0; 489} 490 491static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft, 492 char **outbuf, size_t *outbytesleft) 493{ 494 int n; 495 496 n = MIN(*inbytesleft, *outbytesleft); 497 498 swab(*inbuf, *outbuf, (n&~1)); 499 if (n&1) { 500 (*outbuf)[n-1] = 0; 501 } 502 503 (*inbytesleft) -= n; 504 (*outbytesleft) -= n; 505 (*inbuf) += n; 506 (*outbuf) += n; 507 508 if (*inbytesleft > 0) { 509 errno = E2BIG; 510 return -1; 511 } 512 513 return 0; 514} 515 516static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft, 517 char **outbuf, size_t *outbytesleft) 518{ 519 int n; 520 521 n = MIN(*inbytesleft, *outbytesleft); 522 523 memmove(*outbuf, *inbuf, n); 524 525 (*inbytesleft) -= n; 526 (*outbytesleft) -= n; 527 (*inbuf) += n; 528 (*outbuf) += n; 529 530 if (*inbytesleft > 0) { 531 errno = E2BIG; 532 return -1; 533 } 534 535 return 0; 536} 537 538static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft, 539 char **outbuf, size_t *outbytesleft) 540{ 541 size_t in_left=*inbytesleft, out_left=*outbytesleft; 542 const uint8 *c = (const uint8 *)*inbuf; 543 uint8 *uc = (uint8 *)*outbuf; 544 545 while (in_left >= 1 && out_left >= 2) { 546 unsigned int codepoint; 547 548 if ((c[0] & 0x80) == 0) { 549 uc[0] = c[0]; 550 uc[1] = 0; 551 c += 1; 552 in_left -= 1; 553 out_left -= 2; 554 uc += 2; 555 continue; 556 } 557 558 if ((c[0] & 0xe0) == 0xc0) { 559 if (in_left < 2 || 560 (c[1] & 0xc0) != 0x80) { 561 errno = EILSEQ; 562 goto error; 563 } 564 codepoint = (c[1]&0x3f) | ((c[0]&0x1f)<<6); 565 if (codepoint < 0x80) { 566 /* don't accept UTF-8 characters that are not minimally packed */ 567 errno = EILSEQ; 568 goto error; 569 } 570 uc[1] = codepoint >> 8; 571 uc[0] = codepoint & 0xff; 572 c += 2; 573 in_left -= 2; 574 out_left -= 2; 575 uc += 2; 576 continue; 577 } 578 579 if ((c[0] & 0xf0) == 0xe0) { 580 if (in_left < 3 || 581 (c[1] & 0xc0) != 0x80 || 582 (c[2] & 0xc0) != 0x80) { 583 errno = EILSEQ; 584 goto error; 585 } 586 codepoint = (c[2]&0x3f) | ((c[1]&0x3f)<<6) | ((c[0]&0xf)<<12); 587 if (codepoint < 0x800) { 588 /* don't accept UTF-8 characters that are not minimally packed */ 589 errno = EILSEQ; 590 goto error; 591 } 592 uc[1] = codepoint >> 8; 593 uc[0] = codepoint & 0xff; 594 c += 3; 595 in_left -= 3; 596 out_left -= 2; 597 uc += 2; 598 continue; 599 } 600 601 if ((c[0] & 0xf8) == 0xf0) { 602 if (in_left < 4 || 603 (c[1] & 0xc0) != 0x80 || 604 (c[2] & 0xc0) != 0x80 || 605 (c[3] & 0xc0) != 0x80) { 606 errno = EILSEQ; 607 goto error; 608 } 609 codepoint = 610 (c[3]&0x3f) | 611 ((c[2]&0x3f)<<6) | 612 ((c[1]&0x3f)<<12) | 613 ((c[0]&0x7)<<18); 614 if (codepoint < 0x10000 || codepoint > 0x10ffff) { 615 /* don't accept UTF-8 characters that are not minimally packed */ 616 errno = EILSEQ; 617 goto error; 618 } 619 620 codepoint -= 0x10000; 621 622 if (out_left < 4) { 623 errno = E2BIG; 624 goto error; 625 } 626 627 uc[0] = (codepoint>>10) & 0xFF; 628 uc[1] = (codepoint>>18) | 0xd8; 629 uc[2] = codepoint & 0xFF; 630 uc[3] = ((codepoint>>8) & 0x3) | 0xdc; 631 c += 4; 632 in_left -= 4; 633 out_left -= 4; 634 uc += 4; 635 continue; 636 } 637 638 /* we don't handle 5 byte sequences */ 639 errno = EINVAL; 640 goto error; 641 } 642 643 if (in_left > 0) { 644 errno = E2BIG; 645 goto error; 646 } 647 648 *inbytesleft = in_left; 649 *outbytesleft = out_left; 650 *inbuf = (char *)c; 651 *outbuf = (char *)uc; 652 return 0; 653 654error: 655 *inbytesleft = in_left; 656 *outbytesleft = out_left; 657 *inbuf = (char *)c; 658 *outbuf = (char *)uc; 659 return -1; 660} 661 662static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft, 663 char **outbuf, size_t *outbytesleft) 664{ 665 size_t in_left=*inbytesleft, out_left=*outbytesleft; 666 uint8 *c = (uint8 *)*outbuf; 667 const uint8 *uc = (const uint8 *)*inbuf; 668 669 while (in_left >= 2 && out_left >= 1) { 670 unsigned int codepoint; 671 672 if (uc[1] == 0 && !(uc[0] & 0x80)) { 673 /* simplest case */ 674 c[0] = uc[0]; 675 in_left -= 2; 676 out_left -= 1; 677 uc += 2; 678 c += 1; 679 continue; 680 } 681 682 if ((uc[1]&0xf8) == 0) { 683 /* next simplest case */ 684 if (out_left < 2) { 685 errno = E2BIG; 686 goto error; 687 } 688 c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2); 689 c[1] = 0x80 | (uc[0] & 0x3f); 690 in_left -= 2; 691 out_left -= 2; 692 uc += 2; 693 c += 2; 694 continue; 695 } 696 697 if ((uc[1] & 0xfc) == 0xdc) { 698 /* its the second part of a 4 byte sequence. Illegal */ 699 if (in_left < 4) { 700 errno = EINVAL; 701 } else { 702 errno = EILSEQ; 703 } 704 goto error; 705 } 706 707 if ((uc[1] & 0xfc) != 0xd8) { 708 codepoint = uc[0] | (uc[1]<<8); 709 if (out_left < 3) { 710 errno = E2BIG; 711 goto error; 712 } 713 c[0] = 0xe0 | (codepoint >> 12); 714 c[1] = 0x80 | ((codepoint >> 6) & 0x3f); 715 c[2] = 0x80 | (codepoint & 0x3f); 716 717 in_left -= 2; 718 out_left -= 3; 719 uc += 2; 720 c += 3; 721 continue; 722 } 723 724 /* its the first part of a 4 byte sequence */ 725 if (in_left < 4) { 726 errno = EINVAL; 727 goto error; 728 } 729 if ((uc[3] & 0xfc) != 0xdc) { 730 errno = EILSEQ; 731 goto error; 732 } 733 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) | 734 (uc[0]<<10) | ((uc[1] & 0x3)<<18)); 735 736 if (out_left < 4) { 737 errno = E2BIG; 738 goto error; 739 } 740 c[0] = 0xf0 | (codepoint >> 18); 741 c[1] = 0x80 | ((codepoint >> 12) & 0x3f); 742 c[2] = 0x80 | ((codepoint >> 6) & 0x3f); 743 c[3] = 0x80 | (codepoint & 0x3f); 744 745 in_left -= 4; 746 out_left -= 4; 747 uc += 4; 748 c += 4; 749 } 750 751 if (in_left == 1) { 752 errno = EINVAL; 753 goto error; 754 } 755 756 if (in_left > 1) { 757 errno = E2BIG; 758 goto error; 759 } 760 761 *inbytesleft = in_left; 762 *outbytesleft = out_left; 763 *inbuf = (char *)uc; 764 *outbuf = (char *)c; 765 766 return 0; 767 768error: 769 *inbytesleft = in_left; 770 *outbytesleft = out_left; 771 *inbuf = (char *)uc; 772 *outbuf = (char *)c; 773 return -1; 774} 775 776