1///////////////////////////////////////////////////////////////////////////// 2// Name: src/common/strconv.cpp 3// Purpose: Unicode conversion classes 4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik, 5// Ryan Norton, Fredrik Roubert (UTF7) 6// Modified by: 7// Created: 29/01/98 8// RCS-ID: $Id: strconv.cpp 64156 2010-04-27 08:52:30Z VZ $ 9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik 10// (c) 2000-2003 Vadim Zeitlin 11// (c) 2004 Ryan Norton, Fredrik Roubert 12// Licence: wxWindows licence 13///////////////////////////////////////////////////////////////////////////// 14 15// For compilers that support precompilation, includes "wx.h". 16#include "wx/wxprec.h" 17 18#ifndef WX_PRECOMP 19 #ifdef __WXMSW__ 20 #include "wx/msw/missing.h" 21 #endif 22 #include "wx/intl.h" 23 #include "wx/log.h" 24 #include "wx/utils.h" 25 #include "wx/hashmap.h" 26#endif 27 28#include "wx/strconv.h" 29 30#if wxUSE_WCHAR_T 31 32#ifdef __WINDOWS__ 33 #include "wx/msw/private.h" 34#endif 35 36#ifndef __WXWINCE__ 37#include <errno.h> 38#endif 39 40#include <ctype.h> 41#include <string.h> 42#include <stdlib.h> 43 44#if defined(__WIN32__) && !defined(__WXMICROWIN__) 45 #define wxHAVE_WIN32_MB2WC 46#endif 47 48#ifdef __SALFORDC__ 49 #include <clib.h> 50#endif 51 52#ifdef HAVE_ICONV 53 #include <iconv.h> 54 #include "wx/thread.h" 55#endif 56 57#include "wx/encconv.h" 58#include "wx/fontmap.h" 59 60#ifdef __WXMAC__ 61#ifndef __DARWIN__ 62#include <ATSUnicode.h> 63#include <TextCommon.h> 64#include <TextEncodingConverter.h> 65#endif 66 67// includes Mac headers 68#include "wx/mac/private.h" 69#include "wx/thread.h" 70 71#endif 72 73 74#define TRACE_STRCONV _T("strconv") 75 76// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to 77// be 4 bytes 78#if SIZEOF_WCHAR_T == 2 79 #define WC_UTF16 80#endif 81 82 83// ============================================================================ 84// implementation 85// ============================================================================ 86 87// helper function of cMB2WC(): check if n bytes at this location are all NUL 88static bool NotAllNULs(const char *p, size_t n) 89{ 90 while ( n && *p++ == '\0' ) 91 n--; 92 93 return n != 0; 94} 95 96// ---------------------------------------------------------------------------- 97// UTF-16 en/decoding to/from UCS-4 with surrogates handling 98// ---------------------------------------------------------------------------- 99 100static size_t encode_utf16(wxUint32 input, wxUint16 *output) 101{ 102 if (input <= 0xffff) 103 { 104 if (output) 105 *output = (wxUint16) input; 106 107 return 1; 108 } 109 else if (input >= 0x110000) 110 { 111 return wxCONV_FAILED; 112 } 113 else 114 { 115 if (output) 116 { 117 *output++ = (wxUint16) ((input >> 10) + 0xd7c0); 118 *output = (wxUint16) ((input & 0x3ff) + 0xdc00); 119 } 120 121 return 2; 122 } 123} 124 125static size_t decode_utf16(const wxUint16* input, wxUint32& output) 126{ 127 if ((*input < 0xd800) || (*input > 0xdfff)) 128 { 129 output = *input; 130 return 1; 131 } 132 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff)) 133 { 134 output = *input; 135 return wxCONV_FAILED; 136 } 137 else 138 { 139 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00); 140 return 2; 141 } 142} 143 144#ifdef WC_UTF16 145 typedef wchar_t wxDecodeSurrogate_t; 146#else // !WC_UTF16 147 typedef wxUint16 wxDecodeSurrogate_t; 148#endif // WC_UTF16/!WC_UTF16 149 150// returns the next UTF-32 character from the wchar_t buffer and advances the 151// pointer to the character after this one 152// 153// if an invalid character is found, *pSrc is set to NULL, the caller must 154// check for this 155static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc) 156{ 157 wxUint32 out; 158 const size_t 159 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out); 160 if ( n == wxCONV_FAILED ) 161 *pSrc = NULL; 162 else 163 *pSrc += n; 164 165 return out; 166} 167 168// ---------------------------------------------------------------------------- 169// wxMBConv 170// ---------------------------------------------------------------------------- 171 172size_t 173wxMBConv::ToWChar(wchar_t *dst, size_t dstLen, 174 const char *src, size_t srcLen) const 175{ 176 // although new conversion classes are supposed to implement this function 177 // directly, the existins ones only implement the old MB2WC() and so, to 178 // avoid to have to rewrite all conversion classes at once, we provide a 179 // default (but not efficient) implementation of this one in terms of the 180 // old function by copying the input to ensure that it's NUL-terminated and 181 // then using MB2WC() to convert it 182 183 // the number of chars [which would be] written to dst [if it were not NULL] 184 size_t dstWritten = 0; 185 186 // the number of NULs terminating this string 187 size_t nulLen = 0; // not really needed, but just to avoid warnings 188 189 // if we were not given the input size we just have to assume that the 190 // string is properly terminated as we have no way of knowing how long it 191 // is anyhow, but if we do have the size check whether there are enough 192 // NULs at the end 193 wxCharBuffer bufTmp; 194 const char *srcEnd; 195 if ( srcLen != wxNO_LEN ) 196 { 197 // we need to know how to find the end of this string 198 nulLen = GetMBNulLen(); 199 if ( nulLen == wxCONV_FAILED ) 200 return wxCONV_FAILED; 201 202 // if there are enough NULs we can avoid the copy 203 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) ) 204 { 205 // make a copy in order to properly NUL-terminate the string 206 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */); 207 char * const p = bufTmp.data(); 208 memcpy(p, src, srcLen); 209 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ ) 210 *s = '\0'; 211 212 src = bufTmp; 213 } 214 215 srcEnd = src + srcLen; 216 } 217 else // quit after the first loop iteration 218 { 219 srcEnd = NULL; 220 } 221 222 for ( ;; ) 223 { 224 // try to convert the current chunk 225 size_t lenChunk = MB2WC(NULL, src, 0); 226 if ( lenChunk == wxCONV_FAILED ) 227 return wxCONV_FAILED; 228 229 lenChunk++; // for the L'\0' at the end of this chunk 230 231 dstWritten += lenChunk; 232 233 if ( lenChunk == 1 ) 234 { 235 // nothing left in the input string, conversion succeeded 236 break; 237 } 238 239 if ( dst ) 240 { 241 if ( dstWritten > dstLen ) 242 return wxCONV_FAILED; 243 244 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED ) 245 return wxCONV_FAILED; 246 247 dst += lenChunk; 248 } 249 250 if ( !srcEnd ) 251 { 252 // we convert just one chunk in this case as this is the entire 253 // string anyhow 254 break; 255 } 256 257 // advance the input pointer past the end of this chunk 258 while ( NotAllNULs(src, nulLen) ) 259 { 260 // notice that we must skip over multiple bytes here as we suppose 261 // that if NUL takes 2 or 4 bytes, then all the other characters do 262 // too and so if advanced by a single byte we might erroneously 263 // detect sequences of NUL bytes in the middle of the input 264 src += nulLen; 265 } 266 267 src += nulLen; // skipping over its terminator as well 268 269 // note that ">=" (and not just "==") is needed here as the terminator 270 // we skipped just above could be inside or just after the buffer 271 // delimited by inEnd 272 if ( src >= srcEnd ) 273 break; 274 } 275 276 return dstWritten; 277} 278 279size_t 280wxMBConv::FromWChar(char *dst, size_t dstLen, 281 const wchar_t *src, size_t srcLen) const 282{ 283 // the number of chars [which would be] written to dst [if it were not NULL] 284 size_t dstWritten = 0; 285 286 // make a copy of the input string unless it is already properly 287 // NUL-terminated 288 // 289 // if we don't know its length we have no choice but to assume that it is, 290 // indeed, properly terminated 291 wxWCharBuffer bufTmp; 292 if ( srcLen == wxNO_LEN ) 293 { 294 srcLen = wxWcslen(src) + 1; 295 } 296 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' ) 297 { 298 // make a copy in order to properly NUL-terminate the string 299 bufTmp = wxWCharBuffer(srcLen); 300 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t)); 301 src = bufTmp; 302 } 303 304 const size_t lenNul = GetMBNulLen(); 305 for ( const wchar_t * const srcEnd = src + srcLen; 306 src < srcEnd; 307 src += wxWcslen(src) + 1 /* skip L'\0' too */ ) 308 { 309 // try to convert the current chunk 310 size_t lenChunk = WC2MB(NULL, src, 0); 311 312 if ( lenChunk == wxCONV_FAILED ) 313 return wxCONV_FAILED; 314 315 lenChunk += lenNul; 316 dstWritten += lenChunk; 317 318 if ( dst ) 319 { 320 if ( dstWritten > dstLen ) 321 return wxCONV_FAILED; 322 323 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED ) 324 return wxCONV_FAILED; 325 326 dst += lenChunk; 327 } 328 } 329 330 return dstWritten; 331} 332 333size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const 334{ 335 size_t rc = ToWChar(outBuff, outLen, inBuff); 336 if ( rc != wxCONV_FAILED ) 337 { 338 // ToWChar() returns the buffer length, i.e. including the trailing 339 // NUL, while this method doesn't take it into account 340 rc--; 341 } 342 343 return rc; 344} 345 346size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const 347{ 348 size_t rc = FromWChar(outBuff, outLen, inBuff); 349 if ( rc != wxCONV_FAILED ) 350 { 351 rc -= GetMBNulLen(); 352 } 353 354 return rc; 355} 356 357wxMBConv::~wxMBConv() 358{ 359 // nothing to do here (necessary for Darwin linking probably) 360} 361 362const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const 363{ 364 if ( psz ) 365 { 366 // calculate the length of the buffer needed first 367 const size_t nLen = MB2WC(NULL, psz, 0); 368 if ( nLen != wxCONV_FAILED ) 369 { 370 // now do the actual conversion 371 wxWCharBuffer buf(nLen /* +1 added implicitly */); 372 373 // +1 for the trailing NULL 374 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED ) 375 return buf; 376 } 377 } 378 379 return wxWCharBuffer(); 380} 381 382const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const 383{ 384 if ( pwz ) 385 { 386 const size_t nLen = WC2MB(NULL, pwz, 0); 387 if ( nLen != wxCONV_FAILED ) 388 { 389 // extra space for trailing NUL(s) 390 static const size_t extraLen = GetMaxMBNulLen(); 391 392 wxCharBuffer buf(nLen + extraLen - 1); 393 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED ) 394 return buf; 395 } 396 } 397 398 return wxCharBuffer(); 399} 400 401const wxWCharBuffer 402wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const 403{ 404 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen); 405 if ( dstLen != wxCONV_FAILED ) 406 { 407 wxWCharBuffer wbuf(dstLen - 1); 408 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED ) 409 { 410 if ( outLen ) 411 { 412 *outLen = dstLen; 413 if ( wbuf[dstLen - 1] == L'\0' ) 414 (*outLen)--; 415 } 416 417 return wbuf; 418 } 419 } 420 421 if ( outLen ) 422 *outLen = 0; 423 424 return wxWCharBuffer(); 425} 426 427const wxCharBuffer 428wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const 429{ 430 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen); 431 if ( dstLen != wxCONV_FAILED ) 432 { 433 // special case of empty input: can't allocate 0 size buffer below as 434 // wxCharBuffer insists on NUL-terminating it 435 wxCharBuffer buf(dstLen ? dstLen - 1 : 1); 436 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED ) 437 { 438 if ( outLen ) 439 { 440 *outLen = dstLen; 441 442 const size_t nulLen = GetMBNulLen(); 443 if ( dstLen >= nulLen && 444 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) ) 445 { 446 // in this case the output is NUL-terminated and we're not 447 // supposed to count NUL 448 *outLen -= nulLen; 449 } 450 } 451 452 return buf; 453 } 454 } 455 456 if ( outLen ) 457 *outLen = 0; 458 459 return wxCharBuffer(); 460} 461 462// ---------------------------------------------------------------------------- 463// wxMBConvLibc 464// ---------------------------------------------------------------------------- 465 466size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const 467{ 468 return wxMB2WC(buf, psz, n); 469} 470 471size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const 472{ 473 return wxWC2MB(buf, psz, n); 474} 475 476// ---------------------------------------------------------------------------- 477// wxConvBrokenFileNames 478// ---------------------------------------------------------------------------- 479 480#ifdef __UNIX__ 481 482wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset) 483{ 484 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0 485 || wxStricmp(charset, _T("UTF8")) == 0 ) 486 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA); 487 else 488 m_conv = new wxCSConv(charset); 489} 490 491#endif // __UNIX__ 492 493// ---------------------------------------------------------------------------- 494// UTF-7 495// ---------------------------------------------------------------------------- 496 497// Implementation (C) 2004 Fredrik Roubert 498 499// 500// BASE64 decoding table 501// 502static const unsigned char utf7unb64[] = 503{ 504 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 505 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 506 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 507 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 509 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f, 510 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 511 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 512 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 513 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 514 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 515 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff, 516 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 517 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 518 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 519 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff, 520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 533 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 534 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 535 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 536}; 537 538size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const 539{ 540 size_t len = 0; 541 542 while ( *psz && (!buf || (len < n)) ) 543 { 544 unsigned char cc = *psz++; 545 if (cc != '+') 546 { 547 // plain ASCII char 548 if (buf) 549 *buf++ = cc; 550 len++; 551 } 552 else if (*psz == '-') 553 { 554 // encoded plus sign 555 if (buf) 556 *buf++ = cc; 557 len++; 558 psz++; 559 } 560 else // start of BASE64 encoded string 561 { 562 bool lsb, ok; 563 unsigned int d, l; 564 for ( ok = lsb = false, d = 0, l = 0; 565 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; 566 psz++ ) 567 { 568 d <<= 6; 569 d += cc; 570 for (l += 6; l >= 8; lsb = !lsb) 571 { 572 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256); 573 if (lsb) 574 { 575 if (buf) 576 *buf++ |= c; 577 len ++; 578 } 579 else 580 { 581 if (buf) 582 *buf = (wchar_t)(c << 8); 583 } 584 585 ok = true; 586 } 587 } 588 589 if ( !ok ) 590 { 591 // in valid UTF7 we should have valid characters after '+' 592 return wxCONV_FAILED; 593 } 594 595 if (*psz == '-') 596 psz++; 597 } 598 } 599 600 if ( buf && (len < n) ) 601 *buf = '\0'; 602 603 return len; 604} 605 606// 607// BASE64 encoding table 608// 609static const unsigned char utf7enb64[] = 610{ 611 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 612 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 613 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 614 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 615 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 616 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 617 'w', 'x', 'y', 'z', '0', '1', '2', '3', 618 '4', '5', '6', '7', '8', '9', '+', '/' 619}; 620 621// 622// UTF-7 encoding table 623// 624// 0 - Set D (directly encoded characters) 625// 1 - Set O (optional direct characters) 626// 2 - whitespace characters (optional) 627// 3 - special characters 628// 629static const unsigned char utf7encode[128] = 630{ 631 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 632 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 633 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3, 634 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 635 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 636 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 637 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 638 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3 639}; 640 641size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const 642{ 643 size_t len = 0; 644 645 while (*psz && ((!buf) || (len < n))) 646 { 647 wchar_t cc = *psz++; 648 if (cc < 0x80 && utf7encode[cc] < 1) 649 { 650 // plain ASCII char 651 if (buf) 652 *buf++ = (char)cc; 653 654 len++; 655 } 656#ifndef WC_UTF16 657 else if (((wxUint32)cc) > 0xffff) 658 { 659 // no surrogate pair generation (yet?) 660 return wxCONV_FAILED; 661 } 662#endif 663 else 664 { 665 if (buf) 666 *buf++ = '+'; 667 668 len++; 669 if (cc != '+') 670 { 671 // BASE64 encode string 672 unsigned int lsb, d, l; 673 for (d = 0, l = 0; /*nothing*/; psz++) 674 { 675 for (lsb = 0; lsb < 2; lsb ++) 676 { 677 d <<= 8; 678 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8; 679 680 for (l += 8; l >= 6; ) 681 { 682 l -= 6; 683 if (buf) 684 *buf++ = utf7enb64[(d >> l) % 64]; 685 len++; 686 } 687 } 688 689 cc = *psz; 690 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1)) 691 break; 692 } 693 694 if (l != 0) 695 { 696 if (buf) 697 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64]; 698 699 len++; 700 } 701 } 702 703 if (buf) 704 *buf++ = '-'; 705 len++; 706 } 707 } 708 709 if (buf && (len < n)) 710 *buf = 0; 711 712 return len; 713} 714 715// ---------------------------------------------------------------------------- 716// UTF-8 717// ---------------------------------------------------------------------------- 718 719static wxUint32 utf8_max[]= 720 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff }; 721 722// boundaries of the private use area we use to (temporarily) remap invalid 723// characters invalid in a UTF-8 encoded string 724const wxUint32 wxUnicodePUA = 0x100000; 725const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256; 726 727size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const 728{ 729 size_t len = 0; 730 731 while (*psz && ((!buf) || (len < n))) 732 { 733 const char *opsz = psz; 734 bool invalid = false; 735 unsigned char cc = *psz++, fc = cc; 736 unsigned cnt; 737 for (cnt = 0; fc & 0x80; cnt++) 738 fc <<= 1; 739 740 if (!cnt) 741 { 742 // plain ASCII char 743 if (buf) 744 *buf++ = cc; 745 len++; 746 747 // escape the escape character for octal escapes 748 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL) 749 && cc == '\\' && (!buf || len < n)) 750 { 751 if (buf) 752 *buf++ = cc; 753 len++; 754 } 755 } 756 else 757 { 758 cnt--; 759 if (!cnt) 760 { 761 // invalid UTF-8 sequence 762 invalid = true; 763 } 764 else 765 { 766 unsigned ocnt = cnt - 1; 767 wxUint32 res = cc & (0x3f >> cnt); 768 while (cnt--) 769 { 770 cc = *psz; 771 if ((cc & 0xC0) != 0x80) 772 { 773 // invalid UTF-8 sequence 774 invalid = true; 775 break; 776 } 777 778 psz++; 779 res = (res << 6) | (cc & 0x3f); 780 } 781 782 if (invalid || res <= utf8_max[ocnt]) 783 { 784 // illegal UTF-8 encoding 785 invalid = true; 786 } 787 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) && 788 res >= wxUnicodePUA && res < wxUnicodePUAEnd) 789 { 790 // if one of our PUA characters turns up externally 791 // it must also be treated as an illegal sequence 792 // (a bit like you have to escape an escape character) 793 invalid = true; 794 } 795 else 796 { 797#ifdef WC_UTF16 798 // cast is ok because wchar_t == wxUuint16 if WC_UTF16 799 size_t pa = encode_utf16(res, (wxUint16 *)buf); 800 if (pa == wxCONV_FAILED) 801 { 802 invalid = true; 803 } 804 else 805 { 806 if (buf) 807 buf += pa; 808 len += pa; 809 } 810#else // !WC_UTF16 811 if (buf) 812 *buf++ = (wchar_t)res; 813 len++; 814#endif // WC_UTF16/!WC_UTF16 815 } 816 } 817 818 if (invalid) 819 { 820 if (m_options & MAP_INVALID_UTF8_TO_PUA) 821 { 822 while (opsz < psz && (!buf || len < n)) 823 { 824#ifdef WC_UTF16 825 // cast is ok because wchar_t == wxUuint16 if WC_UTF16 826 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf); 827 wxASSERT(pa != wxCONV_FAILED); 828 if (buf) 829 buf += pa; 830 opsz++; 831 len += pa; 832#else 833 if (buf) 834 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz); 835 opsz++; 836 len++; 837#endif 838 } 839 } 840 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL) 841 { 842 while (opsz < psz && (!buf || len < n)) 843 { 844 if ( buf && len + 3 < n ) 845 { 846 unsigned char on = *opsz; 847 *buf++ = L'\\'; 848 *buf++ = (wchar_t)( L'0' + on / 0100 ); 849 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 ); 850 *buf++ = (wchar_t)( L'0' + on % 010 ); 851 } 852 853 opsz++; 854 len += 4; 855 } 856 } 857 else // MAP_INVALID_UTF8_NOT 858 { 859 return wxCONV_FAILED; 860 } 861 } 862 } 863 } 864 865 if (buf && (len < n)) 866 *buf = 0; 867 868 return len; 869} 870 871static inline bool isoctal(wchar_t wch) 872{ 873 return L'0' <= wch && wch <= L'7'; 874} 875 876size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const 877{ 878 size_t len = 0; 879 880 while (*psz && ((!buf) || (len < n))) 881 { 882 wxUint32 cc; 883 884#ifdef WC_UTF16 885 // cast is ok for WC_UTF16 886 size_t pa = decode_utf16((const wxUint16 *)psz, cc); 887 psz += (pa == wxCONV_FAILED) ? 1 : pa; 888#else 889 cc = (*psz++) & 0x7fffffff; 890#endif 891 892 if ( (m_options & MAP_INVALID_UTF8_TO_PUA) 893 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd ) 894 { 895 if (buf) 896 *buf++ = (char)(cc - wxUnicodePUA); 897 len++; 898 } 899 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) 900 && cc == L'\\' && psz[0] == L'\\' ) 901 { 902 if (buf) 903 *buf++ = (char)cc; 904 psz++; 905 len++; 906 } 907 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) && 908 cc == L'\\' && 909 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) ) 910 { 911 if (buf) 912 { 913 *buf++ = (char) ((psz[0] - L'0') * 0100 + 914 (psz[1] - L'0') * 010 + 915 (psz[2] - L'0')); 916 } 917 918 psz += 3; 919 len++; 920 } 921 else 922 { 923 unsigned cnt; 924 for (cnt = 0; cc > utf8_max[cnt]; cnt++) 925 { 926 } 927 928 if (!cnt) 929 { 930 // plain ASCII char 931 if (buf) 932 *buf++ = (char) cc; 933 len++; 934 } 935 else 936 { 937 len += cnt + 1; 938 if (buf) 939 { 940 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt))); 941 while (cnt--) 942 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f)); 943 } 944 } 945 } 946 } 947 948 if (buf && (len < n)) 949 *buf = 0; 950 951 return len; 952} 953 954// ============================================================================ 955// UTF-16 956// ============================================================================ 957 958#ifdef WORDS_BIGENDIAN 959 #define wxMBConvUTF16straight wxMBConvUTF16BE 960 #define wxMBConvUTF16swap wxMBConvUTF16LE 961#else 962 #define wxMBConvUTF16swap wxMBConvUTF16BE 963 #define wxMBConvUTF16straight wxMBConvUTF16LE 964#endif 965 966/* static */ 967size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen) 968{ 969 if ( srcLen == wxNO_LEN ) 970 { 971 // count the number of bytes in input, including the trailing NULs 972 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src); 973 for ( srcLen = 1; *inBuff++; srcLen++ ) 974 ; 975 976 srcLen *= BYTES_PER_CHAR; 977 } 978 else // we already have the length 979 { 980 // we can only convert an entire number of UTF-16 characters 981 if ( srcLen % BYTES_PER_CHAR ) 982 return wxCONV_FAILED; 983 } 984 985 return srcLen; 986} 987 988// case when in-memory representation is UTF-16 too 989#ifdef WC_UTF16 990 991// ---------------------------------------------------------------------------- 992// conversions without endianness change 993// ---------------------------------------------------------------------------- 994 995size_t 996wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen, 997 const char *src, size_t srcLen) const 998{ 999 // set up the scene for using memcpy() (which is presumably more efficient 1000 // than copying the bytes one by one) 1001 srcLen = GetLength(src, srcLen); 1002 if ( srcLen == wxNO_LEN ) 1003 return wxCONV_FAILED; 1004 1005 const size_t inLen = srcLen / BYTES_PER_CHAR; 1006 if ( dst ) 1007 { 1008 if ( dstLen < inLen ) 1009 return wxCONV_FAILED; 1010 1011 memcpy(dst, src, srcLen); 1012 } 1013 1014 return inLen; 1015} 1016 1017size_t 1018wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen, 1019 const wchar_t *src, size_t srcLen) const 1020{ 1021 if ( srcLen == wxNO_LEN ) 1022 srcLen = wxWcslen(src) + 1; 1023 1024 srcLen *= BYTES_PER_CHAR; 1025 1026 if ( dst ) 1027 { 1028 if ( dstLen < srcLen ) 1029 return wxCONV_FAILED; 1030 1031 memcpy(dst, src, srcLen); 1032 } 1033 1034 return srcLen; 1035} 1036 1037// ---------------------------------------------------------------------------- 1038// endian-reversing conversions 1039// ---------------------------------------------------------------------------- 1040 1041size_t 1042wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen, 1043 const char *src, size_t srcLen) const 1044{ 1045 srcLen = GetLength(src, srcLen); 1046 if ( srcLen == wxNO_LEN ) 1047 return wxCONV_FAILED; 1048 1049 srcLen /= BYTES_PER_CHAR; 1050 1051 if ( dst ) 1052 { 1053 if ( dstLen < srcLen ) 1054 return wxCONV_FAILED; 1055 1056 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src); 1057 for ( size_t n = 0; n < srcLen; n++, inBuff++ ) 1058 { 1059 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff); 1060 } 1061 } 1062 1063 return srcLen; 1064} 1065 1066size_t 1067wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen, 1068 const wchar_t *src, size_t srcLen) const 1069{ 1070 if ( srcLen == wxNO_LEN ) 1071 srcLen = wxWcslen(src) + 1; 1072 1073 srcLen *= BYTES_PER_CHAR; 1074 1075 if ( dst ) 1076 { 1077 if ( dstLen < srcLen ) 1078 return wxCONV_FAILED; 1079 1080 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst); 1081 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ ) 1082 { 1083 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src); 1084 } 1085 } 1086 1087 return srcLen; 1088} 1089 1090#else // !WC_UTF16: wchar_t is UTF-32 1091 1092// ---------------------------------------------------------------------------- 1093// conversions without endianness change 1094// ---------------------------------------------------------------------------- 1095 1096size_t 1097wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen, 1098 const char *src, size_t srcLen) const 1099{ 1100 srcLen = GetLength(src, srcLen); 1101 if ( srcLen == wxNO_LEN ) 1102 return wxCONV_FAILED; 1103 1104 const size_t inLen = srcLen / BYTES_PER_CHAR; 1105 if ( !dst ) 1106 { 1107 // optimization: return maximal space which could be needed for this 1108 // string even if the real size could be smaller if the buffer contains 1109 // any surrogates 1110 return inLen; 1111 } 1112 1113 size_t outLen = 0; 1114 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src); 1115 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; ) 1116 { 1117 const wxUint32 ch = wxDecodeSurrogate(&inBuff); 1118 if ( !inBuff ) 1119 return wxCONV_FAILED; 1120 1121 if ( ++outLen > dstLen ) 1122 return wxCONV_FAILED; 1123 1124 *dst++ = ch; 1125 } 1126 1127 1128 return outLen; 1129} 1130 1131size_t 1132wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen, 1133 const wchar_t *src, size_t srcLen) const 1134{ 1135 if ( srcLen == wxNO_LEN ) 1136 srcLen = wxWcslen(src) + 1; 1137 1138 size_t outLen = 0; 1139 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst); 1140 for ( size_t n = 0; n < srcLen; n++ ) 1141 { 1142 wxUint16 cc[2]; 1143 const size_t numChars = encode_utf16(*src++, cc); 1144 if ( numChars == wxCONV_FAILED ) 1145 return wxCONV_FAILED; 1146 1147 outLen += numChars * BYTES_PER_CHAR; 1148 if ( outBuff ) 1149 { 1150 if ( outLen > dstLen ) 1151 return wxCONV_FAILED; 1152 1153 *outBuff++ = cc[0]; 1154 if ( numChars == 2 ) 1155 { 1156 // second character of a surrogate 1157 *outBuff++ = cc[1]; 1158 } 1159 } 1160 } 1161 1162 return outLen; 1163} 1164 1165// ---------------------------------------------------------------------------- 1166// endian-reversing conversions 1167// ---------------------------------------------------------------------------- 1168 1169size_t 1170wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen, 1171 const char *src, size_t srcLen) const 1172{ 1173 srcLen = GetLength(src, srcLen); 1174 if ( srcLen == wxNO_LEN ) 1175 return wxCONV_FAILED; 1176 1177 const size_t inLen = srcLen / BYTES_PER_CHAR; 1178 if ( !dst ) 1179 { 1180 // optimization: return maximal space which could be needed for this 1181 // string even if the real size could be smaller if the buffer contains 1182 // any surrogates 1183 return inLen; 1184 } 1185 1186 size_t outLen = 0; 1187 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src); 1188 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; ) 1189 { 1190 wxUint32 ch; 1191 wxUint16 tmp[2]; 1192 1193 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff); 1194 inBuff++; 1195 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff); 1196 1197 const size_t numChars = decode_utf16(tmp, ch); 1198 if ( numChars == wxCONV_FAILED ) 1199 return wxCONV_FAILED; 1200 1201 if ( numChars == 2 ) 1202 inBuff++; 1203 1204 if ( ++outLen > dstLen ) 1205 return wxCONV_FAILED; 1206 1207 *dst++ = ch; 1208 } 1209 1210 1211 return outLen; 1212} 1213 1214size_t 1215wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen, 1216 const wchar_t *src, size_t srcLen) const 1217{ 1218 if ( srcLen == wxNO_LEN ) 1219 srcLen = wxWcslen(src) + 1; 1220 1221 size_t outLen = 0; 1222 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst); 1223 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ ) 1224 { 1225 wxUint16 cc[2]; 1226 const size_t numChars = encode_utf16(*src, cc); 1227 if ( numChars == wxCONV_FAILED ) 1228 return wxCONV_FAILED; 1229 1230 outLen += numChars * BYTES_PER_CHAR; 1231 if ( outBuff ) 1232 { 1233 if ( outLen > dstLen ) 1234 return wxCONV_FAILED; 1235 1236 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]); 1237 if ( numChars == 2 ) 1238 { 1239 // second character of a surrogate 1240 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]); 1241 } 1242 } 1243 } 1244 1245 return outLen; 1246} 1247 1248#endif // WC_UTF16/!WC_UTF16 1249 1250 1251// ============================================================================ 1252// UTF-32 1253// ============================================================================ 1254 1255#ifdef WORDS_BIGENDIAN 1256 #define wxMBConvUTF32straight wxMBConvUTF32BE 1257 #define wxMBConvUTF32swap wxMBConvUTF32LE 1258#else 1259 #define wxMBConvUTF32swap wxMBConvUTF32BE 1260 #define wxMBConvUTF32straight wxMBConvUTF32LE 1261#endif 1262 1263 1264WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE; 1265WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE; 1266 1267/* static */ 1268size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen) 1269{ 1270 if ( srcLen == wxNO_LEN ) 1271 { 1272 // count the number of bytes in input, including the trailing NULs 1273 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src); 1274 for ( srcLen = 1; *inBuff++; srcLen++ ) 1275 ; 1276 1277 srcLen *= BYTES_PER_CHAR; 1278 } 1279 else // we already have the length 1280 { 1281 // we can only convert an entire number of UTF-32 characters 1282 if ( srcLen % BYTES_PER_CHAR ) 1283 return wxCONV_FAILED; 1284 } 1285 1286 return srcLen; 1287} 1288 1289// case when in-memory representation is UTF-16 1290#ifdef WC_UTF16 1291 1292// ---------------------------------------------------------------------------- 1293// conversions without endianness change 1294// ---------------------------------------------------------------------------- 1295 1296size_t 1297wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen, 1298 const char *src, size_t srcLen) const 1299{ 1300 srcLen = GetLength(src, srcLen); 1301 if ( srcLen == wxNO_LEN ) 1302 return wxCONV_FAILED; 1303 1304 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src); 1305 const size_t inLen = srcLen / BYTES_PER_CHAR; 1306 size_t outLen = 0; 1307 for ( size_t n = 0; n < inLen; n++ ) 1308 { 1309 wxUint16 cc[2]; 1310 const size_t numChars = encode_utf16(*inBuff++, cc); 1311 if ( numChars == wxCONV_FAILED ) 1312 return wxCONV_FAILED; 1313 1314 outLen += numChars; 1315 if ( dst ) 1316 { 1317 if ( outLen > dstLen ) 1318 return wxCONV_FAILED; 1319 1320 *dst++ = cc[0]; 1321 if ( numChars == 2 ) 1322 { 1323 // second character of a surrogate 1324 *dst++ = cc[1]; 1325 } 1326 } 1327 } 1328 1329 return outLen; 1330} 1331 1332size_t 1333wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen, 1334 const wchar_t *src, size_t srcLen) const 1335{ 1336 if ( srcLen == wxNO_LEN ) 1337 srcLen = wxWcslen(src) + 1; 1338 1339 if ( !dst ) 1340 { 1341 // optimization: return maximal space which could be needed for this 1342 // string instead of the exact amount which could be less if there are 1343 // any surrogates in the input 1344 // 1345 // we consider that surrogates are rare enough to make it worthwhile to 1346 // avoid running the loop below at the cost of slightly extra memory 1347 // consumption 1348 return srcLen * BYTES_PER_CHAR; 1349 } 1350 1351 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst); 1352 size_t outLen = 0; 1353 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; ) 1354 { 1355 const wxUint32 ch = wxDecodeSurrogate(&src); 1356 if ( !src ) 1357 return wxCONV_FAILED; 1358 1359 outLen += BYTES_PER_CHAR; 1360 1361 if ( outLen > dstLen ) 1362 return wxCONV_FAILED; 1363 1364 *outBuff++ = ch; 1365 } 1366 1367 return outLen; 1368} 1369 1370// ---------------------------------------------------------------------------- 1371// endian-reversing conversions 1372// ---------------------------------------------------------------------------- 1373 1374size_t 1375wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen, 1376 const char *src, size_t srcLen) const 1377{ 1378 srcLen = GetLength(src, srcLen); 1379 if ( srcLen == wxNO_LEN ) 1380 return wxCONV_FAILED; 1381 1382 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src); 1383 const size_t inLen = srcLen / BYTES_PER_CHAR; 1384 size_t outLen = 0; 1385 for ( size_t n = 0; n < inLen; n++, inBuff++ ) 1386 { 1387 wxUint16 cc[2]; 1388 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc); 1389 if ( numChars == wxCONV_FAILED ) 1390 return wxCONV_FAILED; 1391 1392 outLen += numChars; 1393 if ( dst ) 1394 { 1395 if ( outLen > dstLen ) 1396 return wxCONV_FAILED; 1397 1398 *dst++ = cc[0]; 1399 if ( numChars == 2 ) 1400 { 1401 // second character of a surrogate 1402 *dst++ = cc[1]; 1403 } 1404 } 1405 } 1406 1407 return outLen; 1408} 1409 1410size_t 1411wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen, 1412 const wchar_t *src, size_t srcLen) const 1413{ 1414 if ( srcLen == wxNO_LEN ) 1415 srcLen = wxWcslen(src) + 1; 1416 1417 if ( !dst ) 1418 { 1419 // optimization: return maximal space which could be needed for this 1420 // string instead of the exact amount which could be less if there are 1421 // any surrogates in the input 1422 // 1423 // we consider that surrogates are rare enough to make it worthwhile to 1424 // avoid running the loop below at the cost of slightly extra memory 1425 // consumption 1426 return srcLen*BYTES_PER_CHAR; 1427 } 1428 1429 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst); 1430 size_t outLen = 0; 1431 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; ) 1432 { 1433 const wxUint32 ch = wxDecodeSurrogate(&src); 1434 if ( !src ) 1435 return wxCONV_FAILED; 1436 1437 outLen += BYTES_PER_CHAR; 1438 1439 if ( outLen > dstLen ) 1440 return wxCONV_FAILED; 1441 1442 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch); 1443 } 1444 1445 return outLen; 1446} 1447 1448#else // !WC_UTF16: wchar_t is UTF-32 1449 1450// ---------------------------------------------------------------------------- 1451// conversions without endianness change 1452// ---------------------------------------------------------------------------- 1453 1454size_t 1455wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen, 1456 const char *src, size_t srcLen) const 1457{ 1458 // use memcpy() as it should be much faster than hand-written loop 1459 srcLen = GetLength(src, srcLen); 1460 if ( srcLen == wxNO_LEN ) 1461 return wxCONV_FAILED; 1462 1463 const size_t inLen = srcLen/BYTES_PER_CHAR; 1464 if ( dst ) 1465 { 1466 if ( dstLen < inLen ) 1467 return wxCONV_FAILED; 1468 1469 memcpy(dst, src, srcLen); 1470 } 1471 1472 return inLen; 1473} 1474 1475size_t 1476wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen, 1477 const wchar_t *src, size_t srcLen) const 1478{ 1479 if ( srcLen == wxNO_LEN ) 1480 srcLen = wxWcslen(src) + 1; 1481 1482 srcLen *= BYTES_PER_CHAR; 1483 1484 if ( dst ) 1485 { 1486 if ( dstLen < srcLen ) 1487 return wxCONV_FAILED; 1488 1489 memcpy(dst, src, srcLen); 1490 } 1491 1492 return srcLen; 1493} 1494 1495// ---------------------------------------------------------------------------- 1496// endian-reversing conversions 1497// ---------------------------------------------------------------------------- 1498 1499size_t 1500wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen, 1501 const char *src, size_t srcLen) const 1502{ 1503 srcLen = GetLength(src, srcLen); 1504 if ( srcLen == wxNO_LEN ) 1505 return wxCONV_FAILED; 1506 1507 srcLen /= BYTES_PER_CHAR; 1508 1509 if ( dst ) 1510 { 1511 if ( dstLen < srcLen ) 1512 return wxCONV_FAILED; 1513 1514 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src); 1515 for ( size_t n = 0; n < srcLen; n++, inBuff++ ) 1516 { 1517 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff); 1518 } 1519 } 1520 1521 return srcLen; 1522} 1523 1524size_t 1525wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen, 1526 const wchar_t *src, size_t srcLen) const 1527{ 1528 if ( srcLen == wxNO_LEN ) 1529 srcLen = wxWcslen(src) + 1; 1530 1531 srcLen *= BYTES_PER_CHAR; 1532 1533 if ( dst ) 1534 { 1535 if ( dstLen < srcLen ) 1536 return wxCONV_FAILED; 1537 1538 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst); 1539 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ ) 1540 { 1541 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src); 1542 } 1543 } 1544 1545 return srcLen; 1546} 1547 1548#endif // WC_UTF16/!WC_UTF16 1549 1550 1551// ============================================================================ 1552// The classes doing conversion using the iconv_xxx() functions 1553// ============================================================================ 1554 1555#ifdef HAVE_ICONV 1556 1557// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with 1558// E2BIG if output buffer is _exactly_ as big as needed. Such case is 1559// (unless there's yet another bug in glibc) the only case when iconv() 1560// returns with (size_t)-1 (which means error) and says there are 0 bytes 1561// left in the input buffer -- when _real_ error occurs, 1562// bytes-left-in-input buffer is non-zero. Hence, this alternative test for 1563// iconv() failure. 1564// [This bug does not appear in glibc 2.2.] 1565#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1 1566#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \ 1567 (errno != E2BIG || bufLeft != 0)) 1568#else 1569#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1) 1570#endif 1571 1572#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x)) 1573 1574#define ICONV_T_INVALID ((iconv_t)-1) 1575 1576#if SIZEOF_WCHAR_T == 4 1577 #define WC_BSWAP wxUINT32_SWAP_ALWAYS 1578 #define WC_ENC wxFONTENCODING_UTF32 1579#elif SIZEOF_WCHAR_T == 2 1580 #define WC_BSWAP wxUINT16_SWAP_ALWAYS 1581 #define WC_ENC wxFONTENCODING_UTF16 1582#else // sizeof(wchar_t) != 2 nor 4 1583 // does this ever happen? 1584 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org" 1585#endif 1586 1587// ---------------------------------------------------------------------------- 1588// wxMBConv_iconv: encapsulates an iconv character set 1589// ---------------------------------------------------------------------------- 1590 1591class wxMBConv_iconv : public wxMBConv 1592{ 1593public: 1594 wxMBConv_iconv(const wxChar *name); 1595 virtual ~wxMBConv_iconv(); 1596 1597 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const; 1598 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const; 1599 1600 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment 1601 virtual size_t GetMBNulLen() const; 1602 1603 virtual wxMBConv *Clone() const 1604 { 1605 wxMBConv_iconv *p = new wxMBConv_iconv(m_name); 1606 p->m_minMBCharWidth = m_minMBCharWidth; 1607 return p; 1608 } 1609 1610 bool IsOk() const 1611 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); } 1612 1613protected: 1614 // the iconv handlers used to translate from multibyte 1615 // to wide char and in the other direction 1616 iconv_t m2w, 1617 w2m; 1618 1619#if wxUSE_THREADS 1620 // guards access to m2w and w2m objects 1621 wxMutex m_iconvMutex; 1622#endif 1623 1624private: 1625 // the name (for iconv_open()) of a wide char charset -- if none is 1626 // available on this machine, it will remain NULL 1627 static wxString ms_wcCharsetName; 1628 1629 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has 1630 // different endian-ness than the native one 1631 static bool ms_wcNeedsSwap; 1632 1633 1634 // name of the encoding handled by this conversion 1635 wxString m_name; 1636 1637 // cached result of GetMBNulLen(); set to 0 meaning "unknown" 1638 // initially 1639 size_t m_minMBCharWidth; 1640}; 1641 1642// make the constructor available for unit testing 1643WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name ) 1644{ 1645 wxMBConv_iconv* result = new wxMBConv_iconv( name ); 1646 if ( !result->IsOk() ) 1647 { 1648 delete result; 1649 return 0; 1650 } 1651 1652 return result; 1653} 1654 1655wxString wxMBConv_iconv::ms_wcCharsetName; 1656bool wxMBConv_iconv::ms_wcNeedsSwap = false; 1657 1658wxMBConv_iconv::wxMBConv_iconv(const wxChar *name) 1659 : m_name(name) 1660{ 1661 m_minMBCharWidth = 0; 1662 1663 // iconv operates with chars, not wxChars, but luckily it uses only ASCII 1664 // names for the charsets 1665 const wxCharBuffer cname(wxString(name).ToAscii()); 1666 1667 // check for charset that represents wchar_t: 1668 if ( ms_wcCharsetName.empty() ) 1669 { 1670 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:")); 1671 1672#if wxUSE_FONTMAP 1673 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC); 1674#else // !wxUSE_FONTMAP 1675 static const wxChar *names_static[] = 1676 { 1677#if SIZEOF_WCHAR_T == 4 1678 _T("UCS-4"), 1679#elif SIZEOF_WCHAR_T == 2 1680 _T("UCS-2"), 1681#endif 1682 NULL 1683 }; 1684 const wxChar **names = names_static; 1685#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP 1686 1687 for ( ; *names && ms_wcCharsetName.empty(); ++names ) 1688 { 1689 const wxString nameCS(*names); 1690 1691 // first try charset with explicit bytesex info (e.g. "UCS-4LE"): 1692 wxString nameXE(nameCS); 1693 1694#ifdef WORDS_BIGENDIAN 1695 nameXE += _T("BE"); 1696#else // little endian 1697 nameXE += _T("LE"); 1698#endif 1699 1700 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""), 1701 nameXE.c_str()); 1702 1703 m2w = iconv_open(nameXE.ToAscii(), cname); 1704 if ( m2w == ICONV_T_INVALID ) 1705 { 1706 // try charset w/o bytesex info (e.g. "UCS4") 1707 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""), 1708 nameCS.c_str()); 1709 m2w = iconv_open(nameCS.ToAscii(), cname); 1710 1711 // and check for bytesex ourselves: 1712 if ( m2w != ICONV_T_INVALID ) 1713 { 1714 char buf[2], *bufPtr; 1715 wchar_t wbuf[2], *wbufPtr; 1716 size_t insz, outsz; 1717 size_t res; 1718 1719 buf[0] = 'A'; 1720 buf[1] = 0; 1721 wbuf[0] = 0; 1722 insz = 2; 1723 outsz = SIZEOF_WCHAR_T * 2; 1724 wbufPtr = wbuf; 1725 bufPtr = buf; 1726 1727 res = iconv( 1728 m2w, ICONV_CHAR_CAST(&bufPtr), &insz, 1729 (char**)&wbufPtr, &outsz); 1730 1731 if (ICONV_FAILED(res, insz)) 1732 { 1733 wxLogLastError(wxT("iconv")); 1734 wxLogError(_("Conversion to charset '%s' doesn't work."), 1735 nameCS.c_str()); 1736 } 1737 else // ok, can convert to this encoding, remember it 1738 { 1739 ms_wcCharsetName = nameCS; 1740 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0]; 1741 } 1742 } 1743 } 1744 else // use charset not requiring byte swapping 1745 { 1746 ms_wcCharsetName = nameXE; 1747 } 1748 } 1749 1750 wxLogTrace(TRACE_STRCONV, 1751 wxT("iconv wchar_t charset is \"%s\"%s"), 1752 ms_wcCharsetName.empty() ? _T("<none>") 1753 : ms_wcCharsetName.c_str(), 1754 ms_wcNeedsSwap ? _T(" (needs swap)") 1755 : _T("")); 1756 } 1757 else // we already have ms_wcCharsetName 1758 { 1759 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname); 1760 } 1761 1762 if ( ms_wcCharsetName.empty() ) 1763 { 1764 w2m = ICONV_T_INVALID; 1765 } 1766 else 1767 { 1768 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii()); 1769 if ( w2m == ICONV_T_INVALID ) 1770 { 1771 wxLogTrace(TRACE_STRCONV, 1772 wxT("\"%s\" -> \"%s\" works but not the converse!?"), 1773 ms_wcCharsetName.c_str(), cname.data()); 1774 } 1775 } 1776} 1777 1778wxMBConv_iconv::~wxMBConv_iconv() 1779{ 1780 if ( m2w != ICONV_T_INVALID ) 1781 iconv_close(m2w); 1782 if ( w2m != ICONV_T_INVALID ) 1783 iconv_close(w2m); 1784} 1785 1786size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const 1787{ 1788 // find the string length: notice that must be done differently for 1789 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs 1790 size_t inbuf; 1791 const size_t nulLen = GetMBNulLen(); 1792 switch ( nulLen ) 1793 { 1794 default: 1795 return wxCONV_FAILED; 1796 1797 case 1: 1798 inbuf = strlen(psz); // arguably more optimized than our version 1799 break; 1800 1801 case 2: 1802 case 4: 1803 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but 1804 // they also have to start at character boundary and not span two 1805 // adjacent characters 1806 const char *p; 1807 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen ) 1808 ; 1809 inbuf = p - psz; 1810 break; 1811 } 1812 1813#if wxUSE_THREADS 1814 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle. 1815 // Unfortunately there are a couple of global wxCSConv objects such as 1816 // wxConvLocal that are used all over wx code, so we have to make sure 1817 // the handle is used by at most one thread at the time. Otherwise 1818 // only a few wx classes would be safe to use from non-main threads 1819 // as MB<->WC conversion would fail "randomly". 1820 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex); 1821#endif // wxUSE_THREADS 1822 1823 size_t outbuf = n * SIZEOF_WCHAR_T; 1824 size_t res, cres; 1825 // VS: Use these instead of psz, buf because iconv() modifies its arguments: 1826 wchar_t *bufPtr = buf; 1827 const char *pszPtr = psz; 1828 1829 if (buf) 1830 { 1831 // have destination buffer, convert there 1832 cres = iconv(m2w, 1833 ICONV_CHAR_CAST(&pszPtr), &inbuf, 1834 (char**)&bufPtr, &outbuf); 1835 res = n - (outbuf / SIZEOF_WCHAR_T); 1836 1837 if (ms_wcNeedsSwap) 1838 { 1839 // convert to native endianness 1840 for ( unsigned i = 0; i < res; i++ ) 1841 buf[n] = WC_BSWAP(buf[i]); 1842 } 1843 1844 // NUL-terminate the string if there is any space left 1845 if (res < n) 1846 buf[res] = 0; 1847 } 1848 else 1849 { 1850 // no destination buffer... convert using temp buffer 1851 // to calculate destination buffer requirement 1852 wchar_t tbuf[8]; 1853 res = 0; 1854 1855 do 1856 { 1857 bufPtr = tbuf; 1858 outbuf = 8 * SIZEOF_WCHAR_T; 1859 1860 cres = iconv(m2w, 1861 ICONV_CHAR_CAST(&pszPtr), &inbuf, 1862 (char**)&bufPtr, &outbuf ); 1863 1864 res += 8 - (outbuf / SIZEOF_WCHAR_T); 1865 } 1866 while ((cres == (size_t)-1) && (errno == E2BIG)); 1867 } 1868 1869 if (ICONV_FAILED(cres, inbuf)) 1870 { 1871 //VS: it is ok if iconv fails, hence trace only 1872 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode())); 1873 return wxCONV_FAILED; 1874 } 1875 1876 return res; 1877} 1878 1879size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const 1880{ 1881#if wxUSE_THREADS 1882 // NB: explained in MB2WC 1883 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex); 1884#endif 1885 1886 size_t inlen = wxWcslen(psz); 1887 size_t inbuf = inlen * SIZEOF_WCHAR_T; 1888 size_t outbuf = n; 1889 size_t res, cres; 1890 1891 wchar_t *tmpbuf = 0; 1892 1893 if (ms_wcNeedsSwap) 1894 { 1895 // need to copy to temp buffer to switch endianness 1896 // (doing WC_BSWAP twice on the original buffer won't help, as it 1897 // could be in read-only memory, or be accessed in some other thread) 1898 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T); 1899 for ( size_t i = 0; i < inlen; i++ ) 1900 tmpbuf[n] = WC_BSWAP(psz[i]); 1901 1902 tmpbuf[inlen] = L'\0'; 1903 psz = tmpbuf; 1904 } 1905 1906 if (buf) 1907 { 1908 // have destination buffer, convert there 1909 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf ); 1910 1911 res = n - outbuf; 1912 1913 // NB: iconv was given only wcslen(psz) characters on input, and so 1914 // it couldn't convert the trailing zero. Let's do it ourselves 1915 // if there's some room left for it in the output buffer. 1916 if (res < n) 1917 buf[0] = 0; 1918 } 1919 else 1920 { 1921 // no destination buffer: convert using temp buffer 1922 // to calculate destination buffer requirement 1923 char tbuf[16]; 1924 res = 0; 1925 do 1926 { 1927 buf = tbuf; 1928 outbuf = 16; 1929 1930 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf ); 1931 1932 res += 16 - outbuf; 1933 } 1934 while ((cres == (size_t)-1) && (errno == E2BIG)); 1935 } 1936 1937 if (ms_wcNeedsSwap) 1938 { 1939 free(tmpbuf); 1940 } 1941 1942 if (ICONV_FAILED(cres, inbuf)) 1943 { 1944 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode())); 1945 return wxCONV_FAILED; 1946 } 1947 1948 return res; 1949} 1950 1951size_t wxMBConv_iconv::GetMBNulLen() const 1952{ 1953 if ( m_minMBCharWidth == 0 ) 1954 { 1955 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv); 1956 1957#if wxUSE_THREADS 1958 // NB: explained in MB2WC 1959 wxMutexLocker lock(self->m_iconvMutex); 1960#endif 1961 1962 const wchar_t *wnul = L""; 1963 char buf[8]; // should be enough for NUL in any encoding 1964 size_t inLen = sizeof(wchar_t), 1965 outLen = WXSIZEOF(buf); 1966 char *inBuff = (char *)wnul; 1967 char *outBuff = buf; 1968 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 ) 1969 { 1970 self->m_minMBCharWidth = (size_t)-1; 1971 } 1972 else // ok 1973 { 1974 self->m_minMBCharWidth = outBuff - buf; 1975 } 1976 } 1977 1978 return m_minMBCharWidth; 1979} 1980 1981#endif // HAVE_ICONV 1982 1983 1984// ============================================================================ 1985// Win32 conversion classes 1986// ============================================================================ 1987 1988#ifdef wxHAVE_WIN32_MB2WC 1989 1990// from utils.cpp 1991#if wxUSE_FONTMAP 1992extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset); 1993extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding); 1994#endif 1995 1996class wxMBConv_win32 : public wxMBConv 1997{ 1998public: 1999 wxMBConv_win32() 2000 { 2001 m_CodePage = CP_ACP; 2002 m_minMBCharWidth = 0; 2003 } 2004 2005 wxMBConv_win32(const wxMBConv_win32& conv) 2006 : wxMBConv() 2007 { 2008 m_CodePage = conv.m_CodePage; 2009 m_minMBCharWidth = conv.m_minMBCharWidth; 2010 } 2011 2012#if wxUSE_FONTMAP 2013 wxMBConv_win32(const wxChar* name) 2014 { 2015 m_CodePage = wxCharsetToCodepage(name); 2016 m_minMBCharWidth = 0; 2017 } 2018 2019 wxMBConv_win32(wxFontEncoding encoding) 2020 { 2021 m_CodePage = wxEncodingToCodepage(encoding); 2022 m_minMBCharWidth = 0; 2023 } 2024#endif // wxUSE_FONTMAP 2025 2026 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const 2027 { 2028 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it 2029 // the behaviour is not compatible with the Unix version (using iconv) 2030 // and break the library itself, e.g. wxTextInputStream::NextChar() 2031 // wouldn't work if reading an incomplete MB char didn't result in an 2032 // error 2033 // 2034 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or 2035 // Win XP or newer and it is not supported for UTF-[78] so we always 2036 // use our own conversions in this case. See 2037 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx 2038 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp 2039 if ( m_CodePage == CP_UTF8 ) 2040 { 2041 return wxConvUTF8.MB2WC(buf, psz, n); 2042 } 2043 2044 if ( m_CodePage == CP_UTF7 ) 2045 { 2046 return wxConvUTF7.MB2WC(buf, psz, n); 2047 } 2048 2049 int flags = 0; 2050 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) && 2051 IsAtLeastWin2kSP4() ) 2052 { 2053 flags = MB_ERR_INVALID_CHARS; 2054 } 2055 2056 const size_t len = ::MultiByteToWideChar 2057 ( 2058 m_CodePage, // code page 2059 flags, // flags: fall on error 2060 psz, // input string 2061 -1, // its length (NUL-terminated) 2062 buf, // output string 2063 buf ? n : 0 // size of output buffer 2064 ); 2065 if ( !len ) 2066 { 2067 // function totally failed 2068 return wxCONV_FAILED; 2069 } 2070 2071 // if we were really converting and didn't use MB_ERR_INVALID_CHARS, 2072 // check if we succeeded, by doing a double trip: 2073 if ( !flags && buf ) 2074 { 2075 const size_t mbLen = strlen(psz); 2076 wxCharBuffer mbBuf(mbLen); 2077 if ( ::WideCharToMultiByte 2078 ( 2079 m_CodePage, 2080 0, 2081 buf, 2082 -1, 2083 mbBuf.data(), 2084 mbLen + 1, // size in bytes, not length 2085 NULL, 2086 NULL 2087 ) == 0 || 2088 strcmp(mbBuf, psz) != 0 ) 2089 { 2090 // we didn't obtain the same thing we started from, hence 2091 // the conversion was lossy and we consider that it failed 2092 return wxCONV_FAILED; 2093 } 2094 } 2095 2096 // note that it returns count of written chars for buf != NULL and size 2097 // of the needed buffer for buf == NULL so in either case the length of 2098 // the string (which never includes the terminating NUL) is one less 2099 return len - 1; 2100 } 2101 2102 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const 2103 { 2104 /* 2105 we have a problem here: by default, WideCharToMultiByte() may 2106 replace characters unrepresentable in the target code page with bad 2107 quality approximations such as turning "1/2" symbol (U+00BD) into 2108 "1" for the code pages which don't have it and we, obviously, want 2109 to avoid this at any price 2110 2111 the trouble is that this function does it _silently_, i.e. it won't 2112 even tell us whether it did or not... Win98/2000 and higher provide 2113 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and 2114 we have to resort to a round trip, i.e. check that converting back 2115 results in the same string -- this is, of course, expensive but 2116 otherwise we simply can't be sure to not garble the data. 2117 */ 2118 2119 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN 2120 // it doesn't work with CJK encodings (which we test for rather roughly 2121 // here...) nor with UTF-7/8 nor, of course, with Windows versions not 2122 // supporting it 2123 BOOL usedDef wxDUMMY_INITIALIZE(false); 2124 BOOL *pUsedDef; 2125 int flags; 2126 if ( CanUseNoBestFit() && m_CodePage < 50000 ) 2127 { 2128 // it's our lucky day 2129 flags = WC_NO_BEST_FIT_CHARS; 2130 pUsedDef = &usedDef; 2131 } 2132 else // old system or unsupported encoding 2133 { 2134 flags = 0; 2135 pUsedDef = NULL; 2136 } 2137 2138 const size_t len = ::WideCharToMultiByte 2139 ( 2140 m_CodePage, // code page 2141 flags, // either none or no best fit 2142 pwz, // input string 2143 -1, // it is (wide) NUL-terminated 2144 buf, // output buffer 2145 buf ? n : 0, // and its size 2146 NULL, // default "replacement" char 2147 pUsedDef // [out] was it used? 2148 ); 2149 2150 if ( !len ) 2151 { 2152 // function totally failed 2153 return wxCONV_FAILED; 2154 } 2155 2156 // if we were really converting, check if we succeeded 2157 if ( buf ) 2158 { 2159 if ( flags ) 2160 { 2161 // check if the conversion failed, i.e. if any replacements 2162 // were done 2163 if ( usedDef ) 2164 return wxCONV_FAILED; 2165 } 2166 else // we must resort to double tripping... 2167 { 2168 wxWCharBuffer wcBuf(n); 2169 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED || 2170 wcscmp(wcBuf, pwz) != 0 ) 2171 { 2172 // we didn't obtain the same thing we started from, hence 2173 // the conversion was lossy and we consider that it failed 2174 return wxCONV_FAILED; 2175 } 2176 } 2177 } 2178 2179 // see the comment above for the reason of "len - 1" 2180 return len - 1; 2181 } 2182 2183 virtual size_t GetMBNulLen() const 2184 { 2185 if ( m_minMBCharWidth == 0 ) 2186 { 2187 int len = ::WideCharToMultiByte 2188 ( 2189 m_CodePage, // code page 2190 0, // no flags 2191 L"", // input string 2192 1, // translate just the NUL 2193 NULL, // output buffer 2194 0, // and its size 2195 NULL, // no replacement char 2196 NULL // [out] don't care if it was used 2197 ); 2198 2199 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32); 2200 switch ( len ) 2201 { 2202 default: 2203 wxLogDebug(_T("Unexpected NUL length %d"), len); 2204 self->m_minMBCharWidth = (size_t)-1; 2205 break; 2206 2207 case 0: 2208 self->m_minMBCharWidth = (size_t)-1; 2209 break; 2210 2211 case 1: 2212 case 2: 2213 case 4: 2214 self->m_minMBCharWidth = len; 2215 break; 2216 } 2217 } 2218 2219 return m_minMBCharWidth; 2220 } 2221 2222 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); } 2223 2224 bool IsOk() const { return m_CodePage != -1; } 2225 2226private: 2227 static bool CanUseNoBestFit() 2228 { 2229 static int s_isWin98Or2k = -1; 2230 2231 if ( s_isWin98Or2k == -1 ) 2232 { 2233 int verMaj, verMin; 2234 switch ( wxGetOsVersion(&verMaj, &verMin) ) 2235 { 2236 case wxOS_WINDOWS_9X: 2237 s_isWin98Or2k = verMaj >= 4 && verMin >= 10; 2238 break; 2239 2240 case wxOS_WINDOWS_NT: 2241 s_isWin98Or2k = verMaj >= 5; 2242 break; 2243 2244 default: 2245 // unknown: be conservative by default 2246 s_isWin98Or2k = 0; 2247 break; 2248 } 2249 2250 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") ); 2251 } 2252 2253 return s_isWin98Or2k == 1; 2254 } 2255 2256 static bool IsAtLeastWin2kSP4() 2257 { 2258#ifdef __WXWINCE__ 2259 return false; 2260#else 2261 static int s_isAtLeastWin2kSP4 = -1; 2262 2263 if ( s_isAtLeastWin2kSP4 == -1 ) 2264 { 2265 OSVERSIONINFOEX ver; 2266 2267 memset(&ver, 0, sizeof(ver)); 2268 ver.dwOSVersionInfoSize = sizeof(ver); 2269 GetVersionEx((OSVERSIONINFO*)&ver); 2270 2271 s_isAtLeastWin2kSP4 = 2272 ((ver.dwMajorVersion > 5) || // Vista+ 2273 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003 2274 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 && 2275 ver.wServicePackMajor >= 4)) // 2000 SP4+ 2276 ? 1 : 0; 2277 } 2278 2279 return s_isAtLeastWin2kSP4 == 1; 2280#endif 2281 } 2282 2283 2284 // the code page we're working with 2285 long m_CodePage; 2286 2287 // cached result of GetMBNulLen(), set to 0 initially meaning 2288 // "unknown" 2289 size_t m_minMBCharWidth; 2290}; 2291 2292#endif // wxHAVE_WIN32_MB2WC 2293 2294// ============================================================================ 2295// Cocoa conversion classes 2296// ============================================================================ 2297 2298#if defined(__WXCOCOA__) 2299 2300// RN: There is no UTF-32 support in either Core Foundation or Cocoa. 2301// Strangely enough, internally Core Foundation uses 2302// UTF-32 internally quite a bit - its just not public (yet). 2303 2304#include <CoreFoundation/CFString.h> 2305#include <CoreFoundation/CFStringEncodingExt.h> 2306 2307CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding) 2308{ 2309 CFStringEncoding enc = kCFStringEncodingInvalidId ; 2310 2311 switch (encoding) 2312 { 2313 case wxFONTENCODING_DEFAULT : 2314 enc = CFStringGetSystemEncoding(); 2315 break ; 2316 2317 case wxFONTENCODING_ISO8859_1 : 2318 enc = kCFStringEncodingISOLatin1 ; 2319 break ; 2320 case wxFONTENCODING_ISO8859_2 : 2321 enc = kCFStringEncodingISOLatin2; 2322 break ; 2323 case wxFONTENCODING_ISO8859_3 : 2324 enc = kCFStringEncodingISOLatin3 ; 2325 break ; 2326 case wxFONTENCODING_ISO8859_4 : 2327 enc = kCFStringEncodingISOLatin4; 2328 break ; 2329 case wxFONTENCODING_ISO8859_5 : 2330 enc = kCFStringEncodingISOLatinCyrillic; 2331 break ; 2332 case wxFONTENCODING_ISO8859_6 : 2333 enc = kCFStringEncodingISOLatinArabic; 2334 break ; 2335 case wxFONTENCODING_ISO8859_7 : 2336 enc = kCFStringEncodingISOLatinGreek; 2337 break ; 2338 case wxFONTENCODING_ISO8859_8 : 2339 enc = kCFStringEncodingISOLatinHebrew; 2340 break ; 2341 case wxFONTENCODING_ISO8859_9 : 2342 enc = kCFStringEncodingISOLatin5; 2343 break ; 2344 case wxFONTENCODING_ISO8859_10 : 2345 enc = kCFStringEncodingISOLatin6; 2346 break ; 2347 case wxFONTENCODING_ISO8859_11 : 2348 enc = kCFStringEncodingISOLatinThai; 2349 break ; 2350 case wxFONTENCODING_ISO8859_13 : 2351 enc = kCFStringEncodingISOLatin7; 2352 break ; 2353 case wxFONTENCODING_ISO8859_14 : 2354 enc = kCFStringEncodingISOLatin8; 2355 break ; 2356 case wxFONTENCODING_ISO8859_15 : 2357 enc = kCFStringEncodingISOLatin9; 2358 break ; 2359 2360 case wxFONTENCODING_KOI8 : 2361 enc = kCFStringEncodingKOI8_R; 2362 break ; 2363 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866 2364 enc = kCFStringEncodingDOSRussian; 2365 break ; 2366 2367// case wxFONTENCODING_BULGARIAN : 2368// enc = ; 2369// break ; 2370 2371 case wxFONTENCODING_CP437 : 2372 enc = kCFStringEncodingDOSLatinUS ; 2373 break ; 2374 case wxFONTENCODING_CP850 : 2375 enc = kCFStringEncodingDOSLatin1; 2376 break ; 2377 case wxFONTENCODING_CP852 : 2378 enc = kCFStringEncodingDOSLatin2; 2379 break ; 2380 case wxFONTENCODING_CP855 : 2381 enc = kCFStringEncodingDOSCyrillic; 2382 break ; 2383 case wxFONTENCODING_CP866 : 2384 enc = kCFStringEncodingDOSRussian ; 2385 break ; 2386 case wxFONTENCODING_CP874 : 2387 enc = kCFStringEncodingDOSThai; 2388 break ; 2389 case wxFONTENCODING_CP932 : 2390 enc = kCFStringEncodingDOSJapanese; 2391 break ; 2392 case wxFONTENCODING_CP936 : 2393 enc = kCFStringEncodingDOSChineseSimplif ; 2394 break ; 2395 case wxFONTENCODING_CP949 : 2396 enc = kCFStringEncodingDOSKorean; 2397 break ; 2398 case wxFONTENCODING_CP950 : 2399 enc = kCFStringEncodingDOSChineseTrad; 2400 break ; 2401 case wxFONTENCODING_CP1250 : 2402 enc = kCFStringEncodingWindowsLatin2; 2403 break ; 2404 case wxFONTENCODING_CP1251 : 2405 enc = kCFStringEncodingWindowsCyrillic ; 2406 break ; 2407 case wxFONTENCODING_CP1252 : 2408 enc = kCFStringEncodingWindowsLatin1 ; 2409 break ; 2410 case wxFONTENCODING_CP1253 : 2411 enc = kCFStringEncodingWindowsGreek; 2412 break ; 2413 case wxFONTENCODING_CP1254 : 2414 enc = kCFStringEncodingWindowsLatin5; 2415 break ; 2416 case wxFONTENCODING_CP1255 : 2417 enc = kCFStringEncodingWindowsHebrew ; 2418 break ; 2419 case wxFONTENCODING_CP1256 : 2420 enc = kCFStringEncodingWindowsArabic ; 2421 break ; 2422 case wxFONTENCODING_CP1257 : 2423 enc = kCFStringEncodingWindowsBalticRim; 2424 break ; 2425// This only really encodes to UTF7 (if that) evidently 2426// case wxFONTENCODING_UTF7 : 2427// enc = kCFStringEncodingNonLossyASCII ; 2428// break ; 2429 case wxFONTENCODING_UTF8 : 2430 enc = kCFStringEncodingUTF8 ; 2431 break ; 2432 case wxFONTENCODING_EUC_JP : 2433 enc = kCFStringEncodingEUC_JP; 2434 break ; 2435 case wxFONTENCODING_UTF16 : 2436 enc = kCFStringEncodingUnicode ; 2437 break ; 2438 case wxFONTENCODING_MACROMAN : 2439 enc = kCFStringEncodingMacRoman ; 2440 break ; 2441 case wxFONTENCODING_MACJAPANESE : 2442 enc = kCFStringEncodingMacJapanese ; 2443 break ; 2444 case wxFONTENCODING_MACCHINESETRAD : 2445 enc = kCFStringEncodingMacChineseTrad ; 2446 break ; 2447 case wxFONTENCODING_MACKOREAN : 2448 enc = kCFStringEncodingMacKorean ; 2449 break ; 2450 case wxFONTENCODING_MACARABIC : 2451 enc = kCFStringEncodingMacArabic ; 2452 break ; 2453 case wxFONTENCODING_MACHEBREW : 2454 enc = kCFStringEncodingMacHebrew ; 2455 break ; 2456 case wxFONTENCODING_MACGREEK : 2457 enc = kCFStringEncodingMacGreek ; 2458 break ; 2459 case wxFONTENCODING_MACCYRILLIC : 2460 enc = kCFStringEncodingMacCyrillic ; 2461 break ; 2462 case wxFONTENCODING_MACDEVANAGARI : 2463 enc = kCFStringEncodingMacDevanagari ; 2464 break ; 2465 case wxFONTENCODING_MACGURMUKHI : 2466 enc = kCFStringEncodingMacGurmukhi ; 2467 break ; 2468 case wxFONTENCODING_MACGUJARATI : 2469 enc = kCFStringEncodingMacGujarati ; 2470 break ; 2471 case wxFONTENCODING_MACORIYA : 2472 enc = kCFStringEncodingMacOriya ; 2473 break ; 2474 case wxFONTENCODING_MACBENGALI : 2475 enc = kCFStringEncodingMacBengali ; 2476 break ; 2477 case wxFONTENCODING_MACTAMIL : 2478 enc = kCFStringEncodingMacTamil ; 2479 break ; 2480 case wxFONTENCODING_MACTELUGU : 2481 enc = kCFStringEncodingMacTelugu ; 2482 break ; 2483 case wxFONTENCODING_MACKANNADA : 2484 enc = kCFStringEncodingMacKannada ; 2485 break ; 2486 case wxFONTENCODING_MACMALAJALAM : 2487 enc = kCFStringEncodingMacMalayalam ; 2488 break ; 2489 case wxFONTENCODING_MACSINHALESE : 2490 enc = kCFStringEncodingMacSinhalese ; 2491 break ; 2492 case wxFONTENCODING_MACBURMESE : 2493 enc = kCFStringEncodingMacBurmese ; 2494 break ; 2495 case wxFONTENCODING_MACKHMER : 2496 enc = kCFStringEncodingMacKhmer ; 2497 break ; 2498 case wxFONTENCODING_MACTHAI : 2499 enc = kCFStringEncodingMacThai ; 2500 break ; 2501 case wxFONTENCODING_MACLAOTIAN : 2502 enc = kCFStringEncodingMacLaotian ; 2503 break ; 2504 case wxFONTENCODING_MACGEORGIAN : 2505 enc = kCFStringEncodingMacGeorgian ; 2506 break ; 2507 case wxFONTENCODING_MACARMENIAN : 2508 enc = kCFStringEncodingMacArmenian ; 2509 break ; 2510 case wxFONTENCODING_MACCHINESESIMP : 2511 enc = kCFStringEncodingMacChineseSimp ; 2512 break ; 2513 case wxFONTENCODING_MACTIBETAN : 2514 enc = kCFStringEncodingMacTibetan ; 2515 break ; 2516 case wxFONTENCODING_MACMONGOLIAN : 2517 enc = kCFStringEncodingMacMongolian ; 2518 break ; 2519 case wxFONTENCODING_MACETHIOPIC : 2520 enc = kCFStringEncodingMacEthiopic ; 2521 break ; 2522 case wxFONTENCODING_MACCENTRALEUR : 2523 enc = kCFStringEncodingMacCentralEurRoman ; 2524 break ; 2525 case wxFONTENCODING_MACVIATNAMESE : 2526 enc = kCFStringEncodingMacVietnamese ; 2527 break ; 2528 case wxFONTENCODING_MACARABICEXT : 2529 enc = kCFStringEncodingMacExtArabic ; 2530 break ; 2531 case wxFONTENCODING_MACSYMBOL : 2532 enc = kCFStringEncodingMacSymbol ; 2533 break ; 2534 case wxFONTENCODING_MACDINGBATS : 2535 enc = kCFStringEncodingMacDingbats ; 2536 break ; 2537 case wxFONTENCODING_MACTURKISH : 2538 enc = kCFStringEncodingMacTurkish ; 2539 break ; 2540 case wxFONTENCODING_MACCROATIAN : 2541 enc = kCFStringEncodingMacCroatian ; 2542 break ; 2543 case wxFONTENCODING_MACICELANDIC : 2544 enc = kCFStringEncodingMacIcelandic ; 2545 break ; 2546 case wxFONTENCODING_MACROMANIAN : 2547 enc = kCFStringEncodingMacRomanian ; 2548 break ; 2549 case wxFONTENCODING_MACCELTIC : 2550 enc = kCFStringEncodingMacCeltic ; 2551 break ; 2552 case wxFONTENCODING_MACGAELIC : 2553 enc = kCFStringEncodingMacGaelic ; 2554 break ; 2555// case wxFONTENCODING_MACKEYBOARD : 2556// enc = kCFStringEncodingMacKeyboardGlyphs ; 2557// break ; 2558 2559 default : 2560 // because gcc is picky 2561 break ; 2562 } 2563 2564 return enc ; 2565} 2566 2567class wxMBConv_cocoa : public wxMBConv 2568{ 2569public: 2570 wxMBConv_cocoa() 2571 { 2572 Init(CFStringGetSystemEncoding()) ; 2573 } 2574 2575 wxMBConv_cocoa(const wxMBConv_cocoa& conv) 2576 { 2577 m_encoding = conv.m_encoding; 2578 } 2579 2580#if wxUSE_FONTMAP 2581 wxMBConv_cocoa(const wxChar* name) 2582 { 2583 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ; 2584 } 2585#endif 2586 2587 wxMBConv_cocoa(wxFontEncoding encoding) 2588 { 2589 Init( wxCFStringEncFromFontEnc(encoding) ); 2590 } 2591 2592 virtual ~wxMBConv_cocoa() 2593 { 2594 } 2595 2596 void Init( CFStringEncoding encoding) 2597 { 2598 m_encoding = encoding ; 2599 } 2600 2601 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const 2602 { 2603 wxASSERT(szUnConv); 2604 2605 CFStringRef theString = CFStringCreateWithBytes ( 2606 NULL, //the allocator 2607 (const UInt8*)szUnConv, 2608 strlen(szUnConv), 2609 m_encoding, 2610 false //no BOM/external representation 2611 ); 2612 2613 wxASSERT(theString); 2614 2615 size_t nOutLength = CFStringGetLength(theString); 2616 2617 if (szOut == NULL) 2618 { 2619 CFRelease(theString); 2620 return nOutLength; 2621 } 2622 2623 CFRange theRange = { 0, nOutSize }; 2624 2625#if SIZEOF_WCHAR_T == 4 2626 UniChar* szUniCharBuffer = new UniChar[nOutSize]; 2627#endif 2628 2629 CFStringGetCharacters(theString, theRange, szUniCharBuffer); 2630 2631 CFRelease(theString); 2632 2633 szUniCharBuffer[nOutLength] = '\0'; 2634 2635#if SIZEOF_WCHAR_T == 4 2636 wxMBConvUTF16 converter; 2637 converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize ); 2638 delete [] szUniCharBuffer; 2639#endif 2640 2641 return nOutLength; 2642 } 2643 2644 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const 2645 { 2646 wxASSERT(szUnConv); 2647 2648 size_t nRealOutSize; 2649 size_t nBufSize = wxWcslen(szUnConv); 2650 UniChar* szUniBuffer = (UniChar*) szUnConv; 2651 2652#if SIZEOF_WCHAR_T == 4 2653 wxMBConvUTF16 converter ; 2654 nBufSize = converter.WC2MB( NULL, szUnConv, 0 ); 2655 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1]; 2656 converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar)); 2657 nBufSize /= sizeof(UniChar); 2658#endif 2659 2660 CFStringRef theString = CFStringCreateWithCharactersNoCopy( 2661 NULL, //allocator 2662 szUniBuffer, 2663 nBufSize, 2664 kCFAllocatorNull //deallocator - we want to deallocate it ourselves 2665 ); 2666 2667 wxASSERT(theString); 2668 2669 //Note that CER puts a BOM when converting to unicode 2670 //so we check and use getchars instead in that case 2671 if (m_encoding == kCFStringEncodingUnicode) 2672 { 2673 if (szOut != NULL) 2674 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut); 2675 2676 nRealOutSize = CFStringGetLength(theString) + 1; 2677 } 2678 else 2679 { 2680 CFStringGetBytes( 2681 theString, 2682 CFRangeMake(0, CFStringGetLength(theString)), 2683 m_encoding, 2684 0, //what to put in characters that can't be converted - 2685 //0 tells CFString to return NULL if it meets such a character 2686 false, //not an external representation 2687 (UInt8*) szOut, 2688 nOutSize, 2689 (CFIndex*) &nRealOutSize 2690 ); 2691 } 2692 2693 CFRelease(theString); 2694 2695#if SIZEOF_WCHAR_T == 4 2696 delete[] szUniBuffer; 2697#endif 2698 2699 return nRealOutSize - 1; 2700 } 2701 2702 virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); } 2703 2704 bool IsOk() const 2705 { 2706 return m_encoding != kCFStringEncodingInvalidId && 2707 CFStringIsEncodingAvailable(m_encoding); 2708 } 2709 2710private: 2711 CFStringEncoding m_encoding ; 2712}; 2713 2714#endif // defined(__WXCOCOA__) 2715 2716// ============================================================================ 2717// Mac conversion classes 2718// ============================================================================ 2719 2720#if defined(__WXMAC__) && defined(TARGET_CARBON) 2721 2722class wxMBConv_mac : public wxMBConv 2723{ 2724public: 2725 wxMBConv_mac() 2726 { 2727 Init(CFStringGetSystemEncoding()) ; 2728 } 2729 2730 wxMBConv_mac(const wxMBConv_mac& conv) 2731 { 2732 Init(conv.m_char_encoding); 2733 } 2734 2735#if wxUSE_FONTMAP 2736 wxMBConv_mac(const wxChar* name) 2737 { 2738 wxFontEncoding enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false); 2739 Init( (enc != wxFONTENCODING_SYSTEM) ? wxMacGetSystemEncFromFontEnc( enc ) : kTextEncodingUnknown); 2740 } 2741#endif 2742 2743 wxMBConv_mac(wxFontEncoding encoding) 2744 { 2745 Init( wxMacGetSystemEncFromFontEnc(encoding) ); 2746 } 2747 2748 virtual ~wxMBConv_mac() 2749 { 2750 OSStatus status = noErr ; 2751 if (m_MB2WC_converter) 2752 status = TECDisposeConverter(m_MB2WC_converter); 2753 if (m_WC2MB_converter) 2754 status = TECDisposeConverter(m_WC2MB_converter); 2755 } 2756 2757 void Init( TextEncodingBase encoding,TextEncodingVariant encodingVariant = kTextEncodingDefaultVariant , 2758 TextEncodingFormat encodingFormat = kTextEncodingDefaultFormat) 2759 { 2760 m_MB2WC_converter = NULL ; 2761 m_WC2MB_converter = NULL ; 2762 if ( encoding != kTextEncodingUnknown ) 2763 { 2764 m_char_encoding = CreateTextEncoding(encoding, encodingVariant, encodingFormat) ; 2765 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ; 2766 } 2767 else 2768 { 2769 m_char_encoding = kTextEncodingUnknown; 2770 m_unicode_encoding = kTextEncodingUnknown; 2771 } 2772 } 2773 2774 virtual void CreateIfNeeded() const 2775 { 2776 if ( m_MB2WC_converter == NULL && m_WC2MB_converter == NULL && 2777 m_char_encoding != kTextEncodingUnknown && m_unicode_encoding != kTextEncodingUnknown ) 2778 { 2779 OSStatus status = noErr ; 2780 status = TECCreateConverter(&m_MB2WC_converter, 2781 m_char_encoding, 2782 m_unicode_encoding); 2783 wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ; 2784 status = TECCreateConverter(&m_WC2MB_converter, 2785 m_unicode_encoding, 2786 m_char_encoding); 2787 wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ; 2788 } 2789 } 2790 2791 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const 2792 { 2793 CreateIfNeeded() ; 2794 OSStatus status = noErr ; 2795 ByteCount byteOutLen ; 2796 ByteCount byteInLen = strlen(psz) + 1; 2797 wchar_t *tbuf = NULL ; 2798 UniChar* ubuf = NULL ; 2799 size_t res = 0 ; 2800 2801 if (buf == NULL) 2802 { 2803 // Apple specs say at least 32 2804 n = wxMax( 32, byteInLen ) ; 2805 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ; 2806 } 2807 2808 ByteCount byteBufferLen = n * sizeof( UniChar ) ; 2809 2810#if SIZEOF_WCHAR_T == 4 2811 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ; 2812#else 2813 ubuf = (UniChar*) (buf ? buf : tbuf) ; 2814#endif 2815 { 2816#if wxUSE_THREADS 2817 wxMutexLocker lock( m_MB2WC_guard ); 2818#endif 2819 status = TECConvertText( 2820 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen, 2821 (TextPtr) ubuf, byteBufferLen, &byteOutLen); 2822 } 2823 2824#if SIZEOF_WCHAR_T == 4 2825 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar 2826 // is not properly terminated we get random characters at the end 2827 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ; 2828 wxMBConvUTF16 converter ; 2829 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ; 2830 free( ubuf ) ; 2831#else 2832 res = byteOutLen / sizeof( UniChar ) ; 2833#endif 2834 2835 if ( buf == NULL ) 2836 free(tbuf) ; 2837 2838 if ( buf && res < n) 2839 buf[res] = 0; 2840 2841 return res ; 2842 } 2843 2844 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const 2845 { 2846 CreateIfNeeded() ; 2847 OSStatus status = noErr ; 2848 ByteCount byteOutLen ; 2849 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ; 2850 2851 char *tbuf = NULL ; 2852 2853 if (buf == NULL) 2854 { 2855 // Apple specs say at least 32 2856 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T ); 2857 tbuf = (char*) malloc( n ) ; 2858 } 2859 2860 ByteCount byteBufferLen = n ; 2861 UniChar* ubuf = NULL ; 2862 2863#if SIZEOF_WCHAR_T == 4 2864 wxMBConvUTF16 converter ; 2865 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ; 2866 byteInLen = unicharlen ; 2867 ubuf = (UniChar*) malloc( byteInLen + 2 ) ; 2868 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ; 2869#else 2870 ubuf = (UniChar*) psz ; 2871#endif 2872 2873 { 2874#if wxUSE_THREADS 2875 wxMutexLocker lock( m_WC2MB_guard ); 2876#endif 2877 status = TECConvertText( 2878 m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen, 2879 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen); 2880 } 2881 2882#if SIZEOF_WCHAR_T == 4 2883 free( ubuf ) ; 2884#endif 2885 2886 if ( buf == NULL ) 2887 free(tbuf) ; 2888 2889 size_t res = byteOutLen ; 2890 if ( buf && res < n) 2891 { 2892 buf[res] = 0; 2893 2894 //we need to double-trip to verify it didn't insert any ? in place 2895 //of bogus characters 2896 wxWCharBuffer wcBuf(n); 2897 size_t pszlen = wxWcslen(psz); 2898 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED || 2899 wxWcslen(wcBuf) != pszlen || 2900 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 ) 2901 { 2902 // we didn't obtain the same thing we started from, hence 2903 // the conversion was lossy and we consider that it failed 2904 return wxCONV_FAILED; 2905 } 2906 } 2907 2908 return res ; 2909 } 2910 2911 virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); } 2912 2913 bool IsOk() const 2914 { 2915 CreateIfNeeded() ; 2916 return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; 2917 } 2918 2919protected : 2920 mutable TECObjectRef m_MB2WC_converter; 2921 mutable TECObjectRef m_WC2MB_converter; 2922#if wxUSE_THREADS 2923 mutable wxMutex m_MB2WC_guard; 2924 mutable wxMutex m_WC2MB_guard; 2925#endif 2926 2927 TextEncodingBase m_char_encoding; 2928 TextEncodingBase m_unicode_encoding; 2929}; 2930 2931// MB is decomposed (D) normalized UTF8 2932 2933class wxMBConv_macUTF8D : public wxMBConv_mac 2934{ 2935public : 2936 wxMBConv_macUTF8D() 2937 { 2938 Init( kTextEncodingUnicodeDefault , kUnicodeNoSubset , kUnicodeUTF8Format ) ; 2939 m_uni = NULL; 2940 m_uniBack = NULL ; 2941 } 2942 2943 virtual ~wxMBConv_macUTF8D() 2944 { 2945 if (m_uni!=NULL) 2946 DisposeUnicodeToTextInfo(&m_uni); 2947 if (m_uniBack!=NULL) 2948 DisposeUnicodeToTextInfo(&m_uniBack); 2949 } 2950 2951 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const 2952 { 2953 CreateIfNeeded() ; 2954 OSStatus status = noErr ; 2955 ByteCount byteOutLen ; 2956 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ; 2957 2958 char *tbuf = NULL ; 2959 2960 if (buf == NULL) 2961 { 2962 // Apple specs say at least 32 2963 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T ); 2964 tbuf = (char*) malloc( n ) ; 2965 } 2966 2967 ByteCount byteBufferLen = n ; 2968 UniChar* ubuf = NULL ; 2969 2970#if SIZEOF_WCHAR_T == 4 2971 wxMBConvUTF16 converter ; 2972 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ; 2973 byteInLen = unicharlen ; 2974 ubuf = (UniChar*) malloc( byteInLen + 2 ) ; 2975 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ; 2976#else 2977 ubuf = (UniChar*) psz ; 2978#endif 2979 2980 // ubuf is a non-decomposed UniChar buffer 2981 2982 ByteCount dcubuflen = byteInLen * 2 + 2 ; 2983 ByteCount dcubufread , dcubufwritten ; 2984 UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ; 2985 2986 { 2987#if wxUSE_THREADS 2988 wxMutexLocker lock( m_WC2MB_guard ); 2989#endif 2990 ConvertFromUnicodeToText( m_uni , byteInLen , ubuf , 2991 kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , dcubuf ) ; 2992 2993 // we now convert that decomposed buffer into UTF8 2994 2995 status = TECConvertText( 2996 m_WC2MB_converter, (ConstTextPtr) dcubuf, dcubufwritten, &dcubufread, 2997 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen); 2998 } 2999 3000 free( dcubuf ); 3001 3002#if SIZEOF_WCHAR_T == 4 3003 free( ubuf ) ; 3004#endif 3005 3006 if ( buf == NULL ) 3007 free(tbuf) ; 3008 3009 size_t res = byteOutLen ; 3010 if ( buf && res < n) 3011 { 3012 buf[res] = 0; 3013 // don't test for round-trip fidelity yet, we cannot guarantee it yet 3014 } 3015 3016 return res ; 3017 } 3018 3019 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const 3020 { 3021 CreateIfNeeded() ; 3022 OSStatus status = noErr ; 3023 ByteCount byteOutLen ; 3024 ByteCount byteInLen = strlen(psz) + 1; 3025 wchar_t *tbuf = NULL ; 3026 UniChar* ubuf = NULL ; 3027 size_t res = 0 ; 3028 3029 if (buf == NULL) 3030 { 3031 // Apple specs say at least 32 3032 n = wxMax( 32, byteInLen ) ; 3033 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ; 3034 } 3035 3036 ByteCount byteBufferLen = n * sizeof( UniChar ) ; 3037 3038#if SIZEOF_WCHAR_T == 4 3039 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ; 3040#else 3041 ubuf = (UniChar*) (buf ? buf : tbuf) ; 3042#endif 3043 3044 ByteCount dcubuflen = byteBufferLen * 2 + 2 ; 3045 ByteCount dcubufread , dcubufwritten ; 3046 UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ; 3047 3048 { 3049#if wxUSE_THREADS 3050 wxMutexLocker lock( m_MB2WC_guard ); 3051#endif 3052 status = TECConvertText( 3053 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen, 3054 (TextPtr) dcubuf, dcubuflen, &byteOutLen); 3055 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar 3056 // is not properly terminated we get random characters at the end 3057 dcubuf[byteOutLen / sizeof( UniChar ) ] = 0 ; 3058 3059 // now from the decomposed UniChar to properly composed uniChar 3060 ConvertFromUnicodeToText( m_uniBack , byteOutLen , dcubuf , 3061 kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , ubuf ) ; 3062 } 3063 3064 free( dcubuf ); 3065 byteOutLen = dcubufwritten ; 3066 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ; 3067 3068 3069#if SIZEOF_WCHAR_T == 4 3070 wxMBConvUTF16 converter ; 3071 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ; 3072 free( ubuf ) ; 3073#else 3074 res = byteOutLen / sizeof( UniChar ) ; 3075#endif 3076 3077 if ( buf == NULL ) 3078 free(tbuf) ; 3079 3080 if ( buf && res < n) 3081 buf[res] = 0; 3082 3083 return res ; 3084 } 3085 3086 virtual void CreateIfNeeded() const 3087 { 3088 wxMBConv_mac::CreateIfNeeded() ; 3089 if ( m_uni == NULL ) 3090 { 3091 m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 3092 kUnicodeNoSubset, kTextEncodingDefaultFormat); 3093 m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 3094 kUnicodeCanonicalDecompVariant, kTextEncodingDefaultFormat); 3095 m_map.mappingVersion = kUnicodeUseLatestMapping; 3096 3097 OSStatus err = CreateUnicodeToTextInfo(&m_map, &m_uni); 3098 wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ; 3099 3100 m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 3101 kUnicodeNoSubset, kTextEncodingDefaultFormat); 3102 m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 3103 kUnicodeCanonicalCompVariant, kTextEncodingDefaultFormat); 3104 m_map.mappingVersion = kUnicodeUseLatestMapping; 3105 err = CreateUnicodeToTextInfo(&m_map, &m_uniBack); 3106 wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ; 3107 } 3108 } 3109protected : 3110 mutable UnicodeToTextInfo m_uni; 3111 mutable UnicodeToTextInfo m_uniBack; 3112 mutable UnicodeMapping m_map; 3113}; 3114#endif // defined(__WXMAC__) && defined(TARGET_CARBON) 3115 3116// ============================================================================ 3117// wxEncodingConverter based conversion classes 3118// ============================================================================ 3119 3120#if wxUSE_FONTMAP 3121 3122class wxMBConv_wxwin : public wxMBConv 3123{ 3124private: 3125 void Init() 3126 { 3127 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) && 3128 w2m.Init(wxFONTENCODING_UNICODE, m_enc); 3129 } 3130 3131public: 3132 // temporarily just use wxEncodingConverter stuff, 3133 // so that it works while a better implementation is built 3134 wxMBConv_wxwin(const wxChar* name) 3135 { 3136 if (name) 3137 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false); 3138 else 3139 m_enc = wxFONTENCODING_SYSTEM; 3140 3141 Init(); 3142 } 3143 3144 wxMBConv_wxwin(wxFontEncoding enc) 3145 { 3146 m_enc = enc; 3147 3148 Init(); 3149 } 3150 3151 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const 3152 { 3153 size_t inbuf = strlen(psz); 3154 if (buf) 3155 { 3156 if (!m2w.Convert(psz, buf)) 3157 return wxCONV_FAILED; 3158 } 3159 return inbuf; 3160 } 3161 3162 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const 3163 { 3164 const size_t inbuf = wxWcslen(psz); 3165 if (buf) 3166 { 3167 if (!w2m.Convert(psz, buf)) 3168 return wxCONV_FAILED; 3169 } 3170 3171 return inbuf; 3172 } 3173 3174 virtual size_t GetMBNulLen() const 3175 { 3176 switch ( m_enc ) 3177 { 3178 case wxFONTENCODING_UTF16BE: 3179 case wxFONTENCODING_UTF16LE: 3180 return 2; 3181 3182 case wxFONTENCODING_UTF32BE: 3183 case wxFONTENCODING_UTF32LE: 3184 return 4; 3185 3186 default: 3187 return 1; 3188 } 3189 } 3190 3191 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); } 3192 3193 bool IsOk() const { return m_ok; } 3194 3195public: 3196 wxFontEncoding m_enc; 3197 wxEncodingConverter m2w, w2m; 3198 3199private: 3200 // were we initialized successfully? 3201 bool m_ok; 3202 3203 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin) 3204}; 3205 3206// make the constructors available for unit testing 3207WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name ) 3208{ 3209 wxMBConv_wxwin* result = new wxMBConv_wxwin( name ); 3210 if ( !result->IsOk() ) 3211 { 3212 delete result; 3213 return 0; 3214 } 3215 3216 return result; 3217} 3218 3219#endif // wxUSE_FONTMAP 3220 3221// ============================================================================ 3222// wxCSConv implementation 3223// ============================================================================ 3224 3225void wxCSConv::Init() 3226{ 3227 m_name = NULL; 3228 m_convReal = NULL; 3229 m_deferred = true; 3230} 3231 3232wxCSConv::wxCSConv(const wxChar *charset) 3233{ 3234 Init(); 3235 3236 if ( charset ) 3237 { 3238 SetName(charset); 3239 } 3240 3241#if wxUSE_FONTMAP 3242 m_encoding = wxFontMapperBase::GetEncodingFromName(charset); 3243 if ( m_encoding == wxFONTENCODING_MAX ) 3244 { 3245 // set to unknown/invalid value 3246 m_encoding = wxFONTENCODING_SYSTEM; 3247 } 3248 else if ( m_encoding == wxFONTENCODING_DEFAULT ) 3249 { 3250 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context 3251 m_encoding = wxFONTENCODING_ISO8859_1; 3252 } 3253#else 3254 m_encoding = wxFONTENCODING_SYSTEM; 3255#endif 3256} 3257 3258wxCSConv::wxCSConv(wxFontEncoding encoding) 3259{ 3260 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT ) 3261 { 3262 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") ); 3263 3264 encoding = wxFONTENCODING_SYSTEM; 3265 } 3266 3267 Init(); 3268 3269 m_encoding = encoding; 3270} 3271 3272wxCSConv::~wxCSConv() 3273{ 3274 Clear(); 3275} 3276 3277wxCSConv::wxCSConv(const wxCSConv& conv) 3278 : wxMBConv() 3279{ 3280 Init(); 3281 3282 SetName(conv.m_name); 3283 m_encoding = conv.m_encoding; 3284} 3285 3286wxCSConv& wxCSConv::operator=(const wxCSConv& conv) 3287{ 3288 Clear(); 3289 3290 SetName(conv.m_name); 3291 m_encoding = conv.m_encoding; 3292 3293 return *this; 3294} 3295 3296void wxCSConv::Clear() 3297{ 3298 free(m_name); 3299 delete m_convReal; 3300 3301 m_name = NULL; 3302 m_convReal = NULL; 3303} 3304 3305void wxCSConv::SetName(const wxChar *charset) 3306{ 3307 if (charset) 3308 { 3309 m_name = wxStrdup(charset); 3310 m_deferred = true; 3311 } 3312} 3313 3314#if wxUSE_FONTMAP 3315 3316WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual, 3317 wxEncodingNameCache ); 3318 3319static wxEncodingNameCache gs_nameCache; 3320#endif 3321 3322wxMBConv *wxCSConv::DoCreate() const 3323{ 3324#if wxUSE_FONTMAP 3325 wxLogTrace(TRACE_STRCONV, 3326 wxT("creating conversion for %s"), 3327 (m_name ? m_name 3328 : wxFontMapperBase::GetEncodingName(m_encoding).c_str())); 3329#endif // wxUSE_FONTMAP 3330 3331 // check for the special case of ASCII or ISO8859-1 charset: as we have 3332 // special knowledge of it anyhow, we don't need to create a special 3333 // conversion object 3334 if ( m_encoding == wxFONTENCODING_ISO8859_1 || 3335 m_encoding == wxFONTENCODING_DEFAULT ) 3336 { 3337 // don't convert at all 3338 return NULL; 3339 } 3340 3341 // we trust OS to do conversion better than we can so try external 3342 // conversion methods first 3343 // 3344 // the full order is: 3345 // 1. OS conversion (iconv() under Unix or Win32 API) 3346 // 2. hard coded conversions for UTF 3347 // 3. wxEncodingConverter as fall back 3348 3349 // step (1) 3350#ifdef HAVE_ICONV 3351#if !wxUSE_FONTMAP 3352 if ( m_name ) 3353#endif // !wxUSE_FONTMAP 3354 { 3355 wxString name(m_name); 3356#if wxUSE_FONTMAP 3357 wxFontEncoding encoding(m_encoding); 3358#endif 3359 3360 if ( !name.empty() ) 3361 { 3362 wxMBConv_iconv *conv = new wxMBConv_iconv(name); 3363 if ( conv->IsOk() ) 3364 return conv; 3365 3366 delete conv; 3367 3368#if wxUSE_FONTMAP 3369 encoding = 3370 wxFontMapperBase::Get()->CharsetToEncoding(name, false); 3371#endif // wxUSE_FONTMAP 3372 } 3373#if wxUSE_FONTMAP 3374 { 3375 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding); 3376 if ( it != gs_nameCache.end() ) 3377 { 3378 if ( it->second.empty() ) 3379 return NULL; 3380 3381 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second); 3382 if ( conv->IsOk() ) 3383 return conv; 3384 3385 delete conv; 3386 } 3387 3388 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding); 3389 // CS : in case this does not return valid names (eg for MacRoman) encoding 3390 // got a 'failure' entry in the cache all the same, although it just has to 3391 // be created using a different method, so only store failed iconv creation 3392 // attempts (or perhaps we shoulnd't do this at all ?) 3393 if ( names[0] != NULL ) 3394 { 3395 for ( ; *names; ++names ) 3396 { 3397 wxMBConv_iconv *conv = new wxMBConv_iconv(*names); 3398 if ( conv->IsOk() ) 3399 { 3400 gs_nameCache[encoding] = *names; 3401 return conv; 3402 } 3403 3404 delete conv; 3405 } 3406 3407 gs_nameCache[encoding] = _T(""); // cache the failure 3408 } 3409 } 3410#endif // wxUSE_FONTMAP 3411 } 3412#endif // HAVE_ICONV 3413 3414#ifdef wxHAVE_WIN32_MB2WC 3415 { 3416#if wxUSE_FONTMAP 3417 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name) 3418 : new wxMBConv_win32(m_encoding); 3419 if ( conv->IsOk() ) 3420 return conv; 3421 3422 delete conv; 3423#else 3424 return NULL; 3425#endif 3426 } 3427#endif // wxHAVE_WIN32_MB2WC 3428 3429#if defined(__WXMAC__) 3430 { 3431 // leave UTF16 and UTF32 to the built-ins of wx 3432 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE || 3433 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) ) 3434 { 3435#if wxUSE_FONTMAP 3436 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name) 3437 : new wxMBConv_mac(m_encoding); 3438#else 3439 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding); 3440#endif 3441 if ( conv->IsOk() ) 3442 return conv; 3443 3444 delete conv; 3445 } 3446 } 3447#endif 3448 3449#if defined(__WXCOCOA__) 3450 { 3451 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) ) 3452 { 3453#if wxUSE_FONTMAP 3454 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name) 3455 : new wxMBConv_cocoa(m_encoding); 3456#else 3457 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding); 3458#endif 3459 3460 if ( conv->IsOk() ) 3461 return conv; 3462 3463 delete conv; 3464 } 3465 } 3466#endif 3467 // step (2) 3468 wxFontEncoding enc = m_encoding; 3469#if wxUSE_FONTMAP 3470 if ( enc == wxFONTENCODING_SYSTEM && m_name ) 3471 { 3472 // use "false" to suppress interactive dialogs -- we can be called from 3473 // anywhere and popping up a dialog from here is the last thing we want to 3474 // do 3475 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false); 3476 } 3477#endif // wxUSE_FONTMAP 3478 3479 switch ( enc ) 3480 { 3481 case wxFONTENCODING_UTF7: 3482 return new wxMBConvUTF7; 3483 3484 case wxFONTENCODING_UTF8: 3485 return new wxMBConvUTF8; 3486 3487 case wxFONTENCODING_UTF16BE: 3488 return new wxMBConvUTF16BE; 3489 3490 case wxFONTENCODING_UTF16LE: 3491 return new wxMBConvUTF16LE; 3492 3493 case wxFONTENCODING_UTF32BE: 3494 return new wxMBConvUTF32BE; 3495 3496 case wxFONTENCODING_UTF32LE: 3497 return new wxMBConvUTF32LE; 3498 3499 default: 3500 // nothing to do but put here to suppress gcc warnings 3501 break; 3502 } 3503 3504 // step (3) 3505#if wxUSE_FONTMAP 3506 { 3507 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name) 3508 : new wxMBConv_wxwin(m_encoding); 3509 if ( conv->IsOk() ) 3510 return conv; 3511 3512 delete conv; 3513 } 3514#endif // wxUSE_FONTMAP 3515 3516 // NB: This is a hack to prevent deadlock. What could otherwise happen 3517 // in Unicode build: wxConvLocal creation ends up being here 3518 // because of some failure and logs the error. But wxLog will try to 3519 // attach a timestamp, for which it will need wxConvLocal (to convert 3520 // time to char* and then wchar_t*), but that fails, tries to log the 3521 // error, but wxLog has an (already locked) critical section that 3522 // guards the static buffer. 3523 static bool alreadyLoggingError = false; 3524 if (!alreadyLoggingError) 3525 { 3526 alreadyLoggingError = true; 3527 wxLogError(_("Cannot convert from the charset '%s'!"), 3528 m_name ? m_name 3529 : 3530#if wxUSE_FONTMAP 3531 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str() 3532#else // !wxUSE_FONTMAP 3533 wxString::Format(_("encoding %i"), m_encoding).c_str() 3534#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP 3535 ); 3536 3537 alreadyLoggingError = false; 3538 } 3539 3540 return NULL; 3541} 3542 3543void wxCSConv::CreateConvIfNeeded() const 3544{ 3545 if ( m_deferred ) 3546 { 3547 wxCSConv *self = (wxCSConv *)this; // const_cast 3548 3549 // if we don't have neither the name nor the encoding, use the default 3550 // encoding for this system 3551 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM ) 3552 { 3553#if wxUSE_INTL 3554 self->m_encoding = wxLocale::GetSystemEncoding(); 3555#else 3556 // fallback to some reasonable default: 3557 self->m_encoding = wxFONTENCODING_ISO8859_1; 3558#endif // wxUSE_INTL 3559 } 3560 3561 self->m_convReal = DoCreate(); 3562 self->m_deferred = false; 3563 } 3564} 3565 3566bool wxCSConv::IsOk() const 3567{ 3568 CreateConvIfNeeded(); 3569 3570 // special case: no convReal created for wxFONTENCODING_ISO8859_1 3571 if ( m_encoding == wxFONTENCODING_ISO8859_1 ) 3572 return true; // always ok as we do it ourselves 3573 3574 // m_convReal->IsOk() is called at its own creation, so we know it must 3575 // be ok if m_convReal is non-NULL 3576 return m_convReal != NULL; 3577} 3578 3579size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen, 3580 const char *src, size_t srcLen) const 3581{ 3582 CreateConvIfNeeded(); 3583 3584 if (m_convReal) 3585 return m_convReal->ToWChar(dst, dstLen, src, srcLen); 3586 3587 // latin-1 (direct) 3588 if ( srcLen == wxNO_LEN ) 3589 srcLen = strlen(src) + 1; // take trailing NUL too 3590 3591 if ( dst ) 3592 { 3593 if ( dstLen < srcLen ) 3594 return wxCONV_FAILED; 3595 3596 for ( size_t n = 0; n < srcLen; n++ ) 3597 dst[n] = (unsigned char)(src[n]); 3598 } 3599 3600 return srcLen; 3601} 3602 3603size_t wxCSConv::FromWChar(char *dst, size_t dstLen, 3604 const wchar_t *src, size_t srcLen) const 3605{ 3606 CreateConvIfNeeded(); 3607 3608 if (m_convReal) 3609 return m_convReal->FromWChar(dst, dstLen, src, srcLen); 3610 3611 // latin-1 (direct) 3612 if ( srcLen == wxNO_LEN ) 3613 srcLen = wxWcslen(src) + 1; 3614 3615 if ( dst ) 3616 { 3617 if ( dstLen < srcLen ) 3618 return wxCONV_FAILED; 3619 3620 for ( size_t n = 0; n < srcLen; n++ ) 3621 { 3622 if ( src[n] > 0xFF ) 3623 return wxCONV_FAILED; 3624 3625 dst[n] = (char)src[n]; 3626 } 3627 3628 } 3629 else // still need to check the input validity 3630 { 3631 for ( size_t n = 0; n < srcLen; n++ ) 3632 { 3633 if ( src[n] > 0xFF ) 3634 return wxCONV_FAILED; 3635 } 3636 } 3637 3638 return srcLen; 3639} 3640 3641size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const 3642{ 3643 // this function exists only for ABI-compatibility in 2.8 branch 3644 return wxMBConv::MB2WC(buf, psz, n); 3645} 3646 3647size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const 3648{ 3649 // this function exists only for ABI-compatibility in 2.8 branch 3650 return wxMBConv::WC2MB(buf, psz, n); 3651} 3652 3653size_t wxCSConv::GetMBNulLen() const 3654{ 3655 CreateConvIfNeeded(); 3656 3657 if ( m_convReal ) 3658 { 3659 return m_convReal->GetMBNulLen(); 3660 } 3661 3662 return 1; 3663} 3664 3665// ---------------------------------------------------------------------------- 3666// globals 3667// ---------------------------------------------------------------------------- 3668 3669#ifdef __WINDOWS__ 3670 static wxMBConv_win32 wxConvLibcObj; 3671#elif defined(__WXMAC__) && !defined(__MACH__) 3672 static wxMBConv_mac wxConvLibcObj ; 3673#else 3674 static wxMBConvLibc wxConvLibcObj; 3675#endif 3676 3677static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM); 3678static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1); 3679static wxMBConvUTF7 wxConvUTF7Obj; 3680static wxMBConvUTF8 wxConvUTF8Obj; 3681#if defined(__WXMAC__) && defined(TARGET_CARBON) 3682static wxMBConv_macUTF8D wxConvMacUTF8DObj; 3683#endif 3684WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj; 3685WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj; 3686WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj; 3687WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj; 3688WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj; 3689WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj; 3690WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = &wxConvLocal; 3691WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = & 3692#ifdef __WXOSX__ 3693#if defined(__WXMAC__) && defined(TARGET_CARBON) 3694 wxConvMacUTF8DObj; 3695#else 3696 wxConvUTF8Obj; 3697#endif 3698#else // !__WXOSX__ 3699 wxConvLibcObj; 3700#endif // __WXOSX__/!__WXOSX__ 3701 3702#if wxUSE_UNICODE 3703 3704wxWCharBuffer wxSafeConvertMB2WX(const char *s) 3705{ 3706 if ( !s ) 3707 return wxWCharBuffer(); 3708 3709 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s)); 3710 if ( !wbuf ) 3711 wbuf = wxConvUTF8.cMB2WX(s); 3712 if ( !wbuf ) 3713 wbuf = wxConvISO8859_1.cMB2WX(s); 3714 3715 return wbuf; 3716} 3717 3718wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws) 3719{ 3720 if ( !ws ) 3721 return wxCharBuffer(); 3722 3723 wxCharBuffer buf(wxConvLibc.cWX2MB(ws)); 3724 if ( !buf ) 3725 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws); 3726 3727 return buf; 3728} 3729 3730#endif // wxUSE_UNICODE 3731 3732#else // !wxUSE_WCHAR_T 3733 3734// stand-ins in absence of wchar_t 3735WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc, 3736 wxConvISO8859_1, 3737 wxConvLocal, 3738 wxConvUTF8; 3739 3740WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = NULL; 3741 3742#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T 3743