1/* win32tc.c -- Interface to Win32 transcoding routines 2 3 (c) 1998-2006 (W3C) MIT, ERCIM, Keio University 4 See tidy.h for the copyright notice. 5 6 $Id$ 7*/ 8 9/* keep these here to keep file non-empty */ 10#include <tidy.h> 11#include "forward.h" 12#include "streamio.h" 13#include "tmbstr.h" 14#include "utf8.h" 15 16#ifdef TIDY_WIN32_MLANG_SUPPORT 17 18#define VC_EXTRALEAN 19#define CINTERFACE 20#define COBJMACROS 21 22#include <windows.h> 23#include <mlang.h> 24 25#undef COBJMACROS 26#undef CINTERFACE 27#undef VC_EXTRALEAN 28 29/* maximum number of bytes for a single character */ 30#define TC_INBUFSIZE 16 31 32/* maximum number of characters per byte sequence */ 33#define TC_OUTBUFSIZE 16 34 35#define CreateMLangObject(p) \ 36 CoCreateInstance( \ 37 &CLSID_CMLangConvertCharset, \ 38 NULL, \ 39 CLSCTX_ALL, \ 40 &IID_IMLangConvertCharset, \ 41 (VOID **)&p); 42 43 44/* Character Set to Microsoft Windows Codepage Identifier map, */ 45/* from <rotor/sscli/clr/src/classlibnative/nls/encodingdata.cpp>. */ 46 47/* note: the 'safe' field indicates whether this encoding can be */ 48/* read/written character-by-character; this does not apply to */ 49/* various stateful encodings such as ISO-2022 or UTF-7, these */ 50/* must be read/written as a complete stream. It is possible that */ 51/* some 'unsafe' encodings are marked as 'save'. */ 52 53/* todo: cleanup; Tidy should use only a single mapping table to */ 54/* circumvent unsupported aliases in other transcoding libraries, */ 55/* enable reverse lookup of encoding names and ease maintenance. */ 56 57static struct _nameWinCPMap 58{ 59 tmbstr name; 60 uint wincp; 61 Bool safe; 62} const NameWinCPMap[] = { 63 { "cp037", 37, yes }, 64 { "csibm037", 37, yes }, 65 { "ebcdic-cp-ca", 37, yes }, 66 { "ebcdic-cp-nl", 37, yes }, 67 { "ebcdic-cp-us", 37, yes }, 68 { "ebcdic-cp-wt", 37, yes }, 69 { "ibm037", 37, yes }, 70 { "cp437", 437, yes }, 71 { "cspc8codepage437", 437, yes }, 72 { "ibm437", 437, yes }, 73 { "cp500", 500, yes }, 74 { "csibm500", 500, yes }, 75 { "ebcdic-cp-be", 500, yes }, 76 { "ebcdic-cp-ch", 500, yes }, 77 { "ibm500", 500, yes }, 78 { "asmo-708", 708, yes }, 79 { "dos-720", 720, yes }, 80 { "ibm737", 737, yes }, 81 { "ibm775", 775, yes }, 82 { "cp850", 850, yes }, 83 { "ibm850", 850, yes }, 84 { "cp852", 852, yes }, 85 { "ibm852", 852, yes }, 86 { "cp855", 855, yes }, 87 { "ibm855", 855, yes }, 88 { "cp857", 857, yes }, 89 { "ibm857", 857, yes }, 90 { "ccsid00858", 858, yes }, 91 { "cp00858", 858, yes }, 92 { "cp858", 858, yes }, 93 { "ibm00858", 858, yes }, 94 { "pc-multilingual-850+euro", 858, yes }, 95 { "cp860", 860, yes }, 96 { "ibm860", 860, yes }, 97 { "cp861", 861, yes }, 98 { "ibm861", 861, yes }, 99 { "cp862", 862, yes }, 100 { "dos-862", 862, yes }, 101 { "ibm862", 862, yes }, 102 { "cp863", 863, yes }, 103 { "ibm863", 863, yes }, 104 { "cp864", 864, yes }, 105 { "ibm864", 864, yes }, 106 { "cp865", 865, yes }, 107 { "ibm865", 865, yes }, 108 { "cp866", 866, yes }, 109 { "ibm866", 866, yes }, 110 { "cp869", 869, yes }, 111 { "ibm869", 869, yes }, 112 { "cp870", 870, yes }, 113 { "csibm870", 870, yes }, 114 { "ebcdic-cp-roece", 870, yes }, 115 { "ebcdic-cp-yu", 870, yes }, 116 { "ibm870", 870, yes }, 117 { "dos-874", 874, yes }, 118 { "iso-8859-11", 874, yes }, 119 { "tis-620", 874, yes }, 120 { "windows-874", 874, yes }, 121 { "cp875", 875, yes }, 122 { "csshiftjis", 932, yes }, 123 { "cswindows31j", 932, yes }, 124 { "ms_kanji", 932, yes }, 125 { "shift-jis", 932, yes }, 126 { "shift_jis", 932, yes }, 127 { "sjis", 932, yes }, 128 { "x-ms-cp932", 932, yes }, 129 { "x-sjis", 932, yes }, 130 { "chinese", 936, yes }, 131 { "cn-gb", 936, yes }, 132 { "csgb2312", 936, yes }, 133 { "csgb231280", 936, yes }, 134 { "csiso58gb231280", 936, yes }, 135 { "gb2312", 936, yes }, 136 { "gb2312-80", 936, yes }, 137 { "gb231280", 936, yes }, 138 { "gb_2312-80", 936, yes }, 139 { "gbk", 936, yes }, 140 { "iso-ir-58", 936, yes }, 141 { "csksc56011987", 949, yes }, 142 { "iso-ir-149", 949, yes }, 143 { "korean", 949, yes }, 144 { "ks-c-5601", 949, yes }, 145 { "ks-c5601", 949, yes }, 146 { "ks_c_5601", 949, yes }, 147 { "ks_c_5601-1987", 949, yes }, 148 { "ks_c_5601-1989", 949, yes }, 149 { "ks_c_5601_1987", 949, yes }, 150 { "ksc5601", 949, yes }, 151 { "ksc_5601", 949, yes }, 152 { "big5", 950, yes }, 153 { "big5-hkscs", 950, yes }, 154 { "cn-big5", 950, yes }, 155 { "csbig5", 950, yes }, 156 { "x-x-big5", 950, yes }, 157 { "cp1026", 1026, yes }, 158 { "csibm1026", 1026, yes }, 159 { "ibm1026", 1026, yes }, 160 { "ibm01047", 1047, yes }, 161 { "ccsid01140", 1140, yes }, 162 { "cp01140", 1140, yes }, 163 { "ebcdic-us-37+euro", 1140, yes }, 164 { "ibm01140", 1140, yes }, 165 { "ccsid01141", 1141, yes }, 166 { "cp01141", 1141, yes }, 167 { "ebcdic-de-273+euro", 1141, yes }, 168 { "ibm01141", 1141, yes }, 169 { "ccsid01142", 1142, yes }, 170 { "cp01142", 1142, yes }, 171 { "ebcdic-dk-277+euro", 1142, yes }, 172 { "ebcdic-no-277+euro", 1142, yes }, 173 { "ibm01142", 1142, yes }, 174 { "ccsid01143", 1143, yes }, 175 { "cp01143", 1143, yes }, 176 { "ebcdic-fi-278+euro", 1143, yes }, 177 { "ebcdic-se-278+euro", 1143, yes }, 178 { "ibm01143", 1143, yes }, 179 { "ccsid01144", 1144, yes }, 180 { "cp01144", 1144, yes }, 181 { "ebcdic-it-280+euro", 1144, yes }, 182 { "ibm01144", 1144, yes }, 183 { "ccsid01145", 1145, yes }, 184 { "cp01145", 1145, yes }, 185 { "ebcdic-es-284+euro", 1145, yes }, 186 { "ibm01145", 1145, yes }, 187 { "ccsid01146", 1146, yes }, 188 { "cp01146", 1146, yes }, 189 { "ebcdic-gb-285+euro", 1146, yes }, 190 { "ibm01146", 1146, yes }, 191 { "ccsid01147", 1147, yes }, 192 { "cp01147", 1147, yes }, 193 { "ebcdic-fr-297+euro", 1147, yes }, 194 { "ibm01147", 1147, yes }, 195 { "ccsid01148", 1148, yes }, 196 { "cp01148", 1148, yes }, 197 { "ebcdic-international-500+euro", 1148, yes }, 198 { "ibm01148", 1148, yes }, 199 { "ccsid01149", 1149, yes }, 200 { "cp01149", 1149, yes }, 201 { "ebcdic-is-871+euro", 1149, yes }, 202 { "ibm01149", 1149, yes }, 203 { "iso-10646-ucs-2", 1200, yes }, 204 { "ucs-2", 1200, yes }, 205 { "unicode", 1200, yes }, 206 { "utf-16", 1200, yes }, 207 { "utf-16le", 1200, yes }, 208 { "unicodefffe", 1201, yes }, 209 { "utf-16be", 1201, yes }, 210 { "windows-1250", 1250, yes }, 211 { "x-cp1250", 1250, yes }, 212 { "windows-1251", 1251, yes }, 213 { "x-cp1251", 1251, yes }, 214 { "windows-1252", 1252, yes }, 215 { "x-ansi", 1252, yes }, 216 { "windows-1253", 1253, yes }, 217 { "windows-1254", 1254, yes }, 218 { "windows-1255", 1255, yes }, 219 { "cp1256", 1256, yes }, 220 { "windows-1256", 1256, yes }, 221 { "windows-1257", 1257, yes }, 222 { "windows-1258", 1258, yes }, 223 { "johab", 1361, yes }, 224 { "macintosh", 10000, yes }, 225 { "x-mac-japanese", 10001, yes }, 226 { "x-mac-chinesetrad", 10002, yes }, 227 { "x-mac-korean", 10003, yes }, 228 { "x-mac-arabic", 10004, yes }, 229 { "x-mac-hebrew", 10005, yes }, 230 { "x-mac-greek", 10006, yes }, 231 { "x-mac-cyrillic", 10007, yes }, 232 { "x-mac-chinesesimp", 10008, yes }, 233 { "x-mac-romanian", 10010, yes }, 234 { "x-mac-ukrainian", 10017, yes }, 235 { "x-mac-thai", 10021, yes }, 236 { "x-mac-ce", 10029, yes }, 237 { "x-mac-icelandic", 10079, yes }, 238 { "x-mac-turkish", 10081, yes }, 239 { "x-mac-croatian", 10082, yes }, 240 { "x-chinese-cns", 20000, yes }, 241 { "x-cp20001", 20001, yes }, 242 { "x-chinese-eten", 20002, yes }, 243 { "x-cp20003", 20003, yes }, 244 { "x-cp20004", 20004, yes }, 245 { "x-cp20005", 20005, yes }, 246 { "irv", 20105, yes }, 247 { "x-ia5", 20105, yes }, 248 { "din_66003", 20106, yes }, 249 { "german", 20106, yes }, 250 { "x-ia5-german", 20106, yes }, 251 { "sen_850200_b", 20107, yes }, 252 { "swedish", 20107, yes }, 253 { "x-ia5-swedish", 20107, yes }, 254 { "norwegian", 20108, yes }, 255 { "ns_4551-1", 20108, yes }, 256 { "x-ia5-norwegian", 20108, yes }, 257 { "ansi_x3.4-1968", 20127, yes }, 258 { "ansi_x3.4-1986", 20127, yes }, 259 { "ascii", 20127, yes }, 260 { "cp367", 20127, yes }, 261 { "csascii", 20127, yes }, 262 { "ibm367", 20127, yes }, 263 { "iso-ir-6", 20127, yes }, 264 { "iso646-us", 20127, yes }, 265 { "iso_646.irv:1991", 20127, yes }, 266 { "us", 20127, yes }, 267 { "us-ascii", 20127, yes }, 268 { "x-cp20261", 20261, yes }, 269 { "x-cp20269", 20269, yes }, 270 { "cp273", 20273, yes }, 271 { "csibm273", 20273, yes }, 272 { "ibm273", 20273, yes }, 273 { "csibm277", 20277, yes }, 274 { "ebcdic-cp-dk", 20277, yes }, 275 { "ebcdic-cp-no", 20277, yes }, 276 { "ibm277", 20277, yes }, 277 { "cp278", 20278, yes }, 278 { "csibm278", 20278, yes }, 279 { "ebcdic-cp-fi", 20278, yes }, 280 { "ebcdic-cp-se", 20278, yes }, 281 { "ibm278", 20278, yes }, 282 { "cp280", 20280, yes }, 283 { "csibm280", 20280, yes }, 284 { "ebcdic-cp-it", 20280, yes }, 285 { "ibm280", 20280, yes }, 286 { "cp284", 20284, yes }, 287 { "csibm284", 20284, yes }, 288 { "ebcdic-cp-es", 20284, yes }, 289 { "ibm284", 20284, yes }, 290 { "cp285", 20285, yes }, 291 { "csibm285", 20285, yes }, 292 { "ebcdic-cp-gb", 20285, yes }, 293 { "ibm285", 20285, yes }, 294 { "cp290", 20290, yes }, 295 { "csibm290", 20290, yes }, 296 { "ebcdic-jp-kana", 20290, yes }, 297 { "ibm290", 20290, yes }, 298 { "cp297", 20297, yes }, 299 { "csibm297", 20297, yes }, 300 { "ebcdic-cp-fr", 20297, yes }, 301 { "ibm297", 20297, yes }, 302 { "cp420", 20420, yes }, 303 { "csibm420", 20420, yes }, 304 { "ebcdic-cp-ar1", 20420, yes }, 305 { "ibm420", 20420, yes }, 306 { "cp423", 20423, yes }, 307 { "csibm423", 20423, yes }, 308 { "ebcdic-cp-gr", 20423, yes }, 309 { "ibm423", 20423, yes }, 310 { "cp424", 20424, yes }, 311 { "csibm424", 20424, yes }, 312 { "ebcdic-cp-he", 20424, yes }, 313 { "ibm424", 20424, yes }, 314 { "x-ebcdic-koreanextended", 20833, yes }, 315 { "csibmthai", 20838, yes }, 316 { "ibm-thai", 20838, yes }, 317 { "cskoi8r", 20866, yes }, 318 { "koi", 20866, yes }, 319 { "koi8", 20866, yes }, 320 { "koi8-r", 20866, yes }, 321 { "koi8r", 20866, yes }, 322 { "cp871", 20871, yes }, 323 { "csibm871", 20871, yes }, 324 { "ebcdic-cp-is", 20871, yes }, 325 { "ibm871", 20871, yes }, 326 { "cp880", 20880, yes }, 327 { "csibm880", 20880, yes }, 328 { "ebcdic-cyrillic", 20880, yes }, 329 { "ibm880", 20880, yes }, 330 { "cp905", 20905, yes }, 331 { "csibm905", 20905, yes }, 332 { "ebcdic-cp-tr", 20905, yes }, 333 { "ibm905", 20905, yes }, 334 { "ccsid00924", 20924, yes }, 335 { "cp00924", 20924, yes }, 336 { "ebcdic-latin9--euro", 20924, yes }, 337 { "ibm00924", 20924, yes }, 338 { "x-cp20936", 20936, yes }, 339 { "x-cp20949", 20949, yes }, 340 { "cp1025", 21025, yes }, 341 { "x-cp21027", 21027, yes }, 342 { "koi8-ru", 21866, yes }, 343 { "koi8-u", 21866, yes }, 344 { "cp819", 28591, yes }, 345 { "csisolatin1", 28591, yes }, 346 { "ibm819", 28591, yes }, 347 { "iso-8859-1", 28591, yes }, 348 { "iso-ir-100", 28591, yes }, 349 { "iso8859-1", 28591, yes }, 350 { "iso_8859-1", 28591, yes }, 351 { "iso_8859-1:1987", 28591, yes }, 352 { "l1", 28591, yes }, 353 { "latin1", 28591, yes }, 354 { "csisolatin2", 28592, yes }, 355 { "iso-8859-2", 28592, yes }, 356 { "iso-ir-101", 28592, yes }, 357 { "iso8859-2", 28592, yes }, 358 { "iso_8859-2", 28592, yes }, 359 { "iso_8859-2:1987", 28592, yes }, 360 { "l2", 28592, yes }, 361 { "latin2", 28592, yes }, 362 { "csisolatin3", 28593, yes }, 363 { "iso-8859-3", 28593, yes }, 364 { "iso-ir-109", 28593, yes }, 365 { "iso_8859-3", 28593, yes }, 366 { "iso_8859-3:1988", 28593, yes }, 367 { "l3", 28593, yes }, 368 { "latin3", 28593, yes }, 369 { "csisolatin4", 28594, yes }, 370 { "iso-8859-4", 28594, yes }, 371 { "iso-ir-110", 28594, yes }, 372 { "iso_8859-4", 28594, yes }, 373 { "iso_8859-4:1988", 28594, yes }, 374 { "l4", 28594, yes }, 375 { "latin4", 28594, yes }, 376 { "csisolatincyrillic", 28595, yes }, 377 { "cyrillic", 28595, yes }, 378 { "iso-8859-5", 28595, yes }, 379 { "iso-ir-144", 28595, yes }, 380 { "iso_8859-5", 28595, yes }, 381 { "iso_8859-5:1988", 28595, yes }, 382 { "arabic", 28596, yes }, 383 { "csisolatinarabic", 28596, yes }, 384 { "ecma-114", 28596, yes }, 385 { "iso-8859-6", 28596, yes }, 386 { "iso-ir-127", 28596, yes }, 387 { "iso_8859-6", 28596, yes }, 388 { "iso_8859-6:1987", 28596, yes }, 389 { "csisolatingreek", 28597, yes }, 390 { "ecma-118", 28597, yes }, 391 { "elot_928", 28597, yes }, 392 { "greek", 28597, yes }, 393 { "greek8", 28597, yes }, 394 { "iso-8859-7", 28597, yes }, 395 { "iso-ir-126", 28597, yes }, 396 { "iso_8859-7", 28597, yes }, 397 { "iso_8859-7:1987", 28597, yes }, 398 { "csisolatinhebrew", 28598, yes }, 399 { "hebrew", 28598, yes }, 400 { "iso-8859-8", 28598, yes }, 401 { "iso-ir-138", 28598, yes }, 402 { "iso_8859-8", 28598, yes }, 403 { "iso_8859-8:1988", 28598, yes }, 404 { "logical", 28598, yes }, 405 { "visual", 28598, yes }, 406 { "csisolatin5", 28599, yes }, 407 { "iso-8859-9", 28599, yes }, 408 { "iso-ir-148", 28599, yes }, 409 { "iso_8859-9", 28599, yes }, 410 { "iso_8859-9:1989", 28599, yes }, 411 { "l5", 28599, yes }, 412 { "latin5", 28599, yes }, 413 { "iso-8859-13", 28603, yes }, 414 { "csisolatin9", 28605, yes }, 415 { "iso-8859-15", 28605, yes }, 416 { "iso_8859-15", 28605, yes }, 417 { "l9", 28605, yes }, 418 { "latin9", 28605, yes }, 419 { "x-europa", 29001, yes }, 420 { "iso-8859-8-i", 38598, yes }, 421 { "iso-2022-jp", 50220, no }, 422 { "csiso2022jp", 50221, no }, 423 { "csiso2022kr", 50225, no }, 424 { "iso-2022-kr", 50225, no }, 425 { "iso-2022-kr-7", 50225, no }, 426 { "iso-2022-kr-7bit", 50225, no }, 427 { "cp50227", 50227, no }, 428 { "x-cp50227", 50227, no }, 429 { "cp930", 50930, yes }, 430 { "x-ebcdic-japaneseanduscanada", 50931, yes }, 431 { "cp933", 50933, yes }, 432 { "cp935", 50935, yes }, 433 { "cp937", 50937, yes }, 434 { "cp939", 50939, yes }, 435 { "cseucpkdfmtjapanese", 51932, yes }, 436 { "euc-jp", 51932, yes }, 437 { "extended_unix_code_packed_format_for_japanese", 51932, yes }, 438 { "iso-2022-jpeuc", 51932, yes }, 439 { "x-euc", 51932, yes }, 440 { "x-euc-jp", 51932, yes }, 441 { "euc-cn", 51936, yes }, 442 { "x-euc-cn", 51936, yes }, 443 { "cseuckr", 51949, yes }, 444 { "euc-kr", 51949, yes }, 445 { "iso-2022-kr-8", 51949, yes }, 446 { "iso-2022-kr-8bit", 51949, yes }, 447 { "hz-gb-2312", 52936, no }, 448 { "gb18030", 54936, yes }, 449 { "x-iscii-de", 57002, yes }, 450 { "x-iscii-be", 57003, yes }, 451 { "x-iscii-ta", 57004, yes }, 452 { "x-iscii-te", 57005, yes }, 453 { "x-iscii-as", 57006, yes }, 454 { "x-iscii-or", 57007, yes }, 455 { "x-iscii-ka", 57008, yes }, 456 { "x-iscii-ma", 57009, yes }, 457 { "x-iscii-gu", 57010, yes }, 458 { "x-iscii-pa", 57011, yes }, 459 { "csunicode11utf7", 65000, no }, 460 { "unicode-1-1-utf-7", 65000, no }, 461 { "unicode-2-0-utf-7", 65000, no }, 462 { "utf-7", 65000, no }, 463 { "x-unicode-1-1-utf-7", 65000, no }, 464 { "x-unicode-2-0-utf-7", 65000, no }, 465 { "unicode-1-1-utf-8", 65001, yes }, 466 { "unicode-2-0-utf-8", 65001, yes }, 467 { "utf-8", 65001, yes }, 468 { "x-unicode-1-1-utf-8", 65001, yes }, 469 { "x-unicode-2-0-utf-8", 65001, yes }, 470 471 /* final entry */ 472 { NULL, 0, no } 473}; 474 475uint TY_(Win32MLangGetCPFromName)(ctmbstr encoding) 476{ 477 uint i; 478 tmbstr enc; 479 480 /* ensure name is in lower case */ 481 enc = TY_(tmbstrdup)(encoding); 482 enc = TY_(tmbstrtolower)(enc); 483 484 for (i = 0; NameWinCPMap[i].name; ++i) 485 { 486 if (TY_(tmbstrcmp)(NameWinCPMap[i].name, enc) == 0) 487 { 488 IMLangConvertCharset * p = NULL; 489 uint wincp = NameWinCPMap[i].wincp; 490 HRESULT hr; 491 492 MemFree(enc); 493 494 /* currently no support for unsafe encodings */ 495 if (!NameWinCPMap[i].safe) 496 return 0; 497 498 /* hack for config.c */ 499 CoInitialize(NULL); 500 hr = CreateMLangObject(p); 501 502 if (hr != S_OK || !p) 503 { 504 wincp = 0; 505 } 506 else 507 { 508 hr = IMLangConvertCharset_Initialize(p, wincp, 1200, 0); 509 510 if (hr != S_OK) 511 wincp = 0; 512 513 IMLangConvertCharset_Release(p); 514 p = NULL; 515 } 516 517 CoUninitialize(); 518 519 return wincp; 520 } 521 } 522 523 MemFree(enc); 524 return 0; 525} 526 527Bool TY_(Win32MLangInitInputTranscoder)(StreamIn * in, uint wincp) 528{ 529 IMLangConvertCharset * p = NULL; 530 HRESULT hr; 531 532 assert( in != NULL ); 533 534 CoInitialize(NULL); 535 536 if (wincp == 0) 537 { 538 /* no codepage found for this encoding */ 539 return no; 540 } 541 542 hr = CreateMLangObject(p); 543 544 if (hr != S_OK || !p) 545 { 546 /* MLang not supported */ 547 return no; 548 } 549 550 hr = IMLangConvertCharset_Initialize(p, wincp, 1200, 0); 551 552 if (hr != S_OK) 553 { 554 /* encoding not supported, insufficient memory, etc. */ 555 return no; 556 } 557 558 in->mlang = (ulong)p; 559 560 return yes; 561} 562 563void TY_(Win32MLangUninitInputTranscoder)(StreamIn * in) 564{ 565 IMLangConvertCharset * p; 566 567 assert( in != NULL ); 568 569 p = (IMLangConvertCharset *)in->mlang; 570 if (p) 571 { 572 IMLangConvertCharset_Release(p); 573 p = NULL; 574 in->mlang = (ulong)NULL; 575 } 576 577 CoUninitialize(); 578} 579 580Bool Win32MLangInitOutputTranscoder(StreamOut * out, tmbstr encoding) 581{ 582 IMLangConvertCharset * p = NULL; 583 HRESULT hr; 584 uint wincp; 585 586 assert( out != NULL ); 587 588 CoInitialize(NULL); 589 590 wincp = TY_(Win32MLangGetCPFromName)(encoding); 591 if (wincp == 0) 592 { 593 /* no codepage found for this encoding */ 594 return no; 595 } 596 597 hr = CreateMLangObject(p); 598 599 if (hr != S_OK || !p) 600 { 601 /* MLang not supported */ 602 return no; 603 } 604 605 IMLangConvertCharset_Initialize(p, 1200, wincp, MLCONVCHARF_NOBESTFITCHARS); 606 607 if (hr != S_OK) 608 { 609 /* encoding not supported, insufficient memory, etc. */ 610 return no; 611 } 612 613 out->mlang = (ulong)p; 614 615 return yes; 616} 617 618void Win32MLangUninitOutputTranscoder(StreamOut * out) 619{ 620 IMLangConvertCharset * p; 621 622 assert( out != NULL ); 623 624 p = (IMLangConvertCharset *)out->mlang; 625 if (p) 626 { 627 IMLangConvertCharset_Release(p); 628 p = NULL; 629 out->mlang = (ulong)NULL; 630 } 631 632 CoUninitialize(); 633} 634 635int TY_(Win32MLangGetChar)(byte firstByte, StreamIn * in, uint * bytesRead) 636{ 637 IMLangConvertCharset * p; 638 TidyInputSource * source; 639 CHAR inbuf[TC_INBUFSIZE] = { 0 }; 640 WCHAR outbuf[TC_OUTBUFSIZE] = { 0 }; 641 HRESULT hr = S_OK; 642 size_t inbufsize = 0; 643 644 assert( in != NULL ); 645 assert( &in->source != NULL ); 646 assert( bytesRead != NULL ); 647 assert( in->mlang != 0 ); 648 649 p = (IMLangConvertCharset *)in->mlang; 650 source = &in->source; 651 652 inbuf[inbufsize++] = (CHAR)firstByte; 653 654 while(inbufsize < TC_INBUFSIZE) 655 { 656 UINT outbufsize = TC_OUTBUFSIZE; 657 UINT readNow = inbufsize; 658 int nextByte = EndOfStream; 659 660 hr = IMLangConvertCharset_DoConversionToUnicode(p, inbuf, &readNow, outbuf, &outbufsize); 661 662 assert( hr == S_OK ); 663 assert( outbufsize <= 2 ); 664 665 if (outbufsize == 2) 666 { 667 /* U+10000-U+10FFFF are returned as a pair of surrogates */ 668 tchar m = (tchar)outbuf[0]; 669 tchar n = (tchar)outbuf[1]; 670 assert( IsHighSurrogate(n) && IsLowSurrogate(m) ); 671 *bytesRead = readNow; 672 return (int)CombineSurrogatePair(n, m); 673 } 674 675 if (outbufsize == 1) 676 { 677 /* we found the character */ 678 /* set bytesRead and return */ 679 *bytesRead = readNow; 680 return (int)outbuf[0]; 681 } 682 683 /* we need more bytes */ 684 nextByte = source->getByte(source->sourceData); 685 686 if (nextByte == EndOfStream) 687 { 688 /* todo: error message for broken stream? */ 689 690 *bytesRead = readNow; 691 return EndOfStream; 692 } 693 694 inbuf[inbufsize++] = (CHAR)nextByte; 695 } 696 697 /* No full character found after reading TC_INBUFSIZE bytes, */ 698 /* give up to read this stream, it's obviously unreadable. */ 699 700 /* todo: error message for broken stream? */ 701 return EndOfStream; 702} 703 704Bool Win32MLangIsConvertible(tchar c, StreamOut * out) 705{ 706 IMLangConvertCharset * p; 707 UINT i = 1; 708 HRESULT hr; 709 WCHAR inbuf[2] = { 0 }; 710 UINT inbufsize = 0; 711 712 assert( c != 0 ); 713 assert( c <= 0x10FFFF ); 714 assert( out != NULL ); 715 assert( out->mlang != 0 ); 716 717 if (c > 0xFFFF) 718 { 719 tchar high = 0; 720 tchar low = 0; 721 722 SplitSurrogatePair(c, &low, &high); 723 724 inbuf[inbufsize++] = (WCHAR)low; 725 inbuf[inbufsize++] = (WCHAR)high; 726 } 727 else 728 inbuf[inbufsize++] = (WCHAR)c; 729 730 p = (IMLangConvertCharset *)out->mlang; 731 hr = IMLangConvertCharset_DoConversionFromUnicode(p, inbuf, &inbufsize, NULL, NULL); 732 733 return hr == S_OK ? yes : no; 734} 735 736void Win32MLangPutChar(tchar c, StreamOut * out, uint * bytesWritten) 737{ 738 IMLangConvertCharset * p; 739 TidyOutputSink * sink; 740 CHAR outbuf[TC_OUTBUFSIZE] = { 0 }; 741 UINT outbufsize = TC_OUTBUFSIZE; 742 HRESULT hr = S_OK; 743 WCHAR inbuf[2] = { 0 }; 744 UINT inbufsize = 0; 745 uint i; 746 747 assert( c != 0 ); 748 assert( c <= 0x10FFFF ); 749 assert( bytesWritten != NULL ); 750 assert( out != NULL ); 751 assert( &out->sink != NULL ); 752 assert( out->mlang != 0 ); 753 754 p = (IMLangConvertCharset *)out->mlang; 755 sink = &out->sink; 756 757 if (c > 0xFFFF) 758 { 759 tchar high = 0; 760 tchar low = 0; 761 762 SplitSurrogatePair(c, &low, &high); 763 764 inbuf[inbufsize++] = (WCHAR)low; 765 inbuf[inbufsize++] = (WCHAR)high; 766 } 767 else 768 inbuf[inbufsize++] = (WCHAR)c; 769 770 hr = IMLangConvertCharset_DoConversionFromUnicode(p, inbuf, &inbufsize, outbuf, &outbufsize); 771 772 assert( hr == S_OK ); 773 assert( outbufsize > 0 ); 774 assert( inbufsize == 1 || inbufsize == 2 ); 775 776 for (i = 0; i < outbufsize; ++i) 777 sink->putByte(sink->sinkData, (byte)(outbuf[i])); 778 779 *bytesWritten = outbufsize; 780 781 return; 782} 783 784#endif /* TIDY_WIN32_MLANG_SUPPORT */ 785 786/* 787 * local variables: 788 * mode: c 789 * indent-tabs-mode: nil 790 * c-basic-offset: 4 791 * eval: (c-set-offset 'substatement-open 0) 792 * end: 793 */ 794