1/* win32tc.c -- Interface to Win32 transcoding routines
2
3  (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
4  See tidy.h for the copyright notice.
5
6  $Id$
7*/
8
9/* keep these here to keep file non-empty */
10#include <tidy.h>
11#include "forward.h"
12#include "streamio.h"
13#include "tmbstr.h"
14#include "utf8.h"
15
16#ifdef TIDY_WIN32_MLANG_SUPPORT
17
18#define VC_EXTRALEAN
19#define CINTERFACE
20#define COBJMACROS
21
22#include <windows.h>
23#include <mlang.h>
24
25#undef COBJMACROS
26#undef CINTERFACE
27#undef VC_EXTRALEAN
28
29/* maximum number of bytes for a single character */
30#define TC_INBUFSIZE  16
31
32/* maximum number of characters per byte sequence */
33#define TC_OUTBUFSIZE 16
34
35#define CreateMLangObject(p) \
36  CoCreateInstance( \
37        &CLSID_CMLangConvertCharset, \
38        NULL, \
39        CLSCTX_ALL, \
40        &IID_IMLangConvertCharset, \
41        (VOID **)&p);
42
43
44/* Character Set to Microsoft Windows Codepage Identifier map,     */
45/* from <rotor/sscli/clr/src/classlibnative/nls/encodingdata.cpp>. */
46
47/* note: the 'safe' field indicates whether this encoding can be   */
48/* read/written character-by-character; this does not apply to     */
49/* various stateful encodings such as ISO-2022 or UTF-7, these     */
50/* must be read/written as a complete stream. It is possible that  */
51/* some 'unsafe' encodings are marked as 'save'.                   */
52
53/* todo: cleanup; Tidy should use only a single mapping table to   */
54/* circumvent unsupported aliases in other transcoding libraries,  */
55/* enable reverse lookup of encoding names and ease maintenance.   */
56
57static struct _nameWinCPMap
58{
59    tmbstr name;
60    uint wincp;
61    Bool safe;
62} const NameWinCPMap[] = {
63  { "cp037",                                            37, yes },
64  { "csibm037",                                         37, yes },
65  { "ebcdic-cp-ca",                                     37, yes },
66  { "ebcdic-cp-nl",                                     37, yes },
67  { "ebcdic-cp-us",                                     37, yes },
68  { "ebcdic-cp-wt",                                     37, yes },
69  { "ibm037",                                           37, yes },
70  { "cp437",                                           437, yes },
71  { "cspc8codepage437",                                437, yes },
72  { "ibm437",                                          437, yes },
73  { "cp500",                                           500, yes },
74  { "csibm500",                                        500, yes },
75  { "ebcdic-cp-be",                                    500, yes },
76  { "ebcdic-cp-ch",                                    500, yes },
77  { "ibm500",                                          500, yes },
78  { "asmo-708",                                        708, yes },
79  { "dos-720",                                         720, yes },
80  { "ibm737",                                          737, yes },
81  { "ibm775",                                          775, yes },
82  { "cp850",                                           850, yes },
83  { "ibm850",                                          850, yes },
84  { "cp852",                                           852, yes },
85  { "ibm852",                                          852, yes },
86  { "cp855",                                           855, yes },
87  { "ibm855",                                          855, yes },
88  { "cp857",                                           857, yes },
89  { "ibm857",                                          857, yes },
90  { "ccsid00858",                                      858, yes },
91  { "cp00858",                                         858, yes },
92  { "cp858",                                           858, yes },
93  { "ibm00858",                                        858, yes },
94  { "pc-multilingual-850+euro",                        858, yes },
95  { "cp860",                                           860, yes },
96  { "ibm860",                                          860, yes },
97  { "cp861",                                           861, yes },
98  { "ibm861",                                          861, yes },
99  { "cp862",                                           862, yes },
100  { "dos-862",                                         862, yes },
101  { "ibm862",                                          862, yes },
102  { "cp863",                                           863, yes },
103  { "ibm863",                                          863, yes },
104  { "cp864",                                           864, yes },
105  { "ibm864",                                          864, yes },
106  { "cp865",                                           865, yes },
107  { "ibm865",                                          865, yes },
108  { "cp866",                                           866, yes },
109  { "ibm866",                                          866, yes },
110  { "cp869",                                           869, yes },
111  { "ibm869",                                          869, yes },
112  { "cp870",                                           870, yes },
113  { "csibm870",                                        870, yes },
114  { "ebcdic-cp-roece",                                 870, yes },
115  { "ebcdic-cp-yu",                                    870, yes },
116  { "ibm870",                                          870, yes },
117  { "dos-874",                                         874, yes },
118  { "iso-8859-11",                                     874, yes },
119  { "tis-620",                                         874, yes },
120  { "windows-874",                                     874, yes },
121  { "cp875",                                           875, yes },
122  { "csshiftjis",                                      932, yes },
123  { "cswindows31j",                                    932, yes },
124  { "ms_kanji",                                        932, yes },
125  { "shift-jis",                                       932, yes },
126  { "shift_jis",                                       932, yes },
127  { "sjis",                                            932, yes },
128  { "x-ms-cp932",                                      932, yes },
129  { "x-sjis",                                          932, yes },
130  { "chinese",                                         936, yes },
131  { "cn-gb",                                           936, yes },
132  { "csgb2312",                                        936, yes },
133  { "csgb231280",                                      936, yes },
134  { "csiso58gb231280",                                 936, yes },
135  { "gb2312",                                          936, yes },
136  { "gb2312-80",                                       936, yes },
137  { "gb231280",                                        936, yes },
138  { "gb_2312-80",                                      936, yes },
139  { "gbk",                                             936, yes },
140  { "iso-ir-58",                                       936, yes },
141  { "csksc56011987",                                   949, yes },
142  { "iso-ir-149",                                      949, yes },
143  { "korean",                                          949, yes },
144  { "ks-c-5601",                                       949, yes },
145  { "ks-c5601",                                        949, yes },
146  { "ks_c_5601",                                       949, yes },
147  { "ks_c_5601-1987",                                  949, yes },
148  { "ks_c_5601-1989",                                  949, yes },
149  { "ks_c_5601_1987",                                  949, yes },
150  { "ksc5601",                                         949, yes },
151  { "ksc_5601",                                        949, yes },
152  { "big5",                                            950, yes },
153  { "big5-hkscs",                                      950, yes },
154  { "cn-big5",                                         950, yes },
155  { "csbig5",                                          950, yes },
156  { "x-x-big5",                                        950, yes },
157  { "cp1026",                                         1026, yes },
158  { "csibm1026",                                      1026, yes },
159  { "ibm1026",                                        1026, yes },
160  { "ibm01047",                                       1047, yes },
161  { "ccsid01140",                                     1140, yes },
162  { "cp01140",                                        1140, yes },
163  { "ebcdic-us-37+euro",                              1140, yes },
164  { "ibm01140",                                       1140, yes },
165  { "ccsid01141",                                     1141, yes },
166  { "cp01141",                                        1141, yes },
167  { "ebcdic-de-273+euro",                             1141, yes },
168  { "ibm01141",                                       1141, yes },
169  { "ccsid01142",                                     1142, yes },
170  { "cp01142",                                        1142, yes },
171  { "ebcdic-dk-277+euro",                             1142, yes },
172  { "ebcdic-no-277+euro",                             1142, yes },
173  { "ibm01142",                                       1142, yes },
174  { "ccsid01143",                                     1143, yes },
175  { "cp01143",                                        1143, yes },
176  { "ebcdic-fi-278+euro",                             1143, yes },
177  { "ebcdic-se-278+euro",                             1143, yes },
178  { "ibm01143",                                       1143, yes },
179  { "ccsid01144",                                     1144, yes },
180  { "cp01144",                                        1144, yes },
181  { "ebcdic-it-280+euro",                             1144, yes },
182  { "ibm01144",                                       1144, yes },
183  { "ccsid01145",                                     1145, yes },
184  { "cp01145",                                        1145, yes },
185  { "ebcdic-es-284+euro",                             1145, yes },
186  { "ibm01145",                                       1145, yes },
187  { "ccsid01146",                                     1146, yes },
188  { "cp01146",                                        1146, yes },
189  { "ebcdic-gb-285+euro",                             1146, yes },
190  { "ibm01146",                                       1146, yes },
191  { "ccsid01147",                                     1147, yes },
192  { "cp01147",                                        1147, yes },
193  { "ebcdic-fr-297+euro",                             1147, yes },
194  { "ibm01147",                                       1147, yes },
195  { "ccsid01148",                                     1148, yes },
196  { "cp01148",                                        1148, yes },
197  { "ebcdic-international-500+euro",                  1148, yes },
198  { "ibm01148",                                       1148, yes },
199  { "ccsid01149",                                     1149, yes },
200  { "cp01149",                                        1149, yes },
201  { "ebcdic-is-871+euro",                             1149, yes },
202  { "ibm01149",                                       1149, yes },
203  { "iso-10646-ucs-2",                                1200, yes },
204  { "ucs-2",                                          1200, yes },
205  { "unicode",                                        1200, yes },
206  { "utf-16",                                         1200, yes },
207  { "utf-16le",                                       1200, yes },
208  { "unicodefffe",                                    1201, yes },
209  { "utf-16be",                                       1201, yes },
210  { "windows-1250",                                   1250, yes },
211  { "x-cp1250",                                       1250, yes },
212  { "windows-1251",                                   1251, yes },
213  { "x-cp1251",                                       1251, yes },
214  { "windows-1252",                                   1252, yes },
215  { "x-ansi",                                         1252, yes },
216  { "windows-1253",                                   1253, yes },
217  { "windows-1254",                                   1254, yes },
218  { "windows-1255",                                   1255, yes },
219  { "cp1256",                                         1256, yes },
220  { "windows-1256",                                   1256, yes },
221  { "windows-1257",                                   1257, yes },
222  { "windows-1258",                                   1258, yes },
223  { "johab",                                          1361, yes },
224  { "macintosh",                                     10000, yes },
225  { "x-mac-japanese",                                10001, yes },
226  { "x-mac-chinesetrad",                             10002, yes },
227  { "x-mac-korean",                                  10003, yes },
228  { "x-mac-arabic",                                  10004, yes },
229  { "x-mac-hebrew",                                  10005, yes },
230  { "x-mac-greek",                                   10006, yes },
231  { "x-mac-cyrillic",                                10007, yes },
232  { "x-mac-chinesesimp",                             10008, yes },
233  { "x-mac-romanian",                                10010, yes },
234  { "x-mac-ukrainian",                               10017, yes },
235  { "x-mac-thai",                                    10021, yes },
236  { "x-mac-ce",                                      10029, yes },
237  { "x-mac-icelandic",                               10079, yes },
238  { "x-mac-turkish",                                 10081, yes },
239  { "x-mac-croatian",                                10082, yes },
240  { "x-chinese-cns",                                 20000, yes },
241  { "x-cp20001",                                     20001, yes },
242  { "x-chinese-eten",                                20002, yes },
243  { "x-cp20003",                                     20003, yes },
244  { "x-cp20004",                                     20004, yes },
245  { "x-cp20005",                                     20005, yes },
246  { "irv",                                           20105, yes },
247  { "x-ia5",                                         20105, yes },
248  { "din_66003",                                     20106, yes },
249  { "german",                                        20106, yes },
250  { "x-ia5-german",                                  20106, yes },
251  { "sen_850200_b",                                  20107, yes },
252  { "swedish",                                       20107, yes },
253  { "x-ia5-swedish",                                 20107, yes },
254  { "norwegian",                                     20108, yes },
255  { "ns_4551-1",                                     20108, yes },
256  { "x-ia5-norwegian",                               20108, yes },
257  { "ansi_x3.4-1968",                                20127, yes },
258  { "ansi_x3.4-1986",                                20127, yes },
259  { "ascii",                                         20127, yes },
260  { "cp367",                                         20127, yes },
261  { "csascii",                                       20127, yes },
262  { "ibm367",                                        20127, yes },
263  { "iso-ir-6",                                      20127, yes },
264  { "iso646-us",                                     20127, yes },
265  { "iso_646.irv:1991",                              20127, yes },
266  { "us",                                            20127, yes },
267  { "us-ascii",                                      20127, yes },
268  { "x-cp20261",                                     20261, yes },
269  { "x-cp20269",                                     20269, yes },
270  { "cp273",                                         20273, yes },
271  { "csibm273",                                      20273, yes },
272  { "ibm273",                                        20273, yes },
273  { "csibm277",                                      20277, yes },
274  { "ebcdic-cp-dk",                                  20277, yes },
275  { "ebcdic-cp-no",                                  20277, yes },
276  { "ibm277",                                        20277, yes },
277  { "cp278",                                         20278, yes },
278  { "csibm278",                                      20278, yes },
279  { "ebcdic-cp-fi",                                  20278, yes },
280  { "ebcdic-cp-se",                                  20278, yes },
281  { "ibm278",                                        20278, yes },
282  { "cp280",                                         20280, yes },
283  { "csibm280",                                      20280, yes },
284  { "ebcdic-cp-it",                                  20280, yes },
285  { "ibm280",                                        20280, yes },
286  { "cp284",                                         20284, yes },
287  { "csibm284",                                      20284, yes },
288  { "ebcdic-cp-es",                                  20284, yes },
289  { "ibm284",                                        20284, yes },
290  { "cp285",                                         20285, yes },
291  { "csibm285",                                      20285, yes },
292  { "ebcdic-cp-gb",                                  20285, yes },
293  { "ibm285",                                        20285, yes },
294  { "cp290",                                         20290, yes },
295  { "csibm290",                                      20290, yes },
296  { "ebcdic-jp-kana",                                20290, yes },
297  { "ibm290",                                        20290, yes },
298  { "cp297",                                         20297, yes },
299  { "csibm297",                                      20297, yes },
300  { "ebcdic-cp-fr",                                  20297, yes },
301  { "ibm297",                                        20297, yes },
302  { "cp420",                                         20420, yes },
303  { "csibm420",                                      20420, yes },
304  { "ebcdic-cp-ar1",                                 20420, yes },
305  { "ibm420",                                        20420, yes },
306  { "cp423",                                         20423, yes },
307  { "csibm423",                                      20423, yes },
308  { "ebcdic-cp-gr",                                  20423, yes },
309  { "ibm423",                                        20423, yes },
310  { "cp424",                                         20424, yes },
311  { "csibm424",                                      20424, yes },
312  { "ebcdic-cp-he",                                  20424, yes },
313  { "ibm424",                                        20424, yes },
314  { "x-ebcdic-koreanextended",                       20833, yes },
315  { "csibmthai",                                     20838, yes },
316  { "ibm-thai",                                      20838, yes },
317  { "cskoi8r",                                       20866, yes },
318  { "koi",                                           20866, yes },
319  { "koi8",                                          20866, yes },
320  { "koi8-r",                                        20866, yes },
321  { "koi8r",                                         20866, yes },
322  { "cp871",                                         20871, yes },
323  { "csibm871",                                      20871, yes },
324  { "ebcdic-cp-is",                                  20871, yes },
325  { "ibm871",                                        20871, yes },
326  { "cp880",                                         20880, yes },
327  { "csibm880",                                      20880, yes },
328  { "ebcdic-cyrillic",                               20880, yes },
329  { "ibm880",                                        20880, yes },
330  { "cp905",                                         20905, yes },
331  { "csibm905",                                      20905, yes },
332  { "ebcdic-cp-tr",                                  20905, yes },
333  { "ibm905",                                        20905, yes },
334  { "ccsid00924",                                    20924, yes },
335  { "cp00924",                                       20924, yes },
336  { "ebcdic-latin9--euro",                           20924, yes },
337  { "ibm00924",                                      20924, yes },
338  { "x-cp20936",                                     20936, yes },
339  { "x-cp20949",                                     20949, yes },
340  { "cp1025",                                        21025, yes },
341  { "x-cp21027",                                     21027, yes },
342  { "koi8-ru",                                       21866, yes },
343  { "koi8-u",                                        21866, yes },
344  { "cp819",                                         28591, yes },
345  { "csisolatin1",                                   28591, yes },
346  { "ibm819",                                        28591, yes },
347  { "iso-8859-1",                                    28591, yes },
348  { "iso-ir-100",                                    28591, yes },
349  { "iso8859-1",                                     28591, yes },
350  { "iso_8859-1",                                    28591, yes },
351  { "iso_8859-1:1987",                               28591, yes },
352  { "l1",                                            28591, yes },
353  { "latin1",                                        28591, yes },
354  { "csisolatin2",                                   28592, yes },
355  { "iso-8859-2",                                    28592, yes },
356  { "iso-ir-101",                                    28592, yes },
357  { "iso8859-2",                                     28592, yes },
358  { "iso_8859-2",                                    28592, yes },
359  { "iso_8859-2:1987",                               28592, yes },
360  { "l2",                                            28592, yes },
361  { "latin2",                                        28592, yes },
362  { "csisolatin3",                                   28593, yes },
363  { "iso-8859-3",                                    28593, yes },
364  { "iso-ir-109",                                    28593, yes },
365  { "iso_8859-3",                                    28593, yes },
366  { "iso_8859-3:1988",                               28593, yes },
367  { "l3",                                            28593, yes },
368  { "latin3",                                        28593, yes },
369  { "csisolatin4",                                   28594, yes },
370  { "iso-8859-4",                                    28594, yes },
371  { "iso-ir-110",                                    28594, yes },
372  { "iso_8859-4",                                    28594, yes },
373  { "iso_8859-4:1988",                               28594, yes },
374  { "l4",                                            28594, yes },
375  { "latin4",                                        28594, yes },
376  { "csisolatincyrillic",                            28595, yes },
377  { "cyrillic",                                      28595, yes },
378  { "iso-8859-5",                                    28595, yes },
379  { "iso-ir-144",                                    28595, yes },
380  { "iso_8859-5",                                    28595, yes },
381  { "iso_8859-5:1988",                               28595, yes },
382  { "arabic",                                        28596, yes },
383  { "csisolatinarabic",                              28596, yes },
384  { "ecma-114",                                      28596, yes },
385  { "iso-8859-6",                                    28596, yes },
386  { "iso-ir-127",                                    28596, yes },
387  { "iso_8859-6",                                    28596, yes },
388  { "iso_8859-6:1987",                               28596, yes },
389  { "csisolatingreek",                               28597, yes },
390  { "ecma-118",                                      28597, yes },
391  { "elot_928",                                      28597, yes },
392  { "greek",                                         28597, yes },
393  { "greek8",                                        28597, yes },
394  { "iso-8859-7",                                    28597, yes },
395  { "iso-ir-126",                                    28597, yes },
396  { "iso_8859-7",                                    28597, yes },
397  { "iso_8859-7:1987",                               28597, yes },
398  { "csisolatinhebrew",                              28598, yes },
399  { "hebrew",                                        28598, yes },
400  { "iso-8859-8",                                    28598, yes },
401  { "iso-ir-138",                                    28598, yes },
402  { "iso_8859-8",                                    28598, yes },
403  { "iso_8859-8:1988",                               28598, yes },
404  { "logical",                                       28598, yes },
405  { "visual",                                        28598, yes },
406  { "csisolatin5",                                   28599, yes },
407  { "iso-8859-9",                                    28599, yes },
408  { "iso-ir-148",                                    28599, yes },
409  { "iso_8859-9",                                    28599, yes },
410  { "iso_8859-9:1989",                               28599, yes },
411  { "l5",                                            28599, yes },
412  { "latin5",                                        28599, yes },
413  { "iso-8859-13",                                   28603, yes },
414  { "csisolatin9",                                   28605, yes },
415  { "iso-8859-15",                                   28605, yes },
416  { "iso_8859-15",                                   28605, yes },
417  { "l9",                                            28605, yes },
418  { "latin9",                                        28605, yes },
419  { "x-europa",                                      29001, yes },
420  { "iso-8859-8-i",                                  38598, yes },
421  { "iso-2022-jp",                                   50220,  no },
422  { "csiso2022jp",                                   50221,  no },
423  { "csiso2022kr",                                   50225,  no },
424  { "iso-2022-kr",                                   50225,  no },
425  { "iso-2022-kr-7",                                 50225,  no },
426  { "iso-2022-kr-7bit",                              50225,  no },
427  { "cp50227",                                       50227,  no },
428  { "x-cp50227",                                     50227,  no },
429  { "cp930",                                         50930, yes },
430  { "x-ebcdic-japaneseanduscanada",                  50931, yes },
431  { "cp933",                                         50933, yes },
432  { "cp935",                                         50935, yes },
433  { "cp937",                                         50937, yes },
434  { "cp939",                                         50939, yes },
435  { "cseucpkdfmtjapanese",                           51932, yes },
436  { "euc-jp",                                        51932, yes },
437  { "extended_unix_code_packed_format_for_japanese", 51932, yes },
438  { "iso-2022-jpeuc",                                51932, yes },
439  { "x-euc",                                         51932, yes },
440  { "x-euc-jp",                                      51932, yes },
441  { "euc-cn",                                        51936, yes },
442  { "x-euc-cn",                                      51936, yes },
443  { "cseuckr",                                       51949, yes },
444  { "euc-kr",                                        51949, yes },
445  { "iso-2022-kr-8",                                 51949, yes },
446  { "iso-2022-kr-8bit",                              51949, yes },
447  { "hz-gb-2312",                                    52936,  no },
448  { "gb18030",                                       54936, yes },
449  { "x-iscii-de",                                    57002, yes },
450  { "x-iscii-be",                                    57003, yes },
451  { "x-iscii-ta",                                    57004, yes },
452  { "x-iscii-te",                                    57005, yes },
453  { "x-iscii-as",                                    57006, yes },
454  { "x-iscii-or",                                    57007, yes },
455  { "x-iscii-ka",                                    57008, yes },
456  { "x-iscii-ma",                                    57009, yes },
457  { "x-iscii-gu",                                    57010, yes },
458  { "x-iscii-pa",                                    57011, yes },
459  { "csunicode11utf7",                               65000,  no },
460  { "unicode-1-1-utf-7",                             65000,  no },
461  { "unicode-2-0-utf-7",                             65000,  no },
462  { "utf-7",                                         65000,  no },
463  { "x-unicode-1-1-utf-7",                           65000,  no },
464  { "x-unicode-2-0-utf-7",                           65000,  no },
465  { "unicode-1-1-utf-8",                             65001, yes },
466  { "unicode-2-0-utf-8",                             65001, yes },
467  { "utf-8",                                         65001, yes },
468  { "x-unicode-1-1-utf-8",                           65001, yes },
469  { "x-unicode-2-0-utf-8",                           65001, yes },
470
471  /* final entry */
472  { NULL,                                                0,  no }
473};
474
475uint TY_(Win32MLangGetCPFromName)(ctmbstr encoding)
476{
477    uint i;
478    tmbstr enc;
479
480    /* ensure name is in lower case */
481    enc = TY_(tmbstrdup)(encoding);
482    enc = TY_(tmbstrtolower)(enc);
483
484    for (i = 0; NameWinCPMap[i].name; ++i)
485    {
486        if (TY_(tmbstrcmp)(NameWinCPMap[i].name, enc) == 0)
487        {
488            IMLangConvertCharset * p = NULL;
489            uint wincp = NameWinCPMap[i].wincp;
490            HRESULT hr;
491
492            MemFree(enc);
493
494            /* currently no support for unsafe encodings */
495            if (!NameWinCPMap[i].safe)
496                return 0;
497
498            /* hack for config.c */
499            CoInitialize(NULL);
500            hr = CreateMLangObject(p);
501
502            if (hr != S_OK || !p)
503            {
504                wincp = 0;
505            }
506            else
507            {
508                hr = IMLangConvertCharset_Initialize(p, wincp, 1200, 0);
509
510                if (hr != S_OK)
511                    wincp = 0;
512
513                IMLangConvertCharset_Release(p);
514                p = NULL;
515            }
516
517            CoUninitialize();
518
519            return wincp;
520        }
521    }
522
523    MemFree(enc);
524    return 0;
525}
526
527Bool TY_(Win32MLangInitInputTranscoder)(StreamIn * in, uint wincp)
528{
529    IMLangConvertCharset * p = NULL;
530    HRESULT hr;
531
532    assert( in != NULL );
533
534    CoInitialize(NULL);
535
536    if (wincp == 0)
537    {
538        /* no codepage found for this encoding */
539        return no;
540    }
541
542    hr = CreateMLangObject(p);
543
544    if (hr != S_OK || !p)
545    {
546        /* MLang not supported */
547        return no;
548    }
549
550    hr = IMLangConvertCharset_Initialize(p, wincp, 1200, 0);
551
552    if (hr != S_OK)
553    {
554        /* encoding not supported, insufficient memory, etc. */
555        return no;
556    }
557
558    in->mlang = (ulong)p;
559
560    return yes;
561}
562
563void TY_(Win32MLangUninitInputTranscoder)(StreamIn * in)
564{
565    IMLangConvertCharset * p;
566
567    assert( in != NULL );
568
569    p = (IMLangConvertCharset *)in->mlang;
570    if (p)
571    {
572        IMLangConvertCharset_Release(p);
573        p = NULL;
574        in->mlang = (ulong)NULL;
575    }
576
577    CoUninitialize();
578}
579
580Bool Win32MLangInitOutputTranscoder(StreamOut * out, tmbstr encoding)
581{
582    IMLangConvertCharset * p = NULL;
583    HRESULT hr;
584    uint wincp;
585
586    assert( out != NULL );
587
588    CoInitialize(NULL);
589
590    wincp = TY_(Win32MLangGetCPFromName)(encoding);
591    if (wincp == 0)
592    {
593        /* no codepage found for this encoding */
594        return no;
595    }
596
597    hr = CreateMLangObject(p);
598
599    if (hr != S_OK || !p)
600    {
601        /* MLang not supported */
602        return no;
603    }
604
605    IMLangConvertCharset_Initialize(p, 1200, wincp, MLCONVCHARF_NOBESTFITCHARS);
606
607    if (hr != S_OK)
608    {
609        /* encoding not supported, insufficient memory, etc. */
610        return no;
611    }
612
613    out->mlang = (ulong)p;
614
615    return yes;
616}
617
618void Win32MLangUninitOutputTranscoder(StreamOut * out)
619{
620    IMLangConvertCharset * p;
621
622    assert( out != NULL );
623
624    p = (IMLangConvertCharset *)out->mlang;
625    if (p)
626    {
627        IMLangConvertCharset_Release(p);
628        p = NULL;
629        out->mlang = (ulong)NULL;
630    }
631
632    CoUninitialize();
633}
634
635int TY_(Win32MLangGetChar)(byte firstByte, StreamIn * in, uint * bytesRead)
636{
637    IMLangConvertCharset * p;
638    TidyInputSource * source;
639    CHAR inbuf[TC_INBUFSIZE] = { 0 };
640    WCHAR outbuf[TC_OUTBUFSIZE] = { 0 };
641    HRESULT hr = S_OK;
642    size_t inbufsize = 0;
643
644    assert( in != NULL );
645    assert( &in->source != NULL );
646    assert( bytesRead != NULL );
647    assert( in->mlang != 0 );
648
649    p = (IMLangConvertCharset *)in->mlang;
650    source = &in->source;
651
652    inbuf[inbufsize++] = (CHAR)firstByte;
653
654    while(inbufsize < TC_INBUFSIZE)
655    {
656        UINT outbufsize = TC_OUTBUFSIZE;
657        UINT readNow = inbufsize;
658        int nextByte = EndOfStream;
659
660        hr = IMLangConvertCharset_DoConversionToUnicode(p, inbuf, &readNow, outbuf, &outbufsize);
661
662        assert( hr == S_OK );
663        assert( outbufsize <= 2 );
664
665        if (outbufsize == 2)
666        {
667            /* U+10000-U+10FFFF are returned as a pair of surrogates */
668            tchar m = (tchar)outbuf[0];
669            tchar n = (tchar)outbuf[1];
670            assert( IsHighSurrogate(n) && IsLowSurrogate(m) );
671            *bytesRead = readNow;
672            return (int)CombineSurrogatePair(n, m);
673        }
674
675        if (outbufsize == 1)
676        {
677            /* we found the character   */
678            /* set bytesRead and return */
679            *bytesRead = readNow;
680            return (int)outbuf[0];
681        }
682
683        /* we need more bytes */
684        nextByte = source->getByte(source->sourceData);
685
686        if (nextByte == EndOfStream)
687        {
688            /* todo: error message for broken stream? */
689
690            *bytesRead = readNow;
691            return EndOfStream;
692        }
693
694        inbuf[inbufsize++] = (CHAR)nextByte;
695    }
696
697    /* No full character found after reading TC_INBUFSIZE bytes, */
698    /* give up to read this stream, it's obviously unreadable.   */
699
700    /* todo: error message for broken stream? */
701    return EndOfStream;
702}
703
704Bool Win32MLangIsConvertible(tchar c, StreamOut * out)
705{
706    IMLangConvertCharset * p;
707    UINT i = 1;
708    HRESULT hr;
709    WCHAR inbuf[2] = { 0 };
710    UINT inbufsize = 0;
711
712    assert( c != 0 );
713    assert( c <= 0x10FFFF );
714    assert( out != NULL );
715    assert( out->mlang != 0 );
716
717    if (c > 0xFFFF)
718    {
719        tchar high = 0;
720        tchar low = 0;
721
722        SplitSurrogatePair(c, &low, &high);
723
724        inbuf[inbufsize++] = (WCHAR)low;
725        inbuf[inbufsize++] = (WCHAR)high;
726    }
727    else
728        inbuf[inbufsize++] = (WCHAR)c;
729
730    p = (IMLangConvertCharset *)out->mlang;
731    hr = IMLangConvertCharset_DoConversionFromUnicode(p, inbuf, &inbufsize, NULL, NULL);
732
733    return hr == S_OK ? yes : no;
734}
735
736void Win32MLangPutChar(tchar c, StreamOut * out, uint * bytesWritten)
737{
738    IMLangConvertCharset * p;
739    TidyOutputSink * sink;
740    CHAR outbuf[TC_OUTBUFSIZE] = { 0 };
741    UINT outbufsize = TC_OUTBUFSIZE;
742    HRESULT hr = S_OK;
743    WCHAR inbuf[2] = { 0 };
744    UINT inbufsize = 0;
745    uint i;
746
747    assert( c != 0 );
748    assert( c <= 0x10FFFF );
749    assert( bytesWritten != NULL );
750    assert( out != NULL );
751    assert( &out->sink != NULL );
752    assert( out->mlang != 0 );
753
754    p = (IMLangConvertCharset *)out->mlang;
755    sink = &out->sink;
756
757    if (c > 0xFFFF)
758    {
759        tchar high = 0;
760        tchar low = 0;
761
762        SplitSurrogatePair(c, &low, &high);
763
764        inbuf[inbufsize++] = (WCHAR)low;
765        inbuf[inbufsize++] = (WCHAR)high;
766    }
767    else
768        inbuf[inbufsize++] = (WCHAR)c;
769
770    hr = IMLangConvertCharset_DoConversionFromUnicode(p, inbuf, &inbufsize, outbuf, &outbufsize);
771
772    assert( hr == S_OK );
773    assert( outbufsize > 0 );
774    assert( inbufsize == 1 || inbufsize == 2 );
775
776    for (i = 0; i < outbufsize; ++i)
777        sink->putByte(sink->sinkData, (byte)(outbuf[i]));
778
779    *bytesWritten = outbufsize;
780
781    return;
782}
783
784#endif /* TIDY_WIN32_MLANG_SUPPORT */
785
786/*
787 * local variables:
788 * mode: c
789 * indent-tabs-mode: nil
790 * c-basic-offset: 4
791 * eval: (c-set-offset 'substatement-open 0)
792 * end:
793 */
794