1#include <string.h>
2#include <Catalog.h>
3#include <Locale.h>
4#include <CharacterSet.h>
5#include <Debug.h>
6#include "character_sets.h"
7
8#undef B_TRANSLATION_CONTEXT
9#define B_TRANSLATION_CONTEXT "textencodings"
10
11namespace BPrivate {
12
13/**
14 * These variables are used in defining the character_sets_by_id array below.
15 * @see http://www.iana.org/assignments/character-sets
16 * @see http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html
17 * @see http://www.openi18n.org/subgroups/sa/locnameguide/final/CodesetAliasTable.html
18 **/
19
20static const char * unicodeAliases[] = {
21	// IANA aliases
22	// java aliases
23	"UTF8", "unicode-1-1-utf-8",
24	NULL
25};
26static const BCharacterSet unicode(0,106, B_TRANSLATE("Unicode"),
27	"UTF-8", "UTF-8",unicodeAliases);
28
29static const char * isoLatin1aliases[] = {
30	// IANA aliases
31	"iso-ir-100", "ISO_8859-1", "ISO-8859-1", "latin1", "11", "IBM819", "CP819", "csISOLatin1",
32	// java aliases
33	"819", "IBM-819", "ISO8859_1", "8859_1", "ISO8859-1",
34	NULL
35};
36static const BCharacterSet isoLatin1(1,4, B_TRANSLATE("ISO West European"),
37	"ISO_8859-1:1987","ISO-8859-1",isoLatin1aliases);
38
39static const char * isoLatin2aliases[] = {
40	// IANA aliases
41	"iso-ir-101", "ISO_8859-2", "ISO-8859-2", "latin2", "12", "csISOLatin2",
42	// java aliases
43	"iso8859_2", "8859_2", "ISO8859-2", "ibm912", "ibm-912", "cp912", "912",
44	NULL
45};
46static const BCharacterSet isoLatin2(2,5, B_TRANSLATE("ISO East European"),
47	"ISO_8859-2:1987","ISO-8859-2",isoLatin2aliases);
48
49static const char * isoLatin3aliases[] = {
50	// IANA aliases
51	"iso-ir-109", "ISO_8859-3", "ISO-8859-3", "latin3", "13", "csISOLatin3",
52	// java aliases
53	"iso8859_3", "8859_3", "iso8859-3", "ibm913", "ibm-913", "cp913", "913",
54	NULL
55};
56static const BCharacterSet isoLatin3(3,6, B_TRANSLATE("ISO South European"),
57	"ISO_8859-3:1988","ISO-8859-3",isoLatin3aliases);
58
59static const char * isoLatin4aliases[] = {
60	// IANA aliases
61	"iso-ir-110", "ISO_8859-4", "ISO-8859-4", "latin4", "14", "csISOLatin4",
62	// java aliases
63	"iso8859_4", "iso8859-4", "8859_4", "ibm914", "ibm-914", "cp914", "914",
64	NULL
65};
66static const BCharacterSet isoLatin4(4,7, B_TRANSLATE("ISO North European"),
67	"ISO_8859-4:1988","ISO-8859-4",isoLatin4aliases);
68
69static const char * isoLatin5aliases[] = {
70	// IANA aliases
71	"iso-ir-144", "ISO_8859-5", "ISO-8859-5", "cyrillic", "csISOLatinCyrillic",
72	// java aliases
73	"iso8859_5", "8859_5", "ISO8859-5", "ibm915", "ibm-915", "cp915", "915",
74	NULL
75};
76static const BCharacterSet isoLatin5(5,8, B_TRANSLATE("ISO Cyrillic"),
77	"ISO_8859-5:1988","ISO-8859-5",isoLatin5aliases);
78
79static const char * isoLatin6aliases[] = {
80	// IANA aliases
81	"iso-ir-127", "ISO_8859-6", "ISO-8859-6", "ECMA-114", "ASMO-708", "arabic", "csISOLatinArabic",
82	// java aliases
83	"iso8859_6", "8859_6", "ISO8859-6", "ibm1089", "ibm-1089", "cp1089", "1089",
84	NULL
85};
86static const BCharacterSet isoLatin6(6,9, B_TRANSLATE("ISO Arabic"),
87	"ISO_8859-6:1987","ISO-8859-6",isoLatin6aliases);
88
89static const char * isoLatin7aliases[] = {
90	// IANA aliases
91	"iso-ir-126", "ISO_8859-7", "ISO-8859-7", "ELOT_928", "ECMA-118", "greek", "greek8", "csISOLatinGreek",
92	// java aliases
93	"iso8859_7", "8859_7", "iso8859-7", "sun_eu_greek", "ibm813", "ibm-813", "813", "cp813",
94	NULL
95};
96static const BCharacterSet isoLatin7(7,10, B_TRANSLATE("ISO Greek"),
97	"ISO_8859-7:1987","ISO-8859-7",isoLatin7aliases);
98
99static const char * isoLatin8aliases[] = {
100	// IANA aliases
101	"iso-ir-138", "ISO_8859-8", "ISO-8859-8", "hebrew", "csISOLatinHebrew",
102	// java aliases
103	"iso8859_8", "8859_8", "ISO8859-8", "cp916", "916", "ibm916", "ibm-916",
104	NULL
105};
106static const BCharacterSet isoLatin8(8,11, B_TRANSLATE("ISO Hebrew"),
107	"ISO_8859-8:1988","ISO-8859-8",isoLatin8aliases);
108
109static const char * isoLatin9aliases[] = {
110	// IANA aliases
111	"iso-ir-148", "ISO_8859-9", "ISO-8859-9", "latin5", "15", "csISOLatin5",
112	// java aliases
113	"iso8859_9", "8859_9", "ibm920", "ibm-920", "920", "cp920",
114	NULL
115};
116const BCharacterSet isoLatin9(9,12, B_TRANSLATE("ISO Turkish"),
117	"ISO_8859-9:1989","ISO-8859-9",isoLatin9aliases);
118
119static const char * isoLatin10aliases[] = {
120	// IANA aliases
121	"iso-ir-157", "16", "ISO_8859-10:1992", "csISOLatin6", "latin6",
122	// java aliases
123	NULL
124};
125static const BCharacterSet isoLatin10(10,13, B_TRANSLATE("ISO Nordic"),
126	"ISO-8859-10","ISO-8859-10",isoLatin10aliases);
127
128static const char * macintoshAliases[] = {
129	// IANA aliases
130	"mac", "csMacintosh",
131	// java aliases
132	"MacRoman",
133	// mail kit aliases
134	"x-mac-roman",
135	NULL
136};
137static const BCharacterSet macintosh(11,2027, B_TRANSLATE("Macintosh Roman"),
138	"macintosh",NULL,macintoshAliases);
139
140static const char * shiftJISaliases[] = {
141	// IANA aliases
142	"MS_Kanji", "csShiftJIS",
143	// java aliases
144	"sjis", "shift_jis", "shift-jis", "x-sjis",
145	// mail kit aliases
146	"shift_jisx0213",
147	NULL
148};
149static const BCharacterSet shiftJIS(12,17, B_TRANSLATE("Japanese Shift JIS"),
150	"Shift_JIS","Shift_JIS",shiftJISaliases);
151
152static const char * EUCPackedJapaneseAliases[] = {
153	// IANA aliases
154	"EUC-JP", "csEUCPkdFmtJapanese",
155	// java aliases
156	"eucjis", "eucjp", "x-euc-jp", "x-eucjp",
157	// mail kit aliases
158	"euc-jisx0213",
159	NULL
160};
161static const BCharacterSet packedJapanese(13,18, B_TRANSLATE("Japanese EUC"),
162                                   "Extended_UNIX_Code_Packed_Format_for_Japanese","EUC-JP",
163                                   EUCPackedJapaneseAliases);
164
165static const char * iso2022jpAliases[] = {
166	// IANA aliases
167	"csISO2022JP",
168	// java aliases
169	"iso2022jp", "jis", "jis_encoding", "csjisencoding",
170	NULL
171};
172static const BCharacterSet iso2022jp(14,39, B_TRANSLATE("Japanese JIS"),
173	"ISO-2022-JP","ISO-2022-JP",iso2022jpAliases);
174
175static const char * windows1252aliases[] = {
176	// IANA aliases
177	// java aliases
178	"cp1252", "cp5348",
179	NULL
180};
181static const BCharacterSet windows1252(15,2252, B_TRANSLATE("Windows Latin-1 "
182	"(CP 1252)"),"windows-1252",NULL,windows1252aliases);
183
184static const char * unicode2aliases[] = {
185	// IANA aliases
186	"csUnicode",
187	// java aliases
188	"UTF-16BE", "UTF_16BE", "X-UTF-16BE", "UnicodeBigUnmarked",
189	NULL
190};
191static const BCharacterSet unicode2(16,1000, B_TRANSLATE("Unicode (UCS-2)"),
192	"ISO-10646-UCS-2",NULL,unicode2aliases);
193
194static const char * KOI8Raliases[] = {
195	// IANA aliases
196	"csKOI8R",
197	// java aliases
198	"koi8_r", "koi8", "cskoi8r",
199	NULL
200};
201static const BCharacterSet KOI8R(17,2084, B_TRANSLATE("KOI8-R Cyrillic"),
202	"KOI8-R","KOI8-R",KOI8Raliases);
203
204static const char * windows1251aliases[] = {
205	// IANA aliases
206	// java aliases
207	"cp1251", "cp5347", "ansi-1251",
208	NULL
209};
210static const BCharacterSet windows1251(18,2251, B_TRANSLATE("Windows Cyrillic "
211	"(CP 1251)"), "windows-1251",NULL,windows1251aliases);
212
213static const char * IBM866aliases[] = {
214	// IANA aliases
215	"cp866", "866", "csIBM866",
216	// java aliases
217	"ibm-866",
218	// mail kit aliases
219	"dos-866",
220	NULL
221};
222static const BCharacterSet IBM866(19,2086, B_TRANSLATE("DOS Cyrillic"),
223	"IBM866","IBM866",IBM866aliases);
224
225static const char * IBM437aliases[] = {
226	// IANA aliases
227	"cp437", "437", "csPC8CodePage437",
228	// java aliases
229	"ibm-437", "windows-437",
230	// mail kit aliases
231	"dos-437",
232	NULL
233};
234static const BCharacterSet IBM437(20,2011, B_TRANSLATE("DOS Latin-US"),
235	"IBM437","IBM437",IBM437aliases);
236
237static const char * eucKRaliases[] = {
238	// IANA aliases
239	"csEUCKR",
240	// java aliases
241	"ksc5601", "euckr", "ks_c_5601-1987", "ksc5601-1987",
242	"ksc5601_1987", "ksc_5601", "5601",
243	NULL
244};
245static const BCharacterSet eucKR(21,38, B_TRANSLATE("EUC Korean"),
246	"EUC-KR","EUC-KR",eucKRaliases);
247
248static const char * iso13aliases[] = {
249	// IANA aliases
250	// java aliases
251	"iso8859_13", "8859_13", "iso_8859-13", "ISO8859-13",
252	NULL
253};
254static const BCharacterSet iso13(22,109, B_TRANSLATE("ISO Baltic"),
255	"ISO-8859-13","ISO-8859-13",iso13aliases);
256
257static const char * iso14aliases[] = {
258	// IANA aliases
259	"iso-ir-199", "ISO_8859-14:1998", "ISO_8859-14", "latin8", "iso-celtic", "l8",
260	NULL
261};
262static const BCharacterSet iso14(23,110, B_TRANSLATE("ISO Celtic"),
263	"ISO-8859-14","ISO-8859-14",iso14aliases);
264
265static const char * iso15aliases[] = {
266	// IANA aliases
267	"ISO_8859-15", "Latin-9",
268	// java aliases
269	"8859_15", "ISO8859_15", "ISO8859-15", "IBM923", "IBM-923", "cp923", "923",
270	"LATIN0", "LATIN9", "L9", "csISOlatin0", "csISOlatin9", "ISO8859_15_FDIS",
271	NULL
272};
273static const BCharacterSet iso15(24,111, B_TRANSLATE("ISO Latin 9"),
274	"ISO-8859-15","ISO-8859-15",iso15aliases);
275
276// chinese character set testing
277
278static const char * big5aliases[] = {
279	// IANA aliases
280	"csBig5",
281	NULL
282};
283static const BCharacterSet big5(25,2026, B_TRANSLATE("Chinese Big5"),
284	"Big5","Big5",big5aliases);
285
286static const char * gb18030aliases[] = {
287	// java aliases
288	"gb18030-2000",
289	// mail kit aliases
290	"gb2312",
291	"gbk",
292	NULL
293};
294static const BCharacterSet gb18030(26,114, B_TRANSLATE("Chinese GB18030"),
295	"GB18030",NULL,gb18030aliases);
296
297static const char* kUTF16Aliases[] = {
298	// IANA aliases
299	"UTF-16",
300	// java aliases
301	"UTF-16BE", "X-UTF-16BE", "UnicodeBigUnmarked",
302	NULL
303};
304static const BCharacterSet kUTF16(27, 1000, B_TRANSLATE("Unicode"), "UTF-16", "UTF-16",
305	kUTF16Aliases);
306
307static const char* kWindows1250Aliases[] = {
308	// IANA aliases
309	"cswindows1250",
310	// java aliases
311	"cp1250",
312	"ms-ee",
313	NULL
314};
315static const BCharacterSet kWindows1250(28, 2250, B_TRANSLATE("Windows Central "
316	"European (CP 1250)"), "windows-1250", NULL, kWindows1250Aliases);
317
318/**
319 * The following initializes the global character set array.
320 * It is organized by id for efficient retrieval using predefined constants in UTF8.h and Font.h.
321 * Character sets are stored contiguously and may be efficiently iterated over.
322 * To add a new character set, define the character set above -- remember to increment the id --
323 * and then add &<charSetName> to the _end_ of the following list.  That's all.
324 **/
325
326const BCharacterSet * character_sets_by_id[] = {
327	&unicode,
328	&isoLatin1, &isoLatin2, &isoLatin3,	&isoLatin4,	&isoLatin5,
329	&isoLatin6,	&isoLatin7, &isoLatin8, &isoLatin9, &isoLatin10,
330	&macintosh,
331	// R5 BFont encodings end here
332	&shiftJIS, &packedJapanese, &iso2022jp,
333	&windows1252, &unicode2, &KOI8R, &windows1251,
334	&IBM866, &IBM437, &eucKR, &iso13, &iso14, &iso15,
335	// R5 convert_to/from_utf8 encodings end here
336	&big5,&gb18030,
337	&kUTF16,
338	&kWindows1250,
339};
340const uint32 character_sets_by_id_count = sizeof(character_sets_by_id)/sizeof(const BCharacterSet*);
341
342/**
343 * The following code initializes the global MIBenum array.
344 * This sparsely populated array exists as an efficient way to access character sets by MIBenum.
345 * The MIBenum array is automatically allocated, and initialized by the following class.
346 * The following class should only be instantiated once, this is assured by using an assertion.
347 * No changes are required to the following code to add a new character set.
348 **/
349
350const BCharacterSet ** character_sets_by_MIBenum;
351uint32 maximum_valid_MIBenum;
352
353static class MIBenumArrayInitializer {
354public:
355	MIBenumArrayInitializer() {
356		DEBUG_ONLY(static int onlyOneTime = 0;)
357		ASSERT_WITH_MESSAGE(onlyOneTime++ == 0,"MIBenumArrayInitializer should be instantiated only one time.");
358		// analyzing character_sets_by_id
359		uint32 max_MIBenum = 0;
360		for (uint32 index = 0 ; index < character_sets_by_id_count ; index++ ) {
361			if (max_MIBenum < character_sets_by_id[index]->GetMIBenum()) {
362				max_MIBenum = character_sets_by_id[index]->GetMIBenum();
363			}
364		}
365		// initializing extern variables
366		character_sets_by_MIBenum = new const BCharacterSet*[max_MIBenum+2];
367		maximum_valid_MIBenum = max_MIBenum;
368		// initializing MIBenum array
369		memset(character_sets_by_MIBenum,0,sizeof(BCharacterSet*)*(max_MIBenum+2));
370		for (uint32 index2 = 0 ; index2 < character_sets_by_id_count ; index2++ ) {
371			const BCharacterSet * charset = character_sets_by_id[index2];
372			character_sets_by_MIBenum[charset->GetMIBenum()] = charset;
373		}
374	}
375	~MIBenumArrayInitializer()
376	{
377		delete [] character_sets_by_MIBenum;
378	}
379} runTheInitializer;
380
381}
382