1/*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *     * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *     * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 *     * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32#include "LocaleToScriptMapping.h"
33
34#include <wtf/HashMap.h>
35#include <wtf/HashSet.h>
36#include <wtf/text/StringHash.h>
37
38namespace WebCore {
39
40struct ScriptNameCode {
41    const char* name;
42    UScriptCode code;
43};
44
45// This generally maps an ISO 15924 script code to its UScriptCode, but certain families of script codes are
46// treated as a single script for assigning a per-script font in Settings. For example, "hira" is mapped to
47// USCRIPT_KATAKANA_OR_HIRAGANA instead of USCRIPT_HIRAGANA, since we want all Japanese scripts to be rendered
48// using the same font setting.
49static const ScriptNameCode scriptNameCodeList[] = {
50    { "zyyy", USCRIPT_COMMON },
51    { "qaai", USCRIPT_INHERITED },
52    { "arab", USCRIPT_ARABIC },
53    { "armn", USCRIPT_ARMENIAN },
54    { "beng", USCRIPT_BENGALI },
55    { "bopo", USCRIPT_BOPOMOFO },
56    { "cher", USCRIPT_CHEROKEE },
57    { "copt", USCRIPT_COPTIC },
58    { "cyrl", USCRIPT_CYRILLIC },
59    { "dsrt", USCRIPT_DESERET },
60    { "deva", USCRIPT_DEVANAGARI },
61    { "ethi", USCRIPT_ETHIOPIC },
62    { "geor", USCRIPT_GEORGIAN },
63    { "goth", USCRIPT_GOTHIC },
64    { "grek", USCRIPT_GREEK },
65    { "gujr", USCRIPT_GUJARATI },
66    { "guru", USCRIPT_GURMUKHI },
67    { "hani", USCRIPT_HAN },
68    { "hang", USCRIPT_HANGUL },
69    { "hebr", USCRIPT_HEBREW },
70    { "hira", USCRIPT_KATAKANA_OR_HIRAGANA },
71    { "knda", USCRIPT_KANNADA },
72    { "kana", USCRIPT_KATAKANA_OR_HIRAGANA },
73    { "khmr", USCRIPT_KHMER },
74    { "laoo", USCRIPT_LAO },
75    { "latn", USCRIPT_LATIN },
76    { "mlym", USCRIPT_MALAYALAM },
77    { "mong", USCRIPT_MONGOLIAN },
78    { "mymr", USCRIPT_MYANMAR },
79    { "ogam", USCRIPT_OGHAM },
80    { "ital", USCRIPT_OLD_ITALIC },
81    { "orya", USCRIPT_ORIYA },
82    { "runr", USCRIPT_RUNIC },
83    { "sinh", USCRIPT_SINHALA },
84    { "syrc", USCRIPT_SYRIAC },
85    { "taml", USCRIPT_TAMIL },
86    { "telu", USCRIPT_TELUGU },
87    { "thaa", USCRIPT_THAANA },
88    { "thai", USCRIPT_THAI },
89    { "tibt", USCRIPT_TIBETAN },
90    { "cans", USCRIPT_CANADIAN_ABORIGINAL },
91    { "yiii", USCRIPT_YI },
92    { "tglg", USCRIPT_TAGALOG },
93    { "hano", USCRIPT_HANUNOO },
94    { "buhd", USCRIPT_BUHID },
95    { "tagb", USCRIPT_TAGBANWA },
96    { "brai", USCRIPT_BRAILLE },
97    { "cprt", USCRIPT_CYPRIOT },
98    { "limb", USCRIPT_LIMBU },
99    { "linb", USCRIPT_LINEAR_B },
100    { "osma", USCRIPT_OSMANYA },
101    { "shaw", USCRIPT_SHAVIAN },
102    { "tale", USCRIPT_TAI_LE },
103    { "ugar", USCRIPT_UGARITIC },
104    { "hrkt", USCRIPT_KATAKANA_OR_HIRAGANA },
105    { "bugi", USCRIPT_BUGINESE },
106    { "glag", USCRIPT_GLAGOLITIC },
107    { "khar", USCRIPT_KHAROSHTHI },
108    { "sylo", USCRIPT_SYLOTI_NAGRI },
109    { "talu", USCRIPT_NEW_TAI_LUE },
110    { "tfng", USCRIPT_TIFINAGH },
111    { "xpeo", USCRIPT_OLD_PERSIAN },
112    { "bali", USCRIPT_BALINESE },
113    { "batk", USCRIPT_BATAK },
114    { "blis", USCRIPT_BLISSYMBOLS },
115    { "brah", USCRIPT_BRAHMI },
116    { "cham", USCRIPT_CHAM },
117    { "cirt", USCRIPT_CIRTH },
118    { "cyrs", USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC },
119    { "egyd", USCRIPT_DEMOTIC_EGYPTIAN },
120    { "egyh", USCRIPT_HIERATIC_EGYPTIAN },
121    { "egyp", USCRIPT_EGYPTIAN_HIEROGLYPHS },
122    { "geok", USCRIPT_KHUTSURI },
123    { "hans", USCRIPT_SIMPLIFIED_HAN },
124    { "hant", USCRIPT_TRADITIONAL_HAN },
125    { "hmng", USCRIPT_PAHAWH_HMONG },
126    { "hung", USCRIPT_OLD_HUNGARIAN },
127    { "inds", USCRIPT_HARAPPAN_INDUS },
128    { "java", USCRIPT_JAVANESE },
129    { "kali", USCRIPT_KAYAH_LI },
130    { "latf", USCRIPT_LATIN_FRAKTUR },
131    { "latg", USCRIPT_LATIN_GAELIC },
132    { "lepc", USCRIPT_LEPCHA },
133    { "lina", USCRIPT_LINEAR_A },
134    { "mand", USCRIPT_MANDAEAN },
135    { "maya", USCRIPT_MAYAN_HIEROGLYPHS },
136    { "mero", USCRIPT_MEROITIC },
137    { "nkoo", USCRIPT_NKO },
138    { "orkh", USCRIPT_ORKHON },
139    { "perm", USCRIPT_OLD_PERMIC },
140    { "phag", USCRIPT_PHAGS_PA },
141    { "phnx", USCRIPT_PHOENICIAN },
142    { "plrd", USCRIPT_PHONETIC_POLLARD },
143    { "roro", USCRIPT_RONGORONGO },
144    { "sara", USCRIPT_SARATI },
145    { "syre", USCRIPT_ESTRANGELO_SYRIAC },
146    { "syrj", USCRIPT_WESTERN_SYRIAC },
147    { "syrn", USCRIPT_EASTERN_SYRIAC },
148    { "teng", USCRIPT_TENGWAR },
149    { "vaii", USCRIPT_VAI },
150    { "visp", USCRIPT_VISIBLE_SPEECH },
151    { "xsux", USCRIPT_CUNEIFORM },
152    { "jpan", USCRIPT_KATAKANA_OR_HIRAGANA },
153    { "kore", USCRIPT_HANGUL },
154    { "zxxx", USCRIPT_UNWRITTEN_LANGUAGES },
155    { "zzzz", USCRIPT_UNKNOWN }
156};
157
158struct ScriptNameCodeMapHashTraits : public HashTraits<String> {
159    static const int minimumTableSize = WTF::HashTableCapacityForSize<sizeof(scriptNameCodeList) / sizeof(ScriptNameCode)>::value;
160};
161
162typedef HashMap<String, UScriptCode, DefaultHash<String>::Hash, ScriptNameCodeMapHashTraits> ScriptNameCodeMap;
163
164UScriptCode scriptNameToCode(const String& scriptName)
165{
166    DEPRECATED_DEFINE_STATIC_LOCAL(ScriptNameCodeMap, scriptNameCodeMap, ());
167    if (scriptNameCodeMap.isEmpty()) {
168        for (size_t i = 0; i < sizeof(scriptNameCodeList) / sizeof(ScriptNameCode); ++i)
169            scriptNameCodeMap.set(ASCIILiteral(scriptNameCodeList[i].name), scriptNameCodeList[i].code);
170    }
171
172    ScriptNameCodeMap::iterator it = scriptNameCodeMap.find(scriptName.lower());
173    if (it != scriptNameCodeMap.end())
174        return it->value;
175    return USCRIPT_INVALID_CODE;
176}
177
178struct LocaleScript {
179    const char* locale;
180    UScriptCode script;
181};
182
183static const LocaleScript localeScriptList[] = {
184    { "aa", USCRIPT_LATIN },
185    { "ab", USCRIPT_CYRILLIC },
186    { "ady", USCRIPT_CYRILLIC },
187    { "af", USCRIPT_LATIN },
188    { "ak", USCRIPT_LATIN },
189    { "am", USCRIPT_ETHIOPIC },
190    { "ar", USCRIPT_ARABIC },
191    { "as", USCRIPT_BENGALI },
192    { "ast", USCRIPT_LATIN },
193    { "av", USCRIPT_CYRILLIC },
194    { "ay", USCRIPT_LATIN },
195    { "az", USCRIPT_LATIN },
196    { "ba", USCRIPT_CYRILLIC },
197    { "be", USCRIPT_CYRILLIC },
198    { "bg", USCRIPT_CYRILLIC },
199    { "bi", USCRIPT_LATIN },
200    { "bn", USCRIPT_BENGALI },
201    { "bo", USCRIPT_TIBETAN },
202    { "bs", USCRIPT_LATIN },
203    { "ca", USCRIPT_LATIN },
204    { "ce", USCRIPT_CYRILLIC },
205    { "ceb", USCRIPT_LATIN },
206    { "ch", USCRIPT_LATIN },
207    { "chk", USCRIPT_LATIN },
208    { "cs", USCRIPT_LATIN },
209    { "cy", USCRIPT_LATIN },
210    { "da", USCRIPT_LATIN },
211    { "de", USCRIPT_LATIN },
212    { "dv", USCRIPT_THAANA },
213    { "dz", USCRIPT_TIBETAN },
214    { "ee", USCRIPT_LATIN },
215    { "efi", USCRIPT_LATIN },
216    { "el", USCRIPT_GREEK },
217    { "en", USCRIPT_LATIN },
218    { "es", USCRIPT_LATIN },
219    { "et", USCRIPT_LATIN },
220    { "eu", USCRIPT_LATIN },
221    { "fa", USCRIPT_ARABIC },
222    { "fi", USCRIPT_LATIN },
223    { "fil", USCRIPT_LATIN },
224    { "fj", USCRIPT_LATIN },
225    { "fo", USCRIPT_LATIN },
226    { "fr", USCRIPT_LATIN },
227    { "fur", USCRIPT_LATIN },
228    { "fy", USCRIPT_LATIN },
229    { "ga", USCRIPT_LATIN },
230    { "gaa", USCRIPT_LATIN },
231    { "gd", USCRIPT_LATIN },
232    { "gil", USCRIPT_LATIN },
233    { "gl", USCRIPT_LATIN },
234    { "gn", USCRIPT_LATIN },
235    { "gsw", USCRIPT_LATIN },
236    { "gu", USCRIPT_GUJARATI },
237    { "ha", USCRIPT_LATIN },
238    { "haw", USCRIPT_LATIN },
239    { "he", USCRIPT_HEBREW },
240    { "hi", USCRIPT_DEVANAGARI },
241    { "hil", USCRIPT_LATIN },
242    { "ho", USCRIPT_LATIN },
243    { "hr", USCRIPT_LATIN },
244    { "ht", USCRIPT_LATIN },
245    { "hu", USCRIPT_LATIN },
246    { "hy", USCRIPT_ARMENIAN },
247    { "id", USCRIPT_LATIN },
248    { "ig", USCRIPT_LATIN },
249    { "ii", USCRIPT_YI },
250    { "ilo", USCRIPT_LATIN },
251    { "inh", USCRIPT_CYRILLIC },
252    { "is", USCRIPT_LATIN },
253    { "it", USCRIPT_LATIN },
254    { "iu", USCRIPT_CANADIAN_ABORIGINAL },
255    { "ja", USCRIPT_KATAKANA_OR_HIRAGANA },
256    { "jv", USCRIPT_LATIN },
257    { "ka", USCRIPT_GEORGIAN },
258    { "kaj", USCRIPT_LATIN },
259    { "kam", USCRIPT_LATIN },
260    { "kbd", USCRIPT_CYRILLIC },
261    { "kha", USCRIPT_LATIN },
262    { "kk", USCRIPT_CYRILLIC },
263    { "kl", USCRIPT_LATIN },
264    { "km", USCRIPT_KHMER },
265    { "kn", USCRIPT_KANNADA },
266    { "ko", USCRIPT_HANGUL },
267    { "kok", USCRIPT_DEVANAGARI },
268    { "kos", USCRIPT_LATIN },
269    { "kpe", USCRIPT_LATIN },
270    { "krc", USCRIPT_CYRILLIC },
271    { "ks", USCRIPT_ARABIC },
272    { "ku", USCRIPT_ARABIC },
273    { "kum", USCRIPT_CYRILLIC },
274    { "ky", USCRIPT_CYRILLIC },
275    { "la", USCRIPT_LATIN },
276    { "lah", USCRIPT_ARABIC },
277    { "lb", USCRIPT_LATIN },
278    { "lez", USCRIPT_CYRILLIC },
279    { "ln", USCRIPT_LATIN },
280    { "lo", USCRIPT_LAO },
281    { "lt", USCRIPT_LATIN },
282    { "lv", USCRIPT_LATIN },
283    { "mai", USCRIPT_DEVANAGARI },
284    { "mdf", USCRIPT_CYRILLIC },
285    { "mg", USCRIPT_LATIN },
286    { "mh", USCRIPT_LATIN },
287    { "mi", USCRIPT_LATIN },
288    { "mk", USCRIPT_CYRILLIC },
289    { "ml", USCRIPT_MALAYALAM },
290    { "mn", USCRIPT_CYRILLIC },
291    { "mr", USCRIPT_DEVANAGARI },
292    { "ms", USCRIPT_LATIN },
293    { "mt", USCRIPT_LATIN },
294    { "my", USCRIPT_MYANMAR },
295    { "myv", USCRIPT_CYRILLIC },
296    { "na", USCRIPT_LATIN },
297    { "nb", USCRIPT_LATIN },
298    { "ne", USCRIPT_DEVANAGARI },
299    { "niu", USCRIPT_LATIN },
300    { "nl", USCRIPT_LATIN },
301    { "nn", USCRIPT_LATIN },
302    { "nr", USCRIPT_LATIN },
303    { "nso", USCRIPT_LATIN },
304    { "ny", USCRIPT_LATIN },
305    { "oc", USCRIPT_LATIN },
306    { "om", USCRIPT_LATIN },
307    { "or", USCRIPT_ORIYA },
308    { "os", USCRIPT_CYRILLIC },
309    { "pa", USCRIPT_GURMUKHI },
310    { "pag", USCRIPT_LATIN },
311    { "pap", USCRIPT_LATIN },
312    { "pau", USCRIPT_LATIN },
313    { "pl", USCRIPT_LATIN },
314    { "pon", USCRIPT_LATIN },
315    { "ps", USCRIPT_ARABIC },
316    { "pt", USCRIPT_LATIN },
317    { "qu", USCRIPT_LATIN },
318    { "rm", USCRIPT_LATIN },
319    { "rn", USCRIPT_LATIN },
320    { "ro", USCRIPT_LATIN },
321    { "ru", USCRIPT_CYRILLIC },
322    { "rw", USCRIPT_LATIN },
323    { "sa", USCRIPT_DEVANAGARI },
324    { "sah", USCRIPT_CYRILLIC },
325    { "sat", USCRIPT_LATIN },
326    { "sd", USCRIPT_ARABIC },
327    { "se", USCRIPT_LATIN },
328    { "sg", USCRIPT_LATIN },
329    { "si", USCRIPT_SINHALA },
330    { "sid", USCRIPT_LATIN },
331    { "sk", USCRIPT_LATIN },
332    { "sl", USCRIPT_LATIN },
333    { "sm", USCRIPT_LATIN },
334    { "so", USCRIPT_LATIN },
335    { "sq", USCRIPT_LATIN },
336    { "sr", USCRIPT_CYRILLIC },
337    { "ss", USCRIPT_LATIN },
338    { "st", USCRIPT_LATIN },
339    { "su", USCRIPT_LATIN },
340    { "sv", USCRIPT_LATIN },
341    { "sw", USCRIPT_LATIN },
342    { "ta", USCRIPT_TAMIL },
343    { "te", USCRIPT_TELUGU },
344    { "tet", USCRIPT_LATIN },
345    { "tg", USCRIPT_CYRILLIC },
346    { "th", USCRIPT_THAI },
347    { "ti", USCRIPT_ETHIOPIC },
348    { "tig", USCRIPT_ETHIOPIC },
349    { "tk", USCRIPT_LATIN },
350    { "tkl", USCRIPT_LATIN },
351    { "tl", USCRIPT_LATIN },
352    { "tn", USCRIPT_LATIN },
353    { "to", USCRIPT_LATIN },
354    { "tpi", USCRIPT_LATIN },
355    { "tr", USCRIPT_LATIN },
356    { "trv", USCRIPT_LATIN },
357    { "ts", USCRIPT_LATIN },
358    { "tt", USCRIPT_CYRILLIC },
359    { "tvl", USCRIPT_LATIN },
360    { "tw", USCRIPT_LATIN },
361    { "ty", USCRIPT_LATIN },
362    { "tyv", USCRIPT_CYRILLIC },
363    { "udm", USCRIPT_CYRILLIC },
364    { "ug", USCRIPT_ARABIC },
365    { "uk", USCRIPT_CYRILLIC },
366    { "und", USCRIPT_LATIN },
367    { "ur", USCRIPT_ARABIC },
368    { "uz", USCRIPT_CYRILLIC },
369    { "ve", USCRIPT_LATIN },
370    { "vi", USCRIPT_LATIN },
371    { "wal", USCRIPT_ETHIOPIC },
372    { "war", USCRIPT_LATIN },
373    { "wo", USCRIPT_LATIN },
374    { "xh", USCRIPT_LATIN },
375    { "yap", USCRIPT_LATIN },
376    { "yo", USCRIPT_LATIN },
377    { "za", USCRIPT_LATIN },
378    { "zh", USCRIPT_SIMPLIFIED_HAN },
379    { "zh_hk", USCRIPT_TRADITIONAL_HAN },
380    { "zh_tw", USCRIPT_TRADITIONAL_HAN },
381    { "zu", USCRIPT_LATIN }
382};
383
384struct LocaleScriptMapHashTraits : public HashTraits<String> {
385    static const int minimumTableSize = WTF::HashTableCapacityForSize<sizeof(localeScriptList) / sizeof(LocaleScript)>::value;
386};
387
388typedef HashMap<String, UScriptCode, DefaultHash<String>::Hash, LocaleScriptMapHashTraits> LocaleScriptMap;
389
390UScriptCode localeToScriptCodeForFontSelection(const String& locale)
391{
392    DEPRECATED_DEFINE_STATIC_LOCAL(LocaleScriptMap, localeScriptMap, ());
393    if (localeScriptMap.isEmpty()) {
394        for (size_t i = 0; i < sizeof(localeScriptList) / sizeof(LocaleScript); ++i)
395            localeScriptMap.set(ASCIILiteral(localeScriptList[i].locale), localeScriptList[i].script);
396    }
397
398    String canonicalLocale = locale.lower().replace('-', '_');
399    while (!canonicalLocale.isEmpty()) {
400        LocaleScriptMap::iterator it = localeScriptMap.find(canonicalLocale);
401        if (it != localeScriptMap.end())
402            return it->value;
403        size_t pos = canonicalLocale.reverseFind('_');
404        if (pos == notFound)
405            break;
406        UScriptCode code = scriptNameToCode(canonicalLocale.substring(pos + 1));
407        if (code != USCRIPT_INVALID_CODE && code != USCRIPT_UNKNOWN)
408            return code;
409        canonicalLocale = canonicalLocale.substring(0, pos);
410    }
411    return USCRIPT_COMMON;
412}
413
414} // namespace WebCore
415