1/* 2 * Copyright (C) 2011 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31#include "config.h" 32#include "LocaleToScriptMapping.h" 33 34#include <wtf/HashMap.h> 35#include <wtf/HashSet.h> 36#include <wtf/text/StringHash.h> 37 38namespace WebCore { 39 40struct ScriptNameCode { 41 const char* name; 42 UScriptCode code; 43}; 44 45// This generally maps an ISO 15924 script code to its UScriptCode, but certain families of script codes are 46// treated as a single script for assigning a per-script font in Settings. For example, "hira" is mapped to 47// USCRIPT_KATAKANA_OR_HIRAGANA instead of USCRIPT_HIRAGANA, since we want all Japanese scripts to be rendered 48// using the same font setting. 49static const ScriptNameCode scriptNameCodeList[] = { 50 { "zyyy", USCRIPT_COMMON }, 51 { "qaai", USCRIPT_INHERITED }, 52 { "arab", USCRIPT_ARABIC }, 53 { "armn", USCRIPT_ARMENIAN }, 54 { "beng", USCRIPT_BENGALI }, 55 { "bopo", USCRIPT_BOPOMOFO }, 56 { "cher", USCRIPT_CHEROKEE }, 57 { "copt", USCRIPT_COPTIC }, 58 { "cyrl", USCRIPT_CYRILLIC }, 59 { "dsrt", USCRIPT_DESERET }, 60 { "deva", USCRIPT_DEVANAGARI }, 61 { "ethi", USCRIPT_ETHIOPIC }, 62 { "geor", USCRIPT_GEORGIAN }, 63 { "goth", USCRIPT_GOTHIC }, 64 { "grek", USCRIPT_GREEK }, 65 { "gujr", USCRIPT_GUJARATI }, 66 { "guru", USCRIPT_GURMUKHI }, 67 { "hani", USCRIPT_HAN }, 68 { "hang", USCRIPT_HANGUL }, 69 { "hebr", USCRIPT_HEBREW }, 70 { "hira", USCRIPT_KATAKANA_OR_HIRAGANA }, 71 { "knda", USCRIPT_KANNADA }, 72 { "kana", USCRIPT_KATAKANA_OR_HIRAGANA }, 73 { "khmr", USCRIPT_KHMER }, 74 { "laoo", USCRIPT_LAO }, 75 { "latn", USCRIPT_LATIN }, 76 { "mlym", USCRIPT_MALAYALAM }, 77 { "mong", USCRIPT_MONGOLIAN }, 78 { "mymr", USCRIPT_MYANMAR }, 79 { "ogam", USCRIPT_OGHAM }, 80 { "ital", USCRIPT_OLD_ITALIC }, 81 { "orya", USCRIPT_ORIYA }, 82 { "runr", USCRIPT_RUNIC }, 83 { "sinh", USCRIPT_SINHALA }, 84 { "syrc", USCRIPT_SYRIAC }, 85 { "taml", USCRIPT_TAMIL }, 86 { "telu", USCRIPT_TELUGU }, 87 { "thaa", USCRIPT_THAANA }, 88 { "thai", USCRIPT_THAI }, 89 { "tibt", USCRIPT_TIBETAN }, 90 { "cans", USCRIPT_CANADIAN_ABORIGINAL }, 91 { "yiii", USCRIPT_YI }, 92 { "tglg", USCRIPT_TAGALOG }, 93 { "hano", USCRIPT_HANUNOO }, 94 { "buhd", USCRIPT_BUHID }, 95 { "tagb", USCRIPT_TAGBANWA }, 96 { "brai", USCRIPT_BRAILLE }, 97 { "cprt", USCRIPT_CYPRIOT }, 98 { "limb", USCRIPT_LIMBU }, 99 { "linb", USCRIPT_LINEAR_B }, 100 { "osma", USCRIPT_OSMANYA }, 101 { "shaw", USCRIPT_SHAVIAN }, 102 { "tale", USCRIPT_TAI_LE }, 103 { "ugar", USCRIPT_UGARITIC }, 104 { "hrkt", USCRIPT_KATAKANA_OR_HIRAGANA }, 105 { "bugi", USCRIPT_BUGINESE }, 106 { "glag", USCRIPT_GLAGOLITIC }, 107 { "khar", USCRIPT_KHAROSHTHI }, 108 { "sylo", USCRIPT_SYLOTI_NAGRI }, 109 { "talu", USCRIPT_NEW_TAI_LUE }, 110 { "tfng", USCRIPT_TIFINAGH }, 111 { "xpeo", USCRIPT_OLD_PERSIAN }, 112 { "bali", USCRIPT_BALINESE }, 113 { "batk", USCRIPT_BATAK }, 114 { "blis", USCRIPT_BLISSYMBOLS }, 115 { "brah", USCRIPT_BRAHMI }, 116 { "cham", USCRIPT_CHAM }, 117 { "cirt", USCRIPT_CIRTH }, 118 { "cyrs", USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC }, 119 { "egyd", USCRIPT_DEMOTIC_EGYPTIAN }, 120 { "egyh", USCRIPT_HIERATIC_EGYPTIAN }, 121 { "egyp", USCRIPT_EGYPTIAN_HIEROGLYPHS }, 122 { "geok", USCRIPT_KHUTSURI }, 123 { "hans", USCRIPT_SIMPLIFIED_HAN }, 124 { "hant", USCRIPT_TRADITIONAL_HAN }, 125 { "hmng", USCRIPT_PAHAWH_HMONG }, 126 { "hung", USCRIPT_OLD_HUNGARIAN }, 127 { "inds", USCRIPT_HARAPPAN_INDUS }, 128 { "java", USCRIPT_JAVANESE }, 129 { "kali", USCRIPT_KAYAH_LI }, 130 { "latf", USCRIPT_LATIN_FRAKTUR }, 131 { "latg", USCRIPT_LATIN_GAELIC }, 132 { "lepc", USCRIPT_LEPCHA }, 133 { "lina", USCRIPT_LINEAR_A }, 134 { "mand", USCRIPT_MANDAEAN }, 135 { "maya", USCRIPT_MAYAN_HIEROGLYPHS }, 136 { "mero", USCRIPT_MEROITIC }, 137 { "nkoo", USCRIPT_NKO }, 138 { "orkh", USCRIPT_ORKHON }, 139 { "perm", USCRIPT_OLD_PERMIC }, 140 { "phag", USCRIPT_PHAGS_PA }, 141 { "phnx", USCRIPT_PHOENICIAN }, 142 { "plrd", USCRIPT_PHONETIC_POLLARD }, 143 { "roro", USCRIPT_RONGORONGO }, 144 { "sara", USCRIPT_SARATI }, 145 { "syre", USCRIPT_ESTRANGELO_SYRIAC }, 146 { "syrj", USCRIPT_WESTERN_SYRIAC }, 147 { "syrn", USCRIPT_EASTERN_SYRIAC }, 148 { "teng", USCRIPT_TENGWAR }, 149 { "vaii", USCRIPT_VAI }, 150 { "visp", USCRIPT_VISIBLE_SPEECH }, 151 { "xsux", USCRIPT_CUNEIFORM }, 152 { "jpan", USCRIPT_KATAKANA_OR_HIRAGANA }, 153 { "kore", USCRIPT_HANGUL }, 154 { "zxxx", USCRIPT_UNWRITTEN_LANGUAGES }, 155 { "zzzz", USCRIPT_UNKNOWN } 156}; 157 158struct ScriptNameCodeMapHashTraits : public HashTraits<String> { 159 static const int minimumTableSize = WTF::HashTableCapacityForSize<sizeof(scriptNameCodeList) / sizeof(ScriptNameCode)>::value; 160}; 161 162typedef HashMap<String, UScriptCode, DefaultHash<String>::Hash, ScriptNameCodeMapHashTraits> ScriptNameCodeMap; 163 164UScriptCode scriptNameToCode(const String& scriptName) 165{ 166 DEPRECATED_DEFINE_STATIC_LOCAL(ScriptNameCodeMap, scriptNameCodeMap, ()); 167 if (scriptNameCodeMap.isEmpty()) { 168 for (size_t i = 0; i < sizeof(scriptNameCodeList) / sizeof(ScriptNameCode); ++i) 169 scriptNameCodeMap.set(ASCIILiteral(scriptNameCodeList[i].name), scriptNameCodeList[i].code); 170 } 171 172 ScriptNameCodeMap::iterator it = scriptNameCodeMap.find(scriptName.lower()); 173 if (it != scriptNameCodeMap.end()) 174 return it->value; 175 return USCRIPT_INVALID_CODE; 176} 177 178struct LocaleScript { 179 const char* locale; 180 UScriptCode script; 181}; 182 183static const LocaleScript localeScriptList[] = { 184 { "aa", USCRIPT_LATIN }, 185 { "ab", USCRIPT_CYRILLIC }, 186 { "ady", USCRIPT_CYRILLIC }, 187 { "af", USCRIPT_LATIN }, 188 { "ak", USCRIPT_LATIN }, 189 { "am", USCRIPT_ETHIOPIC }, 190 { "ar", USCRIPT_ARABIC }, 191 { "as", USCRIPT_BENGALI }, 192 { "ast", USCRIPT_LATIN }, 193 { "av", USCRIPT_CYRILLIC }, 194 { "ay", USCRIPT_LATIN }, 195 { "az", USCRIPT_LATIN }, 196 { "ba", USCRIPT_CYRILLIC }, 197 { "be", USCRIPT_CYRILLIC }, 198 { "bg", USCRIPT_CYRILLIC }, 199 { "bi", USCRIPT_LATIN }, 200 { "bn", USCRIPT_BENGALI }, 201 { "bo", USCRIPT_TIBETAN }, 202 { "bs", USCRIPT_LATIN }, 203 { "ca", USCRIPT_LATIN }, 204 { "ce", USCRIPT_CYRILLIC }, 205 { "ceb", USCRIPT_LATIN }, 206 { "ch", USCRIPT_LATIN }, 207 { "chk", USCRIPT_LATIN }, 208 { "cs", USCRIPT_LATIN }, 209 { "cy", USCRIPT_LATIN }, 210 { "da", USCRIPT_LATIN }, 211 { "de", USCRIPT_LATIN }, 212 { "dv", USCRIPT_THAANA }, 213 { "dz", USCRIPT_TIBETAN }, 214 { "ee", USCRIPT_LATIN }, 215 { "efi", USCRIPT_LATIN }, 216 { "el", USCRIPT_GREEK }, 217 { "en", USCRIPT_LATIN }, 218 { "es", USCRIPT_LATIN }, 219 { "et", USCRIPT_LATIN }, 220 { "eu", USCRIPT_LATIN }, 221 { "fa", USCRIPT_ARABIC }, 222 { "fi", USCRIPT_LATIN }, 223 { "fil", USCRIPT_LATIN }, 224 { "fj", USCRIPT_LATIN }, 225 { "fo", USCRIPT_LATIN }, 226 { "fr", USCRIPT_LATIN }, 227 { "fur", USCRIPT_LATIN }, 228 { "fy", USCRIPT_LATIN }, 229 { "ga", USCRIPT_LATIN }, 230 { "gaa", USCRIPT_LATIN }, 231 { "gd", USCRIPT_LATIN }, 232 { "gil", USCRIPT_LATIN }, 233 { "gl", USCRIPT_LATIN }, 234 { "gn", USCRIPT_LATIN }, 235 { "gsw", USCRIPT_LATIN }, 236 { "gu", USCRIPT_GUJARATI }, 237 { "ha", USCRIPT_LATIN }, 238 { "haw", USCRIPT_LATIN }, 239 { "he", USCRIPT_HEBREW }, 240 { "hi", USCRIPT_DEVANAGARI }, 241 { "hil", USCRIPT_LATIN }, 242 { "ho", USCRIPT_LATIN }, 243 { "hr", USCRIPT_LATIN }, 244 { "ht", USCRIPT_LATIN }, 245 { "hu", USCRIPT_LATIN }, 246 { "hy", USCRIPT_ARMENIAN }, 247 { "id", USCRIPT_LATIN }, 248 { "ig", USCRIPT_LATIN }, 249 { "ii", USCRIPT_YI }, 250 { "ilo", USCRIPT_LATIN }, 251 { "inh", USCRIPT_CYRILLIC }, 252 { "is", USCRIPT_LATIN }, 253 { "it", USCRIPT_LATIN }, 254 { "iu", USCRIPT_CANADIAN_ABORIGINAL }, 255 { "ja", USCRIPT_KATAKANA_OR_HIRAGANA }, 256 { "jv", USCRIPT_LATIN }, 257 { "ka", USCRIPT_GEORGIAN }, 258 { "kaj", USCRIPT_LATIN }, 259 { "kam", USCRIPT_LATIN }, 260 { "kbd", USCRIPT_CYRILLIC }, 261 { "kha", USCRIPT_LATIN }, 262 { "kk", USCRIPT_CYRILLIC }, 263 { "kl", USCRIPT_LATIN }, 264 { "km", USCRIPT_KHMER }, 265 { "kn", USCRIPT_KANNADA }, 266 { "ko", USCRIPT_HANGUL }, 267 { "kok", USCRIPT_DEVANAGARI }, 268 { "kos", USCRIPT_LATIN }, 269 { "kpe", USCRIPT_LATIN }, 270 { "krc", USCRIPT_CYRILLIC }, 271 { "ks", USCRIPT_ARABIC }, 272 { "ku", USCRIPT_ARABIC }, 273 { "kum", USCRIPT_CYRILLIC }, 274 { "ky", USCRIPT_CYRILLIC }, 275 { "la", USCRIPT_LATIN }, 276 { "lah", USCRIPT_ARABIC }, 277 { "lb", USCRIPT_LATIN }, 278 { "lez", USCRIPT_CYRILLIC }, 279 { "ln", USCRIPT_LATIN }, 280 { "lo", USCRIPT_LAO }, 281 { "lt", USCRIPT_LATIN }, 282 { "lv", USCRIPT_LATIN }, 283 { "mai", USCRIPT_DEVANAGARI }, 284 { "mdf", USCRIPT_CYRILLIC }, 285 { "mg", USCRIPT_LATIN }, 286 { "mh", USCRIPT_LATIN }, 287 { "mi", USCRIPT_LATIN }, 288 { "mk", USCRIPT_CYRILLIC }, 289 { "ml", USCRIPT_MALAYALAM }, 290 { "mn", USCRIPT_CYRILLIC }, 291 { "mr", USCRIPT_DEVANAGARI }, 292 { "ms", USCRIPT_LATIN }, 293 { "mt", USCRIPT_LATIN }, 294 { "my", USCRIPT_MYANMAR }, 295 { "myv", USCRIPT_CYRILLIC }, 296 { "na", USCRIPT_LATIN }, 297 { "nb", USCRIPT_LATIN }, 298 { "ne", USCRIPT_DEVANAGARI }, 299 { "niu", USCRIPT_LATIN }, 300 { "nl", USCRIPT_LATIN }, 301 { "nn", USCRIPT_LATIN }, 302 { "nr", USCRIPT_LATIN }, 303 { "nso", USCRIPT_LATIN }, 304 { "ny", USCRIPT_LATIN }, 305 { "oc", USCRIPT_LATIN }, 306 { "om", USCRIPT_LATIN }, 307 { "or", USCRIPT_ORIYA }, 308 { "os", USCRIPT_CYRILLIC }, 309 { "pa", USCRIPT_GURMUKHI }, 310 { "pag", USCRIPT_LATIN }, 311 { "pap", USCRIPT_LATIN }, 312 { "pau", USCRIPT_LATIN }, 313 { "pl", USCRIPT_LATIN }, 314 { "pon", USCRIPT_LATIN }, 315 { "ps", USCRIPT_ARABIC }, 316 { "pt", USCRIPT_LATIN }, 317 { "qu", USCRIPT_LATIN }, 318 { "rm", USCRIPT_LATIN }, 319 { "rn", USCRIPT_LATIN }, 320 { "ro", USCRIPT_LATIN }, 321 { "ru", USCRIPT_CYRILLIC }, 322 { "rw", USCRIPT_LATIN }, 323 { "sa", USCRIPT_DEVANAGARI }, 324 { "sah", USCRIPT_CYRILLIC }, 325 { "sat", USCRIPT_LATIN }, 326 { "sd", USCRIPT_ARABIC }, 327 { "se", USCRIPT_LATIN }, 328 { "sg", USCRIPT_LATIN }, 329 { "si", USCRIPT_SINHALA }, 330 { "sid", USCRIPT_LATIN }, 331 { "sk", USCRIPT_LATIN }, 332 { "sl", USCRIPT_LATIN }, 333 { "sm", USCRIPT_LATIN }, 334 { "so", USCRIPT_LATIN }, 335 { "sq", USCRIPT_LATIN }, 336 { "sr", USCRIPT_CYRILLIC }, 337 { "ss", USCRIPT_LATIN }, 338 { "st", USCRIPT_LATIN }, 339 { "su", USCRIPT_LATIN }, 340 { "sv", USCRIPT_LATIN }, 341 { "sw", USCRIPT_LATIN }, 342 { "ta", USCRIPT_TAMIL }, 343 { "te", USCRIPT_TELUGU }, 344 { "tet", USCRIPT_LATIN }, 345 { "tg", USCRIPT_CYRILLIC }, 346 { "th", USCRIPT_THAI }, 347 { "ti", USCRIPT_ETHIOPIC }, 348 { "tig", USCRIPT_ETHIOPIC }, 349 { "tk", USCRIPT_LATIN }, 350 { "tkl", USCRIPT_LATIN }, 351 { "tl", USCRIPT_LATIN }, 352 { "tn", USCRIPT_LATIN }, 353 { "to", USCRIPT_LATIN }, 354 { "tpi", USCRIPT_LATIN }, 355 { "tr", USCRIPT_LATIN }, 356 { "trv", USCRIPT_LATIN }, 357 { "ts", USCRIPT_LATIN }, 358 { "tt", USCRIPT_CYRILLIC }, 359 { "tvl", USCRIPT_LATIN }, 360 { "tw", USCRIPT_LATIN }, 361 { "ty", USCRIPT_LATIN }, 362 { "tyv", USCRIPT_CYRILLIC }, 363 { "udm", USCRIPT_CYRILLIC }, 364 { "ug", USCRIPT_ARABIC }, 365 { "uk", USCRIPT_CYRILLIC }, 366 { "und", USCRIPT_LATIN }, 367 { "ur", USCRIPT_ARABIC }, 368 { "uz", USCRIPT_CYRILLIC }, 369 { "ve", USCRIPT_LATIN }, 370 { "vi", USCRIPT_LATIN }, 371 { "wal", USCRIPT_ETHIOPIC }, 372 { "war", USCRIPT_LATIN }, 373 { "wo", USCRIPT_LATIN }, 374 { "xh", USCRIPT_LATIN }, 375 { "yap", USCRIPT_LATIN }, 376 { "yo", USCRIPT_LATIN }, 377 { "za", USCRIPT_LATIN }, 378 { "zh", USCRIPT_SIMPLIFIED_HAN }, 379 { "zh_hk", USCRIPT_TRADITIONAL_HAN }, 380 { "zh_tw", USCRIPT_TRADITIONAL_HAN }, 381 { "zu", USCRIPT_LATIN } 382}; 383 384struct LocaleScriptMapHashTraits : public HashTraits<String> { 385 static const int minimumTableSize = WTF::HashTableCapacityForSize<sizeof(localeScriptList) / sizeof(LocaleScript)>::value; 386}; 387 388typedef HashMap<String, UScriptCode, DefaultHash<String>::Hash, LocaleScriptMapHashTraits> LocaleScriptMap; 389 390UScriptCode localeToScriptCodeForFontSelection(const String& locale) 391{ 392 DEPRECATED_DEFINE_STATIC_LOCAL(LocaleScriptMap, localeScriptMap, ()); 393 if (localeScriptMap.isEmpty()) { 394 for (size_t i = 0; i < sizeof(localeScriptList) / sizeof(LocaleScript); ++i) 395 localeScriptMap.set(ASCIILiteral(localeScriptList[i].locale), localeScriptList[i].script); 396 } 397 398 String canonicalLocale = locale.lower().replace('-', '_'); 399 while (!canonicalLocale.isEmpty()) { 400 LocaleScriptMap::iterator it = localeScriptMap.find(canonicalLocale); 401 if (it != localeScriptMap.end()) 402 return it->value; 403 size_t pos = canonicalLocale.reverseFind('_'); 404 if (pos == notFound) 405 break; 406 UScriptCode code = scriptNameToCode(canonicalLocale.substring(pos + 1)); 407 if (code != USCRIPT_INVALID_CODE && code != USCRIPT_UNKNOWN) 408 return code; 409 canonicalLocale = canonicalLocale.substring(0, pos); 410 } 411 return USCRIPT_COMMON; 412} 413 414} // namespace WebCore 415