ConditionalSpecialCasing.java revision 12745:f068a4ffddd2
1/* 2 * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26package java.lang; 27 28import java.text.BreakIterator; 29import java.util.HashSet; 30import java.util.Hashtable; 31import java.util.Iterator; 32import java.util.Locale; 33import sun.text.Normalizer; 34 35 36/** 37 * This is a utility class for <code>String.toLowerCase()</code> and 38 * <code>String.toUpperCase()</code>, that handles special casing with 39 * conditions. In other words, it handles the mappings with conditions 40 * that are defined in 41 * <a href="http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt">Special 42 * Casing Properties</a> file. 43 * <p> 44 * Note that the unconditional case mappings (including 1:M mappings) 45 * are handled in <code>Character.toLower/UpperCase()</code>. 46 */ 47final class ConditionalSpecialCasing { 48 49 // context conditions. 50 static final int FINAL_CASED = 1; 51 static final int AFTER_SOFT_DOTTED = 2; 52 static final int MORE_ABOVE = 3; 53 static final int AFTER_I = 4; 54 static final int NOT_BEFORE_DOT = 5; 55 56 // combining class definitions 57 static final int COMBINING_CLASS_ABOVE = 230; 58 59 // Special case mapping entries 60 static Entry[] entry = { 61 //# ================================================================================ 62 //# Conditional mappings 63 //# ================================================================================ 64 new Entry(0x03A3, new char[]{0x03C2}, new char[]{0x03A3}, null, FINAL_CASED), // # GREEK CAPITAL LETTER SIGMA 65 new Entry(0x0130, new char[]{0x0069, 0x0307}, new char[]{0x0130}, null, 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE 66 67 //# ================================================================================ 68 //# Locale-sensitive mappings 69 //# ================================================================================ 70 //# Lithuanian 71 new Entry(0x0307, new char[]{0x0307}, new char[]{}, "lt", AFTER_SOFT_DOTTED), // # COMBINING DOT ABOVE 72 new Entry(0x0049, new char[]{0x0069, 0x0307}, new char[]{0x0049}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I 73 new Entry(0x004A, new char[]{0x006A, 0x0307}, new char[]{0x004A}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER J 74 new Entry(0x012E, new char[]{0x012F, 0x0307}, new char[]{0x012E}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I WITH OGONEK 75 new Entry(0x00CC, new char[]{0x0069, 0x0307, 0x0300}, new char[]{0x00CC}, "lt", 0), // # LATIN CAPITAL LETTER I WITH GRAVE 76 new Entry(0x00CD, new char[]{0x0069, 0x0307, 0x0301}, new char[]{0x00CD}, "lt", 0), // # LATIN CAPITAL LETTER I WITH ACUTE 77 new Entry(0x0128, new char[]{0x0069, 0x0307, 0x0303}, new char[]{0x0128}, "lt", 0), // # LATIN CAPITAL LETTER I WITH TILDE 78 79 //# ================================================================================ 80 //# Turkish and Azeri 81 new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE 82 new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "az", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE 83 new Entry(0x0307, new char[]{}, new char[]{0x0307}, "tr", AFTER_I), // # COMBINING DOT ABOVE 84 new Entry(0x0307, new char[]{}, new char[]{0x0307}, "az", AFTER_I), // # COMBINING DOT ABOVE 85 new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "tr", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I 86 new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "az", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I 87 new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN SMALL LETTER I 88 new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "az", 0) // # LATIN SMALL LETTER I 89 }; 90 91 // A hash table that contains the above entries 92 static Hashtable<Integer, HashSet<Entry>> entryTable = new Hashtable<>(); 93 static { 94 // create hashtable from the entry 95 for (Entry cur : entry) { 96 Integer cp = cur.getCodePoint(); 97 HashSet<Entry> set = entryTable.get(cp); 98 if (set == null) { 99 set = new HashSet<>(); 100 entryTable.put(cp, set); 101 } 102 set.add(cur); 103 } 104 } 105 106 static int toLowerCaseEx(String src, int index, Locale locale) { 107 char[] result = lookUpTable(src, index, locale, true); 108 109 if (result != null) { 110 if (result.length == 1) { 111 return result[0]; 112 } else { 113 return Character.ERROR; 114 } 115 } else { 116 // default to Character class' one 117 return Character.toLowerCase(src.codePointAt(index)); 118 } 119 } 120 121 static int toUpperCaseEx(String src, int index, Locale locale) { 122 char[] result = lookUpTable(src, index, locale, false); 123 124 if (result != null) { 125 if (result.length == 1) { 126 return result[0]; 127 } else { 128 return Character.ERROR; 129 } 130 } else { 131 // default to Character class' one 132 return Character.toUpperCaseEx(src.codePointAt(index)); 133 } 134 } 135 136 static char[] toLowerCaseCharArray(String src, int index, Locale locale) { 137 return lookUpTable(src, index, locale, true); 138 } 139 140 static char[] toUpperCaseCharArray(String src, int index, Locale locale) { 141 char[] result = lookUpTable(src, index, locale, false); 142 if (result != null) { 143 return result; 144 } else { 145 return Character.toUpperCaseCharArray(src.codePointAt(index)); 146 } 147 } 148 149 private static char[] lookUpTable(String src, int index, Locale locale, boolean bLowerCasing) { 150 HashSet<Entry> set = entryTable.get(src.codePointAt(index)); 151 char[] ret = null; 152 153 if (set != null) { 154 Iterator<Entry> iter = set.iterator(); 155 String currentLang = locale.getLanguage(); 156 while (iter.hasNext()) { 157 Entry entry = iter.next(); 158 String conditionLang = entry.getLanguage(); 159 if (((conditionLang == null) || (conditionLang.equals(currentLang))) && 160 isConditionMet(src, index, locale, entry.getCondition())) { 161 ret = bLowerCasing ? entry.getLowerCase() : entry.getUpperCase(); 162 if (conditionLang != null) { 163 break; 164 } 165 } 166 } 167 } 168 169 return ret; 170 } 171 172 private static boolean isConditionMet(String src, int index, Locale locale, int condition) { 173 switch (condition) { 174 case FINAL_CASED: 175 return isFinalCased(src, index, locale); 176 177 case AFTER_SOFT_DOTTED: 178 return isAfterSoftDotted(src, index); 179 180 case MORE_ABOVE: 181 return isMoreAbove(src, index); 182 183 case AFTER_I: 184 return isAfterI(src, index); 185 186 case NOT_BEFORE_DOT: 187 return !isBeforeDot(src, index); 188 189 default: 190 return true; 191 } 192 } 193 194 /** 195 * Implements the "Final_Cased" condition 196 * 197 * Specification: Within the closest word boundaries containing C, there is a cased 198 * letter before C, and there is no cased letter after C. 199 * 200 * Regular Expression: 201 * Before C: [{cased==true}][{wordBoundary!=true}]* 202 * After C: !([{wordBoundary!=true}]*[{cased}]) 203 */ 204 private static boolean isFinalCased(String src, int index, Locale locale) { 205 BreakIterator wordBoundary = BreakIterator.getWordInstance(locale); 206 wordBoundary.setText(src); 207 int ch; 208 209 // Look for a preceding 'cased' letter 210 for (int i = index; (i >= 0) && !wordBoundary.isBoundary(i); 211 i -= Character.charCount(ch)) { 212 213 ch = src.codePointBefore(i); 214 if (isCased(ch)) { 215 216 int len = src.length(); 217 // Check that there is no 'cased' letter after the index 218 for (i = index + Character.charCount(src.codePointAt(index)); 219 (i < len) && !wordBoundary.isBoundary(i); 220 i += Character.charCount(ch)) { 221 222 ch = src.codePointAt(i); 223 if (isCased(ch)) { 224 return false; 225 } 226 } 227 228 return true; 229 } 230 } 231 232 return false; 233 } 234 235 /** 236 * Implements the "After_I" condition 237 * 238 * Specification: The last preceding base character was an uppercase I, 239 * and there is no intervening combining character class 230 (ABOVE). 240 * 241 * Regular Expression: 242 * Before C: [I]([{cc!=230}&{cc!=0}])* 243 */ 244 private static boolean isAfterI(String src, int index) { 245 int ch; 246 int cc; 247 248 // Look for the last preceding base character 249 for (int i = index; i > 0; i -= Character.charCount(ch)) { 250 251 ch = src.codePointBefore(i); 252 253 if (ch == 'I') { 254 return true; 255 } else { 256 cc = Normalizer.getCombiningClass(ch); 257 if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) { 258 return false; 259 } 260 } 261 } 262 263 return false; 264 } 265 266 /** 267 * Implements the "After_Soft_Dotted" condition 268 * 269 * Specification: The last preceding character with combining class 270 * of zero before C was Soft_Dotted, and there is no intervening 271 * combining character class 230 (ABOVE). 272 * 273 * Regular Expression: 274 * Before C: [{Soft_Dotted==true}]([{cc!=230}&{cc!=0}])* 275 */ 276 private static boolean isAfterSoftDotted(String src, int index) { 277 int ch; 278 int cc; 279 280 // Look for the last preceding character 281 for (int i = index; i > 0; i -= Character.charCount(ch)) { 282 283 ch = src.codePointBefore(i); 284 285 if (isSoftDotted(ch)) { 286 return true; 287 } else { 288 cc = Normalizer.getCombiningClass(ch); 289 if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) { 290 return false; 291 } 292 } 293 } 294 295 return false; 296 } 297 298 /** 299 * Implements the "More_Above" condition 300 * 301 * Specification: C is followed by one or more characters of combining 302 * class 230 (ABOVE) in the combining character sequence. 303 * 304 * Regular Expression: 305 * After C: [{cc!=0}]*[{cc==230}] 306 */ 307 private static boolean isMoreAbove(String src, int index) { 308 int ch; 309 int cc; 310 int len = src.length(); 311 312 // Look for a following ABOVE combining class character 313 for (int i = index + Character.charCount(src.codePointAt(index)); 314 i < len; i += Character.charCount(ch)) { 315 316 ch = src.codePointAt(i); 317 cc = Normalizer.getCombiningClass(ch); 318 319 if (cc == COMBINING_CLASS_ABOVE) { 320 return true; 321 } else if (cc == 0) { 322 return false; 323 } 324 } 325 326 return false; 327 } 328 329 /** 330 * Implements the "Before_Dot" condition 331 * 332 * Specification: C is followed by <code>U+0307 COMBINING DOT ABOVE</code>. 333 * Any sequence of characters with a combining class that is 334 * neither 0 nor 230 may intervene between the current character 335 * and the combining dot above. 336 * 337 * Regular Expression: 338 * After C: ([{cc!=230}&{cc!=0}])*[\u0307] 339 */ 340 private static boolean isBeforeDot(String src, int index) { 341 int ch; 342 int cc; 343 int len = src.length(); 344 345 // Look for a following COMBINING DOT ABOVE 346 for (int i = index + Character.charCount(src.codePointAt(index)); 347 i < len; i += Character.charCount(ch)) { 348 349 ch = src.codePointAt(i); 350 351 if (ch == '\u0307') { 352 return true; 353 } else { 354 cc = Normalizer.getCombiningClass(ch); 355 if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) { 356 return false; 357 } 358 } 359 } 360 361 return false; 362 } 363 364 /** 365 * Examines whether a character is 'cased'. 366 * 367 * A character C is defined to be 'cased' if and only if at least one of 368 * following are true for C: uppercase==true, or lowercase==true, or 369 * general_category==titlecase_letter. 370 * 371 * The uppercase and lowercase property values are specified in the data 372 * file DerivedCoreProperties.txt in the Unicode Character Database. 373 */ 374 private static boolean isCased(int ch) { 375 int type = Character.getType(ch); 376 if (type == Character.LOWERCASE_LETTER || 377 type == Character.UPPERCASE_LETTER || 378 type == Character.TITLECASE_LETTER) { 379 return true; 380 } else { 381 // Check for Other_Lowercase and Other_Uppercase 382 // 383 if ((ch >= 0x02B0) && (ch <= 0x02B8)) { 384 // MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y 385 return true; 386 } else if ((ch >= 0x02C0) && (ch <= 0x02C1)) { 387 // MODIFIER LETTER GLOTTAL STOP..MODIFIER LETTER REVERSED GLOTTAL STOP 388 return true; 389 } else if ((ch >= 0x02E0) && (ch <= 0x02E4)) { 390 // MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP 391 return true; 392 } else if (ch == 0x0345) { 393 // COMBINING GREEK YPOGEGRAMMENI 394 return true; 395 } else if (ch == 0x037A) { 396 // GREEK YPOGEGRAMMENI 397 return true; 398 } else if ((ch >= 0x1D2C) && (ch <= 0x1D61)) { 399 // MODIFIER LETTER CAPITAL A..MODIFIER LETTER SMALL CHI 400 return true; 401 } else if ((ch >= 0x2160) && (ch <= 0x217F)) { 402 // ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND 403 // SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND 404 return true; 405 } else if ((ch >= 0x24B6) && (ch <= 0x24E9)) { 406 // CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z 407 // CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z 408 return true; 409 } else { 410 return false; 411 } 412 } 413 } 414 415 private static boolean isSoftDotted(int ch) { 416 switch (ch) { 417 case 0x0069: // Soft_Dotted # L& LATIN SMALL LETTER I 418 case 0x006A: // Soft_Dotted # L& LATIN SMALL LETTER J 419 case 0x012F: // Soft_Dotted # L& LATIN SMALL LETTER I WITH OGONEK 420 case 0x0268: // Soft_Dotted # L& LATIN SMALL LETTER I WITH STROKE 421 case 0x0456: // Soft_Dotted # L& CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I 422 case 0x0458: // Soft_Dotted # L& CYRILLIC SMALL LETTER JE 423 case 0x1D62: // Soft_Dotted # L& LATIN SUBSCRIPT SMALL LETTER I 424 case 0x1E2D: // Soft_Dotted # L& LATIN SMALL LETTER I WITH TILDE BELOW 425 case 0x1ECB: // Soft_Dotted # L& LATIN SMALL LETTER I WITH DOT BELOW 426 case 0x2071: // Soft_Dotted # L& SUPERSCRIPT LATIN SMALL LETTER I 427 return true; 428 default: 429 return false; 430 } 431 } 432 433 /** 434 * An internal class that represents an entry in the Special Casing Properties. 435 */ 436 static class Entry { 437 int ch; 438 char [] lower; 439 char [] upper; 440 String lang; 441 int condition; 442 443 Entry(int ch, char[] lower, char[] upper, String lang, int condition) { 444 this.ch = ch; 445 this.lower = lower; 446 this.upper = upper; 447 this.lang = lang; 448 this.condition = condition; 449 } 450 451 int getCodePoint() { 452 return ch; 453 } 454 455 char[] getLowerCase() { 456 return lower; 457 } 458 459 char[] getUpperCase() { 460 return upper; 461 } 462 463 String getLanguage() { 464 return lang; 465 } 466 467 int getCondition() { 468 return condition; 469 } 470 } 471} 472