ConditionalSpecialCasing.java revision 12745:f068a4ffddd2
1/*
2 * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26package java.lang;
27
28import java.text.BreakIterator;
29import java.util.HashSet;
30import java.util.Hashtable;
31import java.util.Iterator;
32import java.util.Locale;
33import sun.text.Normalizer;
34
35
36/**
37 * This is a utility class for <code>String.toLowerCase()</code> and
38 * <code>String.toUpperCase()</code>, that handles special casing with
39 * conditions.  In other words, it handles the mappings with conditions
40 * that are defined in
41 * <a href="http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt">Special
42 * Casing Properties</a> file.
43 * <p>
44 * Note that the unconditional case mappings (including 1:M mappings)
45 * are handled in <code>Character.toLower/UpperCase()</code>.
46 */
47final class ConditionalSpecialCasing {
48
49    // context conditions.
50    static final int FINAL_CASED =              1;
51    static final int AFTER_SOFT_DOTTED =        2;
52    static final int MORE_ABOVE =               3;
53    static final int AFTER_I =                  4;
54    static final int NOT_BEFORE_DOT =           5;
55
56    // combining class definitions
57    static final int COMBINING_CLASS_ABOVE = 230;
58
59    // Special case mapping entries
60    static Entry[] entry = {
61        //# ================================================================================
62        //# Conditional mappings
63        //# ================================================================================
64        new Entry(0x03A3, new char[]{0x03C2}, new char[]{0x03A3}, null, FINAL_CASED), // # GREEK CAPITAL LETTER SIGMA
65        new Entry(0x0130, new char[]{0x0069, 0x0307}, new char[]{0x0130}, null, 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
66
67        //# ================================================================================
68        //# Locale-sensitive mappings
69        //# ================================================================================
70        //# Lithuanian
71        new Entry(0x0307, new char[]{0x0307}, new char[]{}, "lt",  AFTER_SOFT_DOTTED), // # COMBINING DOT ABOVE
72        new Entry(0x0049, new char[]{0x0069, 0x0307}, new char[]{0x0049}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I
73        new Entry(0x004A, new char[]{0x006A, 0x0307}, new char[]{0x004A}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER J
74        new Entry(0x012E, new char[]{0x012F, 0x0307}, new char[]{0x012E}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I WITH OGONEK
75        new Entry(0x00CC, new char[]{0x0069, 0x0307, 0x0300}, new char[]{0x00CC}, "lt", 0), // # LATIN CAPITAL LETTER I WITH GRAVE
76        new Entry(0x00CD, new char[]{0x0069, 0x0307, 0x0301}, new char[]{0x00CD}, "lt", 0), // # LATIN CAPITAL LETTER I WITH ACUTE
77        new Entry(0x0128, new char[]{0x0069, 0x0307, 0x0303}, new char[]{0x0128}, "lt", 0), // # LATIN CAPITAL LETTER I WITH TILDE
78
79        //# ================================================================================
80        //# Turkish and Azeri
81        new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
82        new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "az", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
83        new Entry(0x0307, new char[]{}, new char[]{0x0307}, "tr", AFTER_I), // # COMBINING DOT ABOVE
84        new Entry(0x0307, new char[]{}, new char[]{0x0307}, "az", AFTER_I), // # COMBINING DOT ABOVE
85        new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "tr", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
86        new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "az", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
87        new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN SMALL LETTER I
88        new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "az", 0)  // # LATIN SMALL LETTER I
89    };
90
91    // A hash table that contains the above entries
92    static Hashtable<Integer, HashSet<Entry>> entryTable = new Hashtable<>();
93    static {
94        // create hashtable from the entry
95        for (Entry cur : entry) {
96            Integer cp = cur.getCodePoint();
97            HashSet<Entry> set = entryTable.get(cp);
98            if (set == null) {
99                set = new HashSet<>();
100                entryTable.put(cp, set);
101            }
102            set.add(cur);
103        }
104    }
105
106    static int toLowerCaseEx(String src, int index, Locale locale) {
107        char[] result = lookUpTable(src, index, locale, true);
108
109        if (result != null) {
110            if (result.length == 1) {
111                return result[0];
112            } else {
113                return Character.ERROR;
114            }
115        } else {
116            // default to Character class' one
117            return Character.toLowerCase(src.codePointAt(index));
118        }
119    }
120
121    static int toUpperCaseEx(String src, int index, Locale locale) {
122        char[] result = lookUpTable(src, index, locale, false);
123
124        if (result != null) {
125            if (result.length == 1) {
126                return result[0];
127            } else {
128                return Character.ERROR;
129            }
130        } else {
131            // default to Character class' one
132            return Character.toUpperCaseEx(src.codePointAt(index));
133        }
134    }
135
136    static char[] toLowerCaseCharArray(String src, int index, Locale locale) {
137        return lookUpTable(src, index, locale, true);
138    }
139
140    static char[] toUpperCaseCharArray(String src, int index, Locale locale) {
141        char[] result = lookUpTable(src, index, locale, false);
142        if (result != null) {
143            return result;
144        } else {
145            return Character.toUpperCaseCharArray(src.codePointAt(index));
146        }
147    }
148
149    private static char[] lookUpTable(String src, int index, Locale locale, boolean bLowerCasing) {
150        HashSet<Entry> set = entryTable.get(src.codePointAt(index));
151        char[] ret = null;
152
153        if (set != null) {
154            Iterator<Entry> iter = set.iterator();
155            String currentLang = locale.getLanguage();
156            while (iter.hasNext()) {
157                Entry entry = iter.next();
158                String conditionLang = entry.getLanguage();
159                if (((conditionLang == null) || (conditionLang.equals(currentLang))) &&
160                        isConditionMet(src, index, locale, entry.getCondition())) {
161                    ret = bLowerCasing ? entry.getLowerCase() : entry.getUpperCase();
162                    if (conditionLang != null) {
163                        break;
164                    }
165                }
166            }
167        }
168
169        return ret;
170    }
171
172    private static boolean isConditionMet(String src, int index, Locale locale, int condition) {
173        switch (condition) {
174        case FINAL_CASED:
175            return isFinalCased(src, index, locale);
176
177        case AFTER_SOFT_DOTTED:
178            return isAfterSoftDotted(src, index);
179
180        case MORE_ABOVE:
181            return isMoreAbove(src, index);
182
183        case AFTER_I:
184            return isAfterI(src, index);
185
186        case NOT_BEFORE_DOT:
187            return !isBeforeDot(src, index);
188
189        default:
190            return true;
191        }
192    }
193
194    /**
195     * Implements the "Final_Cased" condition
196     *
197     * Specification: Within the closest word boundaries containing C, there is a cased
198     * letter before C, and there is no cased letter after C.
199     *
200     * Regular Expression:
201     *   Before C: [{cased==true}][{wordBoundary!=true}]*
202     *   After C: !([{wordBoundary!=true}]*[{cased}])
203     */
204    private static boolean isFinalCased(String src, int index, Locale locale) {
205        BreakIterator wordBoundary = BreakIterator.getWordInstance(locale);
206        wordBoundary.setText(src);
207        int ch;
208
209        // Look for a preceding 'cased' letter
210        for (int i = index; (i >= 0) && !wordBoundary.isBoundary(i);
211                i -= Character.charCount(ch)) {
212
213            ch = src.codePointBefore(i);
214            if (isCased(ch)) {
215
216                int len = src.length();
217                // Check that there is no 'cased' letter after the index
218                for (i = index + Character.charCount(src.codePointAt(index));
219                        (i < len) && !wordBoundary.isBoundary(i);
220                        i += Character.charCount(ch)) {
221
222                    ch = src.codePointAt(i);
223                    if (isCased(ch)) {
224                        return false;
225                    }
226                }
227
228                return true;
229            }
230        }
231
232        return false;
233    }
234
235    /**
236     * Implements the "After_I" condition
237     *
238     * Specification: The last preceding base character was an uppercase I,
239     * and there is no intervening combining character class 230 (ABOVE).
240     *
241     * Regular Expression:
242     *   Before C: [I]([{cc!=230}&{cc!=0}])*
243     */
244    private static boolean isAfterI(String src, int index) {
245        int ch;
246        int cc;
247
248        // Look for the last preceding base character
249        for (int i = index; i > 0; i -= Character.charCount(ch)) {
250
251            ch = src.codePointBefore(i);
252
253            if (ch == 'I') {
254                return true;
255            } else {
256                cc = Normalizer.getCombiningClass(ch);
257                if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
258                    return false;
259                }
260            }
261        }
262
263        return false;
264    }
265
266    /**
267     * Implements the "After_Soft_Dotted" condition
268     *
269     * Specification: The last preceding character with combining class
270     * of zero before C was Soft_Dotted, and there is no intervening
271     * combining character class 230 (ABOVE).
272     *
273     * Regular Expression:
274     *   Before C: [{Soft_Dotted==true}]([{cc!=230}&{cc!=0}])*
275     */
276    private static boolean isAfterSoftDotted(String src, int index) {
277        int ch;
278        int cc;
279
280        // Look for the last preceding character
281        for (int i = index; i > 0; i -= Character.charCount(ch)) {
282
283            ch = src.codePointBefore(i);
284
285            if (isSoftDotted(ch)) {
286                return true;
287            } else {
288                cc = Normalizer.getCombiningClass(ch);
289                if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
290                    return false;
291                }
292            }
293        }
294
295        return false;
296    }
297
298    /**
299     * Implements the "More_Above" condition
300     *
301     * Specification: C is followed by one or more characters of combining
302     * class 230 (ABOVE) in the combining character sequence.
303     *
304     * Regular Expression:
305     *   After C: [{cc!=0}]*[{cc==230}]
306     */
307    private static boolean isMoreAbove(String src, int index) {
308        int ch;
309        int cc;
310        int len = src.length();
311
312        // Look for a following ABOVE combining class character
313        for (int i = index + Character.charCount(src.codePointAt(index));
314                i < len; i += Character.charCount(ch)) {
315
316            ch = src.codePointAt(i);
317            cc = Normalizer.getCombiningClass(ch);
318
319            if (cc == COMBINING_CLASS_ABOVE) {
320                return true;
321            } else if (cc == 0) {
322                return false;
323            }
324        }
325
326        return false;
327    }
328
329    /**
330     * Implements the "Before_Dot" condition
331     *
332     * Specification: C is followed by <code>U+0307 COMBINING DOT ABOVE</code>.
333     * Any sequence of characters with a combining class that is
334     * neither 0 nor 230 may intervene between the current character
335     * and the combining dot above.
336     *
337     * Regular Expression:
338     *   After C: ([{cc!=230}&{cc!=0}])*[\u0307]
339     */
340    private static boolean isBeforeDot(String src, int index) {
341        int ch;
342        int cc;
343        int len = src.length();
344
345        // Look for a following COMBINING DOT ABOVE
346        for (int i = index + Character.charCount(src.codePointAt(index));
347                i < len; i += Character.charCount(ch)) {
348
349            ch = src.codePointAt(i);
350
351            if (ch == '\u0307') {
352                return true;
353            } else {
354                cc = Normalizer.getCombiningClass(ch);
355                if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
356                    return false;
357                }
358            }
359        }
360
361        return false;
362    }
363
364    /**
365     * Examines whether a character is 'cased'.
366     *
367     * A character C is defined to be 'cased' if and only if at least one of
368     * following are true for C: uppercase==true, or lowercase==true, or
369     * general_category==titlecase_letter.
370     *
371     * The uppercase and lowercase property values are specified in the data
372     * file DerivedCoreProperties.txt in the Unicode Character Database.
373     */
374    private static boolean isCased(int ch) {
375        int type = Character.getType(ch);
376        if (type == Character.LOWERCASE_LETTER ||
377                type == Character.UPPERCASE_LETTER ||
378                type == Character.TITLECASE_LETTER) {
379            return true;
380        } else {
381            // Check for Other_Lowercase and Other_Uppercase
382            //
383            if ((ch >= 0x02B0) && (ch <= 0x02B8)) {
384                // MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y
385                return true;
386            } else if ((ch >= 0x02C0) && (ch <= 0x02C1)) {
387                // MODIFIER LETTER GLOTTAL STOP..MODIFIER LETTER REVERSED GLOTTAL STOP
388                return true;
389            } else if ((ch >= 0x02E0) && (ch <= 0x02E4)) {
390                // MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
391                return true;
392            } else if (ch == 0x0345) {
393                // COMBINING GREEK YPOGEGRAMMENI
394                return true;
395            } else if (ch == 0x037A) {
396                // GREEK YPOGEGRAMMENI
397                return true;
398            } else if ((ch >= 0x1D2C) && (ch <= 0x1D61)) {
399                // MODIFIER LETTER CAPITAL A..MODIFIER LETTER SMALL CHI
400                return true;
401            } else if ((ch >= 0x2160) && (ch <= 0x217F)) {
402                // ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND
403                // SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND
404                return true;
405            } else if ((ch >= 0x24B6) && (ch <= 0x24E9)) {
406                // CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z
407                // CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z
408                return true;
409            } else {
410                return false;
411            }
412        }
413    }
414
415    private static boolean isSoftDotted(int ch) {
416        switch (ch) {
417        case 0x0069: // Soft_Dotted # L&       LATIN SMALL LETTER I
418        case 0x006A: // Soft_Dotted # L&       LATIN SMALL LETTER J
419        case 0x012F: // Soft_Dotted # L&       LATIN SMALL LETTER I WITH OGONEK
420        case 0x0268: // Soft_Dotted # L&       LATIN SMALL LETTER I WITH STROKE
421        case 0x0456: // Soft_Dotted # L&       CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
422        case 0x0458: // Soft_Dotted # L&       CYRILLIC SMALL LETTER JE
423        case 0x1D62: // Soft_Dotted # L&       LATIN SUBSCRIPT SMALL LETTER I
424        case 0x1E2D: // Soft_Dotted # L&       LATIN SMALL LETTER I WITH TILDE BELOW
425        case 0x1ECB: // Soft_Dotted # L&       LATIN SMALL LETTER I WITH DOT BELOW
426        case 0x2071: // Soft_Dotted # L&       SUPERSCRIPT LATIN SMALL LETTER I
427            return true;
428        default:
429            return false;
430        }
431    }
432
433    /**
434     * An internal class that represents an entry in the Special Casing Properties.
435     */
436    static class Entry {
437        int ch;
438        char [] lower;
439        char [] upper;
440        String lang;
441        int condition;
442
443        Entry(int ch, char[] lower, char[] upper, String lang, int condition) {
444            this.ch = ch;
445            this.lower = lower;
446            this.upper = upper;
447            this.lang = lang;
448            this.condition = condition;
449        }
450
451        int getCodePoint() {
452            return ch;
453        }
454
455        char[] getLowerCase() {
456            return lower;
457        }
458
459        char[] getUpperCase() {
460            return upper;
461        }
462
463        String getLanguage() {
464            return lang;
465        }
466
467        int getCondition() {
468            return condition;
469        }
470    }
471}
472