1/*
2 * Copyright (c) 1998, 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26package com.sun.xml.internal.dtdparser;
27
28
29/**
30 * Methods in this class are used to determine whether characters may
31 * appear in certain roles in XML documents.  Such methods are used
32 * both to parse and to create such documents.
33 *
34 * @author David Brownell
35 * @version 1.1, 00/08/05
36 */
37public class XmlChars {
38    // can't construct instances
39    private XmlChars() {
40    }
41
42    /**
43     * Returns true if the argument, a UCS-4 character code, is valid in
44     * XML documents.  Unicode characters fit into the low sixteen
45     * bits of a UCS-4 character, and pairs of Unicode <em>surrogate
46     * characters</em> can be combined to encode UCS-4 characters in
47     * documents containing only Unicode.  (The <code>char</code> datatype
48     * in the Java Programming Language represents Unicode characters,
49     * including unpaired surrogates.)
50     * <p>
51     * <P> In XML, UCS-4 characters can also be encoded by the use of
52     * <em>character references</em> such as <b>&amp;#x12345678;</b>, which
53     * happens to refer to a character that is disallowed in XML documents.
54     * UCS-4 characters allowed in XML documents can be expressed with
55     * one or two Unicode characters.
56     *
57     * @param ucs4char The 32-bit UCS-4 character being tested.
58     */
59    static public boolean isChar(int ucs4char) {
60        // [2] Char ::= #x0009 | #x000A | #x000D
61        //            | [#x0020-#xD7FF]
62        //    ... surrogates excluded!
63        //            | [#xE000-#xFFFD]
64        //             | [#x10000-#x10ffff]
65        return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)
66                || ucs4char == 0x000A || ucs4char == 0x0009
67                || ucs4char == 0x000D
68                || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD)
69                || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));
70    }
71
72    /**
73     * Returns true if the character is allowed to be a non-initial
74     * character in names according to the XML recommendation.
75     *
76     * @see #isNCNameChar(char)
77     * @see #isLetter(char)
78     */
79    public static boolean isNameChar(char c) {
80        // [4] NameChar ::= Letter | Digit | '.' | '_' | ':'
81        //            | CombiningChar | Extender
82
83        if (isLetter2(c))
84            return true;
85        else if (c == '>')
86            return false;
87        else if (c == '.' || c == '-' || c == '_' || c == ':'
88                || isExtender(c))
89            return true;
90        else
91            return false;
92    }
93
94    /**
95     * Returns true if the character is allowed to be a non-initial
96     * character in unscoped names according to the rules of the XML
97     * Namespaces proposed recommendation.  Except for precluding
98     * the colon (used to separate names from their scopes) these
99     * characters are just as allowed by the XML recommendation.
100     *
101     * @see #isNameChar(char)
102     * @see #isLetter(char)
103     */
104    public static boolean isNCNameChar(char c) {
105        // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_'
106        //            | CombiningChar | Extender
107        return c != ':' && isNameChar(c);
108    }
109
110    /**
111     * Returns true if the character is allowed where XML supports
112     * whitespace characters, false otherwise.
113     */
114    public static boolean isSpace(char c) {
115        return c == ' ' || c == '\t' || c == '\n' || c == '\r';
116    }
117
118
119    /*
120     * NOTE:  java.lang.Character.getType() values are:
121     *
122     * UNASSIGNED                    = 0,
123     *
124     * UPPERCASE_LETTER            = 1,    // Lu
125     * LOWERCASE_LETTER            = 2,    // Ll
126     * TITLECASE_LETTER            = 3,    // Lt
127     * MODIFIER_LETTER             = 4,    // Lm
128     * OTHER_LETTER                = 5,    // Lo
129     * NON_SPACING_MARK            = 6,    // Mn
130     * ENCLOSING_MARK              = 7,    // Me
131     * COMBINING_SPACING_MARK      = 8,    // Mc
132     * DECIMAL_DIGIT_NUMBER        = 9,    // Nd
133     * LETTER_NUMBER               = 10,   // Nl
134     * OTHER_NUMBER                = 11,   // No
135     * SPACE_SEPARATOR             = 12,   // Zs
136     * LINE_SEPARATOR              = 13,   // Zl
137     * PARAGRAPH_SEPARATOR         = 14,   // Zp
138     * CONTROL                     = 15,   // Cc
139     * FORMAT                      = 16,   // Cf
140     *                         // 17 reserved for proposed Ci category
141     * PRIVATE_USE                 = 18,   // Co
142     * SURROGATE                   = 19,   // Cs
143     * DASH_PUNCTUATION            = 20,   // Pd
144     * START_PUNCTUATION           = 21,   // Ps
145     * END_PUNCTUATION             = 22,   // Pe
146     * CONNECTOR_PUNCTUATION       = 23,   // Pc
147     * OTHER_PUNCTUATION           = 24,   // Po
148     * MATH_SYMBOL                 = 25,   // Sm
149     * CURRENCY_SYMBOL             = 26,   // Sc
150     * MODIFIER_SYMBOL             = 27,   // Sk
151     * OTHER_SYMBOL                = 28;   // So
152     */
153
154    /**
155     * Returns true if the character is an XML "letter".  XML Names must
156     * start with Letters or a few other characters, but other characters
157     * in names must only satisfy the <em>isNameChar</em> predicate.
158     *
159     * @see #isNameChar(char)
160     * @see #isNCNameChar(char)
161     */
162    public static boolean isLetter(char c) {
163        // [84] Letter ::= BaseChar | Ideographic
164        // [85] BaseChar ::= ... too much to repeat
165        // [86] Ideographic ::= ... too much to repeat
166
167        //
168        // Optimize the typical case.
169        //
170        if (c >= 'a' && c <= 'z')
171            return true;
172        if (c == '/')
173            return false;
174        if (c >= 'A' && c <= 'Z')
175            return true;
176
177        //
178        // Since the tables are too ridiculous to use in code,
179        // we're using the footnotes here to drive this test.
180        //
181        switch (Character.getType(c)) {
182        // app. B footnote says these are 'name start'
183        // chars' ...
184        case Character.LOWERCASE_LETTER:        // Ll
185        case Character.UPPERCASE_LETTER:        // Lu
186        case Character.OTHER_LETTER:            // Lo
187        case Character.TITLECASE_LETTER:        // Lt
188        case Character.LETTER_NUMBER:            // Nl
189
190            // OK, here we just have some exceptions to check...
191            return !isCompatibilityChar(c)
192                    // per "5.14 of Unicode", rule out some combiners
193                    && !(c >= 0x20dd && c <= 0x20e0);
194
195        default:
196            // check for some exceptions:  these are "alphabetic"
197            return ((c >= 0x02bb && c <= 0x02c1)
198                    || c == 0x0559 || c == 0x06e5 || c == 0x06e6);
199        }
200    }
201
202    //
203    // XML 1.0 discourages "compatibility" characters in names; these
204    // were defined to permit passing through some information stored in
205    // older non-Unicode character sets.  These always have alternative
206    // representations in Unicode, e.g. using combining chars.
207    //
208    private static boolean isCompatibilityChar(char c) {
209        // the numerous comparisions here seem unavoidable,
210        // but the switch can reduce the number which must
211        // actually be executed.
212
213        switch ((c >> 8) & 0x0ff) {
214        case 0x00:
215            // ISO Latin/1 has a few compatibility characters
216            return c == 0x00aa || c == 0x00b5 || c == 0x00ba;
217
218        case 0x01:
219            // as do Latin Extended A and (parts of) B
220            return (c >= 0x0132 && c <= 0x0133)
221                    || (c >= 0x013f && c <= 0x0140)
222                    || c == 0x0149
223                    || c == 0x017f
224                    || (c >= 0x01c4 && c <= 0x01cc)
225                    || (c >= 0x01f1 && c <= 0x01f3);
226
227        case 0x02:
228            // some spacing modifiers
229            return (c >= 0x02b0 && c <= 0x02b8)
230                    || (c >= 0x02e0 && c <= 0x02e4);
231
232        case 0x03:
233            return c == 0x037a;            // Greek
234
235        case 0x05:
236            return c == 0x0587;            // Armenian
237
238        case 0x0e:
239            return c >= 0x0edc && c <= 0x0edd;    // Laotian
240
241        case 0x11:
242            // big chunks of Hangul Jamo are all "compatibility"
243            return c == 0x1101
244                    || c == 0x1104
245                    || c == 0x1108
246                    || c == 0x110a
247                    || c == 0x110d
248                    || (c >= 0x1113 && c <= 0x113b)
249                    || c == 0x113d
250                    || c == 0x113f
251                    || (c >= 0x1141 && c <= 0x114b)
252                    || c == 0x114d
253                    || c == 0x114f
254                    || (c >= 0x1151 && c <= 0x1153)
255                    || (c >= 0x1156 && c <= 0x1158)
256                    || c == 0x1162
257                    || c == 0x1164
258                    || c == 0x1166
259                    || c == 0x1168
260                    || (c >= 0x116a && c <= 0x116c)
261                    || (c >= 0x116f && c <= 0x1171)
262                    || c == 0x1174
263                    || (c >= 0x1176 && c <= 0x119d)
264                    || (c >= 0x119f && c <= 0x11a2)
265                    || (c >= 0x11a9 && c <= 0x11aa)
266                    || (c >= 0x11ac && c <= 0x11ad)
267                    || (c >= 0x11b0 && c <= 0x11b6)
268                    || c == 0x11b9
269                    || c == 0x11bb
270                    || (c >= 0x11c3 && c <= 0x11ea)
271                    || (c >= 0x11ec && c <= 0x11ef)
272                    || (c >= 0x11f1 && c <= 0x11f8)
273                    ;
274
275        case 0x20:
276            return c == 0x207f;            // superscript
277
278        case 0x21:
279            return
280                    // various letterlike symbols
281                    c == 0x2102
282                    || c == 0x2107
283                    || (c >= 0x210a && c <= 0x2113)
284                    || c == 0x2115
285                    || (c >= 0x2118 && c <= 0x211d)
286                    || c == 0x2124
287                    || c == 0x2128
288                    || (c >= 0x212c && c <= 0x212d)
289                    || (c >= 0x212f && c <= 0x2138)
290
291                    // most Roman numerals (less 1K, 5K, 10K)
292                    || (c >= 0x2160 && c <= 0x217f)
293                    ;
294
295        case 0x30:
296            // some Hiragana
297            return c >= 0x309b && c <= 0x309c;
298
299        case 0x31:
300            // all Hangul Compatibility Jamo
301            return c >= 0x3131 && c <= 0x318e;
302
303        case 0xf9:
304        case 0xfa:
305        case 0xfb:
306        case 0xfc:
307        case 0xfd:
308        case 0xfe:
309        case 0xff:
310            // the whole "compatibility" area is for that purpose!
311            return true;
312
313        default:
314            // most of Unicode isn't flagged as being for compatibility
315            return false;
316        }
317    }
318
319    // guts of isNameChar/isNCNameChar
320    private static boolean isLetter2(char c) {
321        // [84] Letter ::= BaseChar | Ideographic
322        // [85] BaseChar ::= ... too much to repeat
323        // [86] Ideographic ::= ... too much to repeat
324        // [87] CombiningChar ::= ... too much to repeat
325
326        //
327        // Optimize the typical case.
328        //
329        if (c >= 'a' && c <= 'z')
330            return true;
331        if (c == '>')
332            return false;
333        if (c >= 'A' && c <= 'Z')
334            return true;
335
336        //
337        // Since the tables are too ridiculous to use in code,
338        // we're using the footnotes here to drive this test.
339        //
340        switch (Character.getType(c)) {
341        // app. B footnote says these are 'name start'
342        // chars' ...
343        case Character.LOWERCASE_LETTER:        // Ll
344        case Character.UPPERCASE_LETTER:        // Lu
345        case Character.OTHER_LETTER:            // Lo
346        case Character.TITLECASE_LETTER:        // Lt
347        case Character.LETTER_NUMBER:            // Nl
348            // ... and these are name characters 'other
349            // than name start characters'
350        case Character.COMBINING_SPACING_MARK:    // Mc
351        case Character.ENCLOSING_MARK:        // Me
352        case Character.NON_SPACING_MARK:        // Mn
353        case Character.MODIFIER_LETTER:        // Lm
354        case Character.DECIMAL_DIGIT_NUMBER:        // Nd
355
356            // OK, here we just have some exceptions to check...
357            return !isCompatibilityChar(c)
358                    // per "5.14 of Unicode", rule out some combiners
359                    && !(c >= 0x20dd && c <= 0x20e0);
360
361        default:
362            // added a character ...
363            return c == 0x0387;
364        }
365    }
366
367    private static boolean isExtender(char c) {
368        // [89] Extender ::= ...
369        return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
370                || c == 0x0640 || c == 0x0e46 || c == 0x0ec6
371                || c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
372                || (c >= 0x309d && c <= 0x309e)
373                || (c >= 0x30fc && c <= 0x30fe)
374                ;
375    }
376}
377