UnicodeReader.java revision 2571:10fc81ac75b4
1/* 2 * Copyright (c) 2011, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26package com.sun.tools.javac.parser; 27 28import java.nio.CharBuffer; 29import java.util.Arrays; 30 31import com.sun.tools.javac.file.JavacFileManager; 32import com.sun.tools.javac.util.ArrayUtils; 33import com.sun.tools.javac.util.Log; 34import com.sun.tools.javac.util.Name; 35import com.sun.tools.javac.util.Names; 36 37import static com.sun.tools.javac.util.LayoutCharacters.*; 38 39/** The char reader used by the javac lexer/tokenizer. Returns the sequence of 40 * characters contained in the input stream, handling unicode escape accordingly. 41 * Additionally, it provides features for saving chars into a buffer and to retrieve 42 * them at a later stage. 43 * 44 * <p><b>This is NOT part of any supported API. 45 * If you write code that depends on this, you do so at your own risk. 46 * This code and its internal interfaces are subject to change or 47 * deletion without notice.</b> 48 */ 49public class UnicodeReader { 50 51 /** The input buffer, index of next character to be read, 52 * index of one past last character in buffer. 53 */ 54 protected char[] buf; 55 protected int bp; 56 protected final int buflen; 57 58 /** The current character. 59 */ 60 protected char ch; 61 62 /** The buffer index of the last converted unicode character 63 */ 64 protected int unicodeConversionBp = -1; 65 66 protected Log log; 67 protected Names names; 68 69 /** A character buffer for saved chars. 70 */ 71 protected char[] sbuf = new char[128]; 72 protected int sp; 73 74 /** 75 * Create a scanner from the input array. This method might 76 * modify the array. To avoid copying the input array, ensure 77 * that {@code inputLength < input.length} or 78 * {@code input[input.length -1]} is a white space character. 79 * 80 * @param sf the factory which created this Scanner 81 * @param buffer the input, might be modified 82 * Must be positive and less than or equal to input.length. 83 */ 84 protected UnicodeReader(ScannerFactory sf, CharBuffer buffer) { 85 this(sf, JavacFileManager.toArray(buffer), buffer.limit()); 86 } 87 88 protected UnicodeReader(ScannerFactory sf, char[] input, int inputLength) { 89 log = sf.log; 90 names = sf.names; 91 if (inputLength == input.length) { 92 if (input.length > 0 && Character.isWhitespace(input[input.length - 1])) { 93 inputLength--; 94 } else { 95 input = Arrays.copyOf(input, inputLength + 1); 96 } 97 } 98 buf = input; 99 buflen = inputLength; 100 buf[buflen] = EOI; 101 bp = -1; 102 scanChar(); 103 } 104 105 /** Read next character. 106 */ 107 protected void scanChar() { 108 if (bp < buflen) { 109 ch = buf[++bp]; 110 if (ch == '\\') { 111 convertUnicode(); 112 } 113 } 114 } 115 116 /** Read next character in comment, skipping over double '\' characters. 117 */ 118 protected void scanCommentChar() { 119 scanChar(); 120 if (ch == '\\') { 121 if (peekChar() == '\\' && !isUnicode()) { 122 skipChar(); 123 } else { 124 convertUnicode(); 125 } 126 } 127 } 128 129 /** Append a character to sbuf. 130 */ 131 protected void putChar(char ch, boolean scan) { 132 sbuf = ArrayUtils.ensureCapacity(sbuf, sp); 133 sbuf[sp++] = ch; 134 if (scan) 135 scanChar(); 136 } 137 138 protected void putChar(char ch) { 139 putChar(ch, false); 140 } 141 142 protected void putChar(boolean scan) { 143 putChar(ch, scan); 144 } 145 146 Name name() { 147 return names.fromChars(sbuf, 0, sp); 148 } 149 150 String chars() { 151 return new String(sbuf, 0, sp); 152 } 153 154 /** Convert unicode escape; bp points to initial '\' character 155 * (Spec 3.3). 156 */ 157 protected void convertUnicode() { 158 if (ch == '\\' && unicodeConversionBp != bp) { 159 bp++; ch = buf[bp]; 160 if (ch == 'u') { 161 do { 162 bp++; ch = buf[bp]; 163 } while (ch == 'u'); 164 int limit = bp + 3; 165 if (limit < buflen) { 166 int d = digit(bp, 16); 167 int code = d; 168 while (bp < limit && d >= 0) { 169 bp++; ch = buf[bp]; 170 d = digit(bp, 16); 171 code = (code << 4) + d; 172 } 173 if (d >= 0) { 174 ch = (char)code; 175 unicodeConversionBp = bp; 176 return; 177 } 178 } 179 log.error(bp, "illegal.unicode.esc"); 180 } else { 181 bp--; 182 ch = '\\'; 183 } 184 } 185 } 186 187 /** Are surrogates supported? 188 */ 189 final static boolean surrogatesSupported = surrogatesSupported(); 190 private static boolean surrogatesSupported() { 191 try { 192 Character.isHighSurrogate('a'); 193 return true; 194 } catch (NoSuchMethodError ex) { 195 return false; 196 } 197 } 198 199 /** Scan surrogate pairs. If 'ch' is a high surrogate and 200 * the next character is a low surrogate, returns the code point 201 * constructed from these surrogates. Otherwise, returns -1. 202 * This method will not consume any of the characters. 203 */ 204 protected int peekSurrogates() { 205 if (surrogatesSupported && Character.isHighSurrogate(ch)) { 206 char high = ch; 207 int prevBP = bp; 208 209 scanChar(); 210 211 char low = ch; 212 213 ch = high; 214 bp = prevBP; 215 216 if (Character.isLowSurrogate(low)) { 217 return Character.toCodePoint(high, low); 218 } 219 } 220 221 return -1; 222 } 223 224 /** Convert an ASCII digit from its base (8, 10, or 16) 225 * to its value. 226 */ 227 protected int digit(int pos, int base) { 228 char c = ch; 229 if ('0' <= c && c <= '9') 230 return Character.digit(c, base); //a fast common case 231 int codePoint = peekSurrogates(); 232 int result = codePoint >= 0 ? Character.digit(codePoint, base) : Character.digit(c, base); 233 if (result >= 0 && c > 0x7f) { 234 log.error(pos + 1, "illegal.nonascii.digit"); 235 if (codePoint >= 0) 236 scanChar(); 237 ch = "0123456789abcdef".charAt(result); 238 } 239 return result; 240 } 241 242 protected boolean isUnicode() { 243 return unicodeConversionBp == bp; 244 } 245 246 protected void skipChar() { 247 bp++; 248 } 249 250 protected char peekChar() { 251 return buf[bp + 1]; 252 } 253 254 /** 255 * Returns a copy of the input buffer, up to its inputLength. 256 * Unicode escape sequences are not translated. 257 */ 258 public char[] getRawCharacters() { 259 char[] chars = new char[buflen]; 260 System.arraycopy(buf, 0, chars, 0, buflen); 261 return chars; 262 } 263 264 /** 265 * Returns a copy of a character array subset of the input buffer. 266 * The returned array begins at the {@code beginIndex} and 267 * extends to the character at index {@code endIndex - 1}. 268 * Thus the length of the substring is {@code endIndex-beginIndex}. 269 * This behavior is like 270 * {@code String.substring(beginIndex, endIndex)}. 271 * Unicode escape sequences are not translated. 272 * 273 * @param beginIndex the beginning index, inclusive. 274 * @param endIndex the ending index, exclusive. 275 * @throws ArrayIndexOutOfBoundsException if either offset is outside of the 276 * array bounds 277 */ 278 public char[] getRawCharacters(int beginIndex, int endIndex) { 279 int length = endIndex - beginIndex; 280 char[] chars = new char[length]; 281 System.arraycopy(buf, beginIndex, chars, 0, length); 282 return chars; 283 } 284} 285