UnicodeReader.java revision 2571:10fc81ac75b4
1118699Strhodes/* 2118699Strhodes * Copyright (c) 2011, 2013, Oracle and/or its affiliates. All rights reserved. 3118699Strhodes * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4118699Strhodes * 5118699Strhodes * This code is free software; you can redistribute it and/or modify it 6118699Strhodes * under the terms of the GNU General Public License version 2 only, as 7118699Strhodes * published by the Free Software Foundation. Oracle designates this 8118699Strhodes * particular file as subject to the "Classpath" exception as provided 9118699Strhodes * by Oracle in the LICENSE file that accompanied this code. 10118699Strhodes * 11118699Strhodes * This code is distributed in the hope that it will be useful, but WITHOUT 12118699Strhodes * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13118699Strhodes * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14118699Strhodes * version 2 for more details (a copy is included in the LICENSE file that 15118699Strhodes * accompanied this code). 16118699Strhodes * 17118699Strhodes * You should have received a copy of the GNU General Public License version 18118699Strhodes * 2 along with this work; if not, write to the Free Software Foundation, 19118699Strhodes * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20118699Strhodes * 21118699Strhodes * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22118699Strhodes * or visit www.oracle.com if you need additional information or have any 23118699Strhodes * questions. 24118699Strhodes */ 25118699Strhodes 26118699Strhodespackage com.sun.tools.javac.parser; 27118699Strhodes 28214387Sbcrimport java.nio.CharBuffer; 29118699Strhodesimport java.util.Arrays; 30118699Strhodes 31118699Strhodesimport com.sun.tools.javac.file.JavacFileManager; 32118699Strhodesimport com.sun.tools.javac.util.ArrayUtils; 33118699Strhodesimport com.sun.tools.javac.util.Log; 34118699Strhodesimport com.sun.tools.javac.util.Name; 35118699Strhodesimport com.sun.tools.javac.util.Names; 36118699Strhodes 37118699Strhodesimport static com.sun.tools.javac.util.LayoutCharacters.*; 38118699Strhodes 39118699Strhodes/** The char reader used by the javac lexer/tokenizer. Returns the sequence of 40118699Strhodes * characters contained in the input stream, handling unicode escape accordingly. 41118699Strhodes * Additionally, it provides features for saving chars into a buffer and to retrieve 42118699Strhodes * them at a later stage. 43118699Strhodes * 44118699Strhodes * <p><b>This is NOT part of any supported API. 45118699Strhodes * If you write code that depends on this, you do so at your own risk. 46118699Strhodes * This code and its internal interfaces are subject to change or 47118699Strhodes * deletion without notice.</b> 48118699Strhodes */ 49118699Strhodespublic class UnicodeReader { 50118699Strhodes 51118699Strhodes /** The input buffer, index of next character to be read, 52118699Strhodes * index of one past last character in buffer. 53118699Strhodes */ 54118699Strhodes protected char[] buf; 55118699Strhodes protected int bp; 56118699Strhodes protected final int buflen; 57118699Strhodes 58118699Strhodes /** The current character. 59118699Strhodes */ 60118699Strhodes protected char ch; 61118699Strhodes 62118699Strhodes /** The buffer index of the last converted unicode character 63118699Strhodes */ 64118699Strhodes protected int unicodeConversionBp = -1; 65118699Strhodes 66118699Strhodes protected Log log; 67118699Strhodes protected Names names; 68118699Strhodes 69118699Strhodes /** A character buffer for saved chars. 70118699Strhodes */ 71118699Strhodes protected char[] sbuf = new char[128]; 72118699Strhodes protected int sp; 73118699Strhodes 74118699Strhodes /** 75118699Strhodes * Create a scanner from the input array. This method might 76118699Strhodes * modify the array. To avoid copying the input array, ensure 77118699Strhodes * that {@code inputLength < input.length} or 78118699Strhodes * {@code input[input.length -1]} is a white space character. 79118699Strhodes * 80118699Strhodes * @param sf the factory which created this Scanner 81118699Strhodes * @param buffer the input, might be modified 82118699Strhodes * Must be positive and less than or equal to input.length. 83118699Strhodes */ 84118699Strhodes protected UnicodeReader(ScannerFactory sf, CharBuffer buffer) { 85118699Strhodes this(sf, JavacFileManager.toArray(buffer), buffer.limit()); 86118699Strhodes } 87118699Strhodes 88118699Strhodes protected UnicodeReader(ScannerFactory sf, char[] input, int inputLength) { 89118699Strhodes log = sf.log; 90214387Sbcr names = sf.names; 91118699Strhodes if (inputLength == input.length) { 92118699Strhodes if (input.length > 0 && Character.isWhitespace(input[input.length - 1])) { 93118699Strhodes inputLength--; 94118699Strhodes } else { 95118699Strhodes input = Arrays.copyOf(input, inputLength + 1); 96118699Strhodes } 97118699Strhodes } 98118699Strhodes buf = input; 99214387Sbcr buflen = inputLength; 100214387Sbcr buf[buflen] = EOI; 101118699Strhodes bp = -1; 102118699Strhodes scanChar(); 103118699Strhodes } 104118699Strhodes 105118699Strhodes /** Read next character. 106118699Strhodes */ 107118699Strhodes protected void scanChar() { 108118699Strhodes if (bp < buflen) { 109118699Strhodes ch = buf[++bp]; 110118699Strhodes if (ch == '\\') { 111118699Strhodes convertUnicode(); 112118699Strhodes } 113214387Sbcr } 114214387Sbcr } 115214387Sbcr 116118699Strhodes /** Read next character in comment, skipping over double '\' characters. 117118699Strhodes */ 118118699Strhodes protected void scanCommentChar() { 119118699Strhodes scanChar(); 120118699Strhodes if (ch == '\\') { 121118699Strhodes if (peekChar() == '\\' && !isUnicode()) { 122118699Strhodes skipChar(); 123118699Strhodes } else { 124118699Strhodes convertUnicode(); 125118699Strhodes } 126118699Strhodes } 127118699Strhodes } 128118699Strhodes 129118699Strhodes /** Append a character to sbuf. 130118699Strhodes */ 131118699Strhodes protected void putChar(char ch, boolean scan) { 132214387Sbcr sbuf = ArrayUtils.ensureCapacity(sbuf, sp); 133214387Sbcr sbuf[sp++] = ch; 134214387Sbcr if (scan) 135233648Seadler scanChar(); 136214387Sbcr } 137214387Sbcr 138214387Sbcr protected void putChar(char ch) { 139214387Sbcr putChar(ch, false); 140214387Sbcr } 141214387Sbcr 142214387Sbcr protected void putChar(boolean scan) { 143233648Seadler putChar(ch, scan); 144118699Strhodes } 145118699Strhodes 146118699Strhodes Name name() { 147118699Strhodes return names.fromChars(sbuf, 0, sp); 148118699Strhodes } 149118699Strhodes 150118699Strhodes String chars() { 151118699Strhodes return new String(sbuf, 0, sp); 152118699Strhodes } 153118699Strhodes 154118699Strhodes /** Convert unicode escape; bp points to initial '\' character 155118699Strhodes * (Spec 3.3). 156118699Strhodes */ 157118699Strhodes protected void convertUnicode() { 158118699Strhodes if (ch == '\\' && unicodeConversionBp != bp) { 159118699Strhodes bp++; ch = buf[bp]; 160 if (ch == 'u') { 161 do { 162 bp++; ch = buf[bp]; 163 } while (ch == 'u'); 164 int limit = bp + 3; 165 if (limit < buflen) { 166 int d = digit(bp, 16); 167 int code = d; 168 while (bp < limit && d >= 0) { 169 bp++; ch = buf[bp]; 170 d = digit(bp, 16); 171 code = (code << 4) + d; 172 } 173 if (d >= 0) { 174 ch = (char)code; 175 unicodeConversionBp = bp; 176 return; 177 } 178 } 179 log.error(bp, "illegal.unicode.esc"); 180 } else { 181 bp--; 182 ch = '\\'; 183 } 184 } 185 } 186 187 /** Are surrogates supported? 188 */ 189 final static boolean surrogatesSupported = surrogatesSupported(); 190 private static boolean surrogatesSupported() { 191 try { 192 Character.isHighSurrogate('a'); 193 return true; 194 } catch (NoSuchMethodError ex) { 195 return false; 196 } 197 } 198 199 /** Scan surrogate pairs. If 'ch' is a high surrogate and 200 * the next character is a low surrogate, returns the code point 201 * constructed from these surrogates. Otherwise, returns -1. 202 * This method will not consume any of the characters. 203 */ 204 protected int peekSurrogates() { 205 if (surrogatesSupported && Character.isHighSurrogate(ch)) { 206 char high = ch; 207 int prevBP = bp; 208 209 scanChar(); 210 211 char low = ch; 212 213 ch = high; 214 bp = prevBP; 215 216 if (Character.isLowSurrogate(low)) { 217 return Character.toCodePoint(high, low); 218 } 219 } 220 221 return -1; 222 } 223 224 /** Convert an ASCII digit from its base (8, 10, or 16) 225 * to its value. 226 */ 227 protected int digit(int pos, int base) { 228 char c = ch; 229 if ('0' <= c && c <= '9') 230 return Character.digit(c, base); //a fast common case 231 int codePoint = peekSurrogates(); 232 int result = codePoint >= 0 ? Character.digit(codePoint, base) : Character.digit(c, base); 233 if (result >= 0 && c > 0x7f) { 234 log.error(pos + 1, "illegal.nonascii.digit"); 235 if (codePoint >= 0) 236 scanChar(); 237 ch = "0123456789abcdef".charAt(result); 238 } 239 return result; 240 } 241 242 protected boolean isUnicode() { 243 return unicodeConversionBp == bp; 244 } 245 246 protected void skipChar() { 247 bp++; 248 } 249 250 protected char peekChar() { 251 return buf[bp + 1]; 252 } 253 254 /** 255 * Returns a copy of the input buffer, up to its inputLength. 256 * Unicode escape sequences are not translated. 257 */ 258 public char[] getRawCharacters() { 259 char[] chars = new char[buflen]; 260 System.arraycopy(buf, 0, chars, 0, buflen); 261 return chars; 262 } 263 264 /** 265 * Returns a copy of a character array subset of the input buffer. 266 * The returned array begins at the {@code beginIndex} and 267 * extends to the character at index {@code endIndex - 1}. 268 * Thus the length of the substring is {@code endIndex-beginIndex}. 269 * This behavior is like 270 * {@code String.substring(beginIndex, endIndex)}. 271 * Unicode escape sequences are not translated. 272 * 273 * @param beginIndex the beginning index, inclusive. 274 * @param endIndex the ending index, exclusive. 275 * @throws ArrayIndexOutOfBoundsException if either offset is outside of the 276 * array bounds 277 */ 278 public char[] getRawCharacters(int beginIndex, int endIndex) { 279 int length = endIndex - beginIndex; 280 char[] chars = new char[length]; 281 System.arraycopy(buf, beginIndex, chars, 0, length); 282 return chars; 283 } 284} 285