JavadocTokenizer.java revision 2681:14e1d2a15822
161452Sdfr/* 261452Sdfr * Copyright (c) 2004, 2013, Oracle and/or its affiliates. All rights reserved. 361452Sdfr * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 461452Sdfr * 561452Sdfr * This code is free software; you can redistribute it and/or modify it 661452Sdfr * under the terms of the GNU General Public License version 2 only, as 761452Sdfr * published by the Free Software Foundation. Oracle designates this 861452Sdfr * particular file as subject to the "Classpath" exception as provided 961452Sdfr * by Oracle in the LICENSE file that accompanied this code. 1061452Sdfr * 1161452Sdfr * This code is distributed in the hope that it will be useful, but WITHOUT 1261452Sdfr * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 1361452Sdfr * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 1461452Sdfr * version 2 for more details (a copy is included in the LICENSE file that 1561452Sdfr * accompanied this code). 1661452Sdfr * 1761452Sdfr * You should have received a copy of the GNU General Public License version 1861452Sdfr * 2 along with this work; if not, write to the Free Software Foundation, 1961452Sdfr * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 2061452Sdfr * 2161452Sdfr * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 2261452Sdfr * or visit www.oracle.com if you need additional information or have any 2361452Sdfr * questions. 2461452Sdfr */ 2561452Sdfr 2661452Sdfrpackage com.sun.tools.javac.parser; 27116192Sobrien 28116192Sobrienimport com.sun.tools.javac.parser.Tokens.Comment; 29116192Sobrienimport com.sun.tools.javac.parser.Tokens.Comment.CommentStyle; 3061452Sdfrimport com.sun.tools.javac.util.*; 3161452Sdfr 3261452Sdfrimport java.nio.*; 3361452Sdfrimport java.util.regex.Pattern; 3461452Sdfr 3561452Sdfrimport static com.sun.tools.javac.util.LayoutCharacters.*; 36129878Sphk 3761452Sdfr/** An extension to the base lexical analyzer that captures 3861452Sdfr * and processes the contents of doc comments. It does so by 3976827Salfred * translating Unicode escape sequences and by stripping the 4079339Sjhb * leading whitespace and starts from each line of the comment. 4161452Sdfr * 42119288Simp * <p><b>This is NOT part of any supported API. 43119288Simp * If you write code that depends on this, you do so at your own risk. 4461452Sdfr * This code and its internal interfaces are subject to change or 4561452Sdfr * deletion without notice.</b> 4661452Sdfr */ 4761452Sdfrpublic class JavadocTokenizer extends JavaTokenizer { 4861452Sdfr 4961452Sdfr /** Create a scanner from the input buffer. buffer must implement 5061452Sdfr * array() and compact(), and remaining() must be less than limit(). 5161452Sdfr */ 5261452Sdfr protected JavadocTokenizer(ScannerFactory fac, CharBuffer buffer) { 5361452Sdfr super(fac, buffer); 5461501Sdfr } 5561501Sdfr 5661452Sdfr /** Create a scanner from the input array. The array must have at 5761452Sdfr * least a single character of extra space. 5861452Sdfr */ 5961452Sdfr protected JavadocTokenizer(ScannerFactory fac, char[] input, int inputLength) { 6061452Sdfr super(fac, input, inputLength); 6161501Sdfr } 6261501Sdfr 6387479Scokane @Override 6487479Scokane protected Comment processComment(int pos, int endPos, CommentStyle style) { 6561501Sdfr char[] buf = reader.getRawCharacters(pos, endPos); 6661501Sdfr return new JavadocComment(new DocReader(fac, buf, buf.length, pos), style); 6761501Sdfr } 6861501Sdfr 6961452Sdfr /** 70133851Sobrien * This is a specialized version of UnicodeReader that keeps track of the 71133851Sobrien * column position within a given character stream (used for Javadoc processing), 72133851Sobrien * and which builds a table for mapping positions in the comment string to 73133851Sobrien * positions in the source file. 74133851Sobrien */ 75133851Sobrien static class DocReader extends UnicodeReader { 7661452Sdfr 7761452Sdfr int col; 7861501Sdfr int startPos; 7961501Sdfr 8061501Sdfr /** 8161501Sdfr * A buffer for building a table for mapping positions in {@link #sbuf} 8261501Sdfr * to positions in the source buffer. 8361501Sdfr * 8487479Scokane * The array is organized as a series of pairs of integers: the first 8561501Sdfr * number in each pair specifies a position in the comment text, 8661501Sdfr * the second number in each pair specifies the corresponding position 8761501Sdfr * in the source buffer. The pairs are sorted in ascending order. 8861501Sdfr * 8961501Sdfr * Since the mapping function is generally continuous, with successive 9061501Sdfr * positions in the string corresponding to successive positions in the 9161501Sdfr * source buffer, the table only needs to record discontinuities in 9261501Sdfr * the mapping. The values of intermediate positions can be inferred. 9361501Sdfr * 9461501Sdfr * Discontinuities may occur in a number of places: when a newline 9561501Sdfr * is followed by whitespace and asterisks (which are ignored), 9661501Sdfr * when a tab is expanded into spaces, and when unicode escapes 9761501Sdfr * are used in the source buffer. 9887479Scokane * 9987479Scokane * Thus, to find the source position of any position, p, in the comment 10061501Sdfr * string, find the index, i, of the pair whose string offset 10161501Sdfr * ({@code pbuf[i] }) is closest to but not greater than p. Then, 10261501Sdfr * {@code sourcePos(p) = pbuf[i+1] + (p - pbuf[i]) }. 10361501Sdfr */ 10461501Sdfr int[] pbuf = new int[128]; 10561501Sdfr 10661501Sdfr /** 10761501Sdfr * The index of the next empty slot in the pbuf buffer. 10861501Sdfr */ 10961501Sdfr int pp = 0; 11061501Sdfr 11161501Sdfr /** The buffer index of the last double backslash sequence 11261501Sdfr */ 11361501Sdfr private int doubleBackslashBp = -1; 11461501Sdfr 11561501Sdfr DocReader(ScannerFactory fac, char[] input, int inputLength, int startPos) { 11661501Sdfr super(fac, input, inputLength); 11761501Sdfr this.startPos = startPos; 11861501Sdfr } 11961501Sdfr 12061501Sdfr @Override 12161501Sdfr protected void convertUnicode() { 12261501Sdfr if (ch == '\\' && unicodeConversionBp != bp) { 12361501Sdfr bp++; ch = buf[bp]; col++; 12494790Scokane if (ch == 'u') { 12594790Scokane do { 12661501Sdfr bp++; ch = buf[bp]; col++; 12787479Scokane } while (ch == 'u'); 128105145Smarcel int limit = bp + 3; 129105145Smarcel if (limit < buflen) { 13087479Scokane int d = digit(bp, 16); 13187479Scokane int code = d; 13287479Scokane while (bp < limit && d >= 0) { 13387479Scokane bp++; ch = buf[bp]; col++; 13487479Scokane d = digit(bp, 16); 13587479Scokane code = (code << 4) + d; 13687479Scokane } 13794790Scokane if (d >= 0) { 13887479Scokane ch = (char)code; 13961501Sdfr unicodeConversionBp = bp; 14061501Sdfr return; 14161501Sdfr } 14287479Scokane } 14387479Scokane // "illegal.Unicode.esc", reported by base scanner 14487479Scokane } else { 14587479Scokane bp--; 14687479Scokane ch = '\\'; 14761501Sdfr col--; 14887479Scokane } 14987479Scokane } 15087479Scokane } 15161501Sdfr 15261501Sdfr @Override 15387479Scokane protected void scanCommentChar() { 15461501Sdfr scanChar(); 15561501Sdfr if (ch == '\\') { 15661501Sdfr if (peekChar() == '\\' && !isUnicode()) { 15761501Sdfr bp++; col++; 15861501Sdfr doubleBackslashBp = bp; 15961501Sdfr } else { 16087479Scokane convertUnicode(); 16161501Sdfr } 16261501Sdfr } 16361501Sdfr } 16461501Sdfr 16561501Sdfr @Override 16661501Sdfr protected void scanChar() { 16761501Sdfr bp++; 16861501Sdfr ch = buf[bp]; 16961501Sdfr switch (ch) { 17061501Sdfr case '\r': // return 17161501Sdfr col = 0; 17261501Sdfr break; 17361501Sdfr case '\n': // newline 17461501Sdfr if (bp == 0 || buf[bp-1] != '\r') { 17561501Sdfr col = 0; 17661501Sdfr } 17761501Sdfr break; 17861501Sdfr case '\t': // tab 17961452Sdfr col = (col / TabInc * TabInc) + TabInc; 18061452Sdfr break; 18161452Sdfr case '\\': // possible Unicode 18261452Sdfr col++; 18361452Sdfr convertUnicode(); 18461452Sdfr break; 18561452Sdfr default: 18661452Sdfr col++; 18761452Sdfr break; 18861452Sdfr } 18961452Sdfr } 190133851Sobrien 191133851Sobrien @Override 19283699Scokane public void putChar(char ch, boolean scan) { 19383699Scokane // At this point, bp is the position of the current character in buf, 19487479Scokane // and sp is the position in sbuf where this character will be put. 19587479Scokane // Record a new entry in pbuf if pbuf is empty or if sp and its 19661452Sdfr // corresponding source position are not equidistant from the 19761452Sdfr // corresponding values in the latest entry in the pbuf array. 19861452Sdfr // (i.e. there is a discontinuity in the map function.) 19961452Sdfr if ((pp == 0) 20061452Sdfr || (sp - pbuf[pp - 2] != (startPos + bp) - pbuf[pp - 1])) { 20161452Sdfr if (pp + 1 >= pbuf.length) { 20261452Sdfr int[] new_pbuf = new int[pbuf.length * 2]; 20361452Sdfr System.arraycopy(pbuf, 0, new_pbuf, 0, pbuf.length); 20461452Sdfr pbuf = new_pbuf; 20561452Sdfr } 206127815Snjl pbuf[pp] = sp; 207127815Snjl pbuf[pp + 1] = startPos + bp; 20861452Sdfr pp += 2; 20961452Sdfr } 21061452Sdfr super.putChar(ch, scan); 21161452Sdfr } 212142398Simp 21361452Sdfr /** Whether the ch represents a sequence of two backslashes. */ 21461452Sdfr boolean isDoubleBackslash() { 21561452Sdfr return doubleBackslashBp == bp; 21661452Sdfr } 21761452Sdfr 21861452Sdfr 21961452Sdfr } 22061452Sdfr 22161452Sdfr protected static class JavadocComment extends JavaTokenizer.BasicComment<DocReader> { 22261501Sdfr 22361452Sdfr /** 22461452Sdfr * Translated and stripped contents of doc comment 22561452Sdfr */ 22661452Sdfr private String docComment = null; 22761452Sdfr private int[] docPosns = null; 22861452Sdfr 22961452Sdfr JavadocComment(DocReader reader, CommentStyle cs) { 230127135Snjl super(reader, cs); 231127135Snjl } 23261452Sdfr 23361452Sdfr @Override 23461452Sdfr public String getText() { 23561452Sdfr if (!scanned && cs == CommentStyle.JAVADOC) { 23661452Sdfr scanDocComment(); 23761452Sdfr } 23861452Sdfr return docComment; 23961452Sdfr } 24061452Sdfr 24161452Sdfr @Override 24261452Sdfr public int getSourcePos(int pos) { 24361501Sdfr // Binary search to find the entry for which the string index is 24461452Sdfr // less than pos. Since docPosns is a list of pairs of integers 24561452Sdfr // we must make sure the index is always even. 24661452Sdfr // If we find an exact match for pos, the other item in the pair 24761452Sdfr // gives the source pos; otherwise, compute the source position 24861452Sdfr // relative to the best match found in the array. 24961452Sdfr if (pos == Position.NOPOS) 25061452Sdfr return Position.NOPOS; 25161452Sdfr if (pos < 0 || pos > docComment.length()) 25261452Sdfr throw new StringIndexOutOfBoundsException(String.valueOf(pos)); 25361452Sdfr if (docPosns == null) 25461452Sdfr return Position.NOPOS; 25561452Sdfr int start = 0; 25661452Sdfr int end = docPosns.length; 25761501Sdfr while (start < end - 2) { 25861452Sdfr // find an even index midway between start and end 25961452Sdfr int index = ((start + end) / 4) * 2; 26061501Sdfr if (docPosns[index] < pos) 26161501Sdfr start = index; 26261501Sdfr else if (docPosns[index] == pos) 26361452Sdfr return docPosns[index + 1]; 26461501Sdfr else 26561501Sdfr end = index; 26661501Sdfr } 26761501Sdfr return docPosns[start + 1] + (pos - docPosns[start]); 26861501Sdfr } 26961452Sdfr 27061452Sdfr @Override 27161452Sdfr @SuppressWarnings("fallthrough") 27261452Sdfr protected void scanDocComment() { 27361452Sdfr try { 27461501Sdfr boolean firstLine = true; 27561452Sdfr 27661452Sdfr // Skip over first slash 27761452Sdfr comment_reader.scanCommentChar(); 27861452Sdfr // Skip over first star 27961452Sdfr comment_reader.scanCommentChar(); 28061452Sdfr 28161503Sdfr // consume any number of stars 28261452Sdfr while (comment_reader.bp < comment_reader.buflen && comment_reader.ch == '*') { 28361503Sdfr comment_reader.scanCommentChar(); 28461503Sdfr } 28561503Sdfr // is the comment in the form /**/, /***/, /****/, etc. ? 28661503Sdfr if (comment_reader.bp < comment_reader.buflen && comment_reader.ch == '/') { 28761452Sdfr docComment = ""; 28861452Sdfr return; 28961452Sdfr } 29061452Sdfr 29161452Sdfr // skip a newline on the first line of the comment. 29261452Sdfr if (comment_reader.bp < comment_reader.buflen) { 29361452Sdfr if (comment_reader.ch == LF) { 29461452Sdfr comment_reader.scanCommentChar(); 29561452Sdfr firstLine = false; 29661452Sdfr } else if (comment_reader.ch == CR) { 29761452Sdfr comment_reader.scanCommentChar(); 29861452Sdfr if (comment_reader.ch == LF) { 29961452Sdfr comment_reader.scanCommentChar(); 30061501Sdfr firstLine = false; 30161503Sdfr } 30261503Sdfr } 30361503Sdfr } 30461503Sdfr 30561452Sdfr outerLoop: 30661452Sdfr 30761452Sdfr // The outerLoop processes the doc comment, looping once 30861452Sdfr // for each line. For each line, it first strips off 30961452Sdfr // whitespace, then it consumes any stars, then it 31061452Sdfr // puts the rest of the line into our buffer. 31161452Sdfr while (comment_reader.bp < comment_reader.buflen) { 31261452Sdfr int begin_bp = comment_reader.bp; 31361452Sdfr char begin_ch = comment_reader.ch; 31461452Sdfr // The wsLoop consumes whitespace from the beginning 31561452Sdfr // of each line. 31661452Sdfr wsLoop: 31761452Sdfr 31861452Sdfr while (comment_reader.bp < comment_reader.buflen) { 31961452Sdfr switch(comment_reader.ch) { 32061452Sdfr case ' ': 32161452Sdfr comment_reader.scanCommentChar(); 32261452Sdfr break; 32361452Sdfr case '\t': 32461452Sdfr comment_reader.col = ((comment_reader.col - 1) / TabInc * TabInc) + TabInc; 32561452Sdfr comment_reader.scanCommentChar(); 32661452Sdfr break; 32761452Sdfr case FF: 32861452Sdfr comment_reader.col = 0; 32961452Sdfr comment_reader.scanCommentChar(); 33061452Sdfr break; 33161452Sdfr // Treat newline at beginning of line (blank line, no star) 33261452Sdfr // as comment text. Old Javadoc compatibility requires this. 33361452Sdfr /*---------------------------------* 33461452Sdfr case CR: // (Spec 3.4) 33561452Sdfr doc_reader.scanCommentChar(); 33687479Scokane if (ch == LF) { 33787479Scokane col = 0; 33887479Scokane doc_reader.scanCommentChar(); 33987479Scokane } 34061452Sdfr break; 34187479Scokane case LF: // (Spec 3.4) 34287479Scokane doc_reader.scanCommentChar(); 34361452Sdfr break; 34461452Sdfr *---------------------------------*/ 34561452Sdfr default: 34661452Sdfr // we've seen something that isn't whitespace; 34761452Sdfr // jump out. 34861452Sdfr break wsLoop; 34961452Sdfr } 35061452Sdfr } 35161452Sdfr 35261452Sdfr // Are there stars here? If so, consume them all 35361452Sdfr // and check for the end of comment. 35461452Sdfr if (comment_reader.ch == '*') { 35561452Sdfr // skip all of the stars 35687479Scokane do { 35787479Scokane comment_reader.scanCommentChar(); 35887479Scokane } while (comment_reader.ch == '*'); 35961452Sdfr 36061452Sdfr // check for the closing slash. 36161452Sdfr if (comment_reader.ch == '/') { 36261452Sdfr // We're done with the doc comment 36361452Sdfr // scanChar() and breakout. 36461452Sdfr break outerLoop; 36561452Sdfr } 36661452Sdfr } else if (! firstLine) { 36761452Sdfr // The current line does not begin with a '*' so we will 36861452Sdfr // treat it as comment 36961452Sdfr comment_reader.bp = begin_bp; 37061452Sdfr comment_reader.ch = begin_ch; 37161452Sdfr } 37261452Sdfr // The textLoop processes the rest of the characters 37361452Sdfr // on the line, adding them to our buffer. 37461452Sdfr textLoop: 37561452Sdfr while (comment_reader.bp < comment_reader.buflen) { 37661452Sdfr switch (comment_reader.ch) { 37761452Sdfr case '*': 37861452Sdfr // Is this just a star? Or is this the 37961452Sdfr // end of a comment? 38061452Sdfr comment_reader.scanCommentChar(); 38161452Sdfr if (comment_reader.ch == '/') { 38261452Sdfr // This is the end of the comment, 38361452Sdfr // set ch and return our buffer. 38461452Sdfr break outerLoop; 38561452Sdfr } 38661452Sdfr // This is just an ordinary star. Add it to 38761452Sdfr // the buffer. 38861452Sdfr comment_reader.putChar('*', false); 38961452Sdfr break; 39061452Sdfr case '\\': 39161452Sdfr comment_reader.putChar('\\', false); 39261452Sdfr // If a double backslash was found, write two 39361452Sdfr if (comment_reader.isDoubleBackslash()) { 39461452Sdfr comment_reader.putChar('\\', false); 39561452Sdfr } 39661452Sdfr comment_reader.scanCommentChar(); 39761452Sdfr case ' ': 39861452Sdfr case '\t': 39961452Sdfr comment_reader.putChar(comment_reader.ch, false); 40061452Sdfr comment_reader.scanCommentChar(); 40161452Sdfr break; 40261452Sdfr case FF: 40361452Sdfr comment_reader.scanCommentChar(); 40461452Sdfr break textLoop; // treat as end of line 40561452Sdfr case CR: // (Spec 3.4) 40661452Sdfr comment_reader.scanCommentChar(); 40761452Sdfr if (comment_reader.ch != LF) { 40861452Sdfr // Canonicalize CR-only line terminator to LF 40961452Sdfr comment_reader.putChar((char)LF, false); 41061452Sdfr break textLoop; 41161452Sdfr } 41261452Sdfr /* fall through to LF case */ 41361452Sdfr case LF: // (Spec 3.4) 41461452Sdfr // We've seen a newline. Add it to our 41561452Sdfr // buffer and break out of this loop, 41661452Sdfr // starting fresh on a new line. 41761452Sdfr comment_reader.putChar(comment_reader.ch, false); 41861452Sdfr comment_reader.scanCommentChar(); 419113506Smdodd break textLoop; 420113506Smdodd default: 421 // Add the character to our buffer. 422 comment_reader.putChar(comment_reader.ch, false); 423 comment_reader.scanCommentChar(); 424 } 425 } // end textLoop 426 firstLine = false; 427 } // end outerLoop 428 429 if (comment_reader.sp > 0) { 430 int i = comment_reader.sp - 1; 431 trailLoop: 432 while (i > -1) { 433 switch (comment_reader.sbuf[i]) { 434 case '*': 435 i--; 436 break; 437 default: 438 break trailLoop; 439 } 440 } 441 comment_reader.sp = i + 1; 442 443 // Store the text of the doc comment 444 docComment = comment_reader.chars(); 445 docPosns = new int[comment_reader.pp]; 446 System.arraycopy(comment_reader.pbuf, 0, docPosns, 0, docPosns.length); 447 } else { 448 docComment = ""; 449 } 450 } finally { 451 scanned = true; 452 comment_reader = null; 453 if (docComment != null && 454 DEPRECATED_PATTERN.matcher(docComment).matches()) { 455 deprecatedFlag = true; 456 } 457 } 458 } 459 //where: 460 private static final Pattern DEPRECATED_PATTERN = 461 Pattern.compile("(?sm).*^\\s*@deprecated( |$).*"); 462 463 } 464 465 @Override 466 public Position.LineMap getLineMap() { 467 char[] buf = reader.getRawCharacters(); 468 return Position.makeLineMap(buf, buf.length, true); 469 } 470} 471