JavadocTokenizer.java revision 2681:14e1d2a15822
161452Sdfr/*
261452Sdfr * Copyright (c) 2004, 2013, Oracle and/or its affiliates. All rights reserved.
361452Sdfr * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
461452Sdfr *
561452Sdfr * This code is free software; you can redistribute it and/or modify it
661452Sdfr * under the terms of the GNU General Public License version 2 only, as
761452Sdfr * published by the Free Software Foundation.  Oracle designates this
861452Sdfr * particular file as subject to the "Classpath" exception as provided
961452Sdfr * by Oracle in the LICENSE file that accompanied this code.
1061452Sdfr *
1161452Sdfr * This code is distributed in the hope that it will be useful, but WITHOUT
1261452Sdfr * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
1361452Sdfr * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
1461452Sdfr * version 2 for more details (a copy is included in the LICENSE file that
1561452Sdfr * accompanied this code).
1661452Sdfr *
1761452Sdfr * You should have received a copy of the GNU General Public License version
1861452Sdfr * 2 along with this work; if not, write to the Free Software Foundation,
1961452Sdfr * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
2061452Sdfr *
2161452Sdfr * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
2261452Sdfr * or visit www.oracle.com if you need additional information or have any
2361452Sdfr * questions.
2461452Sdfr */
2561452Sdfr
2661452Sdfrpackage com.sun.tools.javac.parser;
27116192Sobrien
28116192Sobrienimport com.sun.tools.javac.parser.Tokens.Comment;
29116192Sobrienimport com.sun.tools.javac.parser.Tokens.Comment.CommentStyle;
3061452Sdfrimport com.sun.tools.javac.util.*;
3161452Sdfr
3261452Sdfrimport java.nio.*;
3361452Sdfrimport java.util.regex.Pattern;
3461452Sdfr
3561452Sdfrimport static com.sun.tools.javac.util.LayoutCharacters.*;
36129878Sphk
3761452Sdfr/** An extension to the base lexical analyzer that captures
3861452Sdfr *  and processes the contents of doc comments.  It does so by
3976827Salfred *  translating Unicode escape sequences and by stripping the
4079339Sjhb *  leading whitespace and starts from each line of the comment.
4161452Sdfr *
42119288Simp *  <p><b>This is NOT part of any supported API.
43119288Simp *  If you write code that depends on this, you do so at your own risk.
4461452Sdfr *  This code and its internal interfaces are subject to change or
4561452Sdfr *  deletion without notice.</b>
4661452Sdfr */
4761452Sdfrpublic class JavadocTokenizer extends JavaTokenizer {
4861452Sdfr
4961452Sdfr    /** Create a scanner from the input buffer.  buffer must implement
5061452Sdfr     *  array() and compact(), and remaining() must be less than limit().
5161452Sdfr     */
5261452Sdfr    protected JavadocTokenizer(ScannerFactory fac, CharBuffer buffer) {
5361452Sdfr        super(fac, buffer);
5461501Sdfr    }
5561501Sdfr
5661452Sdfr    /** Create a scanner from the input array.  The array must have at
5761452Sdfr     *  least a single character of extra space.
5861452Sdfr     */
5961452Sdfr    protected JavadocTokenizer(ScannerFactory fac, char[] input, int inputLength) {
6061452Sdfr        super(fac, input, inputLength);
6161501Sdfr    }
6261501Sdfr
6387479Scokane    @Override
6487479Scokane    protected Comment processComment(int pos, int endPos, CommentStyle style) {
6561501Sdfr        char[] buf = reader.getRawCharacters(pos, endPos);
6661501Sdfr        return new JavadocComment(new DocReader(fac, buf, buf.length, pos), style);
6761501Sdfr    }
6861501Sdfr
6961452Sdfr    /**
70133851Sobrien     * This is a specialized version of UnicodeReader that keeps track of the
71133851Sobrien     * column position within a given character stream (used for Javadoc processing),
72133851Sobrien     * and which builds a table for mapping positions in the comment string to
73133851Sobrien     * positions in the source file.
74133851Sobrien     */
75133851Sobrien    static class DocReader extends UnicodeReader {
7661452Sdfr
7761452Sdfr         int col;
7861501Sdfr         int startPos;
7961501Sdfr
8061501Sdfr         /**
8161501Sdfr          * A buffer for building a table for mapping positions in {@link #sbuf}
8261501Sdfr          * to positions in the source buffer.
8361501Sdfr          *
8487479Scokane          * The array is organized as a series of pairs of integers: the first
8561501Sdfr          * number in each pair specifies a position in the comment text,
8661501Sdfr          * the second number in each pair specifies the corresponding position
8761501Sdfr          * in the source buffer. The pairs are sorted in ascending order.
8861501Sdfr          *
8961501Sdfr          * Since the mapping function is generally continuous, with successive
9061501Sdfr          * positions in the string corresponding to successive positions in the
9161501Sdfr          * source buffer, the table only needs to record discontinuities in
9261501Sdfr          * the mapping. The values of intermediate positions can be inferred.
9361501Sdfr          *
9461501Sdfr          * Discontinuities may occur in a number of places: when a newline
9561501Sdfr          * is followed by whitespace and asterisks (which are ignored),
9661501Sdfr          * when a tab is expanded into spaces, and when unicode escapes
9761501Sdfr          * are used in the source buffer.
9887479Scokane          *
9987479Scokane          * Thus, to find the source position of any position, p, in the comment
10061501Sdfr          * string, find the index, i, of the pair whose string offset
10161501Sdfr          * ({@code pbuf[i] }) is closest to but not greater than p. Then,
10261501Sdfr          * {@code sourcePos(p) = pbuf[i+1] + (p - pbuf[i]) }.
10361501Sdfr          */
10461501Sdfr         int[] pbuf = new int[128];
10561501Sdfr
10661501Sdfr         /**
10761501Sdfr          * The index of the next empty slot in the pbuf buffer.
10861501Sdfr          */
10961501Sdfr         int pp = 0;
11061501Sdfr
11161501Sdfr         /** The buffer index of the last double backslash sequence
11261501Sdfr          */
11361501Sdfr         private int doubleBackslashBp = -1;
11461501Sdfr
11561501Sdfr         DocReader(ScannerFactory fac, char[] input, int inputLength, int startPos) {
11661501Sdfr             super(fac, input, inputLength);
11761501Sdfr             this.startPos = startPos;
11861501Sdfr         }
11961501Sdfr
12061501Sdfr         @Override
12161501Sdfr         protected void convertUnicode() {
12261501Sdfr             if (ch == '\\' && unicodeConversionBp != bp) {
12361501Sdfr                 bp++; ch = buf[bp]; col++;
12494790Scokane                 if (ch == 'u') {
12594790Scokane                     do {
12661501Sdfr                         bp++; ch = buf[bp]; col++;
12787479Scokane                     } while (ch == 'u');
128105145Smarcel                     int limit = bp + 3;
129105145Smarcel                     if (limit < buflen) {
13087479Scokane                         int d = digit(bp, 16);
13187479Scokane                         int code = d;
13287479Scokane                         while (bp < limit && d >= 0) {
13387479Scokane                             bp++; ch = buf[bp]; col++;
13487479Scokane                             d = digit(bp, 16);
13587479Scokane                             code = (code << 4) + d;
13687479Scokane                         }
13794790Scokane                         if (d >= 0) {
13887479Scokane                             ch = (char)code;
13961501Sdfr                             unicodeConversionBp = bp;
14061501Sdfr                             return;
14161501Sdfr                         }
14287479Scokane                     }
14387479Scokane                     // "illegal.Unicode.esc", reported by base scanner
14487479Scokane                 } else {
14587479Scokane                     bp--;
14687479Scokane                     ch = '\\';
14761501Sdfr                     col--;
14887479Scokane                 }
14987479Scokane             }
15087479Scokane         }
15161501Sdfr
15261501Sdfr         @Override
15387479Scokane         protected void scanCommentChar() {
15461501Sdfr             scanChar();
15561501Sdfr             if (ch == '\\') {
15661501Sdfr                 if (peekChar() == '\\' && !isUnicode()) {
15761501Sdfr                     bp++; col++;
15861501Sdfr                     doubleBackslashBp = bp;
15961501Sdfr                 } else {
16087479Scokane                     convertUnicode();
16161501Sdfr                 }
16261501Sdfr             }
16361501Sdfr         }
16461501Sdfr
16561501Sdfr         @Override
16661501Sdfr         protected void scanChar() {
16761501Sdfr             bp++;
16861501Sdfr             ch = buf[bp];
16961501Sdfr             switch (ch) {
17061501Sdfr             case '\r': // return
17161501Sdfr                 col = 0;
17261501Sdfr                 break;
17361501Sdfr             case '\n': // newline
17461501Sdfr                 if (bp == 0 || buf[bp-1] != '\r') {
17561501Sdfr                     col = 0;
17661501Sdfr                 }
17761501Sdfr                 break;
17861501Sdfr             case '\t': // tab
17961452Sdfr                 col = (col / TabInc * TabInc) + TabInc;
18061452Sdfr                 break;
18161452Sdfr             case '\\': // possible Unicode
18261452Sdfr                 col++;
18361452Sdfr                 convertUnicode();
18461452Sdfr                 break;
18561452Sdfr             default:
18661452Sdfr                 col++;
18761452Sdfr                 break;
18861452Sdfr             }
18961452Sdfr         }
190133851Sobrien
191133851Sobrien         @Override
19283699Scokane         public void putChar(char ch, boolean scan) {
19383699Scokane             // At this point, bp is the position of the current character in buf,
19487479Scokane             // and sp is the position in sbuf where this character will be put.
19587479Scokane             // Record a new entry in pbuf if pbuf is empty or if sp and its
19661452Sdfr             // corresponding source position are not equidistant from the
19761452Sdfr             // corresponding values in the latest entry in the pbuf array.
19861452Sdfr             // (i.e. there is a discontinuity in the map function.)
19961452Sdfr             if ((pp == 0)
20061452Sdfr                     || (sp - pbuf[pp - 2] != (startPos + bp) - pbuf[pp - 1])) {
20161452Sdfr                 if (pp + 1 >= pbuf.length) {
20261452Sdfr                     int[] new_pbuf = new int[pbuf.length * 2];
20361452Sdfr                     System.arraycopy(pbuf, 0, new_pbuf, 0, pbuf.length);
20461452Sdfr                     pbuf = new_pbuf;
20561452Sdfr                 }
206127815Snjl                 pbuf[pp] = sp;
207127815Snjl                 pbuf[pp + 1] = startPos + bp;
20861452Sdfr                 pp += 2;
20961452Sdfr             }
21061452Sdfr             super.putChar(ch, scan);
21161452Sdfr         }
212142398Simp
21361452Sdfr         /** Whether the ch represents a sequence of two backslashes. */
21461452Sdfr         boolean isDoubleBackslash() {
21561452Sdfr             return doubleBackslashBp == bp;
21661452Sdfr         }
21761452Sdfr
21861452Sdfr
21961452Sdfr     }
22061452Sdfr
22161452Sdfr     protected static class JavadocComment extends JavaTokenizer.BasicComment<DocReader> {
22261501Sdfr
22361452Sdfr        /**
22461452Sdfr        * Translated and stripped contents of doc comment
22561452Sdfr        */
22661452Sdfr        private String docComment = null;
22761452Sdfr        private int[] docPosns = null;
22861452Sdfr
22961452Sdfr        JavadocComment(DocReader reader, CommentStyle cs) {
230127135Snjl            super(reader, cs);
231127135Snjl        }
23261452Sdfr
23361452Sdfr        @Override
23461452Sdfr        public String getText() {
23561452Sdfr            if (!scanned && cs == CommentStyle.JAVADOC) {
23661452Sdfr                scanDocComment();
23761452Sdfr            }
23861452Sdfr            return docComment;
23961452Sdfr        }
24061452Sdfr
24161452Sdfr        @Override
24261452Sdfr        public int getSourcePos(int pos) {
24361501Sdfr            // Binary search to find the entry for which the string index is
24461452Sdfr            // less than pos. Since docPosns is a list of pairs of integers
24561452Sdfr            // we must make sure the index is always even.
24661452Sdfr            // If we find an exact match for pos, the other item in the pair
24761452Sdfr            // gives the source pos; otherwise, compute the source position
24861452Sdfr            // relative to the best match found in the array.
24961452Sdfr            if (pos == Position.NOPOS)
25061452Sdfr                return Position.NOPOS;
25161452Sdfr            if (pos < 0 || pos > docComment.length())
25261452Sdfr                throw new StringIndexOutOfBoundsException(String.valueOf(pos));
25361452Sdfr            if (docPosns == null)
25461452Sdfr                return Position.NOPOS;
25561452Sdfr            int start = 0;
25661452Sdfr            int end = docPosns.length;
25761501Sdfr            while (start < end - 2) {
25861452Sdfr                // find an even index midway between start and end
25961452Sdfr                int index = ((start  + end) / 4) * 2;
26061501Sdfr                if (docPosns[index] < pos)
26161501Sdfr                    start = index;
26261501Sdfr                else if (docPosns[index] == pos)
26361452Sdfr                    return docPosns[index + 1];
26461501Sdfr                else
26561501Sdfr                    end = index;
26661501Sdfr            }
26761501Sdfr            return docPosns[start + 1] + (pos - docPosns[start]);
26861501Sdfr        }
26961452Sdfr
27061452Sdfr        @Override
27161452Sdfr        @SuppressWarnings("fallthrough")
27261452Sdfr        protected void scanDocComment() {
27361452Sdfr             try {
27461501Sdfr                 boolean firstLine = true;
27561452Sdfr
27661452Sdfr                 // Skip over first slash
27761452Sdfr                 comment_reader.scanCommentChar();
27861452Sdfr                 // Skip over first star
27961452Sdfr                 comment_reader.scanCommentChar();
28061452Sdfr
28161503Sdfr                 // consume any number of stars
28261452Sdfr                 while (comment_reader.bp < comment_reader.buflen && comment_reader.ch == '*') {
28361503Sdfr                     comment_reader.scanCommentChar();
28461503Sdfr                 }
28561503Sdfr                 // is the comment in the form /**/, /***/, /****/, etc. ?
28661503Sdfr                 if (comment_reader.bp < comment_reader.buflen && comment_reader.ch == '/') {
28761452Sdfr                     docComment = "";
28861452Sdfr                     return;
28961452Sdfr                 }
29061452Sdfr
29161452Sdfr                 // skip a newline on the first line of the comment.
29261452Sdfr                 if (comment_reader.bp < comment_reader.buflen) {
29361452Sdfr                     if (comment_reader.ch == LF) {
29461452Sdfr                         comment_reader.scanCommentChar();
29561452Sdfr                         firstLine = false;
29661452Sdfr                     } else if (comment_reader.ch == CR) {
29761452Sdfr                         comment_reader.scanCommentChar();
29861452Sdfr                         if (comment_reader.ch == LF) {
29961452Sdfr                             comment_reader.scanCommentChar();
30061501Sdfr                             firstLine = false;
30161503Sdfr                         }
30261503Sdfr                     }
30361503Sdfr                 }
30461503Sdfr
30561452Sdfr             outerLoop:
30661452Sdfr
30761452Sdfr                 // The outerLoop processes the doc comment, looping once
30861452Sdfr                 // for each line.  For each line, it first strips off
30961452Sdfr                 // whitespace, then it consumes any stars, then it
31061452Sdfr                 // puts the rest of the line into our buffer.
31161452Sdfr                 while (comment_reader.bp < comment_reader.buflen) {
31261452Sdfr                     int begin_bp = comment_reader.bp;
31361452Sdfr                     char begin_ch = comment_reader.ch;
31461452Sdfr                     // The wsLoop consumes whitespace from the beginning
31561452Sdfr                     // of each line.
31661452Sdfr                 wsLoop:
31761452Sdfr
31861452Sdfr                     while (comment_reader.bp < comment_reader.buflen) {
31961452Sdfr                         switch(comment_reader.ch) {
32061452Sdfr                         case ' ':
32161452Sdfr                             comment_reader.scanCommentChar();
32261452Sdfr                             break;
32361452Sdfr                         case '\t':
32461452Sdfr                             comment_reader.col = ((comment_reader.col - 1) / TabInc * TabInc) + TabInc;
32561452Sdfr                             comment_reader.scanCommentChar();
32661452Sdfr                             break;
32761452Sdfr                         case FF:
32861452Sdfr                             comment_reader.col = 0;
32961452Sdfr                             comment_reader.scanCommentChar();
33061452Sdfr                             break;
33161452Sdfr         // Treat newline at beginning of line (blank line, no star)
33261452Sdfr         // as comment text.  Old Javadoc compatibility requires this.
33361452Sdfr         /*---------------------------------*
33461452Sdfr                         case CR: // (Spec 3.4)
33561452Sdfr                             doc_reader.scanCommentChar();
33687479Scokane                             if (ch == LF) {
33787479Scokane                                 col = 0;
33887479Scokane                                 doc_reader.scanCommentChar();
33987479Scokane                             }
34061452Sdfr                             break;
34187479Scokane                         case LF: // (Spec 3.4)
34287479Scokane                             doc_reader.scanCommentChar();
34361452Sdfr                             break;
34461452Sdfr         *---------------------------------*/
34561452Sdfr                         default:
34661452Sdfr                             // we've seen something that isn't whitespace;
34761452Sdfr                             // jump out.
34861452Sdfr                             break wsLoop;
34961452Sdfr                         }
35061452Sdfr                     }
35161452Sdfr
35261452Sdfr                     // Are there stars here?  If so, consume them all
35361452Sdfr                     // and check for the end of comment.
35461452Sdfr                     if (comment_reader.ch == '*') {
35561452Sdfr                         // skip all of the stars
35687479Scokane                         do {
35787479Scokane                             comment_reader.scanCommentChar();
35887479Scokane                         } while (comment_reader.ch == '*');
35961452Sdfr
36061452Sdfr                         // check for the closing slash.
36161452Sdfr                         if (comment_reader.ch == '/') {
36261452Sdfr                             // We're done with the doc comment
36361452Sdfr                             // scanChar() and breakout.
36461452Sdfr                             break outerLoop;
36561452Sdfr                         }
36661452Sdfr                     } else if (! firstLine) {
36761452Sdfr                         // The current line does not begin with a '*' so we will
36861452Sdfr                         // treat it as comment
36961452Sdfr                         comment_reader.bp = begin_bp;
37061452Sdfr                         comment_reader.ch = begin_ch;
37161452Sdfr                     }
37261452Sdfr                     // The textLoop processes the rest of the characters
37361452Sdfr                     // on the line, adding them to our buffer.
37461452Sdfr                 textLoop:
37561452Sdfr                     while (comment_reader.bp < comment_reader.buflen) {
37661452Sdfr                         switch (comment_reader.ch) {
37761452Sdfr                         case '*':
37861452Sdfr                             // Is this just a star?  Or is this the
37961452Sdfr                             // end of a comment?
38061452Sdfr                             comment_reader.scanCommentChar();
38161452Sdfr                             if (comment_reader.ch == '/') {
38261452Sdfr                                 // This is the end of the comment,
38361452Sdfr                                 // set ch and return our buffer.
38461452Sdfr                                 break outerLoop;
38561452Sdfr                             }
38661452Sdfr                             // This is just an ordinary star.  Add it to
38761452Sdfr                             // the buffer.
38861452Sdfr                             comment_reader.putChar('*', false);
38961452Sdfr                             break;
39061452Sdfr                         case '\\':
39161452Sdfr                             comment_reader.putChar('\\', false);
39261452Sdfr                             // If a double backslash was found, write two
39361452Sdfr                             if (comment_reader.isDoubleBackslash()) {
39461452Sdfr                                 comment_reader.putChar('\\', false);
39561452Sdfr                             }
39661452Sdfr                             comment_reader.scanCommentChar();
39761452Sdfr                         case ' ':
39861452Sdfr                         case '\t':
39961452Sdfr                             comment_reader.putChar(comment_reader.ch, false);
40061452Sdfr                             comment_reader.scanCommentChar();
40161452Sdfr                             break;
40261452Sdfr                         case FF:
40361452Sdfr                             comment_reader.scanCommentChar();
40461452Sdfr                             break textLoop; // treat as end of line
40561452Sdfr                         case CR: // (Spec 3.4)
40661452Sdfr                             comment_reader.scanCommentChar();
40761452Sdfr                             if (comment_reader.ch != LF) {
40861452Sdfr                                 // Canonicalize CR-only line terminator to LF
40961452Sdfr                                 comment_reader.putChar((char)LF, false);
41061452Sdfr                                 break textLoop;
41161452Sdfr                             }
41261452Sdfr                             /* fall through to LF case */
41361452Sdfr                         case LF: // (Spec 3.4)
41461452Sdfr                             // We've seen a newline.  Add it to our
41561452Sdfr                             // buffer and break out of this loop,
41661452Sdfr                             // starting fresh on a new line.
41761452Sdfr                             comment_reader.putChar(comment_reader.ch, false);
41861452Sdfr                             comment_reader.scanCommentChar();
419113506Smdodd                             break textLoop;
420113506Smdodd                         default:
421                             // Add the character to our buffer.
422                             comment_reader.putChar(comment_reader.ch, false);
423                             comment_reader.scanCommentChar();
424                         }
425                     } // end textLoop
426                     firstLine = false;
427                 } // end outerLoop
428
429                 if (comment_reader.sp > 0) {
430                     int i = comment_reader.sp - 1;
431                 trailLoop:
432                     while (i > -1) {
433                         switch (comment_reader.sbuf[i]) {
434                         case '*':
435                             i--;
436                             break;
437                         default:
438                             break trailLoop;
439                         }
440                     }
441                     comment_reader.sp = i + 1;
442
443                     // Store the text of the doc comment
444                    docComment = comment_reader.chars();
445                    docPosns = new int[comment_reader.pp];
446                    System.arraycopy(comment_reader.pbuf, 0, docPosns, 0, docPosns.length);
447                } else {
448                    docComment = "";
449                }
450            } finally {
451                scanned = true;
452                comment_reader = null;
453                if (docComment != null &&
454                        DEPRECATED_PATTERN.matcher(docComment).matches()) {
455                    deprecatedFlag = true;
456                }
457            }
458        }
459        //where:
460            private static final Pattern DEPRECATED_PATTERN =
461                    Pattern.compile("(?sm).*^\\s*@deprecated( |$).*");
462
463    }
464
465    @Override
466    public Position.LineMap getLineMap() {
467        char[] buf = reader.getRawCharacters();
468        return Position.makeLineMap(buf, buf.length, true);
469    }
470}
471