JavadocTokenizer.java revision 2571:10fc81ac75b4
1/*
2 * Copyright (c) 2004, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26package com.sun.tools.javac.parser;
27
28import com.sun.tools.javac.parser.Tokens.Comment;
29import com.sun.tools.javac.parser.Tokens.Comment.CommentStyle;
30import com.sun.tools.javac.util.*;
31
32import java.nio.*;
33
34import static com.sun.tools.javac.util.LayoutCharacters.*;
35
36/** An extension to the base lexical analyzer that captures
37 *  and processes the contents of doc comments.  It does so by
38 *  translating Unicode escape sequences and by stripping the
39 *  leading whitespace and starts from each line of the comment.
40 *
41 *  <p><b>This is NOT part of any supported API.
42 *  If you write code that depends on this, you do so at your own risk.
43 *  This code and its internal interfaces are subject to change or
44 *  deletion without notice.</b>
45 */
46public class JavadocTokenizer extends JavaTokenizer {
47
48    /** Create a scanner from the input buffer.  buffer must implement
49     *  array() and compact(), and remaining() must be less than limit().
50     */
51    protected JavadocTokenizer(ScannerFactory fac, CharBuffer buffer) {
52        super(fac, buffer);
53    }
54
55    /** Create a scanner from the input array.  The array must have at
56     *  least a single character of extra space.
57     */
58    protected JavadocTokenizer(ScannerFactory fac, char[] input, int inputLength) {
59        super(fac, input, inputLength);
60    }
61
62    @Override
63    protected Comment processComment(int pos, int endPos, CommentStyle style) {
64        char[] buf = reader.getRawCharacters(pos, endPos);
65        return new JavadocComment(new DocReader(fac, buf, buf.length, pos), style);
66    }
67
68    /**
69     * This is a specialized version of UnicodeReader that keeps track of the
70     * column position within a given character stream (used for Javadoc processing),
71     * and which builds a table for mapping positions in the comment string to
72     * positions in the source file.
73     */
74    static class DocReader extends UnicodeReader {
75
76         int col;
77         int startPos;
78
79         /**
80          * A buffer for building a table for mapping positions in {@link #sbuf}
81          * to positions in the source buffer.
82          *
83          * The array is organized as a series of pairs of integers: the first
84          * number in each pair specifies a position in the comment text,
85          * the second number in each pair specifies the corresponding position
86          * in the source buffer. The pairs are sorted in ascending order.
87          *
88          * Since the mapping function is generally continuous, with successive
89          * positions in the string corresponding to successive positions in the
90          * source buffer, the table only needs to record discontinuities in
91          * the mapping. The values of intermediate positions can be inferred.
92          *
93          * Discontinuities may occur in a number of places: when a newline
94          * is followed by whitespace and asterisks (which are ignored),
95          * when a tab is expanded into spaces, and when unicode escapes
96          * are used in the source buffer.
97          *
98          * Thus, to find the source position of any position, p, in the comment
99          * string, find the index, i, of the pair whose string offset
100          * ({@code pbuf[i] }) is closest to but not greater than p. Then,
101          * {@code sourcePos(p) = pbuf[i+1] + (p - pbuf[i]) }.
102          */
103         int[] pbuf = new int[128];
104
105         /**
106          * The index of the next empty slot in the pbuf buffer.
107          */
108         int pp = 0;
109
110         /** The buffer index of the last double backslash sequence
111          */
112         private int doubleBackslashBp = -1;
113
114         DocReader(ScannerFactory fac, char[] input, int inputLength, int startPos) {
115             super(fac, input, inputLength);
116             this.startPos = startPos;
117         }
118
119         @Override
120         protected void convertUnicode() {
121             if (ch == '\\' && unicodeConversionBp != bp) {
122                 bp++; ch = buf[bp]; col++;
123                 if (ch == 'u') {
124                     do {
125                         bp++; ch = buf[bp]; col++;
126                     } while (ch == 'u');
127                     int limit = bp + 3;
128                     if (limit < buflen) {
129                         int d = digit(bp, 16);
130                         int code = d;
131                         while (bp < limit && d >= 0) {
132                             bp++; ch = buf[bp]; col++;
133                             d = digit(bp, 16);
134                             code = (code << 4) + d;
135                         }
136                         if (d >= 0) {
137                             ch = (char)code;
138                             unicodeConversionBp = bp;
139                             return;
140                         }
141                     }
142                     // "illegal.Unicode.esc", reported by base scanner
143                 } else {
144                     bp--;
145                     ch = '\\';
146                     col--;
147                 }
148             }
149         }
150
151         @Override
152         protected void scanCommentChar() {
153             scanChar();
154             if (ch == '\\') {
155                 if (peekChar() == '\\' && !isUnicode()) {
156                     bp++; col++;
157                     doubleBackslashBp = bp;
158                 } else {
159                     convertUnicode();
160                 }
161             }
162         }
163
164         @Override
165         protected void scanChar() {
166             bp++;
167             ch = buf[bp];
168             switch (ch) {
169             case '\r': // return
170                 col = 0;
171                 break;
172             case '\n': // newline
173                 if (bp == 0 || buf[bp-1] != '\r') {
174                     col = 0;
175                 }
176                 break;
177             case '\t': // tab
178                 col = (col / TabInc * TabInc) + TabInc;
179                 break;
180             case '\\': // possible Unicode
181                 col++;
182                 convertUnicode();
183                 break;
184             default:
185                 col++;
186                 break;
187             }
188         }
189
190         @Override
191         public void putChar(char ch, boolean scan) {
192             // At this point, bp is the position of the current character in buf,
193             // and sp is the position in sbuf where this character will be put.
194             // Record a new entry in pbuf if pbuf is empty or if sp and its
195             // corresponding source position are not equidistant from the
196             // corresponding values in the latest entry in the pbuf array.
197             // (i.e. there is a discontinuity in the map function.)
198             if ((pp == 0)
199                     || (sp - pbuf[pp - 2] != (startPos + bp) - pbuf[pp - 1])) {
200                 if (pp + 1 >= pbuf.length) {
201                     int[] new_pbuf = new int[pbuf.length * 2];
202                     System.arraycopy(pbuf, 0, new_pbuf, 0, pbuf.length);
203                     pbuf = new_pbuf;
204                 }
205                 pbuf[pp] = sp;
206                 pbuf[pp + 1] = startPos + bp;
207                 pp += 2;
208             }
209             super.putChar(ch, scan);
210         }
211
212         /** Whether the ch represents a sequence of two backslashes. */
213         boolean isDoubleBackslash() {
214             return doubleBackslashBp == bp;
215         }
216
217
218     }
219
220     protected static class JavadocComment extends JavaTokenizer.BasicComment<DocReader> {
221
222        /**
223        * Translated and stripped contents of doc comment
224        */
225        private String docComment = null;
226        private int[] docPosns = null;
227
228        JavadocComment(DocReader reader, CommentStyle cs) {
229            super(reader, cs);
230        }
231
232        @Override
233        public String getText() {
234            if (!scanned && cs == CommentStyle.JAVADOC) {
235                scanDocComment();
236            }
237            return docComment;
238        }
239
240        @Override
241        public int getSourcePos(int pos) {
242            // Binary search to find the entry for which the string index is
243            // less than pos. Since docPosns is a list of pairs of integers
244            // we must make sure the index is always even.
245            // If we find an exact match for pos, the other item in the pair
246            // gives the source pos; otherwise, compute the source position
247            // relative to the best match found in the array.
248            if (pos == Position.NOPOS)
249                return Position.NOPOS;
250            if (pos < 0 || pos > docComment.length())
251                throw new StringIndexOutOfBoundsException(String.valueOf(pos));
252            if (docPosns == null)
253                return Position.NOPOS;
254            int start = 0;
255            int end = docPosns.length;
256            while (start < end - 2) {
257                // find an even index midway between start and end
258                int index = ((start  + end) / 4) * 2;
259                if (docPosns[index] < pos)
260                    start = index;
261                else if (docPosns[index] == pos)
262                    return docPosns[index + 1];
263                else
264                    end = index;
265            }
266            return docPosns[start + 1] + (pos - docPosns[start]);
267        }
268
269        @Override
270        @SuppressWarnings("fallthrough")
271        protected void scanDocComment() {
272             try {
273                 boolean firstLine = true;
274
275                 // Skip over first slash
276                 comment_reader.scanCommentChar();
277                 // Skip over first star
278                 comment_reader.scanCommentChar();
279
280                 // consume any number of stars
281                 while (comment_reader.bp < comment_reader.buflen && comment_reader.ch == '*') {
282                     comment_reader.scanCommentChar();
283                 }
284                 // is the comment in the form /**/, /***/, /****/, etc. ?
285                 if (comment_reader.bp < comment_reader.buflen && comment_reader.ch == '/') {
286                     docComment = "";
287                     return;
288                 }
289
290                 // skip a newline on the first line of the comment.
291                 if (comment_reader.bp < comment_reader.buflen) {
292                     if (comment_reader.ch == LF) {
293                         comment_reader.scanCommentChar();
294                         firstLine = false;
295                     } else if (comment_reader.ch == CR) {
296                         comment_reader.scanCommentChar();
297                         if (comment_reader.ch == LF) {
298                             comment_reader.scanCommentChar();
299                             firstLine = false;
300                         }
301                     }
302                 }
303
304             outerLoop:
305
306                 // The outerLoop processes the doc comment, looping once
307                 // for each line.  For each line, it first strips off
308                 // whitespace, then it consumes any stars, then it
309                 // puts the rest of the line into our buffer.
310                 while (comment_reader.bp < comment_reader.buflen) {
311                     int begin_bp = comment_reader.bp;
312                     char begin_ch = comment_reader.ch;
313                     // The wsLoop consumes whitespace from the beginning
314                     // of each line.
315                 wsLoop:
316
317                     while (comment_reader.bp < comment_reader.buflen) {
318                         switch(comment_reader.ch) {
319                         case ' ':
320                             comment_reader.scanCommentChar();
321                             break;
322                         case '\t':
323                             comment_reader.col = ((comment_reader.col - 1) / TabInc * TabInc) + TabInc;
324                             comment_reader.scanCommentChar();
325                             break;
326                         case FF:
327                             comment_reader.col = 0;
328                             comment_reader.scanCommentChar();
329                             break;
330         // Treat newline at beginning of line (blank line, no star)
331         // as comment text.  Old Javadoc compatibility requires this.
332         /*---------------------------------*
333                         case CR: // (Spec 3.4)
334                             doc_reader.scanCommentChar();
335                             if (ch == LF) {
336                                 col = 0;
337                                 doc_reader.scanCommentChar();
338                             }
339                             break;
340                         case LF: // (Spec 3.4)
341                             doc_reader.scanCommentChar();
342                             break;
343         *---------------------------------*/
344                         default:
345                             // we've seen something that isn't whitespace;
346                             // jump out.
347                             break wsLoop;
348                         }
349                     }
350
351                     // Are there stars here?  If so, consume them all
352                     // and check for the end of comment.
353                     if (comment_reader.ch == '*') {
354                         // skip all of the stars
355                         do {
356                             comment_reader.scanCommentChar();
357                         } while (comment_reader.ch == '*');
358
359                         // check for the closing slash.
360                         if (comment_reader.ch == '/') {
361                             // We're done with the doc comment
362                             // scanChar() and breakout.
363                             break outerLoop;
364                         }
365                     } else if (! firstLine) {
366                         // The current line does not begin with a '*' so we will
367                         // treat it as comment
368                         comment_reader.bp = begin_bp;
369                         comment_reader.ch = begin_ch;
370                     }
371                     // The textLoop processes the rest of the characters
372                     // on the line, adding them to our buffer.
373                 textLoop:
374                     while (comment_reader.bp < comment_reader.buflen) {
375                         switch (comment_reader.ch) {
376                         case '*':
377                             // Is this just a star?  Or is this the
378                             // end of a comment?
379                             comment_reader.scanCommentChar();
380                             if (comment_reader.ch == '/') {
381                                 // This is the end of the comment,
382                                 // set ch and return our buffer.
383                                 break outerLoop;
384                             }
385                             // This is just an ordinary star.  Add it to
386                             // the buffer.
387                             comment_reader.putChar('*', false);
388                             break;
389                         case '\\':
390                             comment_reader.putChar('\\', false);
391                             // If a double backslash was found, write two
392                             if (comment_reader.isDoubleBackslash()) {
393                                 comment_reader.putChar('\\', false);
394                             }
395                             comment_reader.scanCommentChar();
396                         case ' ':
397                         case '\t':
398                             comment_reader.putChar(comment_reader.ch, false);
399                             comment_reader.scanCommentChar();
400                             break;
401                         case FF:
402                             comment_reader.scanCommentChar();
403                             break textLoop; // treat as end of line
404                         case CR: // (Spec 3.4)
405                             comment_reader.scanCommentChar();
406                             if (comment_reader.ch != LF) {
407                                 // Canonicalize CR-only line terminator to LF
408                                 comment_reader.putChar((char)LF, false);
409                                 break textLoop;
410                             }
411                             /* fall through to LF case */
412                         case LF: // (Spec 3.4)
413                             // We've seen a newline.  Add it to our
414                             // buffer and break out of this loop,
415                             // starting fresh on a new line.
416                             comment_reader.putChar(comment_reader.ch, false);
417                             comment_reader.scanCommentChar();
418                             break textLoop;
419                         default:
420                             // Add the character to our buffer.
421                             comment_reader.putChar(comment_reader.ch, false);
422                             comment_reader.scanCommentChar();
423                         }
424                     } // end textLoop
425                     firstLine = false;
426                 } // end outerLoop
427
428                 if (comment_reader.sp > 0) {
429                     int i = comment_reader.sp - 1;
430                 trailLoop:
431                     while (i > -1) {
432                         switch (comment_reader.sbuf[i]) {
433                         case '*':
434                             i--;
435                             break;
436                         default:
437                             break trailLoop;
438                         }
439                     }
440                     comment_reader.sp = i + 1;
441
442                     // Store the text of the doc comment
443                    docComment = comment_reader.chars();
444                    docPosns = new int[comment_reader.pp];
445                    System.arraycopy(comment_reader.pbuf, 0, docPosns, 0, docPosns.length);
446                } else {
447                    docComment = "";
448                }
449            } finally {
450                scanned = true;
451                comment_reader = null;
452                if (docComment != null &&
453                        docComment.matches("(?sm).*^\\s*@deprecated( |$).*")) {
454                    deprecatedFlag = true;
455                }
456            }
457        }
458    }
459
460    @Override
461    public Position.LineMap getLineMap() {
462        char[] buf = reader.getRawCharacters();
463        return Position.makeLineMap(buf, buf.length, true);
464    }
465}
466