1//===--- RawCommentList.cpp - Processing raw comments -----------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "clang/AST/RawCommentList.h"
10#include "clang/AST/ASTContext.h"
11#include "clang/AST/Comment.h"
12#include "clang/AST/CommentBriefParser.h"
13#include "clang/AST/CommentCommandTraits.h"
14#include "clang/AST/CommentLexer.h"
15#include "clang/AST/CommentParser.h"
16#include "clang/AST/CommentSema.h"
17#include "clang/Basic/CharInfo.h"
18#include "llvm/ADT/STLExtras.h"
19#include "llvm/ADT/StringExtras.h"
20#include "llvm/Support/Allocator.h"
21
22using namespace clang;
23
24namespace {
25/// Get comment kind and bool describing if it is a trailing comment.
26std::pair<RawComment::CommentKind, bool> getCommentKind(StringRef Comment,
27                                                        bool ParseAllComments) {
28  const size_t MinCommentLength = ParseAllComments ? 2 : 3;
29  if ((Comment.size() < MinCommentLength) || Comment[0] != '/')
30    return std::make_pair(RawComment::RCK_Invalid, false);
31
32  RawComment::CommentKind K;
33  if (Comment[1] == '/') {
34    if (Comment.size() < 3)
35      return std::make_pair(RawComment::RCK_OrdinaryBCPL, false);
36
37    if (Comment[2] == '/')
38      K = RawComment::RCK_BCPLSlash;
39    else if (Comment[2] == '!')
40      K = RawComment::RCK_BCPLExcl;
41    else
42      return std::make_pair(RawComment::RCK_OrdinaryBCPL, false);
43  } else {
44    assert(Comment.size() >= 4);
45
46    // Comment lexer does not understand escapes in comment markers, so pretend
47    // that this is not a comment.
48    if (Comment[1] != '*' ||
49        Comment[Comment.size() - 2] != '*' ||
50        Comment[Comment.size() - 1] != '/')
51      return std::make_pair(RawComment::RCK_Invalid, false);
52
53    if (Comment[2] == '*')
54      K = RawComment::RCK_JavaDoc;
55    else if (Comment[2] == '!')
56      K = RawComment::RCK_Qt;
57    else
58      return std::make_pair(RawComment::RCK_OrdinaryC, false);
59  }
60  const bool TrailingComment = (Comment.size() > 3) && (Comment[3] == '<');
61  return std::make_pair(K, TrailingComment);
62}
63
64bool mergedCommentIsTrailingComment(StringRef Comment) {
65  return (Comment.size() > 3) && (Comment[3] == '<');
66}
67
68/// Returns true if R1 and R2 both have valid locations that start on the same
69/// column.
70bool commentsStartOnSameColumn(const SourceManager &SM, const RawComment &R1,
71                               const RawComment &R2) {
72  SourceLocation L1 = R1.getBeginLoc();
73  SourceLocation L2 = R2.getBeginLoc();
74  bool Invalid = false;
75  unsigned C1 = SM.getPresumedColumnNumber(L1, &Invalid);
76  if (!Invalid) {
77    unsigned C2 = SM.getPresumedColumnNumber(L2, &Invalid);
78    return !Invalid && (C1 == C2);
79  }
80  return false;
81}
82} // unnamed namespace
83
84/// Determines whether there is only whitespace in `Buffer` between `P`
85/// and the previous line.
86/// \param Buffer The buffer to search in.
87/// \param P The offset from the beginning of `Buffer` to start from.
88/// \return true if all of the characters in `Buffer` ranging from the closest
89/// line-ending character before `P` (or the beginning of `Buffer`) to `P - 1`
90/// are whitespace.
91static bool onlyWhitespaceOnLineBefore(const char *Buffer, unsigned P) {
92  // Search backwards until we see linefeed or carriage return.
93  for (unsigned I = P; I != 0; --I) {
94    char C = Buffer[I - 1];
95    if (isVerticalWhitespace(C))
96      return true;
97    if (!isHorizontalWhitespace(C))
98      return false;
99  }
100  // We hit the beginning of the buffer.
101  return true;
102}
103
104/// Returns whether `K` is an ordinary comment kind.
105static bool isOrdinaryKind(RawComment::CommentKind K) {
106  return (K == RawComment::RCK_OrdinaryBCPL) ||
107         (K == RawComment::RCK_OrdinaryC);
108}
109
110RawComment::RawComment(const SourceManager &SourceMgr, SourceRange SR,
111                       const CommentOptions &CommentOpts, bool Merged) :
112    Range(SR), RawTextValid(false), BriefTextValid(false),
113    IsAttached(false), IsTrailingComment(false),
114    IsAlmostTrailingComment(false) {
115  // Extract raw comment text, if possible.
116  if (SR.getBegin() == SR.getEnd() || getRawText(SourceMgr).empty()) {
117    Kind = RCK_Invalid;
118    return;
119  }
120
121  // Guess comment kind.
122  std::pair<CommentKind, bool> K =
123      getCommentKind(RawText, CommentOpts.ParseAllComments);
124
125  // Guess whether an ordinary comment is trailing.
126  if (CommentOpts.ParseAllComments && isOrdinaryKind(K.first)) {
127    FileID BeginFileID;
128    unsigned BeginOffset;
129    std::tie(BeginFileID, BeginOffset) =
130        SourceMgr.getDecomposedLoc(Range.getBegin());
131    if (BeginOffset != 0) {
132      bool Invalid = false;
133      const char *Buffer =
134          SourceMgr.getBufferData(BeginFileID, &Invalid).data();
135      IsTrailingComment |=
136          (!Invalid && !onlyWhitespaceOnLineBefore(Buffer, BeginOffset));
137    }
138  }
139
140  if (!Merged) {
141    Kind = K.first;
142    IsTrailingComment |= K.second;
143
144    IsAlmostTrailingComment =
145        RawText.starts_with("//<") || RawText.starts_with("/*<");
146  } else {
147    Kind = RCK_Merged;
148    IsTrailingComment =
149        IsTrailingComment || mergedCommentIsTrailingComment(RawText);
150  }
151}
152
153StringRef RawComment::getRawTextSlow(const SourceManager &SourceMgr) const {
154  FileID BeginFileID;
155  FileID EndFileID;
156  unsigned BeginOffset;
157  unsigned EndOffset;
158
159  std::tie(BeginFileID, BeginOffset) =
160      SourceMgr.getDecomposedLoc(Range.getBegin());
161  std::tie(EndFileID, EndOffset) = SourceMgr.getDecomposedLoc(Range.getEnd());
162
163  const unsigned Length = EndOffset - BeginOffset;
164  if (Length < 2)
165    return StringRef();
166
167  // The comment can't begin in one file and end in another.
168  assert(BeginFileID == EndFileID);
169
170  bool Invalid = false;
171  const char *BufferStart = SourceMgr.getBufferData(BeginFileID,
172                                                    &Invalid).data();
173  if (Invalid)
174    return StringRef();
175
176  return StringRef(BufferStart + BeginOffset, Length);
177}
178
179const char *RawComment::extractBriefText(const ASTContext &Context) const {
180  // Lazily initialize RawText using the accessor before using it.
181  (void)getRawText(Context.getSourceManager());
182
183  // Since we will be copying the resulting text, all allocations made during
184  // parsing are garbage after resulting string is formed.  Thus we can use
185  // a separate allocator for all temporary stuff.
186  llvm::BumpPtrAllocator Allocator;
187
188  comments::Lexer L(Allocator, Context.getDiagnostics(),
189                    Context.getCommentCommandTraits(),
190                    Range.getBegin(),
191                    RawText.begin(), RawText.end());
192  comments::BriefParser P(L, Context.getCommentCommandTraits());
193
194  const std::string Result = P.Parse();
195  const unsigned BriefTextLength = Result.size();
196  char *BriefTextPtr = new (Context) char[BriefTextLength + 1];
197  memcpy(BriefTextPtr, Result.c_str(), BriefTextLength + 1);
198  BriefText = BriefTextPtr;
199  BriefTextValid = true;
200
201  return BriefTextPtr;
202}
203
204comments::FullComment *RawComment::parse(const ASTContext &Context,
205                                         const Preprocessor *PP,
206                                         const Decl *D) const {
207  // Lazily initialize RawText using the accessor before using it.
208  (void)getRawText(Context.getSourceManager());
209
210  comments::Lexer L(Context.getAllocator(), Context.getDiagnostics(),
211                    Context.getCommentCommandTraits(),
212                    getSourceRange().getBegin(),
213                    RawText.begin(), RawText.end());
214  comments::Sema S(Context.getAllocator(), Context.getSourceManager(),
215                   Context.getDiagnostics(),
216                   Context.getCommentCommandTraits(),
217                   PP);
218  S.setDecl(D);
219  comments::Parser P(L, S, Context.getAllocator(), Context.getSourceManager(),
220                     Context.getDiagnostics(),
221                     Context.getCommentCommandTraits());
222
223  return P.parseFullComment();
224}
225
226static bool onlyWhitespaceBetween(SourceManager &SM,
227                                  SourceLocation Loc1, SourceLocation Loc2,
228                                  unsigned MaxNewlinesAllowed) {
229  std::pair<FileID, unsigned> Loc1Info = SM.getDecomposedLoc(Loc1);
230  std::pair<FileID, unsigned> Loc2Info = SM.getDecomposedLoc(Loc2);
231
232  // Question does not make sense if locations are in different files.
233  if (Loc1Info.first != Loc2Info.first)
234    return false;
235
236  bool Invalid = false;
237  const char *Buffer = SM.getBufferData(Loc1Info.first, &Invalid).data();
238  if (Invalid)
239    return false;
240
241  unsigned NumNewlines = 0;
242  assert(Loc1Info.second <= Loc2Info.second && "Loc1 after Loc2!");
243  // Look for non-whitespace characters and remember any newlines seen.
244  for (unsigned I = Loc1Info.second; I != Loc2Info.second; ++I) {
245    switch (Buffer[I]) {
246    default:
247      return false;
248    case ' ':
249    case '\t':
250    case '\f':
251    case '\v':
252      break;
253    case '\r':
254    case '\n':
255      ++NumNewlines;
256
257      // Check if we have found more than the maximum allowed number of
258      // newlines.
259      if (NumNewlines > MaxNewlinesAllowed)
260        return false;
261
262      // Collapse \r\n and \n\r into a single newline.
263      if (I + 1 != Loc2Info.second &&
264          (Buffer[I + 1] == '\n' || Buffer[I + 1] == '\r') &&
265          Buffer[I] != Buffer[I + 1])
266        ++I;
267      break;
268    }
269  }
270
271  return true;
272}
273
274void RawCommentList::addComment(const RawComment &RC,
275                                const CommentOptions &CommentOpts,
276                                llvm::BumpPtrAllocator &Allocator) {
277  if (RC.isInvalid())
278    return;
279
280  // Ordinary comments are not interesting for us.
281  if (RC.isOrdinary() && !CommentOpts.ParseAllComments)
282    return;
283
284  std::pair<FileID, unsigned> Loc =
285      SourceMgr.getDecomposedLoc(RC.getBeginLoc());
286
287  const FileID CommentFile = Loc.first;
288  const unsigned CommentOffset = Loc.second;
289
290  // If this is the first Doxygen comment, save it (because there isn't
291  // anything to merge it with).
292  if (OrderedComments[CommentFile].empty()) {
293    OrderedComments[CommentFile][CommentOffset] =
294        new (Allocator) RawComment(RC);
295    return;
296  }
297
298  const RawComment &C1 = *OrderedComments[CommentFile].rbegin()->second;
299  const RawComment &C2 = RC;
300
301  // Merge comments only if there is only whitespace between them.
302  // Can't merge trailing and non-trailing comments unless the second is
303  // non-trailing ordinary in the same column, as in the case:
304  //   int x; // documents x
305  //          // more text
306  // versus:
307  //   int x; // documents x
308  //   int y; // documents y
309  // or:
310  //   int x; // documents x
311  //   // documents y
312  //   int y;
313  // Merge comments if they are on same or consecutive lines.
314  if ((C1.isTrailingComment() == C2.isTrailingComment() ||
315       (C1.isTrailingComment() && !C2.isTrailingComment() &&
316        isOrdinaryKind(C2.getKind()) &&
317        commentsStartOnSameColumn(SourceMgr, C1, C2))) &&
318      onlyWhitespaceBetween(SourceMgr, C1.getEndLoc(), C2.getBeginLoc(),
319                            /*MaxNewlinesAllowed=*/1)) {
320    SourceRange MergedRange(C1.getBeginLoc(), C2.getEndLoc());
321    *OrderedComments[CommentFile].rbegin()->second =
322        RawComment(SourceMgr, MergedRange, CommentOpts, true);
323  } else {
324    OrderedComments[CommentFile][CommentOffset] =
325        new (Allocator) RawComment(RC);
326  }
327}
328
329const std::map<unsigned, RawComment *> *
330RawCommentList::getCommentsInFile(FileID File) const {
331  auto CommentsInFile = OrderedComments.find(File);
332  if (CommentsInFile == OrderedComments.end())
333    return nullptr;
334
335  return &CommentsInFile->second;
336}
337
338bool RawCommentList::empty() const { return OrderedComments.empty(); }
339
340unsigned RawCommentList::getCommentBeginLine(RawComment *C, FileID File,
341                                             unsigned Offset) const {
342  auto Cached = CommentBeginLine.find(C);
343  if (Cached != CommentBeginLine.end())
344    return Cached->second;
345  const unsigned Line = SourceMgr.getLineNumber(File, Offset);
346  CommentBeginLine[C] = Line;
347  return Line;
348}
349
350unsigned RawCommentList::getCommentEndOffset(RawComment *C) const {
351  auto Cached = CommentEndOffset.find(C);
352  if (Cached != CommentEndOffset.end())
353    return Cached->second;
354  const unsigned Offset =
355      SourceMgr.getDecomposedLoc(C->getSourceRange().getEnd()).second;
356  CommentEndOffset[C] = Offset;
357  return Offset;
358}
359
360std::string RawComment::getFormattedText(const SourceManager &SourceMgr,
361                                         DiagnosticsEngine &Diags) const {
362  llvm::StringRef CommentText = getRawText(SourceMgr);
363  if (CommentText.empty())
364    return "";
365
366  std::string Result;
367  for (const RawComment::CommentLine &Line :
368       getFormattedLines(SourceMgr, Diags))
369    Result += Line.Text + "\n";
370
371  auto LastChar = Result.find_last_not_of('\n');
372  Result.erase(LastChar + 1, Result.size());
373
374  return Result;
375}
376
377std::vector<RawComment::CommentLine>
378RawComment::getFormattedLines(const SourceManager &SourceMgr,
379                              DiagnosticsEngine &Diags) const {
380  llvm::StringRef CommentText = getRawText(SourceMgr);
381  if (CommentText.empty())
382    return {};
383
384  llvm::BumpPtrAllocator Allocator;
385  // We do not parse any commands, so CommentOptions are ignored by
386  // comments::Lexer. Therefore, we just use default-constructed options.
387  CommentOptions DefOpts;
388  comments::CommandTraits EmptyTraits(Allocator, DefOpts);
389  comments::Lexer L(Allocator, Diags, EmptyTraits, getSourceRange().getBegin(),
390                    CommentText.begin(), CommentText.end(),
391                    /*ParseCommands=*/false);
392
393  std::vector<RawComment::CommentLine> Result;
394  // A column number of the first non-whitespace token in the comment text.
395  // We skip whitespace up to this column, but keep the whitespace after this
396  // column. IndentColumn is calculated when lexing the first line and reused
397  // for the rest of lines.
398  unsigned IndentColumn = 0;
399
400  // Record the line number of the last processed comment line.
401  // For block-style comments, an extra newline token will be produced after
402  // the end-comment marker, e.g.:
403  //   /** This is a multi-line comment block.
404  //       The lexer will produce two newline tokens here > */
405  // previousLine will record the line number when we previously saw a newline
406  // token and recorded a comment line. If we see another newline token on the
407  // same line, don't record anything in between.
408  unsigned PreviousLine = 0;
409
410  // Processes one line of the comment and adds it to the result.
411  // Handles skipping the indent at the start of the line.
412  // Returns false when eof is reached and true otherwise.
413  auto LexLine = [&](bool IsFirstLine) -> bool {
414    comments::Token Tok;
415    // Lex the first token on the line. We handle it separately, because we to
416    // fix up its indentation.
417    L.lex(Tok);
418    if (Tok.is(comments::tok::eof))
419      return false;
420    if (Tok.is(comments::tok::newline)) {
421      PresumedLoc Loc = SourceMgr.getPresumedLoc(Tok.getLocation());
422      if (Loc.getLine() != PreviousLine) {
423        Result.emplace_back("", Loc, Loc);
424        PreviousLine = Loc.getLine();
425      }
426      return true;
427    }
428    SmallString<124> Line;
429    llvm::StringRef TokText = L.getSpelling(Tok, SourceMgr);
430    bool LocInvalid = false;
431    unsigned TokColumn =
432        SourceMgr.getSpellingColumnNumber(Tok.getLocation(), &LocInvalid);
433    assert(!LocInvalid && "getFormattedText for invalid location");
434
435    // Amount of leading whitespace in TokText.
436    size_t WhitespaceLen = TokText.find_first_not_of(" \t");
437    if (WhitespaceLen == StringRef::npos)
438      WhitespaceLen = TokText.size();
439    // Remember the amount of whitespace we skipped in the first line to remove
440    // indent up to that column in the following lines.
441    if (IsFirstLine)
442      IndentColumn = TokColumn + WhitespaceLen;
443
444    // Amount of leading whitespace we actually want to skip.
445    // For the first line we skip all the whitespace.
446    // For the rest of the lines, we skip whitespace up to IndentColumn.
447    unsigned SkipLen =
448        IsFirstLine
449            ? WhitespaceLen
450            : std::min<size_t>(
451                  WhitespaceLen,
452                  std::max<int>(static_cast<int>(IndentColumn) - TokColumn, 0));
453    llvm::StringRef Trimmed = TokText.drop_front(SkipLen);
454    Line += Trimmed;
455    // Get the beginning location of the adjusted comment line.
456    PresumedLoc Begin =
457        SourceMgr.getPresumedLoc(Tok.getLocation().getLocWithOffset(SkipLen));
458
459    // Lex all tokens in the rest of the line.
460    for (L.lex(Tok); Tok.isNot(comments::tok::eof); L.lex(Tok)) {
461      if (Tok.is(comments::tok::newline)) {
462        // Get the ending location of the comment line.
463        PresumedLoc End = SourceMgr.getPresumedLoc(Tok.getLocation());
464        if (End.getLine() != PreviousLine) {
465          Result.emplace_back(Line, Begin, End);
466          PreviousLine = End.getLine();
467        }
468        return true;
469      }
470      Line += L.getSpelling(Tok, SourceMgr);
471    }
472    PresumedLoc End = SourceMgr.getPresumedLoc(Tok.getLocation());
473    Result.emplace_back(Line, Begin, End);
474    // We've reached the end of file token.
475    return false;
476  };
477
478  // Process first line separately to remember indent for the following lines.
479  if (!LexLine(/*IsFirstLine=*/true))
480    return Result;
481  // Process the rest of the lines.
482  while (LexLine(/*IsFirstLine=*/false))
483    ;
484  return Result;
485}
486