1//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9//  This file defines lexer for structured comments and supporting token class.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_CLANG_AST_COMMENTLEXER_H
14#define LLVM_CLANG_AST_COMMENTLEXER_H
15
16#include "clang/Basic/Diagnostic.h"
17#include "clang/Basic/SourceManager.h"
18#include "llvm/ADT/SmallString.h"
19#include "llvm/ADT/StringRef.h"
20#include "llvm/Support/Allocator.h"
21#include "llvm/Support/raw_ostream.h"
22
23namespace clang {
24namespace comments {
25
26class Lexer;
27class TextTokenRetokenizer;
28struct CommandInfo;
29class CommandTraits;
30
31namespace tok {
32enum TokenKind {
33  eof,
34  newline,
35  text,
36  unknown_command,   // Command that does not have an ID.
37  backslash_command, // Command with an ID, that used backslash marker.
38  at_command,        // Command with an ID, that used 'at' marker.
39  verbatim_block_begin,
40  verbatim_block_line,
41  verbatim_block_end,
42  verbatim_line_name,
43  verbatim_line_text,
44  html_start_tag,     // <tag
45  html_ident,         // attr
46  html_equals,        // =
47  html_quoted_string, // "blah\"blah" or 'blah\'blah'
48  html_greater,       // >
49  html_slash_greater, // />
50  html_end_tag        // </tag
51};
52} // end namespace tok
53
54/// Comment token.
55class Token {
56  friend class Lexer;
57  friend class TextTokenRetokenizer;
58
59  /// The location of the token.
60  SourceLocation Loc;
61
62  /// The actual kind of the token.
63  tok::TokenKind Kind;
64
65  /// Length of the token spelling in comment.  Can be 0 for synthenized
66  /// tokens.
67  unsigned Length;
68
69  /// Contains text value associated with a token.
70  const char *TextPtr;
71
72  /// Integer value associated with a token.
73  ///
74  /// If the token is a known command, contains command ID and TextPtr is
75  /// unused (command spelling can be found with CommandTraits).  Otherwise,
76  /// contains the length of the string that starts at TextPtr.
77  unsigned IntVal;
78
79public:
80  SourceLocation getLocation() const LLVM_READONLY { return Loc; }
81  void setLocation(SourceLocation SL) { Loc = SL; }
82
83  SourceLocation getEndLocation() const LLVM_READONLY {
84    if (Length == 0 || Length == 1)
85      return Loc;
86    return Loc.getLocWithOffset(Length - 1);
87  }
88
89  tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
90  void setKind(tok::TokenKind K) { Kind = K; }
91
92  bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
93  bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
94
95  unsigned getLength() const LLVM_READONLY { return Length; }
96  void setLength(unsigned L) { Length = L; }
97
98  StringRef getText() const LLVM_READONLY {
99    assert(is(tok::text));
100    return StringRef(TextPtr, IntVal);
101  }
102
103  void setText(StringRef Text) {
104    assert(is(tok::text));
105    TextPtr = Text.data();
106    IntVal = Text.size();
107  }
108
109  StringRef getUnknownCommandName() const LLVM_READONLY {
110    assert(is(tok::unknown_command));
111    return StringRef(TextPtr, IntVal);
112  }
113
114  void setUnknownCommandName(StringRef Name) {
115    assert(is(tok::unknown_command));
116    TextPtr = Name.data();
117    IntVal = Name.size();
118  }
119
120  unsigned getCommandID() const LLVM_READONLY {
121    assert(is(tok::backslash_command) || is(tok::at_command));
122    return IntVal;
123  }
124
125  void setCommandID(unsigned ID) {
126    assert(is(tok::backslash_command) || is(tok::at_command));
127    IntVal = ID;
128  }
129
130  unsigned getVerbatimBlockID() const LLVM_READONLY {
131    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
132    return IntVal;
133  }
134
135  void setVerbatimBlockID(unsigned ID) {
136    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
137    IntVal = ID;
138  }
139
140  StringRef getVerbatimBlockText() const LLVM_READONLY {
141    assert(is(tok::verbatim_block_line));
142    return StringRef(TextPtr, IntVal);
143  }
144
145  void setVerbatimBlockText(StringRef Text) {
146    assert(is(tok::verbatim_block_line));
147    TextPtr = Text.data();
148    IntVal = Text.size();
149  }
150
151  unsigned getVerbatimLineID() const LLVM_READONLY {
152    assert(is(tok::verbatim_line_name));
153    return IntVal;
154  }
155
156  void setVerbatimLineID(unsigned ID) {
157    assert(is(tok::verbatim_line_name));
158    IntVal = ID;
159  }
160
161  StringRef getVerbatimLineText() const LLVM_READONLY {
162    assert(is(tok::verbatim_line_text));
163    return StringRef(TextPtr, IntVal);
164  }
165
166  void setVerbatimLineText(StringRef Text) {
167    assert(is(tok::verbatim_line_text));
168    TextPtr = Text.data();
169    IntVal = Text.size();
170  }
171
172  StringRef getHTMLTagStartName() const LLVM_READONLY {
173    assert(is(tok::html_start_tag));
174    return StringRef(TextPtr, IntVal);
175  }
176
177  void setHTMLTagStartName(StringRef Name) {
178    assert(is(tok::html_start_tag));
179    TextPtr = Name.data();
180    IntVal = Name.size();
181  }
182
183  StringRef getHTMLIdent() const LLVM_READONLY {
184    assert(is(tok::html_ident));
185    return StringRef(TextPtr, IntVal);
186  }
187
188  void setHTMLIdent(StringRef Name) {
189    assert(is(tok::html_ident));
190    TextPtr = Name.data();
191    IntVal = Name.size();
192  }
193
194  StringRef getHTMLQuotedString() const LLVM_READONLY {
195    assert(is(tok::html_quoted_string));
196    return StringRef(TextPtr, IntVal);
197  }
198
199  void setHTMLQuotedString(StringRef Str) {
200    assert(is(tok::html_quoted_string));
201    TextPtr = Str.data();
202    IntVal = Str.size();
203  }
204
205  StringRef getHTMLTagEndName() const LLVM_READONLY {
206    assert(is(tok::html_end_tag));
207    return StringRef(TextPtr, IntVal);
208  }
209
210  void setHTMLTagEndName(StringRef Name) {
211    assert(is(tok::html_end_tag));
212    TextPtr = Name.data();
213    IntVal = Name.size();
214  }
215
216  void dump(const Lexer &L, const SourceManager &SM) const;
217};
218
219/// Comment lexer.
220class Lexer {
221private:
222  Lexer(const Lexer &) = delete;
223  void operator=(const Lexer &) = delete;
224
225  /// Allocator for strings that are semantic values of tokens and have to be
226  /// computed (for example, resolved decimal character references).
227  llvm::BumpPtrAllocator &Allocator;
228
229  DiagnosticsEngine &Diags;
230
231  const CommandTraits &Traits;
232
233  const char *const BufferStart;
234  const char *const BufferEnd;
235  SourceLocation FileLoc;
236
237  const char *BufferPtr;
238
239  /// One past end pointer for the current comment.  For BCPL comments points
240  /// to newline or BufferEnd, for C comments points to star in '*/'.
241  const char *CommentEnd;
242
243  enum LexerCommentState {
244    LCS_BeforeComment,
245    LCS_InsideBCPLComment,
246    LCS_InsideCComment,
247    LCS_BetweenComments
248  };
249
250  /// Low-level lexer state, track if we are inside or outside of comment.
251  LexerCommentState CommentState;
252
253  enum LexerState {
254    /// Lexing normal comment text
255    LS_Normal,
256
257    /// Finished lexing verbatim block beginning command, will lex first body
258    /// line.
259    LS_VerbatimBlockFirstLine,
260
261    /// Lexing verbatim block body line-by-line, skipping line-starting
262    /// decorations.
263    LS_VerbatimBlockBody,
264
265    /// Finished lexing verbatim line beginning command, will lex text (one
266    /// line).
267    LS_VerbatimLineText,
268
269    /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
270    LS_HTMLStartTag,
271
272    /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
273    LS_HTMLEndTag
274  };
275
276  /// Current lexing mode.
277  LexerState State;
278
279  /// If State is LS_VerbatimBlock, contains the name of verbatim end
280  /// command, including command marker.
281  SmallString<16> VerbatimBlockEndCommandName;
282
283  /// If true, the commands, html tags, etc will be parsed and reported as
284  /// separate tokens inside the comment body. If false, the comment text will
285  /// be parsed into text and newline tokens.
286  bool ParseCommands;
287
288  /// Given a character reference name (e.g., "lt"), return the character that
289  /// it stands for (e.g., "<").
290  StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
291
292  /// Given a Unicode codepoint as base-10 integer, return the character.
293  StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
294
295  /// Given a Unicode codepoint as base-16 integer, return the character.
296  StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
297
298  void formTokenWithChars(Token &Result, const char *TokEnd,
299                          tok::TokenKind Kind);
300
301  void formTextToken(Token &Result, const char *TokEnd) {
302    StringRef Text(BufferPtr, TokEnd - BufferPtr);
303    formTokenWithChars(Result, TokEnd, tok::text);
304    Result.setText(Text);
305  }
306
307  SourceLocation getSourceLocation(const char *Loc) const {
308    assert(Loc >= BufferStart && Loc <= BufferEnd &&
309           "Location out of range for this buffer!");
310
311    const unsigned CharNo = Loc - BufferStart;
312    return FileLoc.getLocWithOffset(CharNo);
313  }
314
315  DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
316    return Diags.Report(Loc, DiagID);
317  }
318
319  /// Eat string matching regexp \code \s*\* \endcode.
320  void skipLineStartingDecorations();
321
322  /// Lex comment text, including commands if ParseCommands is set to true.
323  void lexCommentText(Token &T);
324
325  void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
326                                const CommandInfo *Info);
327
328  void lexVerbatimBlockFirstLine(Token &T);
329
330  void lexVerbatimBlockBody(Token &T);
331
332  void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
333                               const CommandInfo *Info);
334
335  void lexVerbatimLineText(Token &T);
336
337  void lexHTMLCharacterReference(Token &T);
338
339  void setupAndLexHTMLStartTag(Token &T);
340
341  void lexHTMLStartTag(Token &T);
342
343  void setupAndLexHTMLEndTag(Token &T);
344
345  void lexHTMLEndTag(Token &T);
346
347public:
348  Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
349        const CommandTraits &Traits, SourceLocation FileLoc,
350        const char *BufferStart, const char *BufferEnd,
351        bool ParseCommands = true);
352
353  void lex(Token &T);
354
355  StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const;
356};
357
358} // end namespace comments
359} // end namespace clang
360
361#endif
362
363