1239313Sdim//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2239313Sdim//
3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4353358Sdim// See https://llvm.org/LICENSE.txt for license information.
5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6239313Sdim//
7239313Sdim//===----------------------------------------------------------------------===//
8239313Sdim//
9239313Sdim//  This file defines lexer for structured comments and supporting token class.
10239313Sdim//
11239313Sdim//===----------------------------------------------------------------------===//
12239313Sdim
13280031Sdim#ifndef LLVM_CLANG_AST_COMMENTLEXER_H
14280031Sdim#define LLVM_CLANG_AST_COMMENTLEXER_H
15239313Sdim
16276479Sdim#include "clang/Basic/Diagnostic.h"
17239313Sdim#include "clang/Basic/SourceManager.h"
18239313Sdim#include "llvm/ADT/SmallString.h"
19249423Sdim#include "llvm/ADT/StringRef.h"
20239313Sdim#include "llvm/Support/Allocator.h"
21239313Sdim#include "llvm/Support/raw_ostream.h"
22239313Sdim
23239313Sdimnamespace clang {
24239313Sdimnamespace comments {
25239313Sdim
26239313Sdimclass Lexer;
27239313Sdimclass TextTokenRetokenizer;
28243830Sdimstruct CommandInfo;
29239313Sdimclass CommandTraits;
30239313Sdim
31239313Sdimnamespace tok {
32239313Sdimenum TokenKind {
33239313Sdim  eof,
34239313Sdim  newline,
35239313Sdim  text,
36249423Sdim  unknown_command,   // Command that does not have an ID.
37249423Sdim  backslash_command, // Command with an ID, that used backslash marker.
38249423Sdim  at_command,        // Command with an ID, that used 'at' marker.
39239313Sdim  verbatim_block_begin,
40239313Sdim  verbatim_block_line,
41239313Sdim  verbatim_block_end,
42239313Sdim  verbatim_line_name,
43239313Sdim  verbatim_line_text,
44239313Sdim  html_start_tag,     // <tag
45239313Sdim  html_ident,         // attr
46239313Sdim  html_equals,        // =
47239313Sdim  html_quoted_string, // "blah\"blah" or 'blah\'blah'
48239313Sdim  html_greater,       // >
49239313Sdim  html_slash_greater, // />
50239313Sdim  html_end_tag        // </tag
51239313Sdim};
52239313Sdim} // end namespace tok
53239313Sdim
54341825Sdim/// Comment token.
55239313Sdimclass Token {
56239313Sdim  friend class Lexer;
57239313Sdim  friend class TextTokenRetokenizer;
58239313Sdim
59239313Sdim  /// The location of the token.
60239313Sdim  SourceLocation Loc;
61239313Sdim
62239313Sdim  /// The actual kind of the token.
63239313Sdim  tok::TokenKind Kind;
64239313Sdim
65239313Sdim  /// Length of the token spelling in comment.  Can be 0 for synthenized
66239313Sdim  /// tokens.
67239313Sdim  unsigned Length;
68239313Sdim
69239313Sdim  /// Contains text value associated with a token.
70243830Sdim  const char *TextPtr;
71239313Sdim
72243830Sdim  /// Integer value associated with a token.
73243830Sdim  ///
74341825Sdim  /// If the token is a known command, contains command ID and TextPtr is
75243830Sdim  /// unused (command spelling can be found with CommandTraits).  Otherwise,
76243830Sdim  /// contains the length of the string that starts at TextPtr.
77243830Sdim  unsigned IntVal;
78341825Sdim
79239313Sdimpublic:
80239313Sdim  SourceLocation getLocation() const LLVM_READONLY { return Loc; }
81239313Sdim  void setLocation(SourceLocation SL) { Loc = SL; }
82239313Sdim
83239313Sdim  SourceLocation getEndLocation() const LLVM_READONLY {
84239313Sdim    if (Length == 0 || Length == 1)
85239313Sdim      return Loc;
86239313Sdim    return Loc.getLocWithOffset(Length - 1);
87239313Sdim  }
88239313Sdim
89239313Sdim  tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
90239313Sdim  void setKind(tok::TokenKind K) { Kind = K; }
91239313Sdim
92239313Sdim  bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
93239313Sdim  bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
94239313Sdim
95239313Sdim  unsigned getLength() const LLVM_READONLY { return Length; }
96239313Sdim  void setLength(unsigned L) { Length = L; }
97239313Sdim
98239313Sdim  StringRef getText() const LLVM_READONLY {
99239313Sdim    assert(is(tok::text));
100243830Sdim    return StringRef(TextPtr, IntVal);
101239313Sdim  }
102239313Sdim
103239313Sdim  void setText(StringRef Text) {
104239313Sdim    assert(is(tok::text));
105243830Sdim    TextPtr = Text.data();
106243830Sdim    IntVal = Text.size();
107239313Sdim  }
108239313Sdim
109243830Sdim  StringRef getUnknownCommandName() const LLVM_READONLY {
110243830Sdim    assert(is(tok::unknown_command));
111243830Sdim    return StringRef(TextPtr, IntVal);
112243830Sdim  }
113243830Sdim
114243830Sdim  void setUnknownCommandName(StringRef Name) {
115243830Sdim    assert(is(tok::unknown_command));
116243830Sdim    TextPtr = Name.data();
117243830Sdim    IntVal = Name.size();
118243830Sdim  }
119243830Sdim
120243830Sdim  unsigned getCommandID() const LLVM_READONLY {
121249423Sdim    assert(is(tok::backslash_command) || is(tok::at_command));
122243830Sdim    return IntVal;
123239313Sdim  }
124239313Sdim
125243830Sdim  void setCommandID(unsigned ID) {
126249423Sdim    assert(is(tok::backslash_command) || is(tok::at_command));
127243830Sdim    IntVal = ID;
128239313Sdim  }
129239313Sdim
130243830Sdim  unsigned getVerbatimBlockID() const LLVM_READONLY {
131239313Sdim    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
132243830Sdim    return IntVal;
133239313Sdim  }
134239313Sdim
135243830Sdim  void setVerbatimBlockID(unsigned ID) {
136239313Sdim    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
137243830Sdim    IntVal = ID;
138239313Sdim  }
139239313Sdim
140239313Sdim  StringRef getVerbatimBlockText() const LLVM_READONLY {
141239313Sdim    assert(is(tok::verbatim_block_line));
142243830Sdim    return StringRef(TextPtr, IntVal);
143239313Sdim  }
144239313Sdim
145239313Sdim  void setVerbatimBlockText(StringRef Text) {
146239313Sdim    assert(is(tok::verbatim_block_line));
147243830Sdim    TextPtr = Text.data();
148243830Sdim    IntVal = Text.size();
149239313Sdim  }
150239313Sdim
151243830Sdim  unsigned getVerbatimLineID() const LLVM_READONLY {
152239313Sdim    assert(is(tok::verbatim_line_name));
153243830Sdim    return IntVal;
154239313Sdim  }
155239313Sdim
156243830Sdim  void setVerbatimLineID(unsigned ID) {
157239313Sdim    assert(is(tok::verbatim_line_name));
158243830Sdim    IntVal = ID;
159239313Sdim  }
160239313Sdim
161239313Sdim  StringRef getVerbatimLineText() const LLVM_READONLY {
162239313Sdim    assert(is(tok::verbatim_line_text));
163243830Sdim    return StringRef(TextPtr, IntVal);
164239313Sdim  }
165239313Sdim
166239313Sdim  void setVerbatimLineText(StringRef Text) {
167239313Sdim    assert(is(tok::verbatim_line_text));
168243830Sdim    TextPtr = Text.data();
169243830Sdim    IntVal = Text.size();
170239313Sdim  }
171239313Sdim
172239313Sdim  StringRef getHTMLTagStartName() const LLVM_READONLY {
173239313Sdim    assert(is(tok::html_start_tag));
174243830Sdim    return StringRef(TextPtr, IntVal);
175239313Sdim  }
176239313Sdim
177239313Sdim  void setHTMLTagStartName(StringRef Name) {
178239313Sdim    assert(is(tok::html_start_tag));
179243830Sdim    TextPtr = Name.data();
180243830Sdim    IntVal = Name.size();
181239313Sdim  }
182239313Sdim
183239313Sdim  StringRef getHTMLIdent() const LLVM_READONLY {
184239313Sdim    assert(is(tok::html_ident));
185243830Sdim    return StringRef(TextPtr, IntVal);
186239313Sdim  }
187239313Sdim
188239313Sdim  void setHTMLIdent(StringRef Name) {
189239313Sdim    assert(is(tok::html_ident));
190243830Sdim    TextPtr = Name.data();
191243830Sdim    IntVal = Name.size();
192239313Sdim  }
193239313Sdim
194239313Sdim  StringRef getHTMLQuotedString() const LLVM_READONLY {
195239313Sdim    assert(is(tok::html_quoted_string));
196243830Sdim    return StringRef(TextPtr, IntVal);
197239313Sdim  }
198239313Sdim
199239313Sdim  void setHTMLQuotedString(StringRef Str) {
200239313Sdim    assert(is(tok::html_quoted_string));
201243830Sdim    TextPtr = Str.data();
202243830Sdim    IntVal = Str.size();
203239313Sdim  }
204239313Sdim
205239313Sdim  StringRef getHTMLTagEndName() const LLVM_READONLY {
206239313Sdim    assert(is(tok::html_end_tag));
207243830Sdim    return StringRef(TextPtr, IntVal);
208239313Sdim  }
209239313Sdim
210239313Sdim  void setHTMLTagEndName(StringRef Name) {
211239313Sdim    assert(is(tok::html_end_tag));
212243830Sdim    TextPtr = Name.data();
213243830Sdim    IntVal = Name.size();
214239313Sdim  }
215239313Sdim
216239313Sdim  void dump(const Lexer &L, const SourceManager &SM) const;
217239313Sdim};
218239313Sdim
219341825Sdim/// Comment lexer.
220239313Sdimclass Lexer {
221239313Sdimprivate:
222288943Sdim  Lexer(const Lexer &) = delete;
223288943Sdim  void operator=(const Lexer &) = delete;
224239313Sdim
225239313Sdim  /// Allocator for strings that are semantic values of tokens and have to be
226239313Sdim  /// computed (for example, resolved decimal character references).
227239313Sdim  llvm::BumpPtrAllocator &Allocator;
228239313Sdim
229251662Sdim  DiagnosticsEngine &Diags;
230341825Sdim
231239313Sdim  const CommandTraits &Traits;
232239313Sdim
233239313Sdim  const char *const BufferStart;
234239313Sdim  const char *const BufferEnd;
235239313Sdim  SourceLocation FileLoc;
236239313Sdim
237239313Sdim  const char *BufferPtr;
238239313Sdim
239239313Sdim  /// One past end pointer for the current comment.  For BCPL comments points
240239313Sdim  /// to newline or BufferEnd, for C comments points to star in '*/'.
241239313Sdim  const char *CommentEnd;
242239313Sdim
243239313Sdim  enum LexerCommentState {
244239313Sdim    LCS_BeforeComment,
245239313Sdim    LCS_InsideBCPLComment,
246239313Sdim    LCS_InsideCComment,
247239313Sdim    LCS_BetweenComments
248239313Sdim  };
249239313Sdim
250239313Sdim  /// Low-level lexer state, track if we are inside or outside of comment.
251239313Sdim  LexerCommentState CommentState;
252239313Sdim
253239313Sdim  enum LexerState {
254239313Sdim    /// Lexing normal comment text
255239313Sdim    LS_Normal,
256239313Sdim
257239313Sdim    /// Finished lexing verbatim block beginning command, will lex first body
258239313Sdim    /// line.
259239313Sdim    LS_VerbatimBlockFirstLine,
260239313Sdim
261239313Sdim    /// Lexing verbatim block body line-by-line, skipping line-starting
262239313Sdim    /// decorations.
263239313Sdim    LS_VerbatimBlockBody,
264239313Sdim
265239313Sdim    /// Finished lexing verbatim line beginning command, will lex text (one
266239313Sdim    /// line).
267239313Sdim    LS_VerbatimLineText,
268239313Sdim
269239313Sdim    /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
270239313Sdim    LS_HTMLStartTag,
271239313Sdim
272239313Sdim    /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
273239313Sdim    LS_HTMLEndTag
274239313Sdim  };
275239313Sdim
276239313Sdim  /// Current lexing mode.
277239313Sdim  LexerState State;
278239313Sdim
279239313Sdim  /// If State is LS_VerbatimBlock, contains the name of verbatim end
280239313Sdim  /// command, including command marker.
281239313Sdim  SmallString<16> VerbatimBlockEndCommandName;
282239313Sdim
283341825Sdim  /// If true, the commands, html tags, etc will be parsed and reported as
284341825Sdim  /// separate tokens inside the comment body. If false, the comment text will
285341825Sdim  /// be parsed into text and newline tokens.
286341825Sdim  bool ParseCommands;
287341825Sdim
288239313Sdim  /// Given a character reference name (e.g., "lt"), return the character that
289239313Sdim  /// it stands for (e.g., "<").
290239313Sdim  StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
291239313Sdim
292239313Sdim  /// Given a Unicode codepoint as base-10 integer, return the character.
293239313Sdim  StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
294239313Sdim
295239313Sdim  /// Given a Unicode codepoint as base-16 integer, return the character.
296239313Sdim  StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
297239313Sdim
298239313Sdim  void formTokenWithChars(Token &Result, const char *TokEnd,
299276479Sdim                          tok::TokenKind Kind);
300239313Sdim
301239313Sdim  void formTextToken(Token &Result, const char *TokEnd) {
302239313Sdim    StringRef Text(BufferPtr, TokEnd - BufferPtr);
303239313Sdim    formTokenWithChars(Result, TokEnd, tok::text);
304239313Sdim    Result.setText(Text);
305239313Sdim  }
306239313Sdim
307239313Sdim  SourceLocation getSourceLocation(const char *Loc) const {
308239313Sdim    assert(Loc >= BufferStart && Loc <= BufferEnd &&
309239313Sdim           "Location out of range for this buffer!");
310239313Sdim
311239313Sdim    const unsigned CharNo = Loc - BufferStart;
312239313Sdim    return FileLoc.getLocWithOffset(CharNo);
313239313Sdim  }
314239313Sdim
315251662Sdim  DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
316251662Sdim    return Diags.Report(Loc, DiagID);
317251662Sdim  }
318251662Sdim
319239313Sdim  /// Eat string matching regexp \code \s*\* \endcode.
320239313Sdim  void skipLineStartingDecorations();
321239313Sdim
322341825Sdim  /// Lex comment text, including commands if ParseCommands is set to true.
323239313Sdim  void lexCommentText(Token &T);
324239313Sdim
325341825Sdim  void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
326341825Sdim                                const CommandInfo *Info);
327239313Sdim
328239313Sdim  void lexVerbatimBlockFirstLine(Token &T);
329239313Sdim
330239313Sdim  void lexVerbatimBlockBody(Token &T);
331239313Sdim
332243830Sdim  void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
333243830Sdim                               const CommandInfo *Info);
334239313Sdim
335239313Sdim  void lexVerbatimLineText(Token &T);
336239313Sdim
337239313Sdim  void lexHTMLCharacterReference(Token &T);
338239313Sdim
339239313Sdim  void setupAndLexHTMLStartTag(Token &T);
340239313Sdim
341239313Sdim  void lexHTMLStartTag(Token &T);
342239313Sdim
343239313Sdim  void setupAndLexHTMLEndTag(Token &T);
344239313Sdim
345239313Sdim  void lexHTMLEndTag(Token &T);
346239313Sdim
347239313Sdimpublic:
348251662Sdim  Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
349341825Sdim        const CommandTraits &Traits, SourceLocation FileLoc,
350341825Sdim        const char *BufferStart, const char *BufferEnd,
351341825Sdim        bool ParseCommands = true);
352239313Sdim
353239313Sdim  void lex(Token &T);
354239313Sdim
355360784Sdim  StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const;
356239313Sdim};
357239313Sdim
358239313Sdim} // end namespace comments
359239313Sdim} // end namespace clang
360239313Sdim
361239313Sdim#endif
362239313Sdim
363