1239313Sdim//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// 2239313Sdim// 3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim// See https://llvm.org/LICENSE.txt for license information. 5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6239313Sdim// 7239313Sdim//===----------------------------------------------------------------------===// 8239313Sdim// 9239313Sdim// This file defines lexer for structured comments and supporting token class. 10239313Sdim// 11239313Sdim//===----------------------------------------------------------------------===// 12239313Sdim 13280031Sdim#ifndef LLVM_CLANG_AST_COMMENTLEXER_H 14280031Sdim#define LLVM_CLANG_AST_COMMENTLEXER_H 15239313Sdim 16276479Sdim#include "clang/Basic/Diagnostic.h" 17239313Sdim#include "clang/Basic/SourceManager.h" 18239313Sdim#include "llvm/ADT/SmallString.h" 19249423Sdim#include "llvm/ADT/StringRef.h" 20239313Sdim#include "llvm/Support/Allocator.h" 21239313Sdim#include "llvm/Support/raw_ostream.h" 22239313Sdim 23239313Sdimnamespace clang { 24239313Sdimnamespace comments { 25239313Sdim 26239313Sdimclass Lexer; 27239313Sdimclass TextTokenRetokenizer; 28243830Sdimstruct CommandInfo; 29239313Sdimclass CommandTraits; 30239313Sdim 31239313Sdimnamespace tok { 32239313Sdimenum TokenKind { 33239313Sdim eof, 34239313Sdim newline, 35239313Sdim text, 36249423Sdim unknown_command, // Command that does not have an ID. 37249423Sdim backslash_command, // Command with an ID, that used backslash marker. 38249423Sdim at_command, // Command with an ID, that used 'at' marker. 39239313Sdim verbatim_block_begin, 40239313Sdim verbatim_block_line, 41239313Sdim verbatim_block_end, 42239313Sdim verbatim_line_name, 43239313Sdim verbatim_line_text, 44239313Sdim html_start_tag, // <tag 45239313Sdim html_ident, // attr 46239313Sdim html_equals, // = 47239313Sdim html_quoted_string, // "blah\"blah" or 'blah\'blah' 48239313Sdim html_greater, // > 49239313Sdim html_slash_greater, // /> 50239313Sdim html_end_tag // </tag 51239313Sdim}; 52239313Sdim} // end namespace tok 53239313Sdim 54341825Sdim/// Comment token. 55239313Sdimclass Token { 56239313Sdim friend class Lexer; 57239313Sdim friend class TextTokenRetokenizer; 58239313Sdim 59239313Sdim /// The location of the token. 60239313Sdim SourceLocation Loc; 61239313Sdim 62239313Sdim /// The actual kind of the token. 63239313Sdim tok::TokenKind Kind; 64239313Sdim 65239313Sdim /// Length of the token spelling in comment. Can be 0 for synthenized 66239313Sdim /// tokens. 67239313Sdim unsigned Length; 68239313Sdim 69239313Sdim /// Contains text value associated with a token. 70243830Sdim const char *TextPtr; 71239313Sdim 72243830Sdim /// Integer value associated with a token. 73243830Sdim /// 74341825Sdim /// If the token is a known command, contains command ID and TextPtr is 75243830Sdim /// unused (command spelling can be found with CommandTraits). Otherwise, 76243830Sdim /// contains the length of the string that starts at TextPtr. 77243830Sdim unsigned IntVal; 78341825Sdim 79239313Sdimpublic: 80239313Sdim SourceLocation getLocation() const LLVM_READONLY { return Loc; } 81239313Sdim void setLocation(SourceLocation SL) { Loc = SL; } 82239313Sdim 83239313Sdim SourceLocation getEndLocation() const LLVM_READONLY { 84239313Sdim if (Length == 0 || Length == 1) 85239313Sdim return Loc; 86239313Sdim return Loc.getLocWithOffset(Length - 1); 87239313Sdim } 88239313Sdim 89239313Sdim tok::TokenKind getKind() const LLVM_READONLY { return Kind; } 90239313Sdim void setKind(tok::TokenKind K) { Kind = K; } 91239313Sdim 92239313Sdim bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } 93239313Sdim bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; } 94239313Sdim 95239313Sdim unsigned getLength() const LLVM_READONLY { return Length; } 96239313Sdim void setLength(unsigned L) { Length = L; } 97239313Sdim 98239313Sdim StringRef getText() const LLVM_READONLY { 99239313Sdim assert(is(tok::text)); 100243830Sdim return StringRef(TextPtr, IntVal); 101239313Sdim } 102239313Sdim 103239313Sdim void setText(StringRef Text) { 104239313Sdim assert(is(tok::text)); 105243830Sdim TextPtr = Text.data(); 106243830Sdim IntVal = Text.size(); 107239313Sdim } 108239313Sdim 109243830Sdim StringRef getUnknownCommandName() const LLVM_READONLY { 110243830Sdim assert(is(tok::unknown_command)); 111243830Sdim return StringRef(TextPtr, IntVal); 112243830Sdim } 113243830Sdim 114243830Sdim void setUnknownCommandName(StringRef Name) { 115243830Sdim assert(is(tok::unknown_command)); 116243830Sdim TextPtr = Name.data(); 117243830Sdim IntVal = Name.size(); 118243830Sdim } 119243830Sdim 120243830Sdim unsigned getCommandID() const LLVM_READONLY { 121249423Sdim assert(is(tok::backslash_command) || is(tok::at_command)); 122243830Sdim return IntVal; 123239313Sdim } 124239313Sdim 125243830Sdim void setCommandID(unsigned ID) { 126249423Sdim assert(is(tok::backslash_command) || is(tok::at_command)); 127243830Sdim IntVal = ID; 128239313Sdim } 129239313Sdim 130243830Sdim unsigned getVerbatimBlockID() const LLVM_READONLY { 131239313Sdim assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 132243830Sdim return IntVal; 133239313Sdim } 134239313Sdim 135243830Sdim void setVerbatimBlockID(unsigned ID) { 136239313Sdim assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 137243830Sdim IntVal = ID; 138239313Sdim } 139239313Sdim 140239313Sdim StringRef getVerbatimBlockText() const LLVM_READONLY { 141239313Sdim assert(is(tok::verbatim_block_line)); 142243830Sdim return StringRef(TextPtr, IntVal); 143239313Sdim } 144239313Sdim 145239313Sdim void setVerbatimBlockText(StringRef Text) { 146239313Sdim assert(is(tok::verbatim_block_line)); 147243830Sdim TextPtr = Text.data(); 148243830Sdim IntVal = Text.size(); 149239313Sdim } 150239313Sdim 151243830Sdim unsigned getVerbatimLineID() const LLVM_READONLY { 152239313Sdim assert(is(tok::verbatim_line_name)); 153243830Sdim return IntVal; 154239313Sdim } 155239313Sdim 156243830Sdim void setVerbatimLineID(unsigned ID) { 157239313Sdim assert(is(tok::verbatim_line_name)); 158243830Sdim IntVal = ID; 159239313Sdim } 160239313Sdim 161239313Sdim StringRef getVerbatimLineText() const LLVM_READONLY { 162239313Sdim assert(is(tok::verbatim_line_text)); 163243830Sdim return StringRef(TextPtr, IntVal); 164239313Sdim } 165239313Sdim 166239313Sdim void setVerbatimLineText(StringRef Text) { 167239313Sdim assert(is(tok::verbatim_line_text)); 168243830Sdim TextPtr = Text.data(); 169243830Sdim IntVal = Text.size(); 170239313Sdim } 171239313Sdim 172239313Sdim StringRef getHTMLTagStartName() const LLVM_READONLY { 173239313Sdim assert(is(tok::html_start_tag)); 174243830Sdim return StringRef(TextPtr, IntVal); 175239313Sdim } 176239313Sdim 177239313Sdim void setHTMLTagStartName(StringRef Name) { 178239313Sdim assert(is(tok::html_start_tag)); 179243830Sdim TextPtr = Name.data(); 180243830Sdim IntVal = Name.size(); 181239313Sdim } 182239313Sdim 183239313Sdim StringRef getHTMLIdent() const LLVM_READONLY { 184239313Sdim assert(is(tok::html_ident)); 185243830Sdim return StringRef(TextPtr, IntVal); 186239313Sdim } 187239313Sdim 188239313Sdim void setHTMLIdent(StringRef Name) { 189239313Sdim assert(is(tok::html_ident)); 190243830Sdim TextPtr = Name.data(); 191243830Sdim IntVal = Name.size(); 192239313Sdim } 193239313Sdim 194239313Sdim StringRef getHTMLQuotedString() const LLVM_READONLY { 195239313Sdim assert(is(tok::html_quoted_string)); 196243830Sdim return StringRef(TextPtr, IntVal); 197239313Sdim } 198239313Sdim 199239313Sdim void setHTMLQuotedString(StringRef Str) { 200239313Sdim assert(is(tok::html_quoted_string)); 201243830Sdim TextPtr = Str.data(); 202243830Sdim IntVal = Str.size(); 203239313Sdim } 204239313Sdim 205239313Sdim StringRef getHTMLTagEndName() const LLVM_READONLY { 206239313Sdim assert(is(tok::html_end_tag)); 207243830Sdim return StringRef(TextPtr, IntVal); 208239313Sdim } 209239313Sdim 210239313Sdim void setHTMLTagEndName(StringRef Name) { 211239313Sdim assert(is(tok::html_end_tag)); 212243830Sdim TextPtr = Name.data(); 213243830Sdim IntVal = Name.size(); 214239313Sdim } 215239313Sdim 216239313Sdim void dump(const Lexer &L, const SourceManager &SM) const; 217239313Sdim}; 218239313Sdim 219341825Sdim/// Comment lexer. 220239313Sdimclass Lexer { 221239313Sdimprivate: 222288943Sdim Lexer(const Lexer &) = delete; 223288943Sdim void operator=(const Lexer &) = delete; 224239313Sdim 225239313Sdim /// Allocator for strings that are semantic values of tokens and have to be 226239313Sdim /// computed (for example, resolved decimal character references). 227239313Sdim llvm::BumpPtrAllocator &Allocator; 228239313Sdim 229251662Sdim DiagnosticsEngine &Diags; 230341825Sdim 231239313Sdim const CommandTraits &Traits; 232239313Sdim 233239313Sdim const char *const BufferStart; 234239313Sdim const char *const BufferEnd; 235239313Sdim SourceLocation FileLoc; 236239313Sdim 237239313Sdim const char *BufferPtr; 238239313Sdim 239239313Sdim /// One past end pointer for the current comment. For BCPL comments points 240239313Sdim /// to newline or BufferEnd, for C comments points to star in '*/'. 241239313Sdim const char *CommentEnd; 242239313Sdim 243239313Sdim enum LexerCommentState { 244239313Sdim LCS_BeforeComment, 245239313Sdim LCS_InsideBCPLComment, 246239313Sdim LCS_InsideCComment, 247239313Sdim LCS_BetweenComments 248239313Sdim }; 249239313Sdim 250239313Sdim /// Low-level lexer state, track if we are inside or outside of comment. 251239313Sdim LexerCommentState CommentState; 252239313Sdim 253239313Sdim enum LexerState { 254239313Sdim /// Lexing normal comment text 255239313Sdim LS_Normal, 256239313Sdim 257239313Sdim /// Finished lexing verbatim block beginning command, will lex first body 258239313Sdim /// line. 259239313Sdim LS_VerbatimBlockFirstLine, 260239313Sdim 261239313Sdim /// Lexing verbatim block body line-by-line, skipping line-starting 262239313Sdim /// decorations. 263239313Sdim LS_VerbatimBlockBody, 264239313Sdim 265239313Sdim /// Finished lexing verbatim line beginning command, will lex text (one 266239313Sdim /// line). 267239313Sdim LS_VerbatimLineText, 268239313Sdim 269239313Sdim /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. 270239313Sdim LS_HTMLStartTag, 271239313Sdim 272239313Sdim /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. 273239313Sdim LS_HTMLEndTag 274239313Sdim }; 275239313Sdim 276239313Sdim /// Current lexing mode. 277239313Sdim LexerState State; 278239313Sdim 279239313Sdim /// If State is LS_VerbatimBlock, contains the name of verbatim end 280239313Sdim /// command, including command marker. 281239313Sdim SmallString<16> VerbatimBlockEndCommandName; 282239313Sdim 283341825Sdim /// If true, the commands, html tags, etc will be parsed and reported as 284341825Sdim /// separate tokens inside the comment body. If false, the comment text will 285341825Sdim /// be parsed into text and newline tokens. 286341825Sdim bool ParseCommands; 287341825Sdim 288239313Sdim /// Given a character reference name (e.g., "lt"), return the character that 289239313Sdim /// it stands for (e.g., "<"). 290239313Sdim StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; 291239313Sdim 292239313Sdim /// Given a Unicode codepoint as base-10 integer, return the character. 293239313Sdim StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const; 294239313Sdim 295239313Sdim /// Given a Unicode codepoint as base-16 integer, return the character. 296239313Sdim StringRef resolveHTMLHexCharacterReference(StringRef Name) const; 297239313Sdim 298239313Sdim void formTokenWithChars(Token &Result, const char *TokEnd, 299276479Sdim tok::TokenKind Kind); 300239313Sdim 301239313Sdim void formTextToken(Token &Result, const char *TokEnd) { 302239313Sdim StringRef Text(BufferPtr, TokEnd - BufferPtr); 303239313Sdim formTokenWithChars(Result, TokEnd, tok::text); 304239313Sdim Result.setText(Text); 305239313Sdim } 306239313Sdim 307239313Sdim SourceLocation getSourceLocation(const char *Loc) const { 308239313Sdim assert(Loc >= BufferStart && Loc <= BufferEnd && 309239313Sdim "Location out of range for this buffer!"); 310239313Sdim 311239313Sdim const unsigned CharNo = Loc - BufferStart; 312239313Sdim return FileLoc.getLocWithOffset(CharNo); 313239313Sdim } 314239313Sdim 315251662Sdim DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) { 316251662Sdim return Diags.Report(Loc, DiagID); 317251662Sdim } 318251662Sdim 319239313Sdim /// Eat string matching regexp \code \s*\* \endcode. 320239313Sdim void skipLineStartingDecorations(); 321239313Sdim 322341825Sdim /// Lex comment text, including commands if ParseCommands is set to true. 323239313Sdim void lexCommentText(Token &T); 324239313Sdim 325341825Sdim void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker, 326341825Sdim const CommandInfo *Info); 327239313Sdim 328239313Sdim void lexVerbatimBlockFirstLine(Token &T); 329239313Sdim 330239313Sdim void lexVerbatimBlockBody(Token &T); 331239313Sdim 332243830Sdim void setupAndLexVerbatimLine(Token &T, const char *TextBegin, 333243830Sdim const CommandInfo *Info); 334239313Sdim 335239313Sdim void lexVerbatimLineText(Token &T); 336239313Sdim 337239313Sdim void lexHTMLCharacterReference(Token &T); 338239313Sdim 339239313Sdim void setupAndLexHTMLStartTag(Token &T); 340239313Sdim 341239313Sdim void lexHTMLStartTag(Token &T); 342239313Sdim 343239313Sdim void setupAndLexHTMLEndTag(Token &T); 344239313Sdim 345239313Sdim void lexHTMLEndTag(Token &T); 346239313Sdim 347239313Sdimpublic: 348251662Sdim Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, 349341825Sdim const CommandTraits &Traits, SourceLocation FileLoc, 350341825Sdim const char *BufferStart, const char *BufferEnd, 351341825Sdim bool ParseCommands = true); 352239313Sdim 353239313Sdim void lex(Token &T); 354239313Sdim 355360784Sdim StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const; 356239313Sdim}; 357239313Sdim 358239313Sdim} // end namespace comments 359239313Sdim} // end namespace clang 360239313Sdim 361239313Sdim#endif 362239313Sdim 363