1239313Sdim//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// 2239313Sdim// 3239313Sdim// The LLVM Compiler Infrastructure 4239313Sdim// 5239313Sdim// This file is distributed under the University of Illinois Open Source 6239313Sdim// License. See LICENSE.TXT for details. 7239313Sdim// 8239313Sdim//===----------------------------------------------------------------------===// 9239313Sdim// 10239313Sdim// This file defines lexer for structured comments and supporting token class. 11239313Sdim// 12239313Sdim//===----------------------------------------------------------------------===// 13239313Sdim 14239313Sdim#ifndef LLVM_CLANG_AST_COMMENT_LEXER_H 15239313Sdim#define LLVM_CLANG_AST_COMMENT_LEXER_H 16239313Sdim 17239313Sdim#include "clang/Basic/SourceManager.h" 18251662Sdim#include "clang/Basic/Diagnostic.h" 19239313Sdim#include "llvm/ADT/SmallString.h" 20239313Sdim#include "llvm/ADT/SmallVector.h" 21249423Sdim#include "llvm/ADT/StringRef.h" 22239313Sdim#include "llvm/Support/Allocator.h" 23239313Sdim#include "llvm/Support/raw_ostream.h" 24239313Sdim 25239313Sdimnamespace clang { 26239313Sdimnamespace comments { 27239313Sdim 28239313Sdimclass Lexer; 29239313Sdimclass TextTokenRetokenizer; 30243830Sdimstruct CommandInfo; 31239313Sdimclass CommandTraits; 32239313Sdim 33239313Sdimnamespace tok { 34239313Sdimenum TokenKind { 35239313Sdim eof, 36239313Sdim newline, 37239313Sdim text, 38249423Sdim unknown_command, // Command that does not have an ID. 39249423Sdim backslash_command, // Command with an ID, that used backslash marker. 40249423Sdim at_command, // Command with an ID, that used 'at' marker. 41239313Sdim verbatim_block_begin, 42239313Sdim verbatim_block_line, 43239313Sdim verbatim_block_end, 44239313Sdim verbatim_line_name, 45239313Sdim verbatim_line_text, 46239313Sdim html_start_tag, // <tag 47239313Sdim html_ident, // attr 48239313Sdim html_equals, // = 49239313Sdim html_quoted_string, // "blah\"blah" or 'blah\'blah' 50239313Sdim html_greater, // > 51239313Sdim html_slash_greater, // /> 52239313Sdim html_end_tag // </tag 53239313Sdim}; 54239313Sdim} // end namespace tok 55239313Sdim 56239313Sdim/// \brief Comment token. 57239313Sdimclass Token { 58239313Sdim friend class Lexer; 59239313Sdim friend class TextTokenRetokenizer; 60239313Sdim 61239313Sdim /// The location of the token. 62239313Sdim SourceLocation Loc; 63239313Sdim 64239313Sdim /// The actual kind of the token. 65239313Sdim tok::TokenKind Kind; 66239313Sdim 67239313Sdim /// Length of the token spelling in comment. Can be 0 for synthenized 68239313Sdim /// tokens. 69239313Sdim unsigned Length; 70239313Sdim 71239313Sdim /// Contains text value associated with a token. 72243830Sdim const char *TextPtr; 73239313Sdim 74243830Sdim /// Integer value associated with a token. 75243830Sdim /// 76243830Sdim /// If the token is a konwn command, contains command ID and TextPtr is 77243830Sdim /// unused (command spelling can be found with CommandTraits). Otherwise, 78243830Sdim /// contains the length of the string that starts at TextPtr. 79243830Sdim unsigned IntVal; 80249423Sdim 81239313Sdimpublic: 82239313Sdim SourceLocation getLocation() const LLVM_READONLY { return Loc; } 83239313Sdim void setLocation(SourceLocation SL) { Loc = SL; } 84239313Sdim 85239313Sdim SourceLocation getEndLocation() const LLVM_READONLY { 86239313Sdim if (Length == 0 || Length == 1) 87239313Sdim return Loc; 88239313Sdim return Loc.getLocWithOffset(Length - 1); 89239313Sdim } 90239313Sdim 91239313Sdim tok::TokenKind getKind() const LLVM_READONLY { return Kind; } 92239313Sdim void setKind(tok::TokenKind K) { Kind = K; } 93239313Sdim 94239313Sdim bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } 95239313Sdim bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; } 96239313Sdim 97239313Sdim unsigned getLength() const LLVM_READONLY { return Length; } 98239313Sdim void setLength(unsigned L) { Length = L; } 99239313Sdim 100239313Sdim StringRef getText() const LLVM_READONLY { 101239313Sdim assert(is(tok::text)); 102243830Sdim return StringRef(TextPtr, IntVal); 103239313Sdim } 104239313Sdim 105239313Sdim void setText(StringRef Text) { 106239313Sdim assert(is(tok::text)); 107243830Sdim TextPtr = Text.data(); 108243830Sdim IntVal = Text.size(); 109239313Sdim } 110239313Sdim 111243830Sdim StringRef getUnknownCommandName() const LLVM_READONLY { 112243830Sdim assert(is(tok::unknown_command)); 113243830Sdim return StringRef(TextPtr, IntVal); 114243830Sdim } 115243830Sdim 116243830Sdim void setUnknownCommandName(StringRef Name) { 117243830Sdim assert(is(tok::unknown_command)); 118243830Sdim TextPtr = Name.data(); 119243830Sdim IntVal = Name.size(); 120243830Sdim } 121243830Sdim 122243830Sdim unsigned getCommandID() const LLVM_READONLY { 123249423Sdim assert(is(tok::backslash_command) || is(tok::at_command)); 124243830Sdim return IntVal; 125239313Sdim } 126239313Sdim 127243830Sdim void setCommandID(unsigned ID) { 128249423Sdim assert(is(tok::backslash_command) || is(tok::at_command)); 129243830Sdim IntVal = ID; 130239313Sdim } 131239313Sdim 132243830Sdim unsigned getVerbatimBlockID() const LLVM_READONLY { 133239313Sdim assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 134243830Sdim return IntVal; 135239313Sdim } 136239313Sdim 137243830Sdim void setVerbatimBlockID(unsigned ID) { 138239313Sdim assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 139243830Sdim IntVal = ID; 140239313Sdim } 141239313Sdim 142239313Sdim StringRef getVerbatimBlockText() const LLVM_READONLY { 143239313Sdim assert(is(tok::verbatim_block_line)); 144243830Sdim return StringRef(TextPtr, IntVal); 145239313Sdim } 146239313Sdim 147239313Sdim void setVerbatimBlockText(StringRef Text) { 148239313Sdim assert(is(tok::verbatim_block_line)); 149243830Sdim TextPtr = Text.data(); 150243830Sdim IntVal = Text.size(); 151239313Sdim } 152239313Sdim 153243830Sdim unsigned getVerbatimLineID() const LLVM_READONLY { 154239313Sdim assert(is(tok::verbatim_line_name)); 155243830Sdim return IntVal; 156239313Sdim } 157239313Sdim 158243830Sdim void setVerbatimLineID(unsigned ID) { 159239313Sdim assert(is(tok::verbatim_line_name)); 160243830Sdim IntVal = ID; 161239313Sdim } 162239313Sdim 163239313Sdim StringRef getVerbatimLineText() const LLVM_READONLY { 164239313Sdim assert(is(tok::verbatim_line_text)); 165243830Sdim return StringRef(TextPtr, IntVal); 166239313Sdim } 167239313Sdim 168239313Sdim void setVerbatimLineText(StringRef Text) { 169239313Sdim assert(is(tok::verbatim_line_text)); 170243830Sdim TextPtr = Text.data(); 171243830Sdim IntVal = Text.size(); 172239313Sdim } 173239313Sdim 174239313Sdim StringRef getHTMLTagStartName() const LLVM_READONLY { 175239313Sdim assert(is(tok::html_start_tag)); 176243830Sdim return StringRef(TextPtr, IntVal); 177239313Sdim } 178239313Sdim 179239313Sdim void setHTMLTagStartName(StringRef Name) { 180239313Sdim assert(is(tok::html_start_tag)); 181243830Sdim TextPtr = Name.data(); 182243830Sdim IntVal = Name.size(); 183239313Sdim } 184239313Sdim 185239313Sdim StringRef getHTMLIdent() const LLVM_READONLY { 186239313Sdim assert(is(tok::html_ident)); 187243830Sdim return StringRef(TextPtr, IntVal); 188239313Sdim } 189239313Sdim 190239313Sdim void setHTMLIdent(StringRef Name) { 191239313Sdim assert(is(tok::html_ident)); 192243830Sdim TextPtr = Name.data(); 193243830Sdim IntVal = Name.size(); 194239313Sdim } 195239313Sdim 196239313Sdim StringRef getHTMLQuotedString() const LLVM_READONLY { 197239313Sdim assert(is(tok::html_quoted_string)); 198243830Sdim return StringRef(TextPtr, IntVal); 199239313Sdim } 200239313Sdim 201239313Sdim void setHTMLQuotedString(StringRef Str) { 202239313Sdim assert(is(tok::html_quoted_string)); 203243830Sdim TextPtr = Str.data(); 204243830Sdim IntVal = Str.size(); 205239313Sdim } 206239313Sdim 207239313Sdim StringRef getHTMLTagEndName() const LLVM_READONLY { 208239313Sdim assert(is(tok::html_end_tag)); 209243830Sdim return StringRef(TextPtr, IntVal); 210239313Sdim } 211239313Sdim 212239313Sdim void setHTMLTagEndName(StringRef Name) { 213239313Sdim assert(is(tok::html_end_tag)); 214243830Sdim TextPtr = Name.data(); 215243830Sdim IntVal = Name.size(); 216239313Sdim } 217239313Sdim 218239313Sdim void dump(const Lexer &L, const SourceManager &SM) const; 219239313Sdim}; 220239313Sdim 221239313Sdim/// \brief Comment lexer. 222239313Sdimclass Lexer { 223239313Sdimprivate: 224243830Sdim Lexer(const Lexer &) LLVM_DELETED_FUNCTION; 225243830Sdim void operator=(const Lexer &) LLVM_DELETED_FUNCTION; 226239313Sdim 227239313Sdim /// Allocator for strings that are semantic values of tokens and have to be 228239313Sdim /// computed (for example, resolved decimal character references). 229239313Sdim llvm::BumpPtrAllocator &Allocator; 230239313Sdim 231251662Sdim DiagnosticsEngine &Diags; 232251662Sdim 233239313Sdim const CommandTraits &Traits; 234239313Sdim 235239313Sdim const char *const BufferStart; 236239313Sdim const char *const BufferEnd; 237239313Sdim SourceLocation FileLoc; 238239313Sdim 239239313Sdim const char *BufferPtr; 240239313Sdim 241239313Sdim /// One past end pointer for the current comment. For BCPL comments points 242239313Sdim /// to newline or BufferEnd, for C comments points to star in '*/'. 243239313Sdim const char *CommentEnd; 244239313Sdim 245239313Sdim enum LexerCommentState { 246239313Sdim LCS_BeforeComment, 247239313Sdim LCS_InsideBCPLComment, 248239313Sdim LCS_InsideCComment, 249239313Sdim LCS_BetweenComments 250239313Sdim }; 251239313Sdim 252239313Sdim /// Low-level lexer state, track if we are inside or outside of comment. 253239313Sdim LexerCommentState CommentState; 254239313Sdim 255239313Sdim enum LexerState { 256239313Sdim /// Lexing normal comment text 257239313Sdim LS_Normal, 258239313Sdim 259239313Sdim /// Finished lexing verbatim block beginning command, will lex first body 260239313Sdim /// line. 261239313Sdim LS_VerbatimBlockFirstLine, 262239313Sdim 263239313Sdim /// Lexing verbatim block body line-by-line, skipping line-starting 264239313Sdim /// decorations. 265239313Sdim LS_VerbatimBlockBody, 266239313Sdim 267239313Sdim /// Finished lexing verbatim line beginning command, will lex text (one 268239313Sdim /// line). 269239313Sdim LS_VerbatimLineText, 270239313Sdim 271239313Sdim /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. 272239313Sdim LS_HTMLStartTag, 273239313Sdim 274239313Sdim /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. 275239313Sdim LS_HTMLEndTag 276239313Sdim }; 277239313Sdim 278239313Sdim /// Current lexing mode. 279239313Sdim LexerState State; 280239313Sdim 281239313Sdim /// If State is LS_VerbatimBlock, contains the name of verbatim end 282239313Sdim /// command, including command marker. 283239313Sdim SmallString<16> VerbatimBlockEndCommandName; 284239313Sdim 285239313Sdim /// Given a character reference name (e.g., "lt"), return the character that 286239313Sdim /// it stands for (e.g., "<"). 287239313Sdim StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; 288239313Sdim 289239313Sdim /// Given a Unicode codepoint as base-10 integer, return the character. 290239313Sdim StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const; 291239313Sdim 292239313Sdim /// Given a Unicode codepoint as base-16 integer, return the character. 293239313Sdim StringRef resolveHTMLHexCharacterReference(StringRef Name) const; 294239313Sdim 295239313Sdim void formTokenWithChars(Token &Result, const char *TokEnd, 296239313Sdim tok::TokenKind Kind) { 297239313Sdim const unsigned TokLen = TokEnd - BufferPtr; 298239313Sdim Result.setLocation(getSourceLocation(BufferPtr)); 299239313Sdim Result.setKind(Kind); 300239313Sdim Result.setLength(TokLen); 301239313Sdim#ifndef NDEBUG 302243830Sdim Result.TextPtr = "<UNSET>"; 303243830Sdim Result.IntVal = 7; 304239313Sdim#endif 305239313Sdim BufferPtr = TokEnd; 306239313Sdim } 307239313Sdim 308239313Sdim void formTextToken(Token &Result, const char *TokEnd) { 309239313Sdim StringRef Text(BufferPtr, TokEnd - BufferPtr); 310239313Sdim formTokenWithChars(Result, TokEnd, tok::text); 311239313Sdim Result.setText(Text); 312239313Sdim } 313239313Sdim 314239313Sdim SourceLocation getSourceLocation(const char *Loc) const { 315239313Sdim assert(Loc >= BufferStart && Loc <= BufferEnd && 316239313Sdim "Location out of range for this buffer!"); 317239313Sdim 318239313Sdim const unsigned CharNo = Loc - BufferStart; 319239313Sdim return FileLoc.getLocWithOffset(CharNo); 320239313Sdim } 321239313Sdim 322251662Sdim DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) { 323251662Sdim return Diags.Report(Loc, DiagID); 324251662Sdim } 325251662Sdim 326239313Sdim /// Eat string matching regexp \code \s*\* \endcode. 327239313Sdim void skipLineStartingDecorations(); 328239313Sdim 329239313Sdim /// Lex stuff inside comments. CommentEnd should be set correctly. 330239313Sdim void lexCommentText(Token &T); 331239313Sdim 332239313Sdim void setupAndLexVerbatimBlock(Token &T, 333239313Sdim const char *TextBegin, 334243830Sdim char Marker, const CommandInfo *Info); 335239313Sdim 336239313Sdim void lexVerbatimBlockFirstLine(Token &T); 337239313Sdim 338239313Sdim void lexVerbatimBlockBody(Token &T); 339239313Sdim 340243830Sdim void setupAndLexVerbatimLine(Token &T, const char *TextBegin, 341243830Sdim const CommandInfo *Info); 342239313Sdim 343239313Sdim void lexVerbatimLineText(Token &T); 344239313Sdim 345239313Sdim void lexHTMLCharacterReference(Token &T); 346239313Sdim 347239313Sdim void setupAndLexHTMLStartTag(Token &T); 348239313Sdim 349239313Sdim void lexHTMLStartTag(Token &T); 350239313Sdim 351239313Sdim void setupAndLexHTMLEndTag(Token &T); 352239313Sdim 353239313Sdim void lexHTMLEndTag(Token &T); 354239313Sdim 355239313Sdimpublic: 356251662Sdim Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, 357251662Sdim const CommandTraits &Traits, 358243830Sdim SourceLocation FileLoc, 359239313Sdim const char *BufferStart, const char *BufferEnd); 360239313Sdim 361239313Sdim void lex(Token &T); 362239313Sdim 363239313Sdim StringRef getSpelling(const Token &Tok, 364239313Sdim const SourceManager &SourceMgr, 365239313Sdim bool *Invalid = NULL) const; 366239313Sdim}; 367239313Sdim 368239313Sdim} // end namespace comments 369239313Sdim} // end namespace clang 370239313Sdim 371239313Sdim#endif 372239313Sdim 373