1239313Sdim#include "clang/AST/CommentLexer.h" 2239313Sdim#include "clang/AST/CommentCommandTraits.h" 3251662Sdim#include "clang/AST/CommentDiagnostic.h" 4249423Sdim#include "clang/Basic/CharInfo.h" 5249423Sdim#include "llvm/ADT/StringExtras.h" 6239313Sdim#include "llvm/ADT/StringSwitch.h" 7249423Sdim#include "llvm/Support/ConvertUTF.h" 8239313Sdim#include "llvm/Support/ErrorHandling.h" 9239313Sdim 10239313Sdimnamespace clang { 11239313Sdimnamespace comments { 12239313Sdim 13239313Sdimvoid Token::dump(const Lexer &L, const SourceManager &SM) const { 14239313Sdim llvm::errs() << "comments::Token Kind=" << Kind << " "; 15239313Sdim Loc.dump(SM); 16239313Sdim llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; 17239313Sdim} 18239313Sdim 19249423Sdimstatic inline bool isHTMLNamedCharacterReferenceCharacter(char C) { 20249423Sdim return isLetter(C); 21239313Sdim} 22239313Sdim 23249423Sdimstatic inline bool isHTMLDecimalCharacterReferenceCharacter(char C) { 24249423Sdim return isDigit(C); 25239313Sdim} 26239313Sdim 27249423Sdimstatic inline bool isHTMLHexCharacterReferenceCharacter(char C) { 28249423Sdim return isHexDigit(C); 29239313Sdim} 30243830Sdim 31249423Sdimstatic inline StringRef convertCodePointToUTF8( 32249423Sdim llvm::BumpPtrAllocator &Allocator, 33249423Sdim unsigned CodePoint) { 34249423Sdim char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); 35249423Sdim char *ResolvedPtr = Resolved; 36249423Sdim if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) 37249423Sdim return StringRef(Resolved, ResolvedPtr - Resolved); 38249423Sdim else 39249423Sdim return StringRef(); 40249423Sdim} 41249423Sdim 42249423Sdimnamespace { 43249423Sdim 44243830Sdim#include "clang/AST/CommentHTMLTags.inc" 45249423Sdim#include "clang/AST/CommentHTMLNamedCharacterReferences.inc" 46243830Sdim 47239313Sdim} // unnamed namespace 48239313Sdim 49239313SdimStringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { 50249423Sdim // Fast path, first check a few most widely used named character references. 51239313Sdim return llvm::StringSwitch<StringRef>(Name) 52239313Sdim .Case("amp", "&") 53239313Sdim .Case("lt", "<") 54239313Sdim .Case("gt", ">") 55239313Sdim .Case("quot", "\"") 56239313Sdim .Case("apos", "\'") 57249423Sdim // Slow path. 58249423Sdim .Default(translateHTMLNamedCharacterReferenceToUTF8(Name)); 59239313Sdim} 60239313Sdim 61239313SdimStringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { 62239313Sdim unsigned CodePoint = 0; 63239313Sdim for (unsigned i = 0, e = Name.size(); i != e; ++i) { 64239313Sdim assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); 65239313Sdim CodePoint *= 10; 66239313Sdim CodePoint += Name[i] - '0'; 67239313Sdim } 68249423Sdim return convertCodePointToUTF8(Allocator, CodePoint); 69239313Sdim} 70239313Sdim 71239313SdimStringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { 72239313Sdim unsigned CodePoint = 0; 73239313Sdim for (unsigned i = 0, e = Name.size(); i != e; ++i) { 74239313Sdim CodePoint *= 16; 75239313Sdim const char C = Name[i]; 76239313Sdim assert(isHTMLHexCharacterReferenceCharacter(C)); 77249423Sdim CodePoint += llvm::hexDigitValue(C); 78239313Sdim } 79249423Sdim return convertCodePointToUTF8(Allocator, CodePoint); 80239313Sdim} 81239313Sdim 82239313Sdimvoid Lexer::skipLineStartingDecorations() { 83239313Sdim // This function should be called only for C comments 84239313Sdim assert(CommentState == LCS_InsideCComment); 85239313Sdim 86239313Sdim if (BufferPtr == CommentEnd) 87239313Sdim return; 88239313Sdim 89239313Sdim switch (*BufferPtr) { 90239313Sdim case ' ': 91239313Sdim case '\t': 92239313Sdim case '\f': 93239313Sdim case '\v': { 94239313Sdim const char *NewBufferPtr = BufferPtr; 95239313Sdim NewBufferPtr++; 96239313Sdim if (NewBufferPtr == CommentEnd) 97239313Sdim return; 98239313Sdim 99239313Sdim char C = *NewBufferPtr; 100249423Sdim while (isHorizontalWhitespace(C)) { 101239313Sdim NewBufferPtr++; 102239313Sdim if (NewBufferPtr == CommentEnd) 103239313Sdim return; 104239313Sdim C = *NewBufferPtr; 105239313Sdim } 106239313Sdim if (C == '*') 107239313Sdim BufferPtr = NewBufferPtr + 1; 108239313Sdim break; 109239313Sdim } 110239313Sdim case '*': 111239313Sdim BufferPtr++; 112239313Sdim break; 113239313Sdim } 114239313Sdim} 115239313Sdim 116239313Sdimnamespace { 117239313Sdim/// Returns pointer to the first newline character in the string. 118239313Sdimconst char *findNewline(const char *BufferPtr, const char *BufferEnd) { 119239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 120249423Sdim if (isVerticalWhitespace(*BufferPtr)) 121239313Sdim return BufferPtr; 122239313Sdim } 123239313Sdim return BufferEnd; 124239313Sdim} 125239313Sdim 126239313Sdimconst char *skipNewline(const char *BufferPtr, const char *BufferEnd) { 127239313Sdim if (BufferPtr == BufferEnd) 128239313Sdim return BufferPtr; 129239313Sdim 130239313Sdim if (*BufferPtr == '\n') 131239313Sdim BufferPtr++; 132239313Sdim else { 133239313Sdim assert(*BufferPtr == '\r'); 134239313Sdim BufferPtr++; 135239313Sdim if (BufferPtr != BufferEnd && *BufferPtr == '\n') 136239313Sdim BufferPtr++; 137239313Sdim } 138239313Sdim return BufferPtr; 139239313Sdim} 140239313Sdim 141239313Sdimconst char *skipNamedCharacterReference(const char *BufferPtr, 142239313Sdim const char *BufferEnd) { 143239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 144239313Sdim if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) 145239313Sdim return BufferPtr; 146239313Sdim } 147239313Sdim return BufferEnd; 148239313Sdim} 149239313Sdim 150239313Sdimconst char *skipDecimalCharacterReference(const char *BufferPtr, 151239313Sdim const char *BufferEnd) { 152239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 153239313Sdim if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) 154239313Sdim return BufferPtr; 155239313Sdim } 156239313Sdim return BufferEnd; 157239313Sdim} 158239313Sdim 159239313Sdimconst char *skipHexCharacterReference(const char *BufferPtr, 160263508Sdim const char *BufferEnd) { 161239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 162239313Sdim if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) 163239313Sdim return BufferPtr; 164239313Sdim } 165239313Sdim return BufferEnd; 166239313Sdim} 167239313Sdim 168239313Sdimbool isHTMLIdentifierStartingCharacter(char C) { 169249423Sdim return isLetter(C); 170239313Sdim} 171239313Sdim 172239313Sdimbool isHTMLIdentifierCharacter(char C) { 173249423Sdim return isAlphanumeric(C); 174239313Sdim} 175239313Sdim 176239313Sdimconst char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { 177239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 178239313Sdim if (!isHTMLIdentifierCharacter(*BufferPtr)) 179239313Sdim return BufferPtr; 180239313Sdim } 181239313Sdim return BufferEnd; 182239313Sdim} 183239313Sdim 184239313Sdim/// Skip HTML string quoted in single or double quotes. Escaping quotes inside 185239313Sdim/// string allowed. 186239313Sdim/// 187239313Sdim/// Returns pointer to closing quote. 188239313Sdimconst char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) 189239313Sdim{ 190239313Sdim const char Quote = *BufferPtr; 191239313Sdim assert(Quote == '\"' || Quote == '\''); 192239313Sdim 193239313Sdim BufferPtr++; 194239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 195239313Sdim const char C = *BufferPtr; 196239313Sdim if (C == Quote && BufferPtr[-1] != '\\') 197239313Sdim return BufferPtr; 198239313Sdim } 199239313Sdim return BufferEnd; 200239313Sdim} 201239313Sdim 202239313Sdimconst char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { 203239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 204239313Sdim if (!isWhitespace(*BufferPtr)) 205239313Sdim return BufferPtr; 206239313Sdim } 207239313Sdim return BufferEnd; 208239313Sdim} 209239313Sdim 210239313Sdimbool isWhitespace(const char *BufferPtr, const char *BufferEnd) { 211239313Sdim return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; 212239313Sdim} 213239313Sdim 214243830Sdimbool isCommandNameStartCharacter(char C) { 215249423Sdim return isLetter(C); 216243830Sdim} 217243830Sdim 218239313Sdimbool isCommandNameCharacter(char C) { 219249423Sdim return isAlphanumeric(C); 220239313Sdim} 221239313Sdim 222239313Sdimconst char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { 223239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 224239313Sdim if (!isCommandNameCharacter(*BufferPtr)) 225239313Sdim return BufferPtr; 226239313Sdim } 227239313Sdim return BufferEnd; 228239313Sdim} 229239313Sdim 230239313Sdim/// Return the one past end pointer for BCPL comments. 231239313Sdim/// Handles newlines escaped with backslash or trigraph for backslahs. 232239313Sdimconst char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { 233239313Sdim const char *CurPtr = BufferPtr; 234239313Sdim while (CurPtr != BufferEnd) { 235249423Sdim while (!isVerticalWhitespace(*CurPtr)) { 236239313Sdim CurPtr++; 237239313Sdim if (CurPtr == BufferEnd) 238239313Sdim return BufferEnd; 239239313Sdim } 240239313Sdim // We found a newline, check if it is escaped. 241239313Sdim const char *EscapePtr = CurPtr - 1; 242239313Sdim while(isHorizontalWhitespace(*EscapePtr)) 243239313Sdim EscapePtr--; 244239313Sdim 245239313Sdim if (*EscapePtr == '\\' || 246239313Sdim (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && 247239313Sdim EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { 248239313Sdim // We found an escaped newline. 249239313Sdim CurPtr = skipNewline(CurPtr, BufferEnd); 250239313Sdim } else 251239313Sdim return CurPtr; // Not an escaped newline. 252239313Sdim } 253239313Sdim return BufferEnd; 254239313Sdim} 255239313Sdim 256239313Sdim/// Return the one past end pointer for C comments. 257239313Sdim/// Very dumb, does not handle escaped newlines or trigraphs. 258239313Sdimconst char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { 259239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 260239313Sdim if (*BufferPtr == '*') { 261239313Sdim assert(BufferPtr + 1 != BufferEnd); 262239313Sdim if (*(BufferPtr + 1) == '/') 263239313Sdim return BufferPtr; 264239313Sdim } 265239313Sdim } 266239313Sdim llvm_unreachable("buffer end hit before '*/' was seen"); 267239313Sdim} 268263508Sdim 269239313Sdim} // unnamed namespace 270239313Sdim 271239313Sdimvoid Lexer::lexCommentText(Token &T) { 272239313Sdim assert(CommentState == LCS_InsideBCPLComment || 273239313Sdim CommentState == LCS_InsideCComment); 274239313Sdim 275239313Sdim switch (State) { 276239313Sdim case LS_Normal: 277239313Sdim break; 278239313Sdim case LS_VerbatimBlockFirstLine: 279239313Sdim lexVerbatimBlockFirstLine(T); 280239313Sdim return; 281239313Sdim case LS_VerbatimBlockBody: 282239313Sdim lexVerbatimBlockBody(T); 283239313Sdim return; 284239313Sdim case LS_VerbatimLineText: 285239313Sdim lexVerbatimLineText(T); 286239313Sdim return; 287239313Sdim case LS_HTMLStartTag: 288239313Sdim lexHTMLStartTag(T); 289239313Sdim return; 290239313Sdim case LS_HTMLEndTag: 291239313Sdim lexHTMLEndTag(T); 292239313Sdim return; 293239313Sdim } 294239313Sdim 295239313Sdim assert(State == LS_Normal); 296239313Sdim 297239313Sdim const char *TokenPtr = BufferPtr; 298239313Sdim assert(TokenPtr < CommentEnd); 299239313Sdim while (TokenPtr != CommentEnd) { 300239313Sdim switch(*TokenPtr) { 301239313Sdim case '\\': 302239313Sdim case '@': { 303249423Sdim // Commands that start with a backslash and commands that start with 304249423Sdim // 'at' have equivalent semantics. But we keep information about the 305249423Sdim // exact syntax in AST for comments. 306249423Sdim tok::TokenKind CommandKind = 307249423Sdim (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; 308239313Sdim TokenPtr++; 309239313Sdim if (TokenPtr == CommentEnd) { 310239313Sdim formTextToken(T, TokenPtr); 311239313Sdim return; 312239313Sdim } 313239313Sdim char C = *TokenPtr; 314239313Sdim switch (C) { 315239313Sdim default: 316239313Sdim break; 317239313Sdim 318239313Sdim case '\\': case '@': case '&': case '$': 319239313Sdim case '#': case '<': case '>': case '%': 320239313Sdim case '\"': case '.': case ':': 321239313Sdim // This is one of \\ \@ \& \$ etc escape sequences. 322239313Sdim TokenPtr++; 323239313Sdim if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { 324239313Sdim // This is the \:: escape sequence. 325239313Sdim TokenPtr++; 326239313Sdim } 327239313Sdim StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); 328239313Sdim formTokenWithChars(T, TokenPtr, tok::text); 329239313Sdim T.setText(UnescapedText); 330239313Sdim return; 331239313Sdim } 332239313Sdim 333239313Sdim // Don't make zero-length commands. 334243830Sdim if (!isCommandNameStartCharacter(*TokenPtr)) { 335239313Sdim formTextToken(T, TokenPtr); 336239313Sdim return; 337239313Sdim } 338239313Sdim 339239313Sdim TokenPtr = skipCommandName(TokenPtr, CommentEnd); 340239313Sdim unsigned Length = TokenPtr - (BufferPtr + 1); 341239313Sdim 342239313Sdim // Hardcoded support for lexing LaTeX formula commands 343239313Sdim // \f$ \f[ \f] \f{ \f} as a single command. 344239313Sdim if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { 345239313Sdim C = *TokenPtr; 346239313Sdim if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { 347239313Sdim TokenPtr++; 348239313Sdim Length++; 349239313Sdim } 350239313Sdim } 351239313Sdim 352239313Sdim const StringRef CommandName(BufferPtr + 1, Length); 353239313Sdim 354243830Sdim const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); 355243830Sdim if (!Info) { 356263508Sdim if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { 357263508Sdim StringRef CorrectedName = Info->Name; 358263508Sdim SourceLocation Loc = getSourceLocation(BufferPtr); 359263508Sdim SourceRange CommandRange(Loc.getLocWithOffset(1), 360263508Sdim getSourceLocation(TokenPtr)); 361263508Sdim Diag(Loc, diag::warn_correct_comment_command_name) 362263508Sdim << CommandName << CorrectedName 363263508Sdim << FixItHint::CreateReplacement(CommandRange, CorrectedName); 364263508Sdim } else { 365263508Sdim formTokenWithChars(T, TokenPtr, tok::unknown_command); 366263508Sdim T.setUnknownCommandName(CommandName); 367263508Sdim Diag(T.getLocation(), diag::warn_unknown_comment_command_name); 368263508Sdim return; 369263508Sdim } 370239313Sdim } 371243830Sdim if (Info->IsVerbatimBlockCommand) { 372243830Sdim setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); 373239313Sdim return; 374239313Sdim } 375243830Sdim if (Info->IsVerbatimLineCommand) { 376243830Sdim setupAndLexVerbatimLine(T, TokenPtr, Info); 377243830Sdim return; 378243830Sdim } 379249423Sdim formTokenWithChars(T, TokenPtr, CommandKind); 380243830Sdim T.setCommandID(Info->getID()); 381239313Sdim return; 382239313Sdim } 383239313Sdim 384239313Sdim case '&': 385239313Sdim lexHTMLCharacterReference(T); 386239313Sdim return; 387239313Sdim 388239313Sdim case '<': { 389239313Sdim TokenPtr++; 390239313Sdim if (TokenPtr == CommentEnd) { 391239313Sdim formTextToken(T, TokenPtr); 392239313Sdim return; 393239313Sdim } 394239313Sdim const char C = *TokenPtr; 395239313Sdim if (isHTMLIdentifierStartingCharacter(C)) 396239313Sdim setupAndLexHTMLStartTag(T); 397239313Sdim else if (C == '/') 398239313Sdim setupAndLexHTMLEndTag(T); 399239313Sdim else 400239313Sdim formTextToken(T, TokenPtr); 401239313Sdim 402239313Sdim return; 403239313Sdim } 404239313Sdim 405239313Sdim case '\n': 406239313Sdim case '\r': 407239313Sdim TokenPtr = skipNewline(TokenPtr, CommentEnd); 408239313Sdim formTokenWithChars(T, TokenPtr, tok::newline); 409239313Sdim 410239313Sdim if (CommentState == LCS_InsideCComment) 411239313Sdim skipLineStartingDecorations(); 412239313Sdim return; 413239313Sdim 414239313Sdim default: { 415249423Sdim size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr). 416249423Sdim find_first_of("\n\r\\@&<"); 417249423Sdim if (End != StringRef::npos) 418249423Sdim TokenPtr += End; 419249423Sdim else 420249423Sdim TokenPtr = CommentEnd; 421239313Sdim formTextToken(T, TokenPtr); 422239313Sdim return; 423239313Sdim } 424239313Sdim } 425239313Sdim } 426239313Sdim} 427239313Sdim 428239313Sdimvoid Lexer::setupAndLexVerbatimBlock(Token &T, 429239313Sdim const char *TextBegin, 430243830Sdim char Marker, const CommandInfo *Info) { 431243830Sdim assert(Info->IsVerbatimBlockCommand); 432243830Sdim 433239313Sdim VerbatimBlockEndCommandName.clear(); 434239313Sdim VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@"); 435243830Sdim VerbatimBlockEndCommandName.append(Info->EndCommandName); 436239313Sdim 437239313Sdim formTokenWithChars(T, TextBegin, tok::verbatim_block_begin); 438243830Sdim T.setVerbatimBlockID(Info->getID()); 439239313Sdim 440239313Sdim // If there is a newline following the verbatim opening command, skip the 441239313Sdim // newline so that we don't create an tok::verbatim_block_line with empty 442239313Sdim // text content. 443249423Sdim if (BufferPtr != CommentEnd && 444249423Sdim isVerticalWhitespace(*BufferPtr)) { 445249423Sdim BufferPtr = skipNewline(BufferPtr, CommentEnd); 446249423Sdim State = LS_VerbatimBlockBody; 447249423Sdim return; 448239313Sdim } 449239313Sdim 450239313Sdim State = LS_VerbatimBlockFirstLine; 451239313Sdim} 452239313Sdim 453239313Sdimvoid Lexer::lexVerbatimBlockFirstLine(Token &T) { 454239313Sdimagain: 455239313Sdim assert(BufferPtr < CommentEnd); 456239313Sdim 457239313Sdim // FIXME: It would be better to scan the text once, finding either the block 458239313Sdim // end command or newline. 459239313Sdim // 460239313Sdim // Extract current line. 461239313Sdim const char *Newline = findNewline(BufferPtr, CommentEnd); 462239313Sdim StringRef Line(BufferPtr, Newline - BufferPtr); 463239313Sdim 464239313Sdim // Look for end command in current line. 465239313Sdim size_t Pos = Line.find(VerbatimBlockEndCommandName); 466239313Sdim const char *TextEnd; 467239313Sdim const char *NextLine; 468239313Sdim if (Pos == StringRef::npos) { 469239313Sdim // Current line is completely verbatim. 470239313Sdim TextEnd = Newline; 471239313Sdim NextLine = skipNewline(Newline, CommentEnd); 472239313Sdim } else if (Pos == 0) { 473239313Sdim // Current line contains just an end command. 474239313Sdim const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); 475239313Sdim StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); 476239313Sdim formTokenWithChars(T, End, tok::verbatim_block_end); 477243830Sdim T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID()); 478239313Sdim State = LS_Normal; 479239313Sdim return; 480239313Sdim } else { 481239313Sdim // There is some text, followed by end command. Extract text first. 482239313Sdim TextEnd = BufferPtr + Pos; 483239313Sdim NextLine = TextEnd; 484239313Sdim // If there is only whitespace before end command, skip whitespace. 485239313Sdim if (isWhitespace(BufferPtr, TextEnd)) { 486239313Sdim BufferPtr = TextEnd; 487239313Sdim goto again; 488239313Sdim } 489239313Sdim } 490239313Sdim 491239313Sdim StringRef Text(BufferPtr, TextEnd - BufferPtr); 492239313Sdim formTokenWithChars(T, NextLine, tok::verbatim_block_line); 493239313Sdim T.setVerbatimBlockText(Text); 494239313Sdim 495239313Sdim State = LS_VerbatimBlockBody; 496239313Sdim} 497239313Sdim 498239313Sdimvoid Lexer::lexVerbatimBlockBody(Token &T) { 499239313Sdim assert(State == LS_VerbatimBlockBody); 500239313Sdim 501239313Sdim if (CommentState == LCS_InsideCComment) 502239313Sdim skipLineStartingDecorations(); 503239313Sdim 504239313Sdim lexVerbatimBlockFirstLine(T); 505239313Sdim} 506239313Sdim 507243830Sdimvoid Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin, 508243830Sdim const CommandInfo *Info) { 509243830Sdim assert(Info->IsVerbatimLineCommand); 510239313Sdim formTokenWithChars(T, TextBegin, tok::verbatim_line_name); 511243830Sdim T.setVerbatimLineID(Info->getID()); 512239313Sdim 513239313Sdim State = LS_VerbatimLineText; 514239313Sdim} 515239313Sdim 516239313Sdimvoid Lexer::lexVerbatimLineText(Token &T) { 517239313Sdim assert(State == LS_VerbatimLineText); 518239313Sdim 519239313Sdim // Extract current line. 520239313Sdim const char *Newline = findNewline(BufferPtr, CommentEnd); 521239313Sdim const StringRef Text(BufferPtr, Newline - BufferPtr); 522239313Sdim formTokenWithChars(T, Newline, tok::verbatim_line_text); 523239313Sdim T.setVerbatimLineText(Text); 524239313Sdim 525239313Sdim State = LS_Normal; 526239313Sdim} 527239313Sdim 528239313Sdimvoid Lexer::lexHTMLCharacterReference(Token &T) { 529239313Sdim const char *TokenPtr = BufferPtr; 530239313Sdim assert(*TokenPtr == '&'); 531239313Sdim TokenPtr++; 532239313Sdim if (TokenPtr == CommentEnd) { 533239313Sdim formTextToken(T, TokenPtr); 534239313Sdim return; 535239313Sdim } 536239313Sdim const char *NamePtr; 537239313Sdim bool isNamed = false; 538239313Sdim bool isDecimal = false; 539239313Sdim char C = *TokenPtr; 540239313Sdim if (isHTMLNamedCharacterReferenceCharacter(C)) { 541239313Sdim NamePtr = TokenPtr; 542239313Sdim TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); 543239313Sdim isNamed = true; 544239313Sdim } else if (C == '#') { 545239313Sdim TokenPtr++; 546239313Sdim if (TokenPtr == CommentEnd) { 547239313Sdim formTextToken(T, TokenPtr); 548239313Sdim return; 549239313Sdim } 550239313Sdim C = *TokenPtr; 551239313Sdim if (isHTMLDecimalCharacterReferenceCharacter(C)) { 552239313Sdim NamePtr = TokenPtr; 553239313Sdim TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); 554239313Sdim isDecimal = true; 555239313Sdim } else if (C == 'x' || C == 'X') { 556239313Sdim TokenPtr++; 557239313Sdim NamePtr = TokenPtr; 558239313Sdim TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); 559239313Sdim } else { 560239313Sdim formTextToken(T, TokenPtr); 561239313Sdim return; 562239313Sdim } 563239313Sdim } else { 564239313Sdim formTextToken(T, TokenPtr); 565239313Sdim return; 566239313Sdim } 567239313Sdim if (NamePtr == TokenPtr || TokenPtr == CommentEnd || 568239313Sdim *TokenPtr != ';') { 569239313Sdim formTextToken(T, TokenPtr); 570239313Sdim return; 571239313Sdim } 572239313Sdim StringRef Name(NamePtr, TokenPtr - NamePtr); 573239313Sdim TokenPtr++; // Skip semicolon. 574239313Sdim StringRef Resolved; 575239313Sdim if (isNamed) 576239313Sdim Resolved = resolveHTMLNamedCharacterReference(Name); 577239313Sdim else if (isDecimal) 578239313Sdim Resolved = resolveHTMLDecimalCharacterReference(Name); 579239313Sdim else 580239313Sdim Resolved = resolveHTMLHexCharacterReference(Name); 581239313Sdim 582239313Sdim if (Resolved.empty()) { 583239313Sdim formTextToken(T, TokenPtr); 584239313Sdim return; 585239313Sdim } 586239313Sdim formTokenWithChars(T, TokenPtr, tok::text); 587239313Sdim T.setText(Resolved); 588239313Sdim return; 589239313Sdim} 590239313Sdim 591239313Sdimvoid Lexer::setupAndLexHTMLStartTag(Token &T) { 592239313Sdim assert(BufferPtr[0] == '<' && 593239313Sdim isHTMLIdentifierStartingCharacter(BufferPtr[1])); 594239313Sdim const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); 595243830Sdim StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); 596243830Sdim if (!isHTMLTagName(Name)) { 597243830Sdim formTextToken(T, TagNameEnd); 598243830Sdim return; 599243830Sdim } 600239313Sdim 601239313Sdim formTokenWithChars(T, TagNameEnd, tok::html_start_tag); 602239313Sdim T.setHTMLTagStartName(Name); 603239313Sdim 604239313Sdim BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 605239313Sdim 606239313Sdim const char C = *BufferPtr; 607239313Sdim if (BufferPtr != CommentEnd && 608239313Sdim (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C))) 609239313Sdim State = LS_HTMLStartTag; 610239313Sdim} 611239313Sdim 612239313Sdimvoid Lexer::lexHTMLStartTag(Token &T) { 613239313Sdim assert(State == LS_HTMLStartTag); 614239313Sdim 615239313Sdim const char *TokenPtr = BufferPtr; 616239313Sdim char C = *TokenPtr; 617239313Sdim if (isHTMLIdentifierCharacter(C)) { 618239313Sdim TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd); 619239313Sdim StringRef Ident(BufferPtr, TokenPtr - BufferPtr); 620239313Sdim formTokenWithChars(T, TokenPtr, tok::html_ident); 621239313Sdim T.setHTMLIdent(Ident); 622239313Sdim } else { 623239313Sdim switch (C) { 624239313Sdim case '=': 625239313Sdim TokenPtr++; 626239313Sdim formTokenWithChars(T, TokenPtr, tok::html_equals); 627239313Sdim break; 628239313Sdim case '\"': 629239313Sdim case '\'': { 630239313Sdim const char *OpenQuote = TokenPtr; 631239313Sdim TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd); 632239313Sdim const char *ClosingQuote = TokenPtr; 633239313Sdim if (TokenPtr != CommentEnd) // Skip closing quote. 634239313Sdim TokenPtr++; 635239313Sdim formTokenWithChars(T, TokenPtr, tok::html_quoted_string); 636239313Sdim T.setHTMLQuotedString(StringRef(OpenQuote + 1, 637239313Sdim ClosingQuote - (OpenQuote + 1))); 638239313Sdim break; 639239313Sdim } 640239313Sdim case '>': 641239313Sdim TokenPtr++; 642239313Sdim formTokenWithChars(T, TokenPtr, tok::html_greater); 643239313Sdim State = LS_Normal; 644239313Sdim return; 645239313Sdim case '/': 646239313Sdim TokenPtr++; 647239313Sdim if (TokenPtr != CommentEnd && *TokenPtr == '>') { 648239313Sdim TokenPtr++; 649239313Sdim formTokenWithChars(T, TokenPtr, tok::html_slash_greater); 650239313Sdim } else 651239313Sdim formTextToken(T, TokenPtr); 652239313Sdim 653239313Sdim State = LS_Normal; 654239313Sdim return; 655239313Sdim } 656239313Sdim } 657239313Sdim 658239313Sdim // Now look ahead and return to normal state if we don't see any HTML tokens 659239313Sdim // ahead. 660239313Sdim BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 661239313Sdim if (BufferPtr == CommentEnd) { 662239313Sdim State = LS_Normal; 663239313Sdim return; 664239313Sdim } 665239313Sdim 666239313Sdim C = *BufferPtr; 667239313Sdim if (!isHTMLIdentifierStartingCharacter(C) && 668239313Sdim C != '=' && C != '\"' && C != '\'' && C != '>') { 669239313Sdim State = LS_Normal; 670239313Sdim return; 671239313Sdim } 672239313Sdim} 673239313Sdim 674239313Sdimvoid Lexer::setupAndLexHTMLEndTag(Token &T) { 675239313Sdim assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); 676239313Sdim 677239313Sdim const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); 678239313Sdim const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); 679243830Sdim StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin); 680243830Sdim if (!isHTMLTagName(Name)) { 681243830Sdim formTextToken(T, TagNameEnd); 682243830Sdim return; 683243830Sdim } 684239313Sdim 685239313Sdim const char *End = skipWhitespace(TagNameEnd, CommentEnd); 686239313Sdim 687239313Sdim formTokenWithChars(T, End, tok::html_end_tag); 688243830Sdim T.setHTMLTagEndName(Name); 689239313Sdim 690239313Sdim if (BufferPtr != CommentEnd && *BufferPtr == '>') 691239313Sdim State = LS_HTMLEndTag; 692239313Sdim} 693239313Sdim 694239313Sdimvoid Lexer::lexHTMLEndTag(Token &T) { 695239313Sdim assert(BufferPtr != CommentEnd && *BufferPtr == '>'); 696239313Sdim 697239313Sdim formTokenWithChars(T, BufferPtr + 1, tok::html_greater); 698239313Sdim State = LS_Normal; 699239313Sdim} 700239313Sdim 701251662SdimLexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, 702251662Sdim const CommandTraits &Traits, 703243830Sdim SourceLocation FileLoc, 704239313Sdim const char *BufferStart, const char *BufferEnd): 705251662Sdim Allocator(Allocator), Diags(Diags), Traits(Traits), 706239313Sdim BufferStart(BufferStart), BufferEnd(BufferEnd), 707243830Sdim FileLoc(FileLoc), BufferPtr(BufferStart), 708239313Sdim CommentState(LCS_BeforeComment), State(LS_Normal) { 709239313Sdim} 710239313Sdim 711239313Sdimvoid Lexer::lex(Token &T) { 712239313Sdimagain: 713239313Sdim switch (CommentState) { 714239313Sdim case LCS_BeforeComment: 715239313Sdim if (BufferPtr == BufferEnd) { 716239313Sdim formTokenWithChars(T, BufferPtr, tok::eof); 717239313Sdim return; 718239313Sdim } 719239313Sdim 720239313Sdim assert(*BufferPtr == '/'); 721239313Sdim BufferPtr++; // Skip first slash. 722239313Sdim switch(*BufferPtr) { 723239313Sdim case '/': { // BCPL comment. 724239313Sdim BufferPtr++; // Skip second slash. 725239313Sdim 726239313Sdim if (BufferPtr != BufferEnd) { 727239313Sdim // Skip Doxygen magic marker, if it is present. 728239313Sdim // It might be missing because of a typo //< or /*<, or because we 729239313Sdim // merged this non-Doxygen comment into a bunch of Doxygen comments 730239313Sdim // around it: /** ... */ /* ... */ /** ... */ 731239313Sdim const char C = *BufferPtr; 732239313Sdim if (C == '/' || C == '!') 733239313Sdim BufferPtr++; 734239313Sdim } 735239313Sdim 736239313Sdim // Skip less-than symbol that marks trailing comments. 737239313Sdim // Skip it even if the comment is not a Doxygen one, because //< and /*< 738239313Sdim // are frequent typos. 739239313Sdim if (BufferPtr != BufferEnd && *BufferPtr == '<') 740239313Sdim BufferPtr++; 741239313Sdim 742239313Sdim CommentState = LCS_InsideBCPLComment; 743239313Sdim if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine) 744239313Sdim State = LS_Normal; 745239313Sdim CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); 746239313Sdim goto again; 747239313Sdim } 748239313Sdim case '*': { // C comment. 749239313Sdim BufferPtr++; // Skip star. 750239313Sdim 751239313Sdim // Skip Doxygen magic marker. 752239313Sdim const char C = *BufferPtr; 753239313Sdim if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!') 754239313Sdim BufferPtr++; 755239313Sdim 756239313Sdim // Skip less-than symbol that marks trailing comments. 757239313Sdim if (BufferPtr != BufferEnd && *BufferPtr == '<') 758239313Sdim BufferPtr++; 759239313Sdim 760239313Sdim CommentState = LCS_InsideCComment; 761239313Sdim State = LS_Normal; 762239313Sdim CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); 763239313Sdim goto again; 764239313Sdim } 765239313Sdim default: 766239313Sdim llvm_unreachable("second character of comment should be '/' or '*'"); 767239313Sdim } 768239313Sdim 769239313Sdim case LCS_BetweenComments: { 770239313Sdim // Consecutive comments are extracted only if there is only whitespace 771239313Sdim // between them. So we can search for the start of the next comment. 772239313Sdim const char *EndWhitespace = BufferPtr; 773239313Sdim while(EndWhitespace != BufferEnd && *EndWhitespace != '/') 774239313Sdim EndWhitespace++; 775239313Sdim 776239313Sdim // Turn any whitespace between comments (and there is only whitespace 777239313Sdim // between them -- guaranteed by comment extraction) into a newline. We 778239313Sdim // have two newlines between C comments in total (first one was synthesized 779239313Sdim // after a comment). 780239313Sdim formTokenWithChars(T, EndWhitespace, tok::newline); 781239313Sdim 782239313Sdim CommentState = LCS_BeforeComment; 783239313Sdim break; 784239313Sdim } 785239313Sdim 786239313Sdim case LCS_InsideBCPLComment: 787239313Sdim case LCS_InsideCComment: 788239313Sdim if (BufferPtr != CommentEnd) { 789239313Sdim lexCommentText(T); 790239313Sdim break; 791239313Sdim } else { 792239313Sdim // Skip C comment closing sequence. 793239313Sdim if (CommentState == LCS_InsideCComment) { 794239313Sdim assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); 795239313Sdim BufferPtr += 2; 796239313Sdim assert(BufferPtr <= BufferEnd); 797239313Sdim 798239313Sdim // Synthenize newline just after the C comment, regardless if there is 799239313Sdim // actually a newline. 800239313Sdim formTokenWithChars(T, BufferPtr, tok::newline); 801239313Sdim 802239313Sdim CommentState = LCS_BetweenComments; 803239313Sdim break; 804239313Sdim } else { 805239313Sdim // Don't synthesized a newline after BCPL comment. 806239313Sdim CommentState = LCS_BetweenComments; 807239313Sdim goto again; 808239313Sdim } 809239313Sdim } 810239313Sdim } 811239313Sdim} 812239313Sdim 813239313SdimStringRef Lexer::getSpelling(const Token &Tok, 814239313Sdim const SourceManager &SourceMgr, 815239313Sdim bool *Invalid) const { 816239313Sdim SourceLocation Loc = Tok.getLocation(); 817239313Sdim std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); 818239313Sdim 819239313Sdim bool InvalidTemp = false; 820239313Sdim StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp); 821239313Sdim if (InvalidTemp) { 822239313Sdim *Invalid = true; 823239313Sdim return StringRef(); 824239313Sdim } 825239313Sdim 826239313Sdim const char *Begin = File.data() + LocInfo.second; 827239313Sdim return StringRef(Begin, Tok.getLength()); 828239313Sdim} 829239313Sdim 830239313Sdim} // end namespace comments 831239313Sdim} // end namespace clang 832239313Sdim 833