CommentLexer.cpp revision 251662
1239313Sdim#include "clang/AST/CommentLexer.h" 2239313Sdim#include "clang/AST/CommentCommandTraits.h" 3251662Sdim#include "clang/AST/CommentDiagnostic.h" 4249423Sdim#include "clang/Basic/CharInfo.h" 5249423Sdim#include "llvm/ADT/StringExtras.h" 6239313Sdim#include "llvm/ADT/StringSwitch.h" 7249423Sdim#include "llvm/Support/ConvertUTF.h" 8239313Sdim#include "llvm/Support/ErrorHandling.h" 9239313Sdim 10239313Sdimnamespace clang { 11239313Sdimnamespace comments { 12239313Sdim 13239313Sdimvoid Token::dump(const Lexer &L, const SourceManager &SM) const { 14239313Sdim llvm::errs() << "comments::Token Kind=" << Kind << " "; 15239313Sdim Loc.dump(SM); 16239313Sdim llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; 17239313Sdim} 18239313Sdim 19249423Sdimstatic inline bool isHTMLNamedCharacterReferenceCharacter(char C) { 20249423Sdim return isLetter(C); 21239313Sdim} 22239313Sdim 23249423Sdimstatic inline bool isHTMLDecimalCharacterReferenceCharacter(char C) { 24249423Sdim return isDigit(C); 25239313Sdim} 26239313Sdim 27249423Sdimstatic inline bool isHTMLHexCharacterReferenceCharacter(char C) { 28249423Sdim return isHexDigit(C); 29239313Sdim} 30243830Sdim 31249423Sdimstatic inline StringRef convertCodePointToUTF8( 32249423Sdim llvm::BumpPtrAllocator &Allocator, 33249423Sdim unsigned CodePoint) { 34249423Sdim char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); 35249423Sdim char *ResolvedPtr = Resolved; 36249423Sdim if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) 37249423Sdim return StringRef(Resolved, ResolvedPtr - Resolved); 38249423Sdim else 39249423Sdim return StringRef(); 40249423Sdim} 41249423Sdim 42249423Sdimnamespace { 43249423Sdim 44243830Sdim#include "clang/AST/CommentHTMLTags.inc" 45249423Sdim#include "clang/AST/CommentHTMLNamedCharacterReferences.inc" 46243830Sdim 47239313Sdim} // unnamed namespace 48239313Sdim 49239313SdimStringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { 50249423Sdim // Fast path, first check a few most widely used named character references. 51239313Sdim return llvm::StringSwitch<StringRef>(Name) 52239313Sdim .Case("amp", "&") 53239313Sdim .Case("lt", "<") 54239313Sdim .Case("gt", ">") 55239313Sdim .Case("quot", "\"") 56239313Sdim .Case("apos", "\'") 57249423Sdim // Slow path. 58249423Sdim .Default(translateHTMLNamedCharacterReferenceToUTF8(Name)); 59239313Sdim} 60239313Sdim 61239313SdimStringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { 62239313Sdim unsigned CodePoint = 0; 63239313Sdim for (unsigned i = 0, e = Name.size(); i != e; ++i) { 64239313Sdim assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); 65239313Sdim CodePoint *= 10; 66239313Sdim CodePoint += Name[i] - '0'; 67239313Sdim } 68249423Sdim return convertCodePointToUTF8(Allocator, CodePoint); 69239313Sdim} 70239313Sdim 71239313SdimStringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { 72239313Sdim unsigned CodePoint = 0; 73239313Sdim for (unsigned i = 0, e = Name.size(); i != e; ++i) { 74239313Sdim CodePoint *= 16; 75239313Sdim const char C = Name[i]; 76239313Sdim assert(isHTMLHexCharacterReferenceCharacter(C)); 77249423Sdim CodePoint += llvm::hexDigitValue(C); 78239313Sdim } 79249423Sdim return convertCodePointToUTF8(Allocator, CodePoint); 80239313Sdim} 81239313Sdim 82239313Sdimvoid Lexer::skipLineStartingDecorations() { 83239313Sdim // This function should be called only for C comments 84239313Sdim assert(CommentState == LCS_InsideCComment); 85239313Sdim 86239313Sdim if (BufferPtr == CommentEnd) 87239313Sdim return; 88239313Sdim 89239313Sdim switch (*BufferPtr) { 90239313Sdim case ' ': 91239313Sdim case '\t': 92239313Sdim case '\f': 93239313Sdim case '\v': { 94239313Sdim const char *NewBufferPtr = BufferPtr; 95239313Sdim NewBufferPtr++; 96239313Sdim if (NewBufferPtr == CommentEnd) 97239313Sdim return; 98239313Sdim 99239313Sdim char C = *NewBufferPtr; 100249423Sdim while (isHorizontalWhitespace(C)) { 101239313Sdim NewBufferPtr++; 102239313Sdim if (NewBufferPtr == CommentEnd) 103239313Sdim return; 104239313Sdim C = *NewBufferPtr; 105239313Sdim } 106239313Sdim if (C == '*') 107239313Sdim BufferPtr = NewBufferPtr + 1; 108239313Sdim break; 109239313Sdim } 110239313Sdim case '*': 111239313Sdim BufferPtr++; 112239313Sdim break; 113239313Sdim } 114239313Sdim} 115239313Sdim 116239313Sdimnamespace { 117239313Sdim/// Returns pointer to the first newline character in the string. 118239313Sdimconst char *findNewline(const char *BufferPtr, const char *BufferEnd) { 119239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 120249423Sdim if (isVerticalWhitespace(*BufferPtr)) 121239313Sdim return BufferPtr; 122239313Sdim } 123239313Sdim return BufferEnd; 124239313Sdim} 125239313Sdim 126239313Sdimconst char *skipNewline(const char *BufferPtr, const char *BufferEnd) { 127239313Sdim if (BufferPtr == BufferEnd) 128239313Sdim return BufferPtr; 129239313Sdim 130239313Sdim if (*BufferPtr == '\n') 131239313Sdim BufferPtr++; 132239313Sdim else { 133239313Sdim assert(*BufferPtr == '\r'); 134239313Sdim BufferPtr++; 135239313Sdim if (BufferPtr != BufferEnd && *BufferPtr == '\n') 136239313Sdim BufferPtr++; 137239313Sdim } 138239313Sdim return BufferPtr; 139239313Sdim} 140239313Sdim 141239313Sdimconst char *skipNamedCharacterReference(const char *BufferPtr, 142239313Sdim const char *BufferEnd) { 143239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 144239313Sdim if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) 145239313Sdim return BufferPtr; 146239313Sdim } 147239313Sdim return BufferEnd; 148239313Sdim} 149239313Sdim 150239313Sdimconst char *skipDecimalCharacterReference(const char *BufferPtr, 151239313Sdim const char *BufferEnd) { 152239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 153239313Sdim if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) 154239313Sdim return BufferPtr; 155239313Sdim } 156239313Sdim return BufferEnd; 157239313Sdim} 158239313Sdim 159239313Sdimconst char *skipHexCharacterReference(const char *BufferPtr, 160239313Sdim const char *BufferEnd) { 161239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 162239313Sdim if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) 163239313Sdim return BufferPtr; 164239313Sdim } 165239313Sdim return BufferEnd; 166239313Sdim} 167239313Sdim 168239313Sdimbool isHTMLIdentifierStartingCharacter(char C) { 169249423Sdim return isLetter(C); 170239313Sdim} 171239313Sdim 172239313Sdimbool isHTMLIdentifierCharacter(char C) { 173249423Sdim return isAlphanumeric(C); 174239313Sdim} 175239313Sdim 176239313Sdimconst char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { 177239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 178239313Sdim if (!isHTMLIdentifierCharacter(*BufferPtr)) 179239313Sdim return BufferPtr; 180239313Sdim } 181239313Sdim return BufferEnd; 182239313Sdim} 183239313Sdim 184239313Sdim/// Skip HTML string quoted in single or double quotes. Escaping quotes inside 185239313Sdim/// string allowed. 186239313Sdim/// 187239313Sdim/// Returns pointer to closing quote. 188239313Sdimconst char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) 189239313Sdim{ 190239313Sdim const char Quote = *BufferPtr; 191239313Sdim assert(Quote == '\"' || Quote == '\''); 192239313Sdim 193239313Sdim BufferPtr++; 194239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 195239313Sdim const char C = *BufferPtr; 196239313Sdim if (C == Quote && BufferPtr[-1] != '\\') 197239313Sdim return BufferPtr; 198239313Sdim } 199239313Sdim return BufferEnd; 200239313Sdim} 201239313Sdim 202239313Sdimconst char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { 203239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 204239313Sdim if (!isWhitespace(*BufferPtr)) 205239313Sdim return BufferPtr; 206239313Sdim } 207239313Sdim return BufferEnd; 208239313Sdim} 209239313Sdim 210239313Sdimbool isWhitespace(const char *BufferPtr, const char *BufferEnd) { 211239313Sdim return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; 212239313Sdim} 213239313Sdim 214243830Sdimbool isCommandNameStartCharacter(char C) { 215249423Sdim return isLetter(C); 216243830Sdim} 217243830Sdim 218239313Sdimbool isCommandNameCharacter(char C) { 219249423Sdim return isAlphanumeric(C); 220239313Sdim} 221239313Sdim 222239313Sdimconst char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { 223239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 224239313Sdim if (!isCommandNameCharacter(*BufferPtr)) 225239313Sdim return BufferPtr; 226239313Sdim } 227239313Sdim return BufferEnd; 228239313Sdim} 229239313Sdim 230239313Sdim/// Return the one past end pointer for BCPL comments. 231239313Sdim/// Handles newlines escaped with backslash or trigraph for backslahs. 232239313Sdimconst char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { 233239313Sdim const char *CurPtr = BufferPtr; 234239313Sdim while (CurPtr != BufferEnd) { 235249423Sdim while (!isVerticalWhitespace(*CurPtr)) { 236239313Sdim CurPtr++; 237239313Sdim if (CurPtr == BufferEnd) 238239313Sdim return BufferEnd; 239239313Sdim } 240239313Sdim // We found a newline, check if it is escaped. 241239313Sdim const char *EscapePtr = CurPtr - 1; 242239313Sdim while(isHorizontalWhitespace(*EscapePtr)) 243239313Sdim EscapePtr--; 244239313Sdim 245239313Sdim if (*EscapePtr == '\\' || 246239313Sdim (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && 247239313Sdim EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { 248239313Sdim // We found an escaped newline. 249239313Sdim CurPtr = skipNewline(CurPtr, BufferEnd); 250239313Sdim } else 251239313Sdim return CurPtr; // Not an escaped newline. 252239313Sdim } 253239313Sdim return BufferEnd; 254239313Sdim} 255239313Sdim 256239313Sdim/// Return the one past end pointer for C comments. 257239313Sdim/// Very dumb, does not handle escaped newlines or trigraphs. 258239313Sdimconst char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { 259239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 260239313Sdim if (*BufferPtr == '*') { 261239313Sdim assert(BufferPtr + 1 != BufferEnd); 262239313Sdim if (*(BufferPtr + 1) == '/') 263239313Sdim return BufferPtr; 264239313Sdim } 265239313Sdim } 266239313Sdim llvm_unreachable("buffer end hit before '*/' was seen"); 267239313Sdim} 268239313Sdim} // unnamed namespace 269239313Sdim 270239313Sdimvoid Lexer::lexCommentText(Token &T) { 271239313Sdim assert(CommentState == LCS_InsideBCPLComment || 272239313Sdim CommentState == LCS_InsideCComment); 273239313Sdim 274239313Sdim switch (State) { 275239313Sdim case LS_Normal: 276239313Sdim break; 277239313Sdim case LS_VerbatimBlockFirstLine: 278239313Sdim lexVerbatimBlockFirstLine(T); 279239313Sdim return; 280239313Sdim case LS_VerbatimBlockBody: 281239313Sdim lexVerbatimBlockBody(T); 282239313Sdim return; 283239313Sdim case LS_VerbatimLineText: 284239313Sdim lexVerbatimLineText(T); 285239313Sdim return; 286239313Sdim case LS_HTMLStartTag: 287239313Sdim lexHTMLStartTag(T); 288239313Sdim return; 289239313Sdim case LS_HTMLEndTag: 290239313Sdim lexHTMLEndTag(T); 291239313Sdim return; 292239313Sdim } 293239313Sdim 294239313Sdim assert(State == LS_Normal); 295239313Sdim 296239313Sdim const char *TokenPtr = BufferPtr; 297239313Sdim assert(TokenPtr < CommentEnd); 298239313Sdim while (TokenPtr != CommentEnd) { 299239313Sdim switch(*TokenPtr) { 300239313Sdim case '\\': 301239313Sdim case '@': { 302249423Sdim // Commands that start with a backslash and commands that start with 303249423Sdim // 'at' have equivalent semantics. But we keep information about the 304249423Sdim // exact syntax in AST for comments. 305249423Sdim tok::TokenKind CommandKind = 306249423Sdim (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; 307239313Sdim TokenPtr++; 308239313Sdim if (TokenPtr == CommentEnd) { 309239313Sdim formTextToken(T, TokenPtr); 310239313Sdim return; 311239313Sdim } 312239313Sdim char C = *TokenPtr; 313239313Sdim switch (C) { 314239313Sdim default: 315239313Sdim break; 316239313Sdim 317239313Sdim case '\\': case '@': case '&': case '$': 318239313Sdim case '#': case '<': case '>': case '%': 319239313Sdim case '\"': case '.': case ':': 320239313Sdim // This is one of \\ \@ \& \$ etc escape sequences. 321239313Sdim TokenPtr++; 322239313Sdim if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { 323239313Sdim // This is the \:: escape sequence. 324239313Sdim TokenPtr++; 325239313Sdim } 326239313Sdim StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); 327239313Sdim formTokenWithChars(T, TokenPtr, tok::text); 328239313Sdim T.setText(UnescapedText); 329239313Sdim return; 330239313Sdim } 331239313Sdim 332239313Sdim // Don't make zero-length commands. 333243830Sdim if (!isCommandNameStartCharacter(*TokenPtr)) { 334239313Sdim formTextToken(T, TokenPtr); 335239313Sdim return; 336239313Sdim } 337239313Sdim 338239313Sdim TokenPtr = skipCommandName(TokenPtr, CommentEnd); 339239313Sdim unsigned Length = TokenPtr - (BufferPtr + 1); 340239313Sdim 341239313Sdim // Hardcoded support for lexing LaTeX formula commands 342239313Sdim // \f$ \f[ \f] \f{ \f} as a single command. 343239313Sdim if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { 344239313Sdim C = *TokenPtr; 345239313Sdim if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { 346239313Sdim TokenPtr++; 347239313Sdim Length++; 348239313Sdim } 349239313Sdim } 350239313Sdim 351239313Sdim const StringRef CommandName(BufferPtr + 1, Length); 352239313Sdim 353243830Sdim const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); 354243830Sdim if (!Info) { 355243830Sdim formTokenWithChars(T, TokenPtr, tok::unknown_command); 356243830Sdim T.setUnknownCommandName(CommandName); 357251662Sdim Diag(T.getLocation(), diag::warn_unknown_comment_command_name); 358239313Sdim return; 359239313Sdim } 360243830Sdim if (Info->IsVerbatimBlockCommand) { 361243830Sdim setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); 362239313Sdim return; 363239313Sdim } 364243830Sdim if (Info->IsVerbatimLineCommand) { 365243830Sdim setupAndLexVerbatimLine(T, TokenPtr, Info); 366243830Sdim return; 367243830Sdim } 368249423Sdim formTokenWithChars(T, TokenPtr, CommandKind); 369243830Sdim T.setCommandID(Info->getID()); 370239313Sdim return; 371239313Sdim } 372239313Sdim 373239313Sdim case '&': 374239313Sdim lexHTMLCharacterReference(T); 375239313Sdim return; 376239313Sdim 377239313Sdim case '<': { 378239313Sdim TokenPtr++; 379239313Sdim if (TokenPtr == CommentEnd) { 380239313Sdim formTextToken(T, TokenPtr); 381239313Sdim return; 382239313Sdim } 383239313Sdim const char C = *TokenPtr; 384239313Sdim if (isHTMLIdentifierStartingCharacter(C)) 385239313Sdim setupAndLexHTMLStartTag(T); 386239313Sdim else if (C == '/') 387239313Sdim setupAndLexHTMLEndTag(T); 388239313Sdim else 389239313Sdim formTextToken(T, TokenPtr); 390239313Sdim 391239313Sdim return; 392239313Sdim } 393239313Sdim 394239313Sdim case '\n': 395239313Sdim case '\r': 396239313Sdim TokenPtr = skipNewline(TokenPtr, CommentEnd); 397239313Sdim formTokenWithChars(T, TokenPtr, tok::newline); 398239313Sdim 399239313Sdim if (CommentState == LCS_InsideCComment) 400239313Sdim skipLineStartingDecorations(); 401239313Sdim return; 402239313Sdim 403239313Sdim default: { 404249423Sdim size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr). 405249423Sdim find_first_of("\n\r\\@&<"); 406249423Sdim if (End != StringRef::npos) 407249423Sdim TokenPtr += End; 408249423Sdim else 409249423Sdim TokenPtr = CommentEnd; 410239313Sdim formTextToken(T, TokenPtr); 411239313Sdim return; 412239313Sdim } 413239313Sdim } 414239313Sdim } 415239313Sdim} 416239313Sdim 417239313Sdimvoid Lexer::setupAndLexVerbatimBlock(Token &T, 418239313Sdim const char *TextBegin, 419243830Sdim char Marker, const CommandInfo *Info) { 420243830Sdim assert(Info->IsVerbatimBlockCommand); 421243830Sdim 422239313Sdim VerbatimBlockEndCommandName.clear(); 423239313Sdim VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@"); 424243830Sdim VerbatimBlockEndCommandName.append(Info->EndCommandName); 425239313Sdim 426239313Sdim formTokenWithChars(T, TextBegin, tok::verbatim_block_begin); 427243830Sdim T.setVerbatimBlockID(Info->getID()); 428239313Sdim 429239313Sdim // If there is a newline following the verbatim opening command, skip the 430239313Sdim // newline so that we don't create an tok::verbatim_block_line with empty 431239313Sdim // text content. 432249423Sdim if (BufferPtr != CommentEnd && 433249423Sdim isVerticalWhitespace(*BufferPtr)) { 434249423Sdim BufferPtr = skipNewline(BufferPtr, CommentEnd); 435249423Sdim State = LS_VerbatimBlockBody; 436249423Sdim return; 437239313Sdim } 438239313Sdim 439239313Sdim State = LS_VerbatimBlockFirstLine; 440239313Sdim} 441239313Sdim 442239313Sdimvoid Lexer::lexVerbatimBlockFirstLine(Token &T) { 443239313Sdimagain: 444239313Sdim assert(BufferPtr < CommentEnd); 445239313Sdim 446239313Sdim // FIXME: It would be better to scan the text once, finding either the block 447239313Sdim // end command or newline. 448239313Sdim // 449239313Sdim // Extract current line. 450239313Sdim const char *Newline = findNewline(BufferPtr, CommentEnd); 451239313Sdim StringRef Line(BufferPtr, Newline - BufferPtr); 452239313Sdim 453239313Sdim // Look for end command in current line. 454239313Sdim size_t Pos = Line.find(VerbatimBlockEndCommandName); 455239313Sdim const char *TextEnd; 456239313Sdim const char *NextLine; 457239313Sdim if (Pos == StringRef::npos) { 458239313Sdim // Current line is completely verbatim. 459239313Sdim TextEnd = Newline; 460239313Sdim NextLine = skipNewline(Newline, CommentEnd); 461239313Sdim } else if (Pos == 0) { 462239313Sdim // Current line contains just an end command. 463239313Sdim const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); 464239313Sdim StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); 465239313Sdim formTokenWithChars(T, End, tok::verbatim_block_end); 466243830Sdim T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID()); 467239313Sdim State = LS_Normal; 468239313Sdim return; 469239313Sdim } else { 470239313Sdim // There is some text, followed by end command. Extract text first. 471239313Sdim TextEnd = BufferPtr + Pos; 472239313Sdim NextLine = TextEnd; 473239313Sdim // If there is only whitespace before end command, skip whitespace. 474239313Sdim if (isWhitespace(BufferPtr, TextEnd)) { 475239313Sdim BufferPtr = TextEnd; 476239313Sdim goto again; 477239313Sdim } 478239313Sdim } 479239313Sdim 480239313Sdim StringRef Text(BufferPtr, TextEnd - BufferPtr); 481239313Sdim formTokenWithChars(T, NextLine, tok::verbatim_block_line); 482239313Sdim T.setVerbatimBlockText(Text); 483239313Sdim 484239313Sdim State = LS_VerbatimBlockBody; 485239313Sdim} 486239313Sdim 487239313Sdimvoid Lexer::lexVerbatimBlockBody(Token &T) { 488239313Sdim assert(State == LS_VerbatimBlockBody); 489239313Sdim 490239313Sdim if (CommentState == LCS_InsideCComment) 491239313Sdim skipLineStartingDecorations(); 492239313Sdim 493239313Sdim lexVerbatimBlockFirstLine(T); 494239313Sdim} 495239313Sdim 496243830Sdimvoid Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin, 497243830Sdim const CommandInfo *Info) { 498243830Sdim assert(Info->IsVerbatimLineCommand); 499239313Sdim formTokenWithChars(T, TextBegin, tok::verbatim_line_name); 500243830Sdim T.setVerbatimLineID(Info->getID()); 501239313Sdim 502239313Sdim State = LS_VerbatimLineText; 503239313Sdim} 504239313Sdim 505239313Sdimvoid Lexer::lexVerbatimLineText(Token &T) { 506239313Sdim assert(State == LS_VerbatimLineText); 507239313Sdim 508239313Sdim // Extract current line. 509239313Sdim const char *Newline = findNewline(BufferPtr, CommentEnd); 510239313Sdim const StringRef Text(BufferPtr, Newline - BufferPtr); 511239313Sdim formTokenWithChars(T, Newline, tok::verbatim_line_text); 512239313Sdim T.setVerbatimLineText(Text); 513239313Sdim 514239313Sdim State = LS_Normal; 515239313Sdim} 516239313Sdim 517239313Sdimvoid Lexer::lexHTMLCharacterReference(Token &T) { 518239313Sdim const char *TokenPtr = BufferPtr; 519239313Sdim assert(*TokenPtr == '&'); 520239313Sdim TokenPtr++; 521239313Sdim if (TokenPtr == CommentEnd) { 522239313Sdim formTextToken(T, TokenPtr); 523239313Sdim return; 524239313Sdim } 525239313Sdim const char *NamePtr; 526239313Sdim bool isNamed = false; 527239313Sdim bool isDecimal = false; 528239313Sdim char C = *TokenPtr; 529239313Sdim if (isHTMLNamedCharacterReferenceCharacter(C)) { 530239313Sdim NamePtr = TokenPtr; 531239313Sdim TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); 532239313Sdim isNamed = true; 533239313Sdim } else if (C == '#') { 534239313Sdim TokenPtr++; 535239313Sdim if (TokenPtr == CommentEnd) { 536239313Sdim formTextToken(T, TokenPtr); 537239313Sdim return; 538239313Sdim } 539239313Sdim C = *TokenPtr; 540239313Sdim if (isHTMLDecimalCharacterReferenceCharacter(C)) { 541239313Sdim NamePtr = TokenPtr; 542239313Sdim TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); 543239313Sdim isDecimal = true; 544239313Sdim } else if (C == 'x' || C == 'X') { 545239313Sdim TokenPtr++; 546239313Sdim NamePtr = TokenPtr; 547239313Sdim TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); 548239313Sdim } else { 549239313Sdim formTextToken(T, TokenPtr); 550239313Sdim return; 551239313Sdim } 552239313Sdim } else { 553239313Sdim formTextToken(T, TokenPtr); 554239313Sdim return; 555239313Sdim } 556239313Sdim if (NamePtr == TokenPtr || TokenPtr == CommentEnd || 557239313Sdim *TokenPtr != ';') { 558239313Sdim formTextToken(T, TokenPtr); 559239313Sdim return; 560239313Sdim } 561239313Sdim StringRef Name(NamePtr, TokenPtr - NamePtr); 562239313Sdim TokenPtr++; // Skip semicolon. 563239313Sdim StringRef Resolved; 564239313Sdim if (isNamed) 565239313Sdim Resolved = resolveHTMLNamedCharacterReference(Name); 566239313Sdim else if (isDecimal) 567239313Sdim Resolved = resolveHTMLDecimalCharacterReference(Name); 568239313Sdim else 569239313Sdim Resolved = resolveHTMLHexCharacterReference(Name); 570239313Sdim 571239313Sdim if (Resolved.empty()) { 572239313Sdim formTextToken(T, TokenPtr); 573239313Sdim return; 574239313Sdim } 575239313Sdim formTokenWithChars(T, TokenPtr, tok::text); 576239313Sdim T.setText(Resolved); 577239313Sdim return; 578239313Sdim} 579239313Sdim 580239313Sdimvoid Lexer::setupAndLexHTMLStartTag(Token &T) { 581239313Sdim assert(BufferPtr[0] == '<' && 582239313Sdim isHTMLIdentifierStartingCharacter(BufferPtr[1])); 583239313Sdim const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); 584243830Sdim StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); 585243830Sdim if (!isHTMLTagName(Name)) { 586243830Sdim formTextToken(T, TagNameEnd); 587243830Sdim return; 588243830Sdim } 589239313Sdim 590239313Sdim formTokenWithChars(T, TagNameEnd, tok::html_start_tag); 591239313Sdim T.setHTMLTagStartName(Name); 592239313Sdim 593239313Sdim BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 594239313Sdim 595239313Sdim const char C = *BufferPtr; 596239313Sdim if (BufferPtr != CommentEnd && 597239313Sdim (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C))) 598239313Sdim State = LS_HTMLStartTag; 599239313Sdim} 600239313Sdim 601239313Sdimvoid Lexer::lexHTMLStartTag(Token &T) { 602239313Sdim assert(State == LS_HTMLStartTag); 603239313Sdim 604239313Sdim const char *TokenPtr = BufferPtr; 605239313Sdim char C = *TokenPtr; 606239313Sdim if (isHTMLIdentifierCharacter(C)) { 607239313Sdim TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd); 608239313Sdim StringRef Ident(BufferPtr, TokenPtr - BufferPtr); 609239313Sdim formTokenWithChars(T, TokenPtr, tok::html_ident); 610239313Sdim T.setHTMLIdent(Ident); 611239313Sdim } else { 612239313Sdim switch (C) { 613239313Sdim case '=': 614239313Sdim TokenPtr++; 615239313Sdim formTokenWithChars(T, TokenPtr, tok::html_equals); 616239313Sdim break; 617239313Sdim case '\"': 618239313Sdim case '\'': { 619239313Sdim const char *OpenQuote = TokenPtr; 620239313Sdim TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd); 621239313Sdim const char *ClosingQuote = TokenPtr; 622239313Sdim if (TokenPtr != CommentEnd) // Skip closing quote. 623239313Sdim TokenPtr++; 624239313Sdim formTokenWithChars(T, TokenPtr, tok::html_quoted_string); 625239313Sdim T.setHTMLQuotedString(StringRef(OpenQuote + 1, 626239313Sdim ClosingQuote - (OpenQuote + 1))); 627239313Sdim break; 628239313Sdim } 629239313Sdim case '>': 630239313Sdim TokenPtr++; 631239313Sdim formTokenWithChars(T, TokenPtr, tok::html_greater); 632239313Sdim State = LS_Normal; 633239313Sdim return; 634239313Sdim case '/': 635239313Sdim TokenPtr++; 636239313Sdim if (TokenPtr != CommentEnd && *TokenPtr == '>') { 637239313Sdim TokenPtr++; 638239313Sdim formTokenWithChars(T, TokenPtr, tok::html_slash_greater); 639239313Sdim } else 640239313Sdim formTextToken(T, TokenPtr); 641239313Sdim 642239313Sdim State = LS_Normal; 643239313Sdim return; 644239313Sdim } 645239313Sdim } 646239313Sdim 647239313Sdim // Now look ahead and return to normal state if we don't see any HTML tokens 648239313Sdim // ahead. 649239313Sdim BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 650239313Sdim if (BufferPtr == CommentEnd) { 651239313Sdim State = LS_Normal; 652239313Sdim return; 653239313Sdim } 654239313Sdim 655239313Sdim C = *BufferPtr; 656239313Sdim if (!isHTMLIdentifierStartingCharacter(C) && 657239313Sdim C != '=' && C != '\"' && C != '\'' && C != '>') { 658239313Sdim State = LS_Normal; 659239313Sdim return; 660239313Sdim } 661239313Sdim} 662239313Sdim 663239313Sdimvoid Lexer::setupAndLexHTMLEndTag(Token &T) { 664239313Sdim assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); 665239313Sdim 666239313Sdim const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); 667239313Sdim const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); 668243830Sdim StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin); 669243830Sdim if (!isHTMLTagName(Name)) { 670243830Sdim formTextToken(T, TagNameEnd); 671243830Sdim return; 672243830Sdim } 673239313Sdim 674239313Sdim const char *End = skipWhitespace(TagNameEnd, CommentEnd); 675239313Sdim 676239313Sdim formTokenWithChars(T, End, tok::html_end_tag); 677243830Sdim T.setHTMLTagEndName(Name); 678239313Sdim 679239313Sdim if (BufferPtr != CommentEnd && *BufferPtr == '>') 680239313Sdim State = LS_HTMLEndTag; 681239313Sdim} 682239313Sdim 683239313Sdimvoid Lexer::lexHTMLEndTag(Token &T) { 684239313Sdim assert(BufferPtr != CommentEnd && *BufferPtr == '>'); 685239313Sdim 686239313Sdim formTokenWithChars(T, BufferPtr + 1, tok::html_greater); 687239313Sdim State = LS_Normal; 688239313Sdim} 689239313Sdim 690251662SdimLexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, 691251662Sdim const CommandTraits &Traits, 692243830Sdim SourceLocation FileLoc, 693239313Sdim const char *BufferStart, const char *BufferEnd): 694251662Sdim Allocator(Allocator), Diags(Diags), Traits(Traits), 695239313Sdim BufferStart(BufferStart), BufferEnd(BufferEnd), 696243830Sdim FileLoc(FileLoc), BufferPtr(BufferStart), 697239313Sdim CommentState(LCS_BeforeComment), State(LS_Normal) { 698239313Sdim} 699239313Sdim 700239313Sdimvoid Lexer::lex(Token &T) { 701239313Sdimagain: 702239313Sdim switch (CommentState) { 703239313Sdim case LCS_BeforeComment: 704239313Sdim if (BufferPtr == BufferEnd) { 705239313Sdim formTokenWithChars(T, BufferPtr, tok::eof); 706239313Sdim return; 707239313Sdim } 708239313Sdim 709239313Sdim assert(*BufferPtr == '/'); 710239313Sdim BufferPtr++; // Skip first slash. 711239313Sdim switch(*BufferPtr) { 712239313Sdim case '/': { // BCPL comment. 713239313Sdim BufferPtr++; // Skip second slash. 714239313Sdim 715239313Sdim if (BufferPtr != BufferEnd) { 716239313Sdim // Skip Doxygen magic marker, if it is present. 717239313Sdim // It might be missing because of a typo //< or /*<, or because we 718239313Sdim // merged this non-Doxygen comment into a bunch of Doxygen comments 719239313Sdim // around it: /** ... */ /* ... */ /** ... */ 720239313Sdim const char C = *BufferPtr; 721239313Sdim if (C == '/' || C == '!') 722239313Sdim BufferPtr++; 723239313Sdim } 724239313Sdim 725239313Sdim // Skip less-than symbol that marks trailing comments. 726239313Sdim // Skip it even if the comment is not a Doxygen one, because //< and /*< 727239313Sdim // are frequent typos. 728239313Sdim if (BufferPtr != BufferEnd && *BufferPtr == '<') 729239313Sdim BufferPtr++; 730239313Sdim 731239313Sdim CommentState = LCS_InsideBCPLComment; 732239313Sdim if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine) 733239313Sdim State = LS_Normal; 734239313Sdim CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); 735239313Sdim goto again; 736239313Sdim } 737239313Sdim case '*': { // C comment. 738239313Sdim BufferPtr++; // Skip star. 739239313Sdim 740239313Sdim // Skip Doxygen magic marker. 741239313Sdim const char C = *BufferPtr; 742239313Sdim if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!') 743239313Sdim BufferPtr++; 744239313Sdim 745239313Sdim // Skip less-than symbol that marks trailing comments. 746239313Sdim if (BufferPtr != BufferEnd && *BufferPtr == '<') 747239313Sdim BufferPtr++; 748239313Sdim 749239313Sdim CommentState = LCS_InsideCComment; 750239313Sdim State = LS_Normal; 751239313Sdim CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); 752239313Sdim goto again; 753239313Sdim } 754239313Sdim default: 755239313Sdim llvm_unreachable("second character of comment should be '/' or '*'"); 756239313Sdim } 757239313Sdim 758239313Sdim case LCS_BetweenComments: { 759239313Sdim // Consecutive comments are extracted only if there is only whitespace 760239313Sdim // between them. So we can search for the start of the next comment. 761239313Sdim const char *EndWhitespace = BufferPtr; 762239313Sdim while(EndWhitespace != BufferEnd && *EndWhitespace != '/') 763239313Sdim EndWhitespace++; 764239313Sdim 765239313Sdim // Turn any whitespace between comments (and there is only whitespace 766239313Sdim // between them -- guaranteed by comment extraction) into a newline. We 767239313Sdim // have two newlines between C comments in total (first one was synthesized 768239313Sdim // after a comment). 769239313Sdim formTokenWithChars(T, EndWhitespace, tok::newline); 770239313Sdim 771239313Sdim CommentState = LCS_BeforeComment; 772239313Sdim break; 773239313Sdim } 774239313Sdim 775239313Sdim case LCS_InsideBCPLComment: 776239313Sdim case LCS_InsideCComment: 777239313Sdim if (BufferPtr != CommentEnd) { 778239313Sdim lexCommentText(T); 779239313Sdim break; 780239313Sdim } else { 781239313Sdim // Skip C comment closing sequence. 782239313Sdim if (CommentState == LCS_InsideCComment) { 783239313Sdim assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); 784239313Sdim BufferPtr += 2; 785239313Sdim assert(BufferPtr <= BufferEnd); 786239313Sdim 787239313Sdim // Synthenize newline just after the C comment, regardless if there is 788239313Sdim // actually a newline. 789239313Sdim formTokenWithChars(T, BufferPtr, tok::newline); 790239313Sdim 791239313Sdim CommentState = LCS_BetweenComments; 792239313Sdim break; 793239313Sdim } else { 794239313Sdim // Don't synthesized a newline after BCPL comment. 795239313Sdim CommentState = LCS_BetweenComments; 796239313Sdim goto again; 797239313Sdim } 798239313Sdim } 799239313Sdim } 800239313Sdim} 801239313Sdim 802239313SdimStringRef Lexer::getSpelling(const Token &Tok, 803239313Sdim const SourceManager &SourceMgr, 804239313Sdim bool *Invalid) const { 805239313Sdim SourceLocation Loc = Tok.getLocation(); 806239313Sdim std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); 807239313Sdim 808239313Sdim bool InvalidTemp = false; 809239313Sdim StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp); 810239313Sdim if (InvalidTemp) { 811239313Sdim *Invalid = true; 812239313Sdim return StringRef(); 813239313Sdim } 814239313Sdim 815239313Sdim const char *Begin = File.data() + LocInfo.second; 816239313Sdim return StringRef(Begin, Tok.getLength()); 817239313Sdim} 818239313Sdim 819239313Sdim} // end namespace comments 820239313Sdim} // end namespace clang 821239313Sdim 822