CommentLexer.cpp revision 249423
1239313Sdim#include "clang/AST/CommentLexer.h" 2239313Sdim#include "clang/AST/CommentCommandTraits.h" 3249423Sdim#include "clang/Basic/CharInfo.h" 4249423Sdim#include "llvm/ADT/StringExtras.h" 5239313Sdim#include "llvm/ADT/StringSwitch.h" 6249423Sdim#include "llvm/Support/ConvertUTF.h" 7239313Sdim#include "llvm/Support/ErrorHandling.h" 8239313Sdim 9239313Sdimnamespace clang { 10239313Sdimnamespace comments { 11239313Sdim 12239313Sdimvoid Token::dump(const Lexer &L, const SourceManager &SM) const { 13239313Sdim llvm::errs() << "comments::Token Kind=" << Kind << " "; 14239313Sdim Loc.dump(SM); 15239313Sdim llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; 16239313Sdim} 17239313Sdim 18249423Sdimstatic inline bool isHTMLNamedCharacterReferenceCharacter(char C) { 19249423Sdim return isLetter(C); 20239313Sdim} 21239313Sdim 22249423Sdimstatic inline bool isHTMLDecimalCharacterReferenceCharacter(char C) { 23249423Sdim return isDigit(C); 24239313Sdim} 25239313Sdim 26249423Sdimstatic inline bool isHTMLHexCharacterReferenceCharacter(char C) { 27249423Sdim return isHexDigit(C); 28239313Sdim} 29243830Sdim 30249423Sdimstatic inline StringRef convertCodePointToUTF8( 31249423Sdim llvm::BumpPtrAllocator &Allocator, 32249423Sdim unsigned CodePoint) { 33249423Sdim char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); 34249423Sdim char *ResolvedPtr = Resolved; 35249423Sdim if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) 36249423Sdim return StringRef(Resolved, ResolvedPtr - Resolved); 37249423Sdim else 38249423Sdim return StringRef(); 39249423Sdim} 40249423Sdim 41249423Sdimnamespace { 42249423Sdim 43243830Sdim#include "clang/AST/CommentHTMLTags.inc" 44249423Sdim#include "clang/AST/CommentHTMLNamedCharacterReferences.inc" 45243830Sdim 46239313Sdim} // unnamed namespace 47239313Sdim 48239313SdimStringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { 49249423Sdim // Fast path, first check a few most widely used named character references. 50239313Sdim return llvm::StringSwitch<StringRef>(Name) 51239313Sdim .Case("amp", "&") 52239313Sdim .Case("lt", "<") 53239313Sdim .Case("gt", ">") 54239313Sdim .Case("quot", "\"") 55239313Sdim .Case("apos", "\'") 56249423Sdim // Slow path. 57249423Sdim .Default(translateHTMLNamedCharacterReferenceToUTF8(Name)); 58239313Sdim} 59239313Sdim 60239313SdimStringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { 61239313Sdim unsigned CodePoint = 0; 62239313Sdim for (unsigned i = 0, e = Name.size(); i != e; ++i) { 63239313Sdim assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); 64239313Sdim CodePoint *= 10; 65239313Sdim CodePoint += Name[i] - '0'; 66239313Sdim } 67249423Sdim return convertCodePointToUTF8(Allocator, CodePoint); 68239313Sdim} 69239313Sdim 70239313SdimStringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { 71239313Sdim unsigned CodePoint = 0; 72239313Sdim for (unsigned i = 0, e = Name.size(); i != e; ++i) { 73239313Sdim CodePoint *= 16; 74239313Sdim const char C = Name[i]; 75239313Sdim assert(isHTMLHexCharacterReferenceCharacter(C)); 76249423Sdim CodePoint += llvm::hexDigitValue(C); 77239313Sdim } 78249423Sdim return convertCodePointToUTF8(Allocator, CodePoint); 79239313Sdim} 80239313Sdim 81239313Sdimvoid Lexer::skipLineStartingDecorations() { 82239313Sdim // This function should be called only for C comments 83239313Sdim assert(CommentState == LCS_InsideCComment); 84239313Sdim 85239313Sdim if (BufferPtr == CommentEnd) 86239313Sdim return; 87239313Sdim 88239313Sdim switch (*BufferPtr) { 89239313Sdim case ' ': 90239313Sdim case '\t': 91239313Sdim case '\f': 92239313Sdim case '\v': { 93239313Sdim const char *NewBufferPtr = BufferPtr; 94239313Sdim NewBufferPtr++; 95239313Sdim if (NewBufferPtr == CommentEnd) 96239313Sdim return; 97239313Sdim 98239313Sdim char C = *NewBufferPtr; 99249423Sdim while (isHorizontalWhitespace(C)) { 100239313Sdim NewBufferPtr++; 101239313Sdim if (NewBufferPtr == CommentEnd) 102239313Sdim return; 103239313Sdim C = *NewBufferPtr; 104239313Sdim } 105239313Sdim if (C == '*') 106239313Sdim BufferPtr = NewBufferPtr + 1; 107239313Sdim break; 108239313Sdim } 109239313Sdim case '*': 110239313Sdim BufferPtr++; 111239313Sdim break; 112239313Sdim } 113239313Sdim} 114239313Sdim 115239313Sdimnamespace { 116239313Sdim/// Returns pointer to the first newline character in the string. 117239313Sdimconst char *findNewline(const char *BufferPtr, const char *BufferEnd) { 118239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 119249423Sdim if (isVerticalWhitespace(*BufferPtr)) 120239313Sdim return BufferPtr; 121239313Sdim } 122239313Sdim return BufferEnd; 123239313Sdim} 124239313Sdim 125239313Sdimconst char *skipNewline(const char *BufferPtr, const char *BufferEnd) { 126239313Sdim if (BufferPtr == BufferEnd) 127239313Sdim return BufferPtr; 128239313Sdim 129239313Sdim if (*BufferPtr == '\n') 130239313Sdim BufferPtr++; 131239313Sdim else { 132239313Sdim assert(*BufferPtr == '\r'); 133239313Sdim BufferPtr++; 134239313Sdim if (BufferPtr != BufferEnd && *BufferPtr == '\n') 135239313Sdim BufferPtr++; 136239313Sdim } 137239313Sdim return BufferPtr; 138239313Sdim} 139239313Sdim 140239313Sdimconst char *skipNamedCharacterReference(const char *BufferPtr, 141239313Sdim const char *BufferEnd) { 142239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 143239313Sdim if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) 144239313Sdim return BufferPtr; 145239313Sdim } 146239313Sdim return BufferEnd; 147239313Sdim} 148239313Sdim 149239313Sdimconst char *skipDecimalCharacterReference(const char *BufferPtr, 150239313Sdim const char *BufferEnd) { 151239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 152239313Sdim if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) 153239313Sdim return BufferPtr; 154239313Sdim } 155239313Sdim return BufferEnd; 156239313Sdim} 157239313Sdim 158239313Sdimconst char *skipHexCharacterReference(const char *BufferPtr, 159239313Sdim const char *BufferEnd) { 160239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 161239313Sdim if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) 162239313Sdim return BufferPtr; 163239313Sdim } 164239313Sdim return BufferEnd; 165239313Sdim} 166239313Sdim 167239313Sdimbool isHTMLIdentifierStartingCharacter(char C) { 168249423Sdim return isLetter(C); 169239313Sdim} 170239313Sdim 171239313Sdimbool isHTMLIdentifierCharacter(char C) { 172249423Sdim return isAlphanumeric(C); 173239313Sdim} 174239313Sdim 175239313Sdimconst char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { 176239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 177239313Sdim if (!isHTMLIdentifierCharacter(*BufferPtr)) 178239313Sdim return BufferPtr; 179239313Sdim } 180239313Sdim return BufferEnd; 181239313Sdim} 182239313Sdim 183239313Sdim/// Skip HTML string quoted in single or double quotes. Escaping quotes inside 184239313Sdim/// string allowed. 185239313Sdim/// 186239313Sdim/// Returns pointer to closing quote. 187239313Sdimconst char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) 188239313Sdim{ 189239313Sdim const char Quote = *BufferPtr; 190239313Sdim assert(Quote == '\"' || Quote == '\''); 191239313Sdim 192239313Sdim BufferPtr++; 193239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 194239313Sdim const char C = *BufferPtr; 195239313Sdim if (C == Quote && BufferPtr[-1] != '\\') 196239313Sdim return BufferPtr; 197239313Sdim } 198239313Sdim return BufferEnd; 199239313Sdim} 200239313Sdim 201239313Sdimconst char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { 202239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 203239313Sdim if (!isWhitespace(*BufferPtr)) 204239313Sdim return BufferPtr; 205239313Sdim } 206239313Sdim return BufferEnd; 207239313Sdim} 208239313Sdim 209239313Sdimbool isWhitespace(const char *BufferPtr, const char *BufferEnd) { 210239313Sdim return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; 211239313Sdim} 212239313Sdim 213243830Sdimbool isCommandNameStartCharacter(char C) { 214249423Sdim return isLetter(C); 215243830Sdim} 216243830Sdim 217239313Sdimbool isCommandNameCharacter(char C) { 218249423Sdim return isAlphanumeric(C); 219239313Sdim} 220239313Sdim 221239313Sdimconst char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { 222239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 223239313Sdim if (!isCommandNameCharacter(*BufferPtr)) 224239313Sdim return BufferPtr; 225239313Sdim } 226239313Sdim return BufferEnd; 227239313Sdim} 228239313Sdim 229239313Sdim/// Return the one past end pointer for BCPL comments. 230239313Sdim/// Handles newlines escaped with backslash or trigraph for backslahs. 231239313Sdimconst char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { 232239313Sdim const char *CurPtr = BufferPtr; 233239313Sdim while (CurPtr != BufferEnd) { 234249423Sdim while (!isVerticalWhitespace(*CurPtr)) { 235239313Sdim CurPtr++; 236239313Sdim if (CurPtr == BufferEnd) 237239313Sdim return BufferEnd; 238239313Sdim } 239239313Sdim // We found a newline, check if it is escaped. 240239313Sdim const char *EscapePtr = CurPtr - 1; 241239313Sdim while(isHorizontalWhitespace(*EscapePtr)) 242239313Sdim EscapePtr--; 243239313Sdim 244239313Sdim if (*EscapePtr == '\\' || 245239313Sdim (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && 246239313Sdim EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { 247239313Sdim // We found an escaped newline. 248239313Sdim CurPtr = skipNewline(CurPtr, BufferEnd); 249239313Sdim } else 250239313Sdim return CurPtr; // Not an escaped newline. 251239313Sdim } 252239313Sdim return BufferEnd; 253239313Sdim} 254239313Sdim 255239313Sdim/// Return the one past end pointer for C comments. 256239313Sdim/// Very dumb, does not handle escaped newlines or trigraphs. 257239313Sdimconst char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { 258239313Sdim for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 259239313Sdim if (*BufferPtr == '*') { 260239313Sdim assert(BufferPtr + 1 != BufferEnd); 261239313Sdim if (*(BufferPtr + 1) == '/') 262239313Sdim return BufferPtr; 263239313Sdim } 264239313Sdim } 265239313Sdim llvm_unreachable("buffer end hit before '*/' was seen"); 266239313Sdim} 267239313Sdim} // unnamed namespace 268239313Sdim 269239313Sdimvoid Lexer::lexCommentText(Token &T) { 270239313Sdim assert(CommentState == LCS_InsideBCPLComment || 271239313Sdim CommentState == LCS_InsideCComment); 272239313Sdim 273239313Sdim switch (State) { 274239313Sdim case LS_Normal: 275239313Sdim break; 276239313Sdim case LS_VerbatimBlockFirstLine: 277239313Sdim lexVerbatimBlockFirstLine(T); 278239313Sdim return; 279239313Sdim case LS_VerbatimBlockBody: 280239313Sdim lexVerbatimBlockBody(T); 281239313Sdim return; 282239313Sdim case LS_VerbatimLineText: 283239313Sdim lexVerbatimLineText(T); 284239313Sdim return; 285239313Sdim case LS_HTMLStartTag: 286239313Sdim lexHTMLStartTag(T); 287239313Sdim return; 288239313Sdim case LS_HTMLEndTag: 289239313Sdim lexHTMLEndTag(T); 290239313Sdim return; 291239313Sdim } 292239313Sdim 293239313Sdim assert(State == LS_Normal); 294239313Sdim 295239313Sdim const char *TokenPtr = BufferPtr; 296239313Sdim assert(TokenPtr < CommentEnd); 297239313Sdim while (TokenPtr != CommentEnd) { 298239313Sdim switch(*TokenPtr) { 299239313Sdim case '\\': 300239313Sdim case '@': { 301249423Sdim // Commands that start with a backslash and commands that start with 302249423Sdim // 'at' have equivalent semantics. But we keep information about the 303249423Sdim // exact syntax in AST for comments. 304249423Sdim tok::TokenKind CommandKind = 305249423Sdim (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; 306239313Sdim TokenPtr++; 307239313Sdim if (TokenPtr == CommentEnd) { 308239313Sdim formTextToken(T, TokenPtr); 309239313Sdim return; 310239313Sdim } 311239313Sdim char C = *TokenPtr; 312239313Sdim switch (C) { 313239313Sdim default: 314239313Sdim break; 315239313Sdim 316239313Sdim case '\\': case '@': case '&': case '$': 317239313Sdim case '#': case '<': case '>': case '%': 318239313Sdim case '\"': case '.': case ':': 319239313Sdim // This is one of \\ \@ \& \$ etc escape sequences. 320239313Sdim TokenPtr++; 321239313Sdim if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { 322239313Sdim // This is the \:: escape sequence. 323239313Sdim TokenPtr++; 324239313Sdim } 325239313Sdim StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); 326239313Sdim formTokenWithChars(T, TokenPtr, tok::text); 327239313Sdim T.setText(UnescapedText); 328239313Sdim return; 329239313Sdim } 330239313Sdim 331239313Sdim // Don't make zero-length commands. 332243830Sdim if (!isCommandNameStartCharacter(*TokenPtr)) { 333239313Sdim formTextToken(T, TokenPtr); 334239313Sdim return; 335239313Sdim } 336239313Sdim 337239313Sdim TokenPtr = skipCommandName(TokenPtr, CommentEnd); 338239313Sdim unsigned Length = TokenPtr - (BufferPtr + 1); 339239313Sdim 340239313Sdim // Hardcoded support for lexing LaTeX formula commands 341239313Sdim // \f$ \f[ \f] \f{ \f} as a single command. 342239313Sdim if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { 343239313Sdim C = *TokenPtr; 344239313Sdim if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { 345239313Sdim TokenPtr++; 346239313Sdim Length++; 347239313Sdim } 348239313Sdim } 349239313Sdim 350239313Sdim const StringRef CommandName(BufferPtr + 1, Length); 351239313Sdim 352243830Sdim const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); 353243830Sdim if (!Info) { 354243830Sdim formTokenWithChars(T, TokenPtr, tok::unknown_command); 355243830Sdim T.setUnknownCommandName(CommandName); 356239313Sdim return; 357239313Sdim } 358243830Sdim if (Info->IsVerbatimBlockCommand) { 359243830Sdim setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); 360239313Sdim return; 361239313Sdim } 362243830Sdim if (Info->IsVerbatimLineCommand) { 363243830Sdim setupAndLexVerbatimLine(T, TokenPtr, Info); 364243830Sdim return; 365243830Sdim } 366249423Sdim formTokenWithChars(T, TokenPtr, CommandKind); 367243830Sdim T.setCommandID(Info->getID()); 368239313Sdim return; 369239313Sdim } 370239313Sdim 371239313Sdim case '&': 372239313Sdim lexHTMLCharacterReference(T); 373239313Sdim return; 374239313Sdim 375239313Sdim case '<': { 376239313Sdim TokenPtr++; 377239313Sdim if (TokenPtr == CommentEnd) { 378239313Sdim formTextToken(T, TokenPtr); 379239313Sdim return; 380239313Sdim } 381239313Sdim const char C = *TokenPtr; 382239313Sdim if (isHTMLIdentifierStartingCharacter(C)) 383239313Sdim setupAndLexHTMLStartTag(T); 384239313Sdim else if (C == '/') 385239313Sdim setupAndLexHTMLEndTag(T); 386239313Sdim else 387239313Sdim formTextToken(T, TokenPtr); 388239313Sdim 389239313Sdim return; 390239313Sdim } 391239313Sdim 392239313Sdim case '\n': 393239313Sdim case '\r': 394239313Sdim TokenPtr = skipNewline(TokenPtr, CommentEnd); 395239313Sdim formTokenWithChars(T, TokenPtr, tok::newline); 396239313Sdim 397239313Sdim if (CommentState == LCS_InsideCComment) 398239313Sdim skipLineStartingDecorations(); 399239313Sdim return; 400239313Sdim 401239313Sdim default: { 402249423Sdim size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr). 403249423Sdim find_first_of("\n\r\\@&<"); 404249423Sdim if (End != StringRef::npos) 405249423Sdim TokenPtr += End; 406249423Sdim else 407249423Sdim TokenPtr = CommentEnd; 408239313Sdim formTextToken(T, TokenPtr); 409239313Sdim return; 410239313Sdim } 411239313Sdim } 412239313Sdim } 413239313Sdim} 414239313Sdim 415239313Sdimvoid Lexer::setupAndLexVerbatimBlock(Token &T, 416239313Sdim const char *TextBegin, 417243830Sdim char Marker, const CommandInfo *Info) { 418243830Sdim assert(Info->IsVerbatimBlockCommand); 419243830Sdim 420239313Sdim VerbatimBlockEndCommandName.clear(); 421239313Sdim VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@"); 422243830Sdim VerbatimBlockEndCommandName.append(Info->EndCommandName); 423239313Sdim 424239313Sdim formTokenWithChars(T, TextBegin, tok::verbatim_block_begin); 425243830Sdim T.setVerbatimBlockID(Info->getID()); 426239313Sdim 427239313Sdim // If there is a newline following the verbatim opening command, skip the 428239313Sdim // newline so that we don't create an tok::verbatim_block_line with empty 429239313Sdim // text content. 430249423Sdim if (BufferPtr != CommentEnd && 431249423Sdim isVerticalWhitespace(*BufferPtr)) { 432249423Sdim BufferPtr = skipNewline(BufferPtr, CommentEnd); 433249423Sdim State = LS_VerbatimBlockBody; 434249423Sdim return; 435239313Sdim } 436239313Sdim 437239313Sdim State = LS_VerbatimBlockFirstLine; 438239313Sdim} 439239313Sdim 440239313Sdimvoid Lexer::lexVerbatimBlockFirstLine(Token &T) { 441239313Sdimagain: 442239313Sdim assert(BufferPtr < CommentEnd); 443239313Sdim 444239313Sdim // FIXME: It would be better to scan the text once, finding either the block 445239313Sdim // end command or newline. 446239313Sdim // 447239313Sdim // Extract current line. 448239313Sdim const char *Newline = findNewline(BufferPtr, CommentEnd); 449239313Sdim StringRef Line(BufferPtr, Newline - BufferPtr); 450239313Sdim 451239313Sdim // Look for end command in current line. 452239313Sdim size_t Pos = Line.find(VerbatimBlockEndCommandName); 453239313Sdim const char *TextEnd; 454239313Sdim const char *NextLine; 455239313Sdim if (Pos == StringRef::npos) { 456239313Sdim // Current line is completely verbatim. 457239313Sdim TextEnd = Newline; 458239313Sdim NextLine = skipNewline(Newline, CommentEnd); 459239313Sdim } else if (Pos == 0) { 460239313Sdim // Current line contains just an end command. 461239313Sdim const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); 462239313Sdim StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); 463239313Sdim formTokenWithChars(T, End, tok::verbatim_block_end); 464243830Sdim T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID()); 465239313Sdim State = LS_Normal; 466239313Sdim return; 467239313Sdim } else { 468239313Sdim // There is some text, followed by end command. Extract text first. 469239313Sdim TextEnd = BufferPtr + Pos; 470239313Sdim NextLine = TextEnd; 471239313Sdim // If there is only whitespace before end command, skip whitespace. 472239313Sdim if (isWhitespace(BufferPtr, TextEnd)) { 473239313Sdim BufferPtr = TextEnd; 474239313Sdim goto again; 475239313Sdim } 476239313Sdim } 477239313Sdim 478239313Sdim StringRef Text(BufferPtr, TextEnd - BufferPtr); 479239313Sdim formTokenWithChars(T, NextLine, tok::verbatim_block_line); 480239313Sdim T.setVerbatimBlockText(Text); 481239313Sdim 482239313Sdim State = LS_VerbatimBlockBody; 483239313Sdim} 484239313Sdim 485239313Sdimvoid Lexer::lexVerbatimBlockBody(Token &T) { 486239313Sdim assert(State == LS_VerbatimBlockBody); 487239313Sdim 488239313Sdim if (CommentState == LCS_InsideCComment) 489239313Sdim skipLineStartingDecorations(); 490239313Sdim 491239313Sdim lexVerbatimBlockFirstLine(T); 492239313Sdim} 493239313Sdim 494243830Sdimvoid Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin, 495243830Sdim const CommandInfo *Info) { 496243830Sdim assert(Info->IsVerbatimLineCommand); 497239313Sdim formTokenWithChars(T, TextBegin, tok::verbatim_line_name); 498243830Sdim T.setVerbatimLineID(Info->getID()); 499239313Sdim 500239313Sdim State = LS_VerbatimLineText; 501239313Sdim} 502239313Sdim 503239313Sdimvoid Lexer::lexVerbatimLineText(Token &T) { 504239313Sdim assert(State == LS_VerbatimLineText); 505239313Sdim 506239313Sdim // Extract current line. 507239313Sdim const char *Newline = findNewline(BufferPtr, CommentEnd); 508239313Sdim const StringRef Text(BufferPtr, Newline - BufferPtr); 509239313Sdim formTokenWithChars(T, Newline, tok::verbatim_line_text); 510239313Sdim T.setVerbatimLineText(Text); 511239313Sdim 512239313Sdim State = LS_Normal; 513239313Sdim} 514239313Sdim 515239313Sdimvoid Lexer::lexHTMLCharacterReference(Token &T) { 516239313Sdim const char *TokenPtr = BufferPtr; 517239313Sdim assert(*TokenPtr == '&'); 518239313Sdim TokenPtr++; 519239313Sdim if (TokenPtr == CommentEnd) { 520239313Sdim formTextToken(T, TokenPtr); 521239313Sdim return; 522239313Sdim } 523239313Sdim const char *NamePtr; 524239313Sdim bool isNamed = false; 525239313Sdim bool isDecimal = false; 526239313Sdim char C = *TokenPtr; 527239313Sdim if (isHTMLNamedCharacterReferenceCharacter(C)) { 528239313Sdim NamePtr = TokenPtr; 529239313Sdim TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); 530239313Sdim isNamed = true; 531239313Sdim } else if (C == '#') { 532239313Sdim TokenPtr++; 533239313Sdim if (TokenPtr == CommentEnd) { 534239313Sdim formTextToken(T, TokenPtr); 535239313Sdim return; 536239313Sdim } 537239313Sdim C = *TokenPtr; 538239313Sdim if (isHTMLDecimalCharacterReferenceCharacter(C)) { 539239313Sdim NamePtr = TokenPtr; 540239313Sdim TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); 541239313Sdim isDecimal = true; 542239313Sdim } else if (C == 'x' || C == 'X') { 543239313Sdim TokenPtr++; 544239313Sdim NamePtr = TokenPtr; 545239313Sdim TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); 546239313Sdim } else { 547239313Sdim formTextToken(T, TokenPtr); 548239313Sdim return; 549239313Sdim } 550239313Sdim } else { 551239313Sdim formTextToken(T, TokenPtr); 552239313Sdim return; 553239313Sdim } 554239313Sdim if (NamePtr == TokenPtr || TokenPtr == CommentEnd || 555239313Sdim *TokenPtr != ';') { 556239313Sdim formTextToken(T, TokenPtr); 557239313Sdim return; 558239313Sdim } 559239313Sdim StringRef Name(NamePtr, TokenPtr - NamePtr); 560239313Sdim TokenPtr++; // Skip semicolon. 561239313Sdim StringRef Resolved; 562239313Sdim if (isNamed) 563239313Sdim Resolved = resolveHTMLNamedCharacterReference(Name); 564239313Sdim else if (isDecimal) 565239313Sdim Resolved = resolveHTMLDecimalCharacterReference(Name); 566239313Sdim else 567239313Sdim Resolved = resolveHTMLHexCharacterReference(Name); 568239313Sdim 569239313Sdim if (Resolved.empty()) { 570239313Sdim formTextToken(T, TokenPtr); 571239313Sdim return; 572239313Sdim } 573239313Sdim formTokenWithChars(T, TokenPtr, tok::text); 574239313Sdim T.setText(Resolved); 575239313Sdim return; 576239313Sdim} 577239313Sdim 578239313Sdimvoid Lexer::setupAndLexHTMLStartTag(Token &T) { 579239313Sdim assert(BufferPtr[0] == '<' && 580239313Sdim isHTMLIdentifierStartingCharacter(BufferPtr[1])); 581239313Sdim const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); 582243830Sdim StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); 583243830Sdim if (!isHTMLTagName(Name)) { 584243830Sdim formTextToken(T, TagNameEnd); 585243830Sdim return; 586243830Sdim } 587239313Sdim 588239313Sdim formTokenWithChars(T, TagNameEnd, tok::html_start_tag); 589239313Sdim T.setHTMLTagStartName(Name); 590239313Sdim 591239313Sdim BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 592239313Sdim 593239313Sdim const char C = *BufferPtr; 594239313Sdim if (BufferPtr != CommentEnd && 595239313Sdim (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C))) 596239313Sdim State = LS_HTMLStartTag; 597239313Sdim} 598239313Sdim 599239313Sdimvoid Lexer::lexHTMLStartTag(Token &T) { 600239313Sdim assert(State == LS_HTMLStartTag); 601239313Sdim 602239313Sdim const char *TokenPtr = BufferPtr; 603239313Sdim char C = *TokenPtr; 604239313Sdim if (isHTMLIdentifierCharacter(C)) { 605239313Sdim TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd); 606239313Sdim StringRef Ident(BufferPtr, TokenPtr - BufferPtr); 607239313Sdim formTokenWithChars(T, TokenPtr, tok::html_ident); 608239313Sdim T.setHTMLIdent(Ident); 609239313Sdim } else { 610239313Sdim switch (C) { 611239313Sdim case '=': 612239313Sdim TokenPtr++; 613239313Sdim formTokenWithChars(T, TokenPtr, tok::html_equals); 614239313Sdim break; 615239313Sdim case '\"': 616239313Sdim case '\'': { 617239313Sdim const char *OpenQuote = TokenPtr; 618239313Sdim TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd); 619239313Sdim const char *ClosingQuote = TokenPtr; 620239313Sdim if (TokenPtr != CommentEnd) // Skip closing quote. 621239313Sdim TokenPtr++; 622239313Sdim formTokenWithChars(T, TokenPtr, tok::html_quoted_string); 623239313Sdim T.setHTMLQuotedString(StringRef(OpenQuote + 1, 624239313Sdim ClosingQuote - (OpenQuote + 1))); 625239313Sdim break; 626239313Sdim } 627239313Sdim case '>': 628239313Sdim TokenPtr++; 629239313Sdim formTokenWithChars(T, TokenPtr, tok::html_greater); 630239313Sdim State = LS_Normal; 631239313Sdim return; 632239313Sdim case '/': 633239313Sdim TokenPtr++; 634239313Sdim if (TokenPtr != CommentEnd && *TokenPtr == '>') { 635239313Sdim TokenPtr++; 636239313Sdim formTokenWithChars(T, TokenPtr, tok::html_slash_greater); 637239313Sdim } else 638239313Sdim formTextToken(T, TokenPtr); 639239313Sdim 640239313Sdim State = LS_Normal; 641239313Sdim return; 642239313Sdim } 643239313Sdim } 644239313Sdim 645239313Sdim // Now look ahead and return to normal state if we don't see any HTML tokens 646239313Sdim // ahead. 647239313Sdim BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 648239313Sdim if (BufferPtr == CommentEnd) { 649239313Sdim State = LS_Normal; 650239313Sdim return; 651239313Sdim } 652239313Sdim 653239313Sdim C = *BufferPtr; 654239313Sdim if (!isHTMLIdentifierStartingCharacter(C) && 655239313Sdim C != '=' && C != '\"' && C != '\'' && C != '>') { 656239313Sdim State = LS_Normal; 657239313Sdim return; 658239313Sdim } 659239313Sdim} 660239313Sdim 661239313Sdimvoid Lexer::setupAndLexHTMLEndTag(Token &T) { 662239313Sdim assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); 663239313Sdim 664239313Sdim const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); 665239313Sdim const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); 666243830Sdim StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin); 667243830Sdim if (!isHTMLTagName(Name)) { 668243830Sdim formTextToken(T, TagNameEnd); 669243830Sdim return; 670243830Sdim } 671239313Sdim 672239313Sdim const char *End = skipWhitespace(TagNameEnd, CommentEnd); 673239313Sdim 674239313Sdim formTokenWithChars(T, End, tok::html_end_tag); 675243830Sdim T.setHTMLTagEndName(Name); 676239313Sdim 677239313Sdim if (BufferPtr != CommentEnd && *BufferPtr == '>') 678239313Sdim State = LS_HTMLEndTag; 679239313Sdim} 680239313Sdim 681239313Sdimvoid Lexer::lexHTMLEndTag(Token &T) { 682239313Sdim assert(BufferPtr != CommentEnd && *BufferPtr == '>'); 683239313Sdim 684239313Sdim formTokenWithChars(T, BufferPtr + 1, tok::html_greater); 685239313Sdim State = LS_Normal; 686239313Sdim} 687239313Sdim 688239313SdimLexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits, 689243830Sdim SourceLocation FileLoc, 690239313Sdim const char *BufferStart, const char *BufferEnd): 691239313Sdim Allocator(Allocator), Traits(Traits), 692239313Sdim BufferStart(BufferStart), BufferEnd(BufferEnd), 693243830Sdim FileLoc(FileLoc), BufferPtr(BufferStart), 694239313Sdim CommentState(LCS_BeforeComment), State(LS_Normal) { 695239313Sdim} 696239313Sdim 697239313Sdimvoid Lexer::lex(Token &T) { 698239313Sdimagain: 699239313Sdim switch (CommentState) { 700239313Sdim case LCS_BeforeComment: 701239313Sdim if (BufferPtr == BufferEnd) { 702239313Sdim formTokenWithChars(T, BufferPtr, tok::eof); 703239313Sdim return; 704239313Sdim } 705239313Sdim 706239313Sdim assert(*BufferPtr == '/'); 707239313Sdim BufferPtr++; // Skip first slash. 708239313Sdim switch(*BufferPtr) { 709239313Sdim case '/': { // BCPL comment. 710239313Sdim BufferPtr++; // Skip second slash. 711239313Sdim 712239313Sdim if (BufferPtr != BufferEnd) { 713239313Sdim // Skip Doxygen magic marker, if it is present. 714239313Sdim // It might be missing because of a typo //< or /*<, or because we 715239313Sdim // merged this non-Doxygen comment into a bunch of Doxygen comments 716239313Sdim // around it: /** ... */ /* ... */ /** ... */ 717239313Sdim const char C = *BufferPtr; 718239313Sdim if (C == '/' || C == '!') 719239313Sdim BufferPtr++; 720239313Sdim } 721239313Sdim 722239313Sdim // Skip less-than symbol that marks trailing comments. 723239313Sdim // Skip it even if the comment is not a Doxygen one, because //< and /*< 724239313Sdim // are frequent typos. 725239313Sdim if (BufferPtr != BufferEnd && *BufferPtr == '<') 726239313Sdim BufferPtr++; 727239313Sdim 728239313Sdim CommentState = LCS_InsideBCPLComment; 729239313Sdim if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine) 730239313Sdim State = LS_Normal; 731239313Sdim CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); 732239313Sdim goto again; 733239313Sdim } 734239313Sdim case '*': { // C comment. 735239313Sdim BufferPtr++; // Skip star. 736239313Sdim 737239313Sdim // Skip Doxygen magic marker. 738239313Sdim const char C = *BufferPtr; 739239313Sdim if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!') 740239313Sdim BufferPtr++; 741239313Sdim 742239313Sdim // Skip less-than symbol that marks trailing comments. 743239313Sdim if (BufferPtr != BufferEnd && *BufferPtr == '<') 744239313Sdim BufferPtr++; 745239313Sdim 746239313Sdim CommentState = LCS_InsideCComment; 747239313Sdim State = LS_Normal; 748239313Sdim CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); 749239313Sdim goto again; 750239313Sdim } 751239313Sdim default: 752239313Sdim llvm_unreachable("second character of comment should be '/' or '*'"); 753239313Sdim } 754239313Sdim 755239313Sdim case LCS_BetweenComments: { 756239313Sdim // Consecutive comments are extracted only if there is only whitespace 757239313Sdim // between them. So we can search for the start of the next comment. 758239313Sdim const char *EndWhitespace = BufferPtr; 759239313Sdim while(EndWhitespace != BufferEnd && *EndWhitespace != '/') 760239313Sdim EndWhitespace++; 761239313Sdim 762239313Sdim // Turn any whitespace between comments (and there is only whitespace 763239313Sdim // between them -- guaranteed by comment extraction) into a newline. We 764239313Sdim // have two newlines between C comments in total (first one was synthesized 765239313Sdim // after a comment). 766239313Sdim formTokenWithChars(T, EndWhitespace, tok::newline); 767239313Sdim 768239313Sdim CommentState = LCS_BeforeComment; 769239313Sdim break; 770239313Sdim } 771239313Sdim 772239313Sdim case LCS_InsideBCPLComment: 773239313Sdim case LCS_InsideCComment: 774239313Sdim if (BufferPtr != CommentEnd) { 775239313Sdim lexCommentText(T); 776239313Sdim break; 777239313Sdim } else { 778239313Sdim // Skip C comment closing sequence. 779239313Sdim if (CommentState == LCS_InsideCComment) { 780239313Sdim assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); 781239313Sdim BufferPtr += 2; 782239313Sdim assert(BufferPtr <= BufferEnd); 783239313Sdim 784239313Sdim // Synthenize newline just after the C comment, regardless if there is 785239313Sdim // actually a newline. 786239313Sdim formTokenWithChars(T, BufferPtr, tok::newline); 787239313Sdim 788239313Sdim CommentState = LCS_BetweenComments; 789239313Sdim break; 790239313Sdim } else { 791239313Sdim // Don't synthesized a newline after BCPL comment. 792239313Sdim CommentState = LCS_BetweenComments; 793239313Sdim goto again; 794239313Sdim } 795239313Sdim } 796239313Sdim } 797239313Sdim} 798239313Sdim 799239313SdimStringRef Lexer::getSpelling(const Token &Tok, 800239313Sdim const SourceManager &SourceMgr, 801239313Sdim bool *Invalid) const { 802239313Sdim SourceLocation Loc = Tok.getLocation(); 803239313Sdim std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); 804239313Sdim 805239313Sdim bool InvalidTemp = false; 806239313Sdim StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp); 807239313Sdim if (InvalidTemp) { 808239313Sdim *Invalid = true; 809239313Sdim return StringRef(); 810239313Sdim } 811239313Sdim 812239313Sdim const char *Begin = File.data() + LocInfo.second; 813239313Sdim return StringRef(Begin, Tok.getLength()); 814239313Sdim} 815239313Sdim 816239313Sdim} // end namespace comments 817239313Sdim} // end namespace clang 818239313Sdim 819