1239313Sdim#include "clang/AST/CommentLexer.h"
2239313Sdim#include "clang/AST/CommentCommandTraits.h"
3251662Sdim#include "clang/AST/CommentDiagnostic.h"
4249423Sdim#include "clang/Basic/CharInfo.h"
5249423Sdim#include "llvm/ADT/StringExtras.h"
6239313Sdim#include "llvm/ADT/StringSwitch.h"
7249423Sdim#include "llvm/Support/ConvertUTF.h"
8239313Sdim#include "llvm/Support/ErrorHandling.h"
9239313Sdim
10239313Sdimnamespace clang {
11239313Sdimnamespace comments {
12239313Sdim
13239313Sdimvoid Token::dump(const Lexer &L, const SourceManager &SM) const {
14239313Sdim  llvm::errs() << "comments::Token Kind=" << Kind << " ";
15239313Sdim  Loc.dump(SM);
16239313Sdim  llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
17239313Sdim}
18239313Sdim
19249423Sdimstatic inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
20249423Sdim  return isLetter(C);
21239313Sdim}
22239313Sdim
23249423Sdimstatic inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
24249423Sdim  return isDigit(C);
25239313Sdim}
26239313Sdim
27249423Sdimstatic inline bool isHTMLHexCharacterReferenceCharacter(char C) {
28249423Sdim  return isHexDigit(C);
29239313Sdim}
30243830Sdim
31249423Sdimstatic inline StringRef convertCodePointToUTF8(
32249423Sdim                                      llvm::BumpPtrAllocator &Allocator,
33249423Sdim                                      unsigned CodePoint) {
34249423Sdim  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
35249423Sdim  char *ResolvedPtr = Resolved;
36249423Sdim  if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
37249423Sdim    return StringRef(Resolved, ResolvedPtr - Resolved);
38249423Sdim  else
39249423Sdim    return StringRef();
40249423Sdim}
41249423Sdim
42249423Sdimnamespace {
43249423Sdim
44243830Sdim#include "clang/AST/CommentHTMLTags.inc"
45249423Sdim#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
46243830Sdim
47239313Sdim} // unnamed namespace
48239313Sdim
49239313SdimStringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
50249423Sdim  // Fast path, first check a few most widely used named character references.
51239313Sdim  return llvm::StringSwitch<StringRef>(Name)
52239313Sdim      .Case("amp", "&")
53239313Sdim      .Case("lt", "<")
54239313Sdim      .Case("gt", ">")
55239313Sdim      .Case("quot", "\"")
56239313Sdim      .Case("apos", "\'")
57249423Sdim      // Slow path.
58249423Sdim      .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
59239313Sdim}
60239313Sdim
61239313SdimStringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
62239313Sdim  unsigned CodePoint = 0;
63239313Sdim  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
64239313Sdim    assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
65239313Sdim    CodePoint *= 10;
66239313Sdim    CodePoint += Name[i] - '0';
67239313Sdim  }
68249423Sdim  return convertCodePointToUTF8(Allocator, CodePoint);
69239313Sdim}
70239313Sdim
71239313SdimStringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
72239313Sdim  unsigned CodePoint = 0;
73239313Sdim  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
74239313Sdim    CodePoint *= 16;
75239313Sdim    const char C = Name[i];
76239313Sdim    assert(isHTMLHexCharacterReferenceCharacter(C));
77249423Sdim    CodePoint += llvm::hexDigitValue(C);
78239313Sdim  }
79249423Sdim  return convertCodePointToUTF8(Allocator, CodePoint);
80239313Sdim}
81239313Sdim
82239313Sdimvoid Lexer::skipLineStartingDecorations() {
83239313Sdim  // This function should be called only for C comments
84239313Sdim  assert(CommentState == LCS_InsideCComment);
85239313Sdim
86239313Sdim  if (BufferPtr == CommentEnd)
87239313Sdim    return;
88239313Sdim
89239313Sdim  switch (*BufferPtr) {
90239313Sdim  case ' ':
91239313Sdim  case '\t':
92239313Sdim  case '\f':
93239313Sdim  case '\v': {
94239313Sdim    const char *NewBufferPtr = BufferPtr;
95239313Sdim    NewBufferPtr++;
96239313Sdim    if (NewBufferPtr == CommentEnd)
97239313Sdim      return;
98239313Sdim
99239313Sdim    char C = *NewBufferPtr;
100249423Sdim    while (isHorizontalWhitespace(C)) {
101239313Sdim      NewBufferPtr++;
102239313Sdim      if (NewBufferPtr == CommentEnd)
103239313Sdim        return;
104239313Sdim      C = *NewBufferPtr;
105239313Sdim    }
106239313Sdim    if (C == '*')
107239313Sdim      BufferPtr = NewBufferPtr + 1;
108239313Sdim    break;
109239313Sdim  }
110239313Sdim  case '*':
111239313Sdim    BufferPtr++;
112239313Sdim    break;
113239313Sdim  }
114239313Sdim}
115239313Sdim
116239313Sdimnamespace {
117239313Sdim/// Returns pointer to the first newline character in the string.
118239313Sdimconst char *findNewline(const char *BufferPtr, const char *BufferEnd) {
119239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
120249423Sdim    if (isVerticalWhitespace(*BufferPtr))
121239313Sdim      return BufferPtr;
122239313Sdim  }
123239313Sdim  return BufferEnd;
124239313Sdim}
125239313Sdim
126239313Sdimconst char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
127239313Sdim  if (BufferPtr == BufferEnd)
128239313Sdim    return BufferPtr;
129239313Sdim
130239313Sdim  if (*BufferPtr == '\n')
131239313Sdim    BufferPtr++;
132239313Sdim  else {
133239313Sdim    assert(*BufferPtr == '\r');
134239313Sdim    BufferPtr++;
135239313Sdim    if (BufferPtr != BufferEnd && *BufferPtr == '\n')
136239313Sdim      BufferPtr++;
137239313Sdim  }
138239313Sdim  return BufferPtr;
139239313Sdim}
140239313Sdim
141239313Sdimconst char *skipNamedCharacterReference(const char *BufferPtr,
142239313Sdim                                        const char *BufferEnd) {
143239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
144239313Sdim    if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
145239313Sdim      return BufferPtr;
146239313Sdim  }
147239313Sdim  return BufferEnd;
148239313Sdim}
149239313Sdim
150239313Sdimconst char *skipDecimalCharacterReference(const char *BufferPtr,
151239313Sdim                                          const char *BufferEnd) {
152239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
153239313Sdim    if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
154239313Sdim      return BufferPtr;
155239313Sdim  }
156239313Sdim  return BufferEnd;
157239313Sdim}
158239313Sdim
159239313Sdimconst char *skipHexCharacterReference(const char *BufferPtr,
160263508Sdim                                      const char *BufferEnd) {
161239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
162239313Sdim    if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
163239313Sdim      return BufferPtr;
164239313Sdim  }
165239313Sdim  return BufferEnd;
166239313Sdim}
167239313Sdim
168239313Sdimbool isHTMLIdentifierStartingCharacter(char C) {
169249423Sdim  return isLetter(C);
170239313Sdim}
171239313Sdim
172239313Sdimbool isHTMLIdentifierCharacter(char C) {
173249423Sdim  return isAlphanumeric(C);
174239313Sdim}
175239313Sdim
176239313Sdimconst char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
177239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
178239313Sdim    if (!isHTMLIdentifierCharacter(*BufferPtr))
179239313Sdim      return BufferPtr;
180239313Sdim  }
181239313Sdim  return BufferEnd;
182239313Sdim}
183239313Sdim
184239313Sdim/// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
185239313Sdim/// string allowed.
186239313Sdim///
187239313Sdim/// Returns pointer to closing quote.
188239313Sdimconst char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
189239313Sdim{
190239313Sdim  const char Quote = *BufferPtr;
191239313Sdim  assert(Quote == '\"' || Quote == '\'');
192239313Sdim
193239313Sdim  BufferPtr++;
194239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
195239313Sdim    const char C = *BufferPtr;
196239313Sdim    if (C == Quote && BufferPtr[-1] != '\\')
197239313Sdim      return BufferPtr;
198239313Sdim  }
199239313Sdim  return BufferEnd;
200239313Sdim}
201239313Sdim
202239313Sdimconst char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
203239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
204239313Sdim    if (!isWhitespace(*BufferPtr))
205239313Sdim      return BufferPtr;
206239313Sdim  }
207239313Sdim  return BufferEnd;
208239313Sdim}
209239313Sdim
210239313Sdimbool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
211239313Sdim  return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
212239313Sdim}
213239313Sdim
214243830Sdimbool isCommandNameStartCharacter(char C) {
215249423Sdim  return isLetter(C);
216243830Sdim}
217243830Sdim
218239313Sdimbool isCommandNameCharacter(char C) {
219249423Sdim  return isAlphanumeric(C);
220239313Sdim}
221239313Sdim
222239313Sdimconst char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
223239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
224239313Sdim    if (!isCommandNameCharacter(*BufferPtr))
225239313Sdim      return BufferPtr;
226239313Sdim  }
227239313Sdim  return BufferEnd;
228239313Sdim}
229239313Sdim
230239313Sdim/// Return the one past end pointer for BCPL comments.
231239313Sdim/// Handles newlines escaped with backslash or trigraph for backslahs.
232239313Sdimconst char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
233239313Sdim  const char *CurPtr = BufferPtr;
234239313Sdim  while (CurPtr != BufferEnd) {
235249423Sdim    while (!isVerticalWhitespace(*CurPtr)) {
236239313Sdim      CurPtr++;
237239313Sdim      if (CurPtr == BufferEnd)
238239313Sdim        return BufferEnd;
239239313Sdim    }
240239313Sdim    // We found a newline, check if it is escaped.
241239313Sdim    const char *EscapePtr = CurPtr - 1;
242239313Sdim    while(isHorizontalWhitespace(*EscapePtr))
243239313Sdim      EscapePtr--;
244239313Sdim
245239313Sdim    if (*EscapePtr == '\\' ||
246239313Sdim        (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
247239313Sdim         EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
248239313Sdim      // We found an escaped newline.
249239313Sdim      CurPtr = skipNewline(CurPtr, BufferEnd);
250239313Sdim    } else
251239313Sdim      return CurPtr; // Not an escaped newline.
252239313Sdim  }
253239313Sdim  return BufferEnd;
254239313Sdim}
255239313Sdim
256239313Sdim/// Return the one past end pointer for C comments.
257239313Sdim/// Very dumb, does not handle escaped newlines or trigraphs.
258239313Sdimconst char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
259239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
260239313Sdim    if (*BufferPtr == '*') {
261239313Sdim      assert(BufferPtr + 1 != BufferEnd);
262239313Sdim      if (*(BufferPtr + 1) == '/')
263239313Sdim        return BufferPtr;
264239313Sdim    }
265239313Sdim  }
266239313Sdim  llvm_unreachable("buffer end hit before '*/' was seen");
267239313Sdim}
268263508Sdim
269239313Sdim} // unnamed namespace
270239313Sdim
271239313Sdimvoid Lexer::lexCommentText(Token &T) {
272239313Sdim  assert(CommentState == LCS_InsideBCPLComment ||
273239313Sdim         CommentState == LCS_InsideCComment);
274239313Sdim
275239313Sdim  switch (State) {
276239313Sdim  case LS_Normal:
277239313Sdim    break;
278239313Sdim  case LS_VerbatimBlockFirstLine:
279239313Sdim    lexVerbatimBlockFirstLine(T);
280239313Sdim    return;
281239313Sdim  case LS_VerbatimBlockBody:
282239313Sdim    lexVerbatimBlockBody(T);
283239313Sdim    return;
284239313Sdim  case LS_VerbatimLineText:
285239313Sdim    lexVerbatimLineText(T);
286239313Sdim    return;
287239313Sdim  case LS_HTMLStartTag:
288239313Sdim    lexHTMLStartTag(T);
289239313Sdim    return;
290239313Sdim  case LS_HTMLEndTag:
291239313Sdim    lexHTMLEndTag(T);
292239313Sdim    return;
293239313Sdim  }
294239313Sdim
295239313Sdim  assert(State == LS_Normal);
296239313Sdim
297239313Sdim  const char *TokenPtr = BufferPtr;
298239313Sdim  assert(TokenPtr < CommentEnd);
299239313Sdim  while (TokenPtr != CommentEnd) {
300239313Sdim    switch(*TokenPtr) {
301239313Sdim      case '\\':
302239313Sdim      case '@': {
303249423Sdim        // Commands that start with a backslash and commands that start with
304249423Sdim        // 'at' have equivalent semantics.  But we keep information about the
305249423Sdim        // exact syntax in AST for comments.
306249423Sdim        tok::TokenKind CommandKind =
307249423Sdim            (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
308239313Sdim        TokenPtr++;
309239313Sdim        if (TokenPtr == CommentEnd) {
310239313Sdim          formTextToken(T, TokenPtr);
311239313Sdim          return;
312239313Sdim        }
313239313Sdim        char C = *TokenPtr;
314239313Sdim        switch (C) {
315239313Sdim        default:
316239313Sdim          break;
317239313Sdim
318239313Sdim        case '\\': case '@': case '&': case '$':
319239313Sdim        case '#':  case '<': case '>': case '%':
320239313Sdim        case '\"': case '.': case ':':
321239313Sdim          // This is one of \\ \@ \& \$ etc escape sequences.
322239313Sdim          TokenPtr++;
323239313Sdim          if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
324239313Sdim            // This is the \:: escape sequence.
325239313Sdim            TokenPtr++;
326239313Sdim          }
327239313Sdim          StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
328239313Sdim          formTokenWithChars(T, TokenPtr, tok::text);
329239313Sdim          T.setText(UnescapedText);
330239313Sdim          return;
331239313Sdim        }
332239313Sdim
333239313Sdim        // Don't make zero-length commands.
334243830Sdim        if (!isCommandNameStartCharacter(*TokenPtr)) {
335239313Sdim          formTextToken(T, TokenPtr);
336239313Sdim          return;
337239313Sdim        }
338239313Sdim
339239313Sdim        TokenPtr = skipCommandName(TokenPtr, CommentEnd);
340239313Sdim        unsigned Length = TokenPtr - (BufferPtr + 1);
341239313Sdim
342239313Sdim        // Hardcoded support for lexing LaTeX formula commands
343239313Sdim        // \f$ \f[ \f] \f{ \f} as a single command.
344239313Sdim        if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
345239313Sdim          C = *TokenPtr;
346239313Sdim          if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
347239313Sdim            TokenPtr++;
348239313Sdim            Length++;
349239313Sdim          }
350239313Sdim        }
351239313Sdim
352239313Sdim        const StringRef CommandName(BufferPtr + 1, Length);
353239313Sdim
354243830Sdim        const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
355243830Sdim        if (!Info) {
356263508Sdim          if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
357263508Sdim            StringRef CorrectedName = Info->Name;
358263508Sdim            SourceLocation Loc = getSourceLocation(BufferPtr);
359263508Sdim            SourceRange CommandRange(Loc.getLocWithOffset(1),
360263508Sdim                                     getSourceLocation(TokenPtr));
361263508Sdim            Diag(Loc, diag::warn_correct_comment_command_name)
362263508Sdim              << CommandName << CorrectedName
363263508Sdim              << FixItHint::CreateReplacement(CommandRange, CorrectedName);
364263508Sdim          } else {
365263508Sdim            formTokenWithChars(T, TokenPtr, tok::unknown_command);
366263508Sdim            T.setUnknownCommandName(CommandName);
367263508Sdim            Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
368263508Sdim            return;
369263508Sdim          }
370239313Sdim        }
371243830Sdim        if (Info->IsVerbatimBlockCommand) {
372243830Sdim          setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
373239313Sdim          return;
374239313Sdim        }
375243830Sdim        if (Info->IsVerbatimLineCommand) {
376243830Sdim          setupAndLexVerbatimLine(T, TokenPtr, Info);
377243830Sdim          return;
378243830Sdim        }
379249423Sdim        formTokenWithChars(T, TokenPtr, CommandKind);
380243830Sdim        T.setCommandID(Info->getID());
381239313Sdim        return;
382239313Sdim      }
383239313Sdim
384239313Sdim      case '&':
385239313Sdim        lexHTMLCharacterReference(T);
386239313Sdim        return;
387239313Sdim
388239313Sdim      case '<': {
389239313Sdim        TokenPtr++;
390239313Sdim        if (TokenPtr == CommentEnd) {
391239313Sdim          formTextToken(T, TokenPtr);
392239313Sdim          return;
393239313Sdim        }
394239313Sdim        const char C = *TokenPtr;
395239313Sdim        if (isHTMLIdentifierStartingCharacter(C))
396239313Sdim          setupAndLexHTMLStartTag(T);
397239313Sdim        else if (C == '/')
398239313Sdim          setupAndLexHTMLEndTag(T);
399239313Sdim        else
400239313Sdim          formTextToken(T, TokenPtr);
401239313Sdim
402239313Sdim        return;
403239313Sdim      }
404239313Sdim
405239313Sdim      case '\n':
406239313Sdim      case '\r':
407239313Sdim        TokenPtr = skipNewline(TokenPtr, CommentEnd);
408239313Sdim        formTokenWithChars(T, TokenPtr, tok::newline);
409239313Sdim
410239313Sdim        if (CommentState == LCS_InsideCComment)
411239313Sdim          skipLineStartingDecorations();
412239313Sdim        return;
413239313Sdim
414239313Sdim      default: {
415249423Sdim        size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
416249423Sdim                         find_first_of("\n\r\\@&<");
417249423Sdim        if (End != StringRef::npos)
418249423Sdim          TokenPtr += End;
419249423Sdim        else
420249423Sdim          TokenPtr = CommentEnd;
421239313Sdim        formTextToken(T, TokenPtr);
422239313Sdim        return;
423239313Sdim      }
424239313Sdim    }
425239313Sdim  }
426239313Sdim}
427239313Sdim
428239313Sdimvoid Lexer::setupAndLexVerbatimBlock(Token &T,
429239313Sdim                                     const char *TextBegin,
430243830Sdim                                     char Marker, const CommandInfo *Info) {
431243830Sdim  assert(Info->IsVerbatimBlockCommand);
432243830Sdim
433239313Sdim  VerbatimBlockEndCommandName.clear();
434239313Sdim  VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
435243830Sdim  VerbatimBlockEndCommandName.append(Info->EndCommandName);
436239313Sdim
437239313Sdim  formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
438243830Sdim  T.setVerbatimBlockID(Info->getID());
439239313Sdim
440239313Sdim  // If there is a newline following the verbatim opening command, skip the
441239313Sdim  // newline so that we don't create an tok::verbatim_block_line with empty
442239313Sdim  // text content.
443249423Sdim  if (BufferPtr != CommentEnd &&
444249423Sdim      isVerticalWhitespace(*BufferPtr)) {
445249423Sdim    BufferPtr = skipNewline(BufferPtr, CommentEnd);
446249423Sdim    State = LS_VerbatimBlockBody;
447249423Sdim    return;
448239313Sdim  }
449239313Sdim
450239313Sdim  State = LS_VerbatimBlockFirstLine;
451239313Sdim}
452239313Sdim
453239313Sdimvoid Lexer::lexVerbatimBlockFirstLine(Token &T) {
454239313Sdimagain:
455239313Sdim  assert(BufferPtr < CommentEnd);
456239313Sdim
457239313Sdim  // FIXME: It would be better to scan the text once, finding either the block
458239313Sdim  // end command or newline.
459239313Sdim  //
460239313Sdim  // Extract current line.
461239313Sdim  const char *Newline = findNewline(BufferPtr, CommentEnd);
462239313Sdim  StringRef Line(BufferPtr, Newline - BufferPtr);
463239313Sdim
464239313Sdim  // Look for end command in current line.
465239313Sdim  size_t Pos = Line.find(VerbatimBlockEndCommandName);
466239313Sdim  const char *TextEnd;
467239313Sdim  const char *NextLine;
468239313Sdim  if (Pos == StringRef::npos) {
469239313Sdim    // Current line is completely verbatim.
470239313Sdim    TextEnd = Newline;
471239313Sdim    NextLine = skipNewline(Newline, CommentEnd);
472239313Sdim  } else if (Pos == 0) {
473239313Sdim    // Current line contains just an end command.
474239313Sdim    const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
475239313Sdim    StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
476239313Sdim    formTokenWithChars(T, End, tok::verbatim_block_end);
477243830Sdim    T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
478239313Sdim    State = LS_Normal;
479239313Sdim    return;
480239313Sdim  } else {
481239313Sdim    // There is some text, followed by end command.  Extract text first.
482239313Sdim    TextEnd = BufferPtr + Pos;
483239313Sdim    NextLine = TextEnd;
484239313Sdim    // If there is only whitespace before end command, skip whitespace.
485239313Sdim    if (isWhitespace(BufferPtr, TextEnd)) {
486239313Sdim      BufferPtr = TextEnd;
487239313Sdim      goto again;
488239313Sdim    }
489239313Sdim  }
490239313Sdim
491239313Sdim  StringRef Text(BufferPtr, TextEnd - BufferPtr);
492239313Sdim  formTokenWithChars(T, NextLine, tok::verbatim_block_line);
493239313Sdim  T.setVerbatimBlockText(Text);
494239313Sdim
495239313Sdim  State = LS_VerbatimBlockBody;
496239313Sdim}
497239313Sdim
498239313Sdimvoid Lexer::lexVerbatimBlockBody(Token &T) {
499239313Sdim  assert(State == LS_VerbatimBlockBody);
500239313Sdim
501239313Sdim  if (CommentState == LCS_InsideCComment)
502239313Sdim    skipLineStartingDecorations();
503239313Sdim
504239313Sdim  lexVerbatimBlockFirstLine(T);
505239313Sdim}
506239313Sdim
507243830Sdimvoid Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
508243830Sdim                                    const CommandInfo *Info) {
509243830Sdim  assert(Info->IsVerbatimLineCommand);
510239313Sdim  formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
511243830Sdim  T.setVerbatimLineID(Info->getID());
512239313Sdim
513239313Sdim  State = LS_VerbatimLineText;
514239313Sdim}
515239313Sdim
516239313Sdimvoid Lexer::lexVerbatimLineText(Token &T) {
517239313Sdim  assert(State == LS_VerbatimLineText);
518239313Sdim
519239313Sdim  // Extract current line.
520239313Sdim  const char *Newline = findNewline(BufferPtr, CommentEnd);
521239313Sdim  const StringRef Text(BufferPtr, Newline - BufferPtr);
522239313Sdim  formTokenWithChars(T, Newline, tok::verbatim_line_text);
523239313Sdim  T.setVerbatimLineText(Text);
524239313Sdim
525239313Sdim  State = LS_Normal;
526239313Sdim}
527239313Sdim
528239313Sdimvoid Lexer::lexHTMLCharacterReference(Token &T) {
529239313Sdim  const char *TokenPtr = BufferPtr;
530239313Sdim  assert(*TokenPtr == '&');
531239313Sdim  TokenPtr++;
532239313Sdim  if (TokenPtr == CommentEnd) {
533239313Sdim    formTextToken(T, TokenPtr);
534239313Sdim    return;
535239313Sdim  }
536239313Sdim  const char *NamePtr;
537239313Sdim  bool isNamed = false;
538239313Sdim  bool isDecimal = false;
539239313Sdim  char C = *TokenPtr;
540239313Sdim  if (isHTMLNamedCharacterReferenceCharacter(C)) {
541239313Sdim    NamePtr = TokenPtr;
542239313Sdim    TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
543239313Sdim    isNamed = true;
544239313Sdim  } else if (C == '#') {
545239313Sdim    TokenPtr++;
546239313Sdim    if (TokenPtr == CommentEnd) {
547239313Sdim      formTextToken(T, TokenPtr);
548239313Sdim      return;
549239313Sdim    }
550239313Sdim    C = *TokenPtr;
551239313Sdim    if (isHTMLDecimalCharacterReferenceCharacter(C)) {
552239313Sdim      NamePtr = TokenPtr;
553239313Sdim      TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
554239313Sdim      isDecimal = true;
555239313Sdim    } else if (C == 'x' || C == 'X') {
556239313Sdim      TokenPtr++;
557239313Sdim      NamePtr = TokenPtr;
558239313Sdim      TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
559239313Sdim    } else {
560239313Sdim      formTextToken(T, TokenPtr);
561239313Sdim      return;
562239313Sdim    }
563239313Sdim  } else {
564239313Sdim    formTextToken(T, TokenPtr);
565239313Sdim    return;
566239313Sdim  }
567239313Sdim  if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
568239313Sdim      *TokenPtr != ';') {
569239313Sdim    formTextToken(T, TokenPtr);
570239313Sdim    return;
571239313Sdim  }
572239313Sdim  StringRef Name(NamePtr, TokenPtr - NamePtr);
573239313Sdim  TokenPtr++; // Skip semicolon.
574239313Sdim  StringRef Resolved;
575239313Sdim  if (isNamed)
576239313Sdim    Resolved = resolveHTMLNamedCharacterReference(Name);
577239313Sdim  else if (isDecimal)
578239313Sdim    Resolved = resolveHTMLDecimalCharacterReference(Name);
579239313Sdim  else
580239313Sdim    Resolved = resolveHTMLHexCharacterReference(Name);
581239313Sdim
582239313Sdim  if (Resolved.empty()) {
583239313Sdim    formTextToken(T, TokenPtr);
584239313Sdim    return;
585239313Sdim  }
586239313Sdim  formTokenWithChars(T, TokenPtr, tok::text);
587239313Sdim  T.setText(Resolved);
588239313Sdim  return;
589239313Sdim}
590239313Sdim
591239313Sdimvoid Lexer::setupAndLexHTMLStartTag(Token &T) {
592239313Sdim  assert(BufferPtr[0] == '<' &&
593239313Sdim         isHTMLIdentifierStartingCharacter(BufferPtr[1]));
594239313Sdim  const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
595243830Sdim  StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
596243830Sdim  if (!isHTMLTagName(Name)) {
597243830Sdim    formTextToken(T, TagNameEnd);
598243830Sdim    return;
599243830Sdim  }
600239313Sdim
601239313Sdim  formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
602239313Sdim  T.setHTMLTagStartName(Name);
603239313Sdim
604239313Sdim  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
605239313Sdim
606239313Sdim  const char C = *BufferPtr;
607239313Sdim  if (BufferPtr != CommentEnd &&
608239313Sdim      (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
609239313Sdim    State = LS_HTMLStartTag;
610239313Sdim}
611239313Sdim
612239313Sdimvoid Lexer::lexHTMLStartTag(Token &T) {
613239313Sdim  assert(State == LS_HTMLStartTag);
614239313Sdim
615239313Sdim  const char *TokenPtr = BufferPtr;
616239313Sdim  char C = *TokenPtr;
617239313Sdim  if (isHTMLIdentifierCharacter(C)) {
618239313Sdim    TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
619239313Sdim    StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
620239313Sdim    formTokenWithChars(T, TokenPtr, tok::html_ident);
621239313Sdim    T.setHTMLIdent(Ident);
622239313Sdim  } else {
623239313Sdim    switch (C) {
624239313Sdim    case '=':
625239313Sdim      TokenPtr++;
626239313Sdim      formTokenWithChars(T, TokenPtr, tok::html_equals);
627239313Sdim      break;
628239313Sdim    case '\"':
629239313Sdim    case '\'': {
630239313Sdim      const char *OpenQuote = TokenPtr;
631239313Sdim      TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
632239313Sdim      const char *ClosingQuote = TokenPtr;
633239313Sdim      if (TokenPtr != CommentEnd) // Skip closing quote.
634239313Sdim        TokenPtr++;
635239313Sdim      formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
636239313Sdim      T.setHTMLQuotedString(StringRef(OpenQuote + 1,
637239313Sdim                                      ClosingQuote - (OpenQuote + 1)));
638239313Sdim      break;
639239313Sdim    }
640239313Sdim    case '>':
641239313Sdim      TokenPtr++;
642239313Sdim      formTokenWithChars(T, TokenPtr, tok::html_greater);
643239313Sdim      State = LS_Normal;
644239313Sdim      return;
645239313Sdim    case '/':
646239313Sdim      TokenPtr++;
647239313Sdim      if (TokenPtr != CommentEnd && *TokenPtr == '>') {
648239313Sdim        TokenPtr++;
649239313Sdim        formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
650239313Sdim      } else
651239313Sdim        formTextToken(T, TokenPtr);
652239313Sdim
653239313Sdim      State = LS_Normal;
654239313Sdim      return;
655239313Sdim    }
656239313Sdim  }
657239313Sdim
658239313Sdim  // Now look ahead and return to normal state if we don't see any HTML tokens
659239313Sdim  // ahead.
660239313Sdim  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
661239313Sdim  if (BufferPtr == CommentEnd) {
662239313Sdim    State = LS_Normal;
663239313Sdim    return;
664239313Sdim  }
665239313Sdim
666239313Sdim  C = *BufferPtr;
667239313Sdim  if (!isHTMLIdentifierStartingCharacter(C) &&
668239313Sdim      C != '=' && C != '\"' && C != '\'' && C != '>') {
669239313Sdim    State = LS_Normal;
670239313Sdim    return;
671239313Sdim  }
672239313Sdim}
673239313Sdim
674239313Sdimvoid Lexer::setupAndLexHTMLEndTag(Token &T) {
675239313Sdim  assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
676239313Sdim
677239313Sdim  const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
678239313Sdim  const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
679243830Sdim  StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
680243830Sdim  if (!isHTMLTagName(Name)) {
681243830Sdim    formTextToken(T, TagNameEnd);
682243830Sdim    return;
683243830Sdim  }
684239313Sdim
685239313Sdim  const char *End = skipWhitespace(TagNameEnd, CommentEnd);
686239313Sdim
687239313Sdim  formTokenWithChars(T, End, tok::html_end_tag);
688243830Sdim  T.setHTMLTagEndName(Name);
689239313Sdim
690239313Sdim  if (BufferPtr != CommentEnd && *BufferPtr == '>')
691239313Sdim    State = LS_HTMLEndTag;
692239313Sdim}
693239313Sdim
694239313Sdimvoid Lexer::lexHTMLEndTag(Token &T) {
695239313Sdim  assert(BufferPtr != CommentEnd && *BufferPtr == '>');
696239313Sdim
697239313Sdim  formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
698239313Sdim  State = LS_Normal;
699239313Sdim}
700239313Sdim
701251662SdimLexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
702251662Sdim             const CommandTraits &Traits,
703243830Sdim             SourceLocation FileLoc,
704239313Sdim             const char *BufferStart, const char *BufferEnd):
705251662Sdim    Allocator(Allocator), Diags(Diags), Traits(Traits),
706239313Sdim    BufferStart(BufferStart), BufferEnd(BufferEnd),
707243830Sdim    FileLoc(FileLoc), BufferPtr(BufferStart),
708239313Sdim    CommentState(LCS_BeforeComment), State(LS_Normal) {
709239313Sdim}
710239313Sdim
711239313Sdimvoid Lexer::lex(Token &T) {
712239313Sdimagain:
713239313Sdim  switch (CommentState) {
714239313Sdim  case LCS_BeforeComment:
715239313Sdim    if (BufferPtr == BufferEnd) {
716239313Sdim      formTokenWithChars(T, BufferPtr, tok::eof);
717239313Sdim      return;
718239313Sdim    }
719239313Sdim
720239313Sdim    assert(*BufferPtr == '/');
721239313Sdim    BufferPtr++; // Skip first slash.
722239313Sdim    switch(*BufferPtr) {
723239313Sdim    case '/': { // BCPL comment.
724239313Sdim      BufferPtr++; // Skip second slash.
725239313Sdim
726239313Sdim      if (BufferPtr != BufferEnd) {
727239313Sdim        // Skip Doxygen magic marker, if it is present.
728239313Sdim        // It might be missing because of a typo //< or /*<, or because we
729239313Sdim        // merged this non-Doxygen comment into a bunch of Doxygen comments
730239313Sdim        // around it: /** ... */ /* ... */ /** ... */
731239313Sdim        const char C = *BufferPtr;
732239313Sdim        if (C == '/' || C == '!')
733239313Sdim          BufferPtr++;
734239313Sdim      }
735239313Sdim
736239313Sdim      // Skip less-than symbol that marks trailing comments.
737239313Sdim      // Skip it even if the comment is not a Doxygen one, because //< and /*<
738239313Sdim      // are frequent typos.
739239313Sdim      if (BufferPtr != BufferEnd && *BufferPtr == '<')
740239313Sdim        BufferPtr++;
741239313Sdim
742239313Sdim      CommentState = LCS_InsideBCPLComment;
743239313Sdim      if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
744239313Sdim        State = LS_Normal;
745239313Sdim      CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
746239313Sdim      goto again;
747239313Sdim    }
748239313Sdim    case '*': { // C comment.
749239313Sdim      BufferPtr++; // Skip star.
750239313Sdim
751239313Sdim      // Skip Doxygen magic marker.
752239313Sdim      const char C = *BufferPtr;
753239313Sdim      if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
754239313Sdim        BufferPtr++;
755239313Sdim
756239313Sdim      // Skip less-than symbol that marks trailing comments.
757239313Sdim      if (BufferPtr != BufferEnd && *BufferPtr == '<')
758239313Sdim        BufferPtr++;
759239313Sdim
760239313Sdim      CommentState = LCS_InsideCComment;
761239313Sdim      State = LS_Normal;
762239313Sdim      CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
763239313Sdim      goto again;
764239313Sdim    }
765239313Sdim    default:
766239313Sdim      llvm_unreachable("second character of comment should be '/' or '*'");
767239313Sdim    }
768239313Sdim
769239313Sdim  case LCS_BetweenComments: {
770239313Sdim    // Consecutive comments are extracted only if there is only whitespace
771239313Sdim    // between them.  So we can search for the start of the next comment.
772239313Sdim    const char *EndWhitespace = BufferPtr;
773239313Sdim    while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
774239313Sdim      EndWhitespace++;
775239313Sdim
776239313Sdim    // Turn any whitespace between comments (and there is only whitespace
777239313Sdim    // between them -- guaranteed by comment extraction) into a newline.  We
778239313Sdim    // have two newlines between C comments in total (first one was synthesized
779239313Sdim    // after a comment).
780239313Sdim    formTokenWithChars(T, EndWhitespace, tok::newline);
781239313Sdim
782239313Sdim    CommentState = LCS_BeforeComment;
783239313Sdim    break;
784239313Sdim  }
785239313Sdim
786239313Sdim  case LCS_InsideBCPLComment:
787239313Sdim  case LCS_InsideCComment:
788239313Sdim    if (BufferPtr != CommentEnd) {
789239313Sdim      lexCommentText(T);
790239313Sdim      break;
791239313Sdim    } else {
792239313Sdim      // Skip C comment closing sequence.
793239313Sdim      if (CommentState == LCS_InsideCComment) {
794239313Sdim        assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
795239313Sdim        BufferPtr += 2;
796239313Sdim        assert(BufferPtr <= BufferEnd);
797239313Sdim
798239313Sdim        // Synthenize newline just after the C comment, regardless if there is
799239313Sdim        // actually a newline.
800239313Sdim        formTokenWithChars(T, BufferPtr, tok::newline);
801239313Sdim
802239313Sdim        CommentState = LCS_BetweenComments;
803239313Sdim        break;
804239313Sdim      } else {
805239313Sdim        // Don't synthesized a newline after BCPL comment.
806239313Sdim        CommentState = LCS_BetweenComments;
807239313Sdim        goto again;
808239313Sdim      }
809239313Sdim    }
810239313Sdim  }
811239313Sdim}
812239313Sdim
813239313SdimStringRef Lexer::getSpelling(const Token &Tok,
814239313Sdim                             const SourceManager &SourceMgr,
815239313Sdim                             bool *Invalid) const {
816239313Sdim  SourceLocation Loc = Tok.getLocation();
817239313Sdim  std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
818239313Sdim
819239313Sdim  bool InvalidTemp = false;
820239313Sdim  StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
821239313Sdim  if (InvalidTemp) {
822239313Sdim    *Invalid = true;
823239313Sdim    return StringRef();
824239313Sdim  }
825239313Sdim
826239313Sdim  const char *Begin = File.data() + LocInfo.second;
827239313Sdim  return StringRef(Begin, Tok.getLength());
828239313Sdim}
829239313Sdim
830239313Sdim} // end namespace comments
831239313Sdim} // end namespace clang
832239313Sdim
833