CommentLexer.cpp revision 249423
1239313Sdim#include "clang/AST/CommentLexer.h"
2239313Sdim#include "clang/AST/CommentCommandTraits.h"
3249423Sdim#include "clang/Basic/CharInfo.h"
4249423Sdim#include "llvm/ADT/StringExtras.h"
5239313Sdim#include "llvm/ADT/StringSwitch.h"
6249423Sdim#include "llvm/Support/ConvertUTF.h"
7239313Sdim#include "llvm/Support/ErrorHandling.h"
8239313Sdim
9239313Sdimnamespace clang {
10239313Sdimnamespace comments {
11239313Sdim
12239313Sdimvoid Token::dump(const Lexer &L, const SourceManager &SM) const {
13239313Sdim  llvm::errs() << "comments::Token Kind=" << Kind << " ";
14239313Sdim  Loc.dump(SM);
15239313Sdim  llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
16239313Sdim}
17239313Sdim
18249423Sdimstatic inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
19249423Sdim  return isLetter(C);
20239313Sdim}
21239313Sdim
22249423Sdimstatic inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
23249423Sdim  return isDigit(C);
24239313Sdim}
25239313Sdim
26249423Sdimstatic inline bool isHTMLHexCharacterReferenceCharacter(char C) {
27249423Sdim  return isHexDigit(C);
28239313Sdim}
29243830Sdim
30249423Sdimstatic inline StringRef convertCodePointToUTF8(
31249423Sdim                                      llvm::BumpPtrAllocator &Allocator,
32249423Sdim                                      unsigned CodePoint) {
33249423Sdim  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
34249423Sdim  char *ResolvedPtr = Resolved;
35249423Sdim  if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
36249423Sdim    return StringRef(Resolved, ResolvedPtr - Resolved);
37249423Sdim  else
38249423Sdim    return StringRef();
39249423Sdim}
40249423Sdim
41249423Sdimnamespace {
42249423Sdim
43243830Sdim#include "clang/AST/CommentHTMLTags.inc"
44249423Sdim#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
45243830Sdim
46239313Sdim} // unnamed namespace
47239313Sdim
48239313SdimStringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
49249423Sdim  // Fast path, first check a few most widely used named character references.
50239313Sdim  return llvm::StringSwitch<StringRef>(Name)
51239313Sdim      .Case("amp", "&")
52239313Sdim      .Case("lt", "<")
53239313Sdim      .Case("gt", ">")
54239313Sdim      .Case("quot", "\"")
55239313Sdim      .Case("apos", "\'")
56249423Sdim      // Slow path.
57249423Sdim      .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
58239313Sdim}
59239313Sdim
60239313SdimStringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
61239313Sdim  unsigned CodePoint = 0;
62239313Sdim  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
63239313Sdim    assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
64239313Sdim    CodePoint *= 10;
65239313Sdim    CodePoint += Name[i] - '0';
66239313Sdim  }
67249423Sdim  return convertCodePointToUTF8(Allocator, CodePoint);
68239313Sdim}
69239313Sdim
70239313SdimStringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
71239313Sdim  unsigned CodePoint = 0;
72239313Sdim  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
73239313Sdim    CodePoint *= 16;
74239313Sdim    const char C = Name[i];
75239313Sdim    assert(isHTMLHexCharacterReferenceCharacter(C));
76249423Sdim    CodePoint += llvm::hexDigitValue(C);
77239313Sdim  }
78249423Sdim  return convertCodePointToUTF8(Allocator, CodePoint);
79239313Sdim}
80239313Sdim
81239313Sdimvoid Lexer::skipLineStartingDecorations() {
82239313Sdim  // This function should be called only for C comments
83239313Sdim  assert(CommentState == LCS_InsideCComment);
84239313Sdim
85239313Sdim  if (BufferPtr == CommentEnd)
86239313Sdim    return;
87239313Sdim
88239313Sdim  switch (*BufferPtr) {
89239313Sdim  case ' ':
90239313Sdim  case '\t':
91239313Sdim  case '\f':
92239313Sdim  case '\v': {
93239313Sdim    const char *NewBufferPtr = BufferPtr;
94239313Sdim    NewBufferPtr++;
95239313Sdim    if (NewBufferPtr == CommentEnd)
96239313Sdim      return;
97239313Sdim
98239313Sdim    char C = *NewBufferPtr;
99249423Sdim    while (isHorizontalWhitespace(C)) {
100239313Sdim      NewBufferPtr++;
101239313Sdim      if (NewBufferPtr == CommentEnd)
102239313Sdim        return;
103239313Sdim      C = *NewBufferPtr;
104239313Sdim    }
105239313Sdim    if (C == '*')
106239313Sdim      BufferPtr = NewBufferPtr + 1;
107239313Sdim    break;
108239313Sdim  }
109239313Sdim  case '*':
110239313Sdim    BufferPtr++;
111239313Sdim    break;
112239313Sdim  }
113239313Sdim}
114239313Sdim
115239313Sdimnamespace {
116239313Sdim/// Returns pointer to the first newline character in the string.
117239313Sdimconst char *findNewline(const char *BufferPtr, const char *BufferEnd) {
118239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
119249423Sdim    if (isVerticalWhitespace(*BufferPtr))
120239313Sdim      return BufferPtr;
121239313Sdim  }
122239313Sdim  return BufferEnd;
123239313Sdim}
124239313Sdim
125239313Sdimconst char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
126239313Sdim  if (BufferPtr == BufferEnd)
127239313Sdim    return BufferPtr;
128239313Sdim
129239313Sdim  if (*BufferPtr == '\n')
130239313Sdim    BufferPtr++;
131239313Sdim  else {
132239313Sdim    assert(*BufferPtr == '\r');
133239313Sdim    BufferPtr++;
134239313Sdim    if (BufferPtr != BufferEnd && *BufferPtr == '\n')
135239313Sdim      BufferPtr++;
136239313Sdim  }
137239313Sdim  return BufferPtr;
138239313Sdim}
139239313Sdim
140239313Sdimconst char *skipNamedCharacterReference(const char *BufferPtr,
141239313Sdim                                        const char *BufferEnd) {
142239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
143239313Sdim    if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
144239313Sdim      return BufferPtr;
145239313Sdim  }
146239313Sdim  return BufferEnd;
147239313Sdim}
148239313Sdim
149239313Sdimconst char *skipDecimalCharacterReference(const char *BufferPtr,
150239313Sdim                                          const char *BufferEnd) {
151239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
152239313Sdim    if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
153239313Sdim      return BufferPtr;
154239313Sdim  }
155239313Sdim  return BufferEnd;
156239313Sdim}
157239313Sdim
158239313Sdimconst char *skipHexCharacterReference(const char *BufferPtr,
159239313Sdim                                          const char *BufferEnd) {
160239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
161239313Sdim    if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
162239313Sdim      return BufferPtr;
163239313Sdim  }
164239313Sdim  return BufferEnd;
165239313Sdim}
166239313Sdim
167239313Sdimbool isHTMLIdentifierStartingCharacter(char C) {
168249423Sdim  return isLetter(C);
169239313Sdim}
170239313Sdim
171239313Sdimbool isHTMLIdentifierCharacter(char C) {
172249423Sdim  return isAlphanumeric(C);
173239313Sdim}
174239313Sdim
175239313Sdimconst char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
176239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
177239313Sdim    if (!isHTMLIdentifierCharacter(*BufferPtr))
178239313Sdim      return BufferPtr;
179239313Sdim  }
180239313Sdim  return BufferEnd;
181239313Sdim}
182239313Sdim
183239313Sdim/// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
184239313Sdim/// string allowed.
185239313Sdim///
186239313Sdim/// Returns pointer to closing quote.
187239313Sdimconst char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
188239313Sdim{
189239313Sdim  const char Quote = *BufferPtr;
190239313Sdim  assert(Quote == '\"' || Quote == '\'');
191239313Sdim
192239313Sdim  BufferPtr++;
193239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
194239313Sdim    const char C = *BufferPtr;
195239313Sdim    if (C == Quote && BufferPtr[-1] != '\\')
196239313Sdim      return BufferPtr;
197239313Sdim  }
198239313Sdim  return BufferEnd;
199239313Sdim}
200239313Sdim
201239313Sdimconst char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
202239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
203239313Sdim    if (!isWhitespace(*BufferPtr))
204239313Sdim      return BufferPtr;
205239313Sdim  }
206239313Sdim  return BufferEnd;
207239313Sdim}
208239313Sdim
209239313Sdimbool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
210239313Sdim  return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
211239313Sdim}
212239313Sdim
213243830Sdimbool isCommandNameStartCharacter(char C) {
214249423Sdim  return isLetter(C);
215243830Sdim}
216243830Sdim
217239313Sdimbool isCommandNameCharacter(char C) {
218249423Sdim  return isAlphanumeric(C);
219239313Sdim}
220239313Sdim
221239313Sdimconst char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
222239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
223239313Sdim    if (!isCommandNameCharacter(*BufferPtr))
224239313Sdim      return BufferPtr;
225239313Sdim  }
226239313Sdim  return BufferEnd;
227239313Sdim}
228239313Sdim
229239313Sdim/// Return the one past end pointer for BCPL comments.
230239313Sdim/// Handles newlines escaped with backslash or trigraph for backslahs.
231239313Sdimconst char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
232239313Sdim  const char *CurPtr = BufferPtr;
233239313Sdim  while (CurPtr != BufferEnd) {
234249423Sdim    while (!isVerticalWhitespace(*CurPtr)) {
235239313Sdim      CurPtr++;
236239313Sdim      if (CurPtr == BufferEnd)
237239313Sdim        return BufferEnd;
238239313Sdim    }
239239313Sdim    // We found a newline, check if it is escaped.
240239313Sdim    const char *EscapePtr = CurPtr - 1;
241239313Sdim    while(isHorizontalWhitespace(*EscapePtr))
242239313Sdim      EscapePtr--;
243239313Sdim
244239313Sdim    if (*EscapePtr == '\\' ||
245239313Sdim        (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
246239313Sdim         EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
247239313Sdim      // We found an escaped newline.
248239313Sdim      CurPtr = skipNewline(CurPtr, BufferEnd);
249239313Sdim    } else
250239313Sdim      return CurPtr; // Not an escaped newline.
251239313Sdim  }
252239313Sdim  return BufferEnd;
253239313Sdim}
254239313Sdim
255239313Sdim/// Return the one past end pointer for C comments.
256239313Sdim/// Very dumb, does not handle escaped newlines or trigraphs.
257239313Sdimconst char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
258239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
259239313Sdim    if (*BufferPtr == '*') {
260239313Sdim      assert(BufferPtr + 1 != BufferEnd);
261239313Sdim      if (*(BufferPtr + 1) == '/')
262239313Sdim        return BufferPtr;
263239313Sdim    }
264239313Sdim  }
265239313Sdim  llvm_unreachable("buffer end hit before '*/' was seen");
266239313Sdim}
267239313Sdim} // unnamed namespace
268239313Sdim
269239313Sdimvoid Lexer::lexCommentText(Token &T) {
270239313Sdim  assert(CommentState == LCS_InsideBCPLComment ||
271239313Sdim         CommentState == LCS_InsideCComment);
272239313Sdim
273239313Sdim  switch (State) {
274239313Sdim  case LS_Normal:
275239313Sdim    break;
276239313Sdim  case LS_VerbatimBlockFirstLine:
277239313Sdim    lexVerbatimBlockFirstLine(T);
278239313Sdim    return;
279239313Sdim  case LS_VerbatimBlockBody:
280239313Sdim    lexVerbatimBlockBody(T);
281239313Sdim    return;
282239313Sdim  case LS_VerbatimLineText:
283239313Sdim    lexVerbatimLineText(T);
284239313Sdim    return;
285239313Sdim  case LS_HTMLStartTag:
286239313Sdim    lexHTMLStartTag(T);
287239313Sdim    return;
288239313Sdim  case LS_HTMLEndTag:
289239313Sdim    lexHTMLEndTag(T);
290239313Sdim    return;
291239313Sdim  }
292239313Sdim
293239313Sdim  assert(State == LS_Normal);
294239313Sdim
295239313Sdim  const char *TokenPtr = BufferPtr;
296239313Sdim  assert(TokenPtr < CommentEnd);
297239313Sdim  while (TokenPtr != CommentEnd) {
298239313Sdim    switch(*TokenPtr) {
299239313Sdim      case '\\':
300239313Sdim      case '@': {
301249423Sdim        // Commands that start with a backslash and commands that start with
302249423Sdim        // 'at' have equivalent semantics.  But we keep information about the
303249423Sdim        // exact syntax in AST for comments.
304249423Sdim        tok::TokenKind CommandKind =
305249423Sdim            (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
306239313Sdim        TokenPtr++;
307239313Sdim        if (TokenPtr == CommentEnd) {
308239313Sdim          formTextToken(T, TokenPtr);
309239313Sdim          return;
310239313Sdim        }
311239313Sdim        char C = *TokenPtr;
312239313Sdim        switch (C) {
313239313Sdim        default:
314239313Sdim          break;
315239313Sdim
316239313Sdim        case '\\': case '@': case '&': case '$':
317239313Sdim        case '#':  case '<': case '>': case '%':
318239313Sdim        case '\"': case '.': case ':':
319239313Sdim          // This is one of \\ \@ \& \$ etc escape sequences.
320239313Sdim          TokenPtr++;
321239313Sdim          if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
322239313Sdim            // This is the \:: escape sequence.
323239313Sdim            TokenPtr++;
324239313Sdim          }
325239313Sdim          StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
326239313Sdim          formTokenWithChars(T, TokenPtr, tok::text);
327239313Sdim          T.setText(UnescapedText);
328239313Sdim          return;
329239313Sdim        }
330239313Sdim
331239313Sdim        // Don't make zero-length commands.
332243830Sdim        if (!isCommandNameStartCharacter(*TokenPtr)) {
333239313Sdim          formTextToken(T, TokenPtr);
334239313Sdim          return;
335239313Sdim        }
336239313Sdim
337239313Sdim        TokenPtr = skipCommandName(TokenPtr, CommentEnd);
338239313Sdim        unsigned Length = TokenPtr - (BufferPtr + 1);
339239313Sdim
340239313Sdim        // Hardcoded support for lexing LaTeX formula commands
341239313Sdim        // \f$ \f[ \f] \f{ \f} as a single command.
342239313Sdim        if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
343239313Sdim          C = *TokenPtr;
344239313Sdim          if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
345239313Sdim            TokenPtr++;
346239313Sdim            Length++;
347239313Sdim          }
348239313Sdim        }
349239313Sdim
350239313Sdim        const StringRef CommandName(BufferPtr + 1, Length);
351239313Sdim
352243830Sdim        const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
353243830Sdim        if (!Info) {
354243830Sdim          formTokenWithChars(T, TokenPtr, tok::unknown_command);
355243830Sdim          T.setUnknownCommandName(CommandName);
356239313Sdim          return;
357239313Sdim        }
358243830Sdim        if (Info->IsVerbatimBlockCommand) {
359243830Sdim          setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
360239313Sdim          return;
361239313Sdim        }
362243830Sdim        if (Info->IsVerbatimLineCommand) {
363243830Sdim          setupAndLexVerbatimLine(T, TokenPtr, Info);
364243830Sdim          return;
365243830Sdim        }
366249423Sdim        formTokenWithChars(T, TokenPtr, CommandKind);
367243830Sdim        T.setCommandID(Info->getID());
368239313Sdim        return;
369239313Sdim      }
370239313Sdim
371239313Sdim      case '&':
372239313Sdim        lexHTMLCharacterReference(T);
373239313Sdim        return;
374239313Sdim
375239313Sdim      case '<': {
376239313Sdim        TokenPtr++;
377239313Sdim        if (TokenPtr == CommentEnd) {
378239313Sdim          formTextToken(T, TokenPtr);
379239313Sdim          return;
380239313Sdim        }
381239313Sdim        const char C = *TokenPtr;
382239313Sdim        if (isHTMLIdentifierStartingCharacter(C))
383239313Sdim          setupAndLexHTMLStartTag(T);
384239313Sdim        else if (C == '/')
385239313Sdim          setupAndLexHTMLEndTag(T);
386239313Sdim        else
387239313Sdim          formTextToken(T, TokenPtr);
388239313Sdim
389239313Sdim        return;
390239313Sdim      }
391239313Sdim
392239313Sdim      case '\n':
393239313Sdim      case '\r':
394239313Sdim        TokenPtr = skipNewline(TokenPtr, CommentEnd);
395239313Sdim        formTokenWithChars(T, TokenPtr, tok::newline);
396239313Sdim
397239313Sdim        if (CommentState == LCS_InsideCComment)
398239313Sdim          skipLineStartingDecorations();
399239313Sdim        return;
400239313Sdim
401239313Sdim      default: {
402249423Sdim        size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
403249423Sdim                         find_first_of("\n\r\\@&<");
404249423Sdim        if (End != StringRef::npos)
405249423Sdim          TokenPtr += End;
406249423Sdim        else
407249423Sdim          TokenPtr = CommentEnd;
408239313Sdim        formTextToken(T, TokenPtr);
409239313Sdim        return;
410239313Sdim      }
411239313Sdim    }
412239313Sdim  }
413239313Sdim}
414239313Sdim
415239313Sdimvoid Lexer::setupAndLexVerbatimBlock(Token &T,
416239313Sdim                                     const char *TextBegin,
417243830Sdim                                     char Marker, const CommandInfo *Info) {
418243830Sdim  assert(Info->IsVerbatimBlockCommand);
419243830Sdim
420239313Sdim  VerbatimBlockEndCommandName.clear();
421239313Sdim  VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
422243830Sdim  VerbatimBlockEndCommandName.append(Info->EndCommandName);
423239313Sdim
424239313Sdim  formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
425243830Sdim  T.setVerbatimBlockID(Info->getID());
426239313Sdim
427239313Sdim  // If there is a newline following the verbatim opening command, skip the
428239313Sdim  // newline so that we don't create an tok::verbatim_block_line with empty
429239313Sdim  // text content.
430249423Sdim  if (BufferPtr != CommentEnd &&
431249423Sdim      isVerticalWhitespace(*BufferPtr)) {
432249423Sdim    BufferPtr = skipNewline(BufferPtr, CommentEnd);
433249423Sdim    State = LS_VerbatimBlockBody;
434249423Sdim    return;
435239313Sdim  }
436239313Sdim
437239313Sdim  State = LS_VerbatimBlockFirstLine;
438239313Sdim}
439239313Sdim
440239313Sdimvoid Lexer::lexVerbatimBlockFirstLine(Token &T) {
441239313Sdimagain:
442239313Sdim  assert(BufferPtr < CommentEnd);
443239313Sdim
444239313Sdim  // FIXME: It would be better to scan the text once, finding either the block
445239313Sdim  // end command or newline.
446239313Sdim  //
447239313Sdim  // Extract current line.
448239313Sdim  const char *Newline = findNewline(BufferPtr, CommentEnd);
449239313Sdim  StringRef Line(BufferPtr, Newline - BufferPtr);
450239313Sdim
451239313Sdim  // Look for end command in current line.
452239313Sdim  size_t Pos = Line.find(VerbatimBlockEndCommandName);
453239313Sdim  const char *TextEnd;
454239313Sdim  const char *NextLine;
455239313Sdim  if (Pos == StringRef::npos) {
456239313Sdim    // Current line is completely verbatim.
457239313Sdim    TextEnd = Newline;
458239313Sdim    NextLine = skipNewline(Newline, CommentEnd);
459239313Sdim  } else if (Pos == 0) {
460239313Sdim    // Current line contains just an end command.
461239313Sdim    const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
462239313Sdim    StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
463239313Sdim    formTokenWithChars(T, End, tok::verbatim_block_end);
464243830Sdim    T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
465239313Sdim    State = LS_Normal;
466239313Sdim    return;
467239313Sdim  } else {
468239313Sdim    // There is some text, followed by end command.  Extract text first.
469239313Sdim    TextEnd = BufferPtr + Pos;
470239313Sdim    NextLine = TextEnd;
471239313Sdim    // If there is only whitespace before end command, skip whitespace.
472239313Sdim    if (isWhitespace(BufferPtr, TextEnd)) {
473239313Sdim      BufferPtr = TextEnd;
474239313Sdim      goto again;
475239313Sdim    }
476239313Sdim  }
477239313Sdim
478239313Sdim  StringRef Text(BufferPtr, TextEnd - BufferPtr);
479239313Sdim  formTokenWithChars(T, NextLine, tok::verbatim_block_line);
480239313Sdim  T.setVerbatimBlockText(Text);
481239313Sdim
482239313Sdim  State = LS_VerbatimBlockBody;
483239313Sdim}
484239313Sdim
485239313Sdimvoid Lexer::lexVerbatimBlockBody(Token &T) {
486239313Sdim  assert(State == LS_VerbatimBlockBody);
487239313Sdim
488239313Sdim  if (CommentState == LCS_InsideCComment)
489239313Sdim    skipLineStartingDecorations();
490239313Sdim
491239313Sdim  lexVerbatimBlockFirstLine(T);
492239313Sdim}
493239313Sdim
494243830Sdimvoid Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
495243830Sdim                                    const CommandInfo *Info) {
496243830Sdim  assert(Info->IsVerbatimLineCommand);
497239313Sdim  formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
498243830Sdim  T.setVerbatimLineID(Info->getID());
499239313Sdim
500239313Sdim  State = LS_VerbatimLineText;
501239313Sdim}
502239313Sdim
503239313Sdimvoid Lexer::lexVerbatimLineText(Token &T) {
504239313Sdim  assert(State == LS_VerbatimLineText);
505239313Sdim
506239313Sdim  // Extract current line.
507239313Sdim  const char *Newline = findNewline(BufferPtr, CommentEnd);
508239313Sdim  const StringRef Text(BufferPtr, Newline - BufferPtr);
509239313Sdim  formTokenWithChars(T, Newline, tok::verbatim_line_text);
510239313Sdim  T.setVerbatimLineText(Text);
511239313Sdim
512239313Sdim  State = LS_Normal;
513239313Sdim}
514239313Sdim
515239313Sdimvoid Lexer::lexHTMLCharacterReference(Token &T) {
516239313Sdim  const char *TokenPtr = BufferPtr;
517239313Sdim  assert(*TokenPtr == '&');
518239313Sdim  TokenPtr++;
519239313Sdim  if (TokenPtr == CommentEnd) {
520239313Sdim    formTextToken(T, TokenPtr);
521239313Sdim    return;
522239313Sdim  }
523239313Sdim  const char *NamePtr;
524239313Sdim  bool isNamed = false;
525239313Sdim  bool isDecimal = false;
526239313Sdim  char C = *TokenPtr;
527239313Sdim  if (isHTMLNamedCharacterReferenceCharacter(C)) {
528239313Sdim    NamePtr = TokenPtr;
529239313Sdim    TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
530239313Sdim    isNamed = true;
531239313Sdim  } else if (C == '#') {
532239313Sdim    TokenPtr++;
533239313Sdim    if (TokenPtr == CommentEnd) {
534239313Sdim      formTextToken(T, TokenPtr);
535239313Sdim      return;
536239313Sdim    }
537239313Sdim    C = *TokenPtr;
538239313Sdim    if (isHTMLDecimalCharacterReferenceCharacter(C)) {
539239313Sdim      NamePtr = TokenPtr;
540239313Sdim      TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
541239313Sdim      isDecimal = true;
542239313Sdim    } else if (C == 'x' || C == 'X') {
543239313Sdim      TokenPtr++;
544239313Sdim      NamePtr = TokenPtr;
545239313Sdim      TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
546239313Sdim    } else {
547239313Sdim      formTextToken(T, TokenPtr);
548239313Sdim      return;
549239313Sdim    }
550239313Sdim  } else {
551239313Sdim    formTextToken(T, TokenPtr);
552239313Sdim    return;
553239313Sdim  }
554239313Sdim  if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
555239313Sdim      *TokenPtr != ';') {
556239313Sdim    formTextToken(T, TokenPtr);
557239313Sdim    return;
558239313Sdim  }
559239313Sdim  StringRef Name(NamePtr, TokenPtr - NamePtr);
560239313Sdim  TokenPtr++; // Skip semicolon.
561239313Sdim  StringRef Resolved;
562239313Sdim  if (isNamed)
563239313Sdim    Resolved = resolveHTMLNamedCharacterReference(Name);
564239313Sdim  else if (isDecimal)
565239313Sdim    Resolved = resolveHTMLDecimalCharacterReference(Name);
566239313Sdim  else
567239313Sdim    Resolved = resolveHTMLHexCharacterReference(Name);
568239313Sdim
569239313Sdim  if (Resolved.empty()) {
570239313Sdim    formTextToken(T, TokenPtr);
571239313Sdim    return;
572239313Sdim  }
573239313Sdim  formTokenWithChars(T, TokenPtr, tok::text);
574239313Sdim  T.setText(Resolved);
575239313Sdim  return;
576239313Sdim}
577239313Sdim
578239313Sdimvoid Lexer::setupAndLexHTMLStartTag(Token &T) {
579239313Sdim  assert(BufferPtr[0] == '<' &&
580239313Sdim         isHTMLIdentifierStartingCharacter(BufferPtr[1]));
581239313Sdim  const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
582243830Sdim  StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
583243830Sdim  if (!isHTMLTagName(Name)) {
584243830Sdim    formTextToken(T, TagNameEnd);
585243830Sdim    return;
586243830Sdim  }
587239313Sdim
588239313Sdim  formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
589239313Sdim  T.setHTMLTagStartName(Name);
590239313Sdim
591239313Sdim  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
592239313Sdim
593239313Sdim  const char C = *BufferPtr;
594239313Sdim  if (BufferPtr != CommentEnd &&
595239313Sdim      (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
596239313Sdim    State = LS_HTMLStartTag;
597239313Sdim}
598239313Sdim
599239313Sdimvoid Lexer::lexHTMLStartTag(Token &T) {
600239313Sdim  assert(State == LS_HTMLStartTag);
601239313Sdim
602239313Sdim  const char *TokenPtr = BufferPtr;
603239313Sdim  char C = *TokenPtr;
604239313Sdim  if (isHTMLIdentifierCharacter(C)) {
605239313Sdim    TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
606239313Sdim    StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
607239313Sdim    formTokenWithChars(T, TokenPtr, tok::html_ident);
608239313Sdim    T.setHTMLIdent(Ident);
609239313Sdim  } else {
610239313Sdim    switch (C) {
611239313Sdim    case '=':
612239313Sdim      TokenPtr++;
613239313Sdim      formTokenWithChars(T, TokenPtr, tok::html_equals);
614239313Sdim      break;
615239313Sdim    case '\"':
616239313Sdim    case '\'': {
617239313Sdim      const char *OpenQuote = TokenPtr;
618239313Sdim      TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
619239313Sdim      const char *ClosingQuote = TokenPtr;
620239313Sdim      if (TokenPtr != CommentEnd) // Skip closing quote.
621239313Sdim        TokenPtr++;
622239313Sdim      formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
623239313Sdim      T.setHTMLQuotedString(StringRef(OpenQuote + 1,
624239313Sdim                                      ClosingQuote - (OpenQuote + 1)));
625239313Sdim      break;
626239313Sdim    }
627239313Sdim    case '>':
628239313Sdim      TokenPtr++;
629239313Sdim      formTokenWithChars(T, TokenPtr, tok::html_greater);
630239313Sdim      State = LS_Normal;
631239313Sdim      return;
632239313Sdim    case '/':
633239313Sdim      TokenPtr++;
634239313Sdim      if (TokenPtr != CommentEnd && *TokenPtr == '>') {
635239313Sdim        TokenPtr++;
636239313Sdim        formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
637239313Sdim      } else
638239313Sdim        formTextToken(T, TokenPtr);
639239313Sdim
640239313Sdim      State = LS_Normal;
641239313Sdim      return;
642239313Sdim    }
643239313Sdim  }
644239313Sdim
645239313Sdim  // Now look ahead and return to normal state if we don't see any HTML tokens
646239313Sdim  // ahead.
647239313Sdim  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
648239313Sdim  if (BufferPtr == CommentEnd) {
649239313Sdim    State = LS_Normal;
650239313Sdim    return;
651239313Sdim  }
652239313Sdim
653239313Sdim  C = *BufferPtr;
654239313Sdim  if (!isHTMLIdentifierStartingCharacter(C) &&
655239313Sdim      C != '=' && C != '\"' && C != '\'' && C != '>') {
656239313Sdim    State = LS_Normal;
657239313Sdim    return;
658239313Sdim  }
659239313Sdim}
660239313Sdim
661239313Sdimvoid Lexer::setupAndLexHTMLEndTag(Token &T) {
662239313Sdim  assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
663239313Sdim
664239313Sdim  const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
665239313Sdim  const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
666243830Sdim  StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
667243830Sdim  if (!isHTMLTagName(Name)) {
668243830Sdim    formTextToken(T, TagNameEnd);
669243830Sdim    return;
670243830Sdim  }
671239313Sdim
672239313Sdim  const char *End = skipWhitespace(TagNameEnd, CommentEnd);
673239313Sdim
674239313Sdim  formTokenWithChars(T, End, tok::html_end_tag);
675243830Sdim  T.setHTMLTagEndName(Name);
676239313Sdim
677239313Sdim  if (BufferPtr != CommentEnd && *BufferPtr == '>')
678239313Sdim    State = LS_HTMLEndTag;
679239313Sdim}
680239313Sdim
681239313Sdimvoid Lexer::lexHTMLEndTag(Token &T) {
682239313Sdim  assert(BufferPtr != CommentEnd && *BufferPtr == '>');
683239313Sdim
684239313Sdim  formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
685239313Sdim  State = LS_Normal;
686239313Sdim}
687239313Sdim
688239313SdimLexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
689243830Sdim             SourceLocation FileLoc,
690239313Sdim             const char *BufferStart, const char *BufferEnd):
691239313Sdim    Allocator(Allocator), Traits(Traits),
692239313Sdim    BufferStart(BufferStart), BufferEnd(BufferEnd),
693243830Sdim    FileLoc(FileLoc), BufferPtr(BufferStart),
694239313Sdim    CommentState(LCS_BeforeComment), State(LS_Normal) {
695239313Sdim}
696239313Sdim
697239313Sdimvoid Lexer::lex(Token &T) {
698239313Sdimagain:
699239313Sdim  switch (CommentState) {
700239313Sdim  case LCS_BeforeComment:
701239313Sdim    if (BufferPtr == BufferEnd) {
702239313Sdim      formTokenWithChars(T, BufferPtr, tok::eof);
703239313Sdim      return;
704239313Sdim    }
705239313Sdim
706239313Sdim    assert(*BufferPtr == '/');
707239313Sdim    BufferPtr++; // Skip first slash.
708239313Sdim    switch(*BufferPtr) {
709239313Sdim    case '/': { // BCPL comment.
710239313Sdim      BufferPtr++; // Skip second slash.
711239313Sdim
712239313Sdim      if (BufferPtr != BufferEnd) {
713239313Sdim        // Skip Doxygen magic marker, if it is present.
714239313Sdim        // It might be missing because of a typo //< or /*<, or because we
715239313Sdim        // merged this non-Doxygen comment into a bunch of Doxygen comments
716239313Sdim        // around it: /** ... */ /* ... */ /** ... */
717239313Sdim        const char C = *BufferPtr;
718239313Sdim        if (C == '/' || C == '!')
719239313Sdim          BufferPtr++;
720239313Sdim      }
721239313Sdim
722239313Sdim      // Skip less-than symbol that marks trailing comments.
723239313Sdim      // Skip it even if the comment is not a Doxygen one, because //< and /*<
724239313Sdim      // are frequent typos.
725239313Sdim      if (BufferPtr != BufferEnd && *BufferPtr == '<')
726239313Sdim        BufferPtr++;
727239313Sdim
728239313Sdim      CommentState = LCS_InsideBCPLComment;
729239313Sdim      if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
730239313Sdim        State = LS_Normal;
731239313Sdim      CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
732239313Sdim      goto again;
733239313Sdim    }
734239313Sdim    case '*': { // C comment.
735239313Sdim      BufferPtr++; // Skip star.
736239313Sdim
737239313Sdim      // Skip Doxygen magic marker.
738239313Sdim      const char C = *BufferPtr;
739239313Sdim      if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
740239313Sdim        BufferPtr++;
741239313Sdim
742239313Sdim      // Skip less-than symbol that marks trailing comments.
743239313Sdim      if (BufferPtr != BufferEnd && *BufferPtr == '<')
744239313Sdim        BufferPtr++;
745239313Sdim
746239313Sdim      CommentState = LCS_InsideCComment;
747239313Sdim      State = LS_Normal;
748239313Sdim      CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
749239313Sdim      goto again;
750239313Sdim    }
751239313Sdim    default:
752239313Sdim      llvm_unreachable("second character of comment should be '/' or '*'");
753239313Sdim    }
754239313Sdim
755239313Sdim  case LCS_BetweenComments: {
756239313Sdim    // Consecutive comments are extracted only if there is only whitespace
757239313Sdim    // between them.  So we can search for the start of the next comment.
758239313Sdim    const char *EndWhitespace = BufferPtr;
759239313Sdim    while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
760239313Sdim      EndWhitespace++;
761239313Sdim
762239313Sdim    // Turn any whitespace between comments (and there is only whitespace
763239313Sdim    // between them -- guaranteed by comment extraction) into a newline.  We
764239313Sdim    // have two newlines between C comments in total (first one was synthesized
765239313Sdim    // after a comment).
766239313Sdim    formTokenWithChars(T, EndWhitespace, tok::newline);
767239313Sdim
768239313Sdim    CommentState = LCS_BeforeComment;
769239313Sdim    break;
770239313Sdim  }
771239313Sdim
772239313Sdim  case LCS_InsideBCPLComment:
773239313Sdim  case LCS_InsideCComment:
774239313Sdim    if (BufferPtr != CommentEnd) {
775239313Sdim      lexCommentText(T);
776239313Sdim      break;
777239313Sdim    } else {
778239313Sdim      // Skip C comment closing sequence.
779239313Sdim      if (CommentState == LCS_InsideCComment) {
780239313Sdim        assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
781239313Sdim        BufferPtr += 2;
782239313Sdim        assert(BufferPtr <= BufferEnd);
783239313Sdim
784239313Sdim        // Synthenize newline just after the C comment, regardless if there is
785239313Sdim        // actually a newline.
786239313Sdim        formTokenWithChars(T, BufferPtr, tok::newline);
787239313Sdim
788239313Sdim        CommentState = LCS_BetweenComments;
789239313Sdim        break;
790239313Sdim      } else {
791239313Sdim        // Don't synthesized a newline after BCPL comment.
792239313Sdim        CommentState = LCS_BetweenComments;
793239313Sdim        goto again;
794239313Sdim      }
795239313Sdim    }
796239313Sdim  }
797239313Sdim}
798239313Sdim
799239313SdimStringRef Lexer::getSpelling(const Token &Tok,
800239313Sdim                             const SourceManager &SourceMgr,
801239313Sdim                             bool *Invalid) const {
802239313Sdim  SourceLocation Loc = Tok.getLocation();
803239313Sdim  std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
804239313Sdim
805239313Sdim  bool InvalidTemp = false;
806239313Sdim  StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
807239313Sdim  if (InvalidTemp) {
808239313Sdim    *Invalid = true;
809239313Sdim    return StringRef();
810239313Sdim  }
811239313Sdim
812239313Sdim  const char *Begin = File.data() + LocInfo.second;
813239313Sdim  return StringRef(Begin, Tok.getLength());
814239313Sdim}
815239313Sdim
816239313Sdim} // end namespace comments
817239313Sdim} // end namespace clang
818239313Sdim
819